├── .github
    ├── actionlint.yaml
    ├── actions
    │   └── free-disk-space
    │   │   └── action.yml
    ├── dependabot.yml
    ├── mergify.yml
    ├── stale_bot.yml
    └── workflows
    │   ├── actionlint.dockerfile
    │   ├── actionlint.yml
    │   ├── constraints-update.yml
    │   ├── docs.yml
    │   ├── e2e-nvidia-l4-x1.yml
    │   ├── e2e-nvidia-l40s-x4.yml
    │   ├── lint.yml
    │   ├── matchers
    │       ├── actionlint.json
    │       └── pylint.json
    │   ├── pypi.yaml
    │   ├── spellcheck.yml
    │   └── test.yml
├── .gitignore
├── .isort.cfg
├── .markdownlint-cli2.yaml
├── .pre-commit-config.yaml
├── .pylintrc
├── .spellcheck-en-custom.txt
├── .spellcheck.yml
├── CHANGELOG.md
├── DCO.txt
├── LICENSE
├── Makefile
├── README.md
├── constraints-dev.txt
├── constraints-dev.txt.in
├── docs
    ├── ci.md
    └── release-strategy.md
├── pyproject.toml
├── requirements-dev.txt
├── requirements-files.in
├── requirements-leaderboard.txt
├── requirements-ruler.txt
├── requirements.txt
├── scripts
    ├── evaluate_best_checkpoint.py
    ├── functional-tests.sh
    ├── ruff.sh
    ├── test_branch_gen_answers.py
    ├── test_branch_generator.py
    ├── test_branch_judge_answers.py
    ├── test_gen_answers.py
    ├── test_judge_answers.py
    ├── test_leaderboard.py
    ├── test_mmlu.py
    └── test_mmlu_branch.py
├── src
    └── instructlab
    │   ├── __init__.py
    │   └── eval
    │       ├── __init__.py
    │       ├── data
    │           ├── mt_bench
    │           │   ├── judge_prompts.jsonl
    │           │   ├── question.jsonl
    │           │   └── reference_answer
    │           │   │   └── gpt-4.jsonl
    │           └── mt_bench_branch
    │           │   └── judge_prompts.jsonl
    │       ├── evaluator.py
    │       ├── exceptions.py
    │       ├── leaderboard.py
    │       ├── logger_config.py
    │       ├── mmlu.py
    │       ├── mt_bench.py
    │       ├── mt_bench_answers.py
    │       ├── mt_bench_branch_generator.py
    │       ├── mt_bench_common.py
    │       ├── mt_bench_conversation.py
    │       ├── mt_bench_judgment.py
    │       ├── mt_bench_model_adapter.py
    │       ├── ragas.py
    │       └── ruler.py
├── tests
    ├── test_mmlu.py
    ├── test_mt_bench.py
    ├── test_mt_bench_answers.py
    ├── test_mt_bench_common.py
    ├── test_mt_bench_judgment.py
    ├── test_mt_bench_model_adapter.py
    ├── test_project.py
    ├── test_ragas.py
    └── testdata
    │   └── sdg
    │       ├── _default_template_yaml
    │       ├── tonsil_data.jsonl
    │       └── tonsil_task.yaml
└── tox.ini


/.github/actionlint.yaml:
--------------------------------------------------------------------------------
1 | self-hosted-runner:
2 |   labels:
3 |   - ubuntu-gpu
4 | 


--------------------------------------------------------------------------------
/.github/actions/free-disk-space/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Free Disk Space'
 2 | description: 'Frees disk space on the runner'
 3 | runs:
 4 |   using: "composite"
 5 |   steps:
 6 |     - run: |
 7 |         df -h
 8 |         sudo docker rmi "$(docker image ls -aq)" >/dev/null 2>&1 || true
 9 |         sudo rm -rf \
10 |           /usr/share/dotnet /usr/local/lib/android /opt/ghc \
11 |           /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup \
12 |           /usr/lib/jvm || true
13 |         sudo apt install aptitude -y >/dev/null 2>&1
14 |         sudo aptitude purge '~n ^mysql' -f -y >/dev/null 2>&1
15 |         sudo aptitude purge '~n ^dotnet' -f -y >/dev/null 2>&1
16 |         sudo apt-get autoremove -y >/dev/null 2>&1
17 |         sudo apt-get autoclean -y >/dev/null 2>&1
18 |         df -h
19 |       shell: bash
20 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # GitHub Dependabot configuration file
 4 | version: 2
 5 | updates:
 6 | 
 7 |   # Maintain dependencies for GitHub Actions
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 |   - package-ecosystem: "docker"
13 |     directory: "/.github/workflows"
14 |     schedule:
15 |       interval: "daily"
16 | 


--------------------------------------------------------------------------------
/.github/mergify.yml:
--------------------------------------------------------------------------------
  1 | pull_request_rules:
  2 | - name: auto-merge
  3 |   description: automatic merge for main with >= 2 approved reviews, all requested reviews have given feedback, not held, and CI is successful
  4 |   conditions:
  5 |     - "#approved-reviews-by>=2"
  6 |     - "#review-requested=0"
  7 |     - "#changes-requested-reviews-by=0"
  8 |     - or:
  9 |       - base=main
 10 |       - base~=^release-
 11 |     - label!=hold
 12 |     - label!=do-not-merge
 13 |     - label!=needs-rebase
 14 | 
 15 |     # The files conditions regex should match the globs in workflow files
 16 |     # If workflow configuration files in .github/ are changed, the actionlint check must pass
 17 |     - or:
 18 |       - and:
 19 |         - check-success=actionlint
 20 |         - or:
 21 |           - files~=^\.github/(actions|workflows)/.*\.ya?ml$
 22 |           - files~=^\.github/workflows/actionlint\.
 23 |       - and:
 24 |         - -files~=^\.github/(actions|workflows)/.*\.ya?ml$
 25 |         - -files~=^\.github/workflows/actionlint\.
 26 | 
 27 |     # e2e medium workflow
 28 |     - or:
 29 |       - and:
 30 |         # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml'
 31 |         - check-success~=e2e-medium-workflow-complete
 32 |         - or:
 33 |           - files~=\.py$
 34 |           - files=pyproject.toml
 35 |           - files~=^requirements.*\.txt$
 36 |           - files=.github/workflows/e2e-nvidia-l4-x1.yml
 37 |       - and:
 38 |         - -files~=\.py$
 39 |         - -files=pyproject.toml
 40 |         - -files~=^requirements.*\.txt$
 41 |         - -files=.github/workflows/e2e-nvidia-l4-x1.yml
 42 | 
 43 |     # lint must pass if files change that would trigger this job
 44 |     - or:
 45 |       - and:
 46 |         - check-success=lint-workflow-complete
 47 |         - or:
 48 |           # see .github/workflows/lint.yml and test.yml
 49 |           - files~=\.py$
 50 |           - files=pyproject.toml
 51 |           - files~=^requirements.*\.txt$
 52 |           - files=tox.ini
 53 |           - files~=^scripts/[^/]+\.sh$
 54 |           - files=.github/workflows/lint.yml
 55 |       - and:
 56 |         - -files~=\.py$
 57 |         - -files=pyproject.toml
 58 |         - -files~=^requirements.*\.txt$
 59 |         - -files=tox.ini
 60 |         - -files~=^scripts/[^/]+\.sh$
 61 |         - -files=.github/workflows/lint.yml
 62 | 
 63 |     - or:
 64 |       - and:
 65 |         - check-success=markdown-lint
 66 |         - or:
 67 |           - files~=\.md$
 68 |           - files=.markdownlint-cli2.yaml
 69 |           - files=.github/workflows/docs.yml
 70 |       - and:
 71 |         - -files~=\.md$
 72 |         - -files=.markdownlint-cli2.yaml
 73 |         - -files=.github/workflows/docs.yml
 74 | 
 75 |     - or:
 76 |       - and:
 77 |         - check-success=spellcheck
 78 |         - or:
 79 |           - files~=\.md$
 80 |           - files=tox.ini
 81 |           - files~=^\.spellcheck[^/]+$
 82 |           - files=.github/workflows/spellcheck.yml
 83 |       - and:
 84 |         - -files~=\.md$
 85 |         - -files=tox.ini
 86 |         - -files~=^\.spellcheck[^/]+$
 87 |         - -files=.github/workflows/spellcheck.yml
 88 | 
 89 |   actions:
 90 |     merge:
 91 |       method: merge
 92 |     delete_head_branch:
 93 | 
 94 | - name: label-cicd
 95 |   description: Automatically apply CI/CD label
 96 |   conditions:
 97 |     - or:
 98 |       - files=.github/mergify.yml
 99 |       - files~=^\.github/(actions|workflows)/
100 |       - files=scripts/ruff.sh
101 |       - files=.pre-commit-config.yaml
102 |       - files=.pylintrc
103 |       - files~=^\.spellcheck[^/]+$
104 |       - files=tox.ini
105 |       - files=.markdownlint-cli2.yaml
106 |   actions:
107 |     label:
108 |       add:
109 |         - CI/CD
110 | 
111 | - name: label-documentation
112 |   description: Automatically apply documentation label
113 |   conditions:
114 |     - or:
115 |       - files~=^[^/]+\.md$
116 |   actions:
117 |     label:
118 |       add:
119 |         - documentation
120 | 
121 | - name: label-testing
122 |   description: Automatically apply testing label
123 |   conditions:
124 |     - or:
125 |       - files~=^tests/
126 |       - files=tox.ini
127 |   actions:
128 |     label:
129 |       add:
130 |         - testing
131 | 
132 | - name: ping author on conflicts and add 'needs-rebase' label
133 |   conditions:
134 |       - conflict
135 |       - -closed
136 |   actions:
137 |     label:
138 |       add:
139 |         - needs-rebase
140 |     comment:
141 |       message: |
142 |        This pull request has merge conflicts that must be resolved before it can be
143 |        merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
144 | 
145 | - name: remove 'needs-rebase' label when conflict is resolved
146 |   conditions:
147 |       - -conflict
148 |       - -closed
149 |   actions:
150 |     label:
151 |       remove:
152 |         - needs-rebase
153 | 
154 | - name: release-branch-label
155 |   description: Automatically apply the release-branch label to release branch PRs
156 |   conditions:
157 |     - base~=^release-
158 |   actions:
159 |     label:
160 |       add:
161 |         - release-branch
162 | 
163 | - name: Apply ci-failure label if any CI checks have failed
164 |   conditions:
165 |       - "#check-failure>0"
166 |   actions:
167 |     label:
168 |       add:
169 |         - ci-failure
170 | 
171 | - name: Remove ci-failure label if no failures are present
172 |   conditions:
173 |       - "#check-failure=0"
174 |   actions:
175 |     label:
176 |       remove:
177 |         - ci-failure
178 | 
179 | - name: Apply 'one-approval' label if one of the maintainer approved the PR
180 |   conditions:
181 |       - "#approved-reviews-by=1"
182 |   actions:
183 |     label:
184 |       add:
185 |         - one-approval
186 | 
187 | - name: Remove 'one-approval' label if the approval was reset
188 |   conditions:
189 |       - "#approved-reviews-by!=1"
190 |   actions:
191 |     label:
192 |       remove:
193 |         - one-approval
194 | 
195 | - name: label-dependencies
196 |   description: Automatically apply dependencies label
197 |   conditions:
198 |     - or:
199 |       - files~=^requirements.*\.txt$
200 |       - files~=^requirements/
201 |   actions:
202 |     label:
203 |       add:
204 |         - dependencies
205 | 


--------------------------------------------------------------------------------
/.github/stale_bot.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: 'Close stale issues and PRs'
 4 | 
 5 | on:
 6 |   schedule:
 7 |     - cron: '30 1 * * *'
 8 | 
 9 | env:
10 |   LC_ALL: en_US.UTF-8
11 | 
12 | defaults:
13 |   run:
14 |     shell: bash
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   stale:
21 |     permissions:
22 |       issues: write
23 |       pull-requests: write
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       - name: "Harden Runner"
27 |         uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
28 |         with:
29 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
30 | 
31 |       - name: "Stale Action"
32 |         uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
33 |         with:
34 |           stale-issue-label: 'stale'
35 |           stale-issue-message: >
36 |             This issue has been automatically marked as stale because it has not had activity within 90 days.
37 |             It will be automatically closed if no further activity occurs within 30 days.
38 |           close-issue-message: >
39 |             This issue has been automatically closed due to inactivity. Please feel free to reopen if you feel it is still relevant!
40 |           days-before-issue-stale: 90
41 |           days-before-issue-close: 30
42 |           stale-pr-label: 'stale'
43 |           stale-pr-message: >
44 |             This pull request has been automatically marked as stale because it has not had activity within 90 days.
45 |             It will be automatically closed if no further activity occurs within 30 days.
46 |           close-pr-message: >
47 |             This pull request has been automatically closed due to inactivity. Please feel free to reopen if you intend to continue working on it!
48 |           days-before-pr-stale: 90
49 |           days-before-pr-close: 30
50 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.dockerfile:
--------------------------------------------------------------------------------
1 | # Since dependabot cannot update workflows using docker,
2 | # we use this indirection since dependabot can update this file.
3 | FROM rhysd/actionlint:1.7.7@sha256:887a259a5a534f3c4f36cb02dca341673c6089431057242cdc931e9f133147e9
4 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Lint GitHub Actions workflows
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - "main"
 8 |     paths:
 9 |       - '.github/workflows/*.ya?ml'
10 |       - '.github/workflows/actionlint.*' # This workflow
11 |   pull_request:
12 |     branches:
13 |       - "main"
14 |     paths:
15 |       - '.github/workflows/*.ya?ml'
16 |       - '.github/workflows/actionlint.*' # This workflow
17 | 
18 | env:
19 |   LC_ALL: en_US.UTF-8
20 | 
21 | defaults:
22 |   run:
23 |     shell: bash
24 | 
25 | permissions:
26 |   contents: read
27 | 
28 | jobs:
29 |   actionlint:
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - name: "Harden Runner"
33 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
34 |         with:
35 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
36 | 
37 |       - name: "Checkout"
38 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
39 |         with:
40 |           fetch-depth: 0
41 | 
42 |       - name: "Download actionlint"
43 |         run: |
44 |           docker build --tag actionlint - < .github/workflows/actionlint.dockerfile
45 |       - name: "Check workflow files"
46 |         run: |
47 |           echo "::add-matcher::.github/workflows/matchers/actionlint.json"
48 |           docker run --volume="${PWD}:/repo" --workdir=/repo actionlint -color
49 | 


--------------------------------------------------------------------------------
/.github/workflows/constraints-update.yml:
--------------------------------------------------------------------------------
 1 | name: Update constraints-dev.txt
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 3 * * 1'  # Every Monday at 03:00 UTC
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   update-constraints:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       contents: write
13 |       pull-requests: write
14 | 
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
18 | 
19 |       - name: Checkout "update-constraints" in-house CI action
20 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
21 |         with:
22 |           repository: instructlab/ci-actions
23 |           path: ci-actions
24 |           # no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet
25 |           ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main
26 |           sparse-checkout: |
27 |             actions/update-constraints
28 | 
29 |       - name: Update constraints
30 |         id: update-constraints
31 |         uses: ./ci-actions/actions/update-constraints
32 |         with:
33 |           gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Lint Markdown documents
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - "main"
 9 |     paths:
10 |       - '**/*.md'
11 |       - '.markdownlint-cli2.yaml'
12 |       - '.github/workflows/docs.yml' # This workflow
13 |   pull_request:
14 |     branches:
15 |       - "main"
16 |     paths:
17 |       - '**/*.md'
18 |       - '.markdownlint-cli2.yaml'
19 |       - '.github/workflows/docs.yml' # This workflow
20 | 
21 | env:
22 |   LC_ALL: en_US.UTF-8
23 | 
24 | defaults:
25 |   run:
26 |     shell: bash
27 | 
28 | permissions:
29 |   contents: read
30 | 
31 | jobs:
32 |   markdown-lint:
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |       - name: "Harden Runner"
36 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
37 |         with:
38 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
39 |       - name: "Checkout"
40 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41 |         with:
42 |           fetch-depth: 0
43 |       - name: "Check Markdown documents"
44 |         uses: DavidAnson/markdownlint-cli2-action@992badcdf24e3b8eb7e87ff9287fe931bcb00c6e # v20.0.0
45 |         with:
46 |           globs: '**/*.md'
47 | 


--------------------------------------------------------------------------------
/.github/workflows/e2e-nvidia-l4-x1.yml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | name: E2E (NVIDIA L4 x1)
  4 | 
  5 | on:
  6 |   # run against every merge commit to 'main' and release branches
  7 |   push:
  8 |     branches:
  9 |       - main
 10 |       - release-*
 11 |   # only run on PRs that touch certain regex paths
 12 |   pull_request_target:
 13 |     branches:
 14 |       - main
 15 |       - release-*
 16 |     paths:
 17 |       # note this should match the merging criteria in 'mergify.yml'
 18 |       - '**.py'
 19 |       - 'pyproject.toml'
 20 |       - 'requirements**.txt'
 21 |       - 'constraints-dev.txt'
 22 |       - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
 23 |   workflow_dispatch:
 24 | 
 25 | concurrency:
 26 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 27 |   cancel-in-progress: true
 28 | 
 29 | env:
 30 |   LC_ALL: en_US.UTF-8
 31 |   TMPDIR: /home/tmp
 32 |   
 33 | defaults:
 34 |   run:
 35 |     shell: bash
 36 |   
 37 | permissions:
 38 |   contents: read
 39 | 
 40 | jobs:
 41 |   start-medium-ec2-runner:
 42 |     runs-on: ubuntu-latest
 43 |     outputs:
 44 |       label: ${{ steps.start-ec2-runner.outputs.label }}
 45 |       ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
 46 |     steps:
 47 |       - name: Configure AWS credentials
 48 |         uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
 49 |         with:
 50 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 51 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 52 |           aws-region: ${{ vars.AWS_REGION }}
 53 | 
 54 |       - name: Start EC2 runner
 55 |         id: start-ec2-runner
 56 |         uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
 57 |         with:
 58 |           mode: start
 59 |           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
 60 |           ec2-image-id: ${{ vars.AWS_EC2_AMI }}
 61 |           ec2-instance-type: g6.8xlarge
 62 |           subnet-id: subnet-02d230cffd9385bd4
 63 |           security-group-id: sg-06300447c4a5fbef3
 64 |           iam-role-name: instructlab-ci-runner
 65 |           aws-resource-tags: >
 66 |             [
 67 |               {"Key": "Name", "Value": "instructlab-ci-github-medium-runner"},
 68 |               {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
 69 |               {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
 70 |               {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
 71 |             ]
 72 | 
 73 |   e2e-medium-test:
 74 |     needs:
 75 |       - start-medium-ec2-runner
 76 |     runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}
 77 | 
 78 |     # It is important that this job has no write permissions and has
 79 |     # no access to any secrets. This part (e2e) is where we are running
 80 |     # untrusted code from PRs.
 81 |     permissions: {}
 82 | 
 83 |     steps:
 84 |       - name: Install Packages
 85 |         run: |
 86 |           cat /etc/os-release
 87 |           mkdir -p "${TMPDIR}"
 88 |           sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
 89 | 
 90 |       - name: Checkout instructlab/instructlab
 91 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 92 |         with:
 93 |           repository: "instructlab/instructlab"
 94 |           path: "instructlab"
 95 |           fetch-depth: 0
 96 | 
 97 |       - name: Checkout instructlab/eval
 98 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 99 |         with:
100 |           repository: "instructlab/eval"
101 |           path: "eval"
102 |           # https://github.com/actions/checkout/issues/249
103 |           fetch-depth: 0
104 | 
105 |       - name: Fetch and checkout PR
106 |         id: fetch_pr
107 |         if: github.event_name == 'pull_request_target'
108 |         working-directory: ./eval
109 |         run: |
110 |           git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
111 |           git checkout pr-${{ github.event.pull_request.number }}
112 | 
113 |       - name: Install ilab
114 |         working-directory: ./instructlab
115 |         run: |
116 |           PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh
117 |         
118 |       - name: Update instructlab-eval library
119 |         working-directory: ./eval
120 |         run: |
121 |           . ../instructlab/venv/bin/activate
122 |           # Patch out our own pin from the ilab repo constraints file
123 |           ilab_constraints=../instructlab/constraints-dev.txt
124 |           sed -i '/instructlab-eval==/d' $ilab_constraints
125 | 
126 |           # Since we reuse the virtual environment prepared using ilab
127 |           # constraints, we should stick to the same constraints when
128 |           # installing latest eval.
129 |           #
130 |           # FIX: this is not ideal; a proper fix would require decoupling the
131 |           # two repos in CI: either by removing the job completely and relying
132 |           # on "sdk" (no ilab) test runs; or by preparing a separate
133 |           # constraints file that would consider both the requirements files
134 |           # for the eval library AND for the ilab - so that they are
135 |           # consistent.
136 |           pip_install="pip install -c $ilab_constraints"
137 |           $pip_install .
138 |           $pip_install .[cuda]
139 | 
140 |       - name: Run e2e test
141 |         working-directory: ./instructlab
142 |         run: |
143 |           . venv/bin/activate
144 |           ./scripts/e2e-ci.sh -m
145 | 
146 |   stop-medium-ec2-runner:
147 |     needs:
148 |       - start-medium-ec2-runner
149 |       - e2e-medium-test
150 |     runs-on: ubuntu-latest
151 |     if: ${{ always() }}
152 |     steps:
153 |       - name: Configure AWS credentials
154 |         uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
155 |         with:
156 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
157 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
158 |           aws-region: ${{ vars.AWS_REGION }}
159 |       - name: Stop EC2 runner
160 |         uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
161 |         with:
162 |           mode: stop
163 |           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
164 |           label: ${{ needs.start-medium-ec2-runner.outputs.label }}
165 |           ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
166 | 
167 |   e2e-medium-workflow-complete:
168 |     # we don't want to block PRs on failed EC2 cleanup
169 |     # so not requiring "stop-runner" as well
170 |     needs: ["start-medium-ec2-runner", "e2e-medium-test"]
171 |     runs-on: ubuntu-latest
172 |     steps:
173 |       - name: E2E Workflow Complete
174 |         run: echo "E2E Workflow Complete"
175 | 


--------------------------------------------------------------------------------
/.github/workflows/e2e-nvidia-l40s-x4.yml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | name: E2E (NVIDIA L40S x4)
  4 | 
  5 | on:
  6 |   schedule:
  7 |     - cron: '0 16 * * *' # Runs at 4PM UTC every day
  8 |   workflow_dispatch:
  9 |     inputs:
 10 |       pr_or_branch:
 11 |         description: 'pull request number or branch name'
 12 |         required: true
 13 |         default: 'main'
 14 | 
 15 | env:
 16 |   TMPDIR: /home/tmp
 17 | 
 18 | jobs:
 19 |   start-large-ec2-runner:
 20 |     runs-on: ubuntu-latest
 21 |     outputs:
 22 |       label: ${{ steps.start-ec2-runner.outputs.label }}
 23 |       ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
 24 |     steps:
 25 |       - name: Configure AWS credentials
 26 |         uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
 27 |         with:
 28 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 29 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 30 |           aws-region: ${{ vars.AWS_REGION }}
 31 | 
 32 |       - name: Start EC2 runner
 33 |         id: start-ec2-runner
 34 |         uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
 35 |         with:
 36 |           mode: start
 37 |           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
 38 |           ec2-image-id: ${{ vars.AWS_EC2_AMI }}
 39 |           ec2-instance-type: g6e.12xlarge
 40 |           subnet-id: subnet-024298cefa3bedd61
 41 |           security-group-id: sg-06300447c4a5fbef3
 42 |           iam-role-name: instructlab-ci-runner
 43 |           aws-resource-tags: >
 44 |             [
 45 |               {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
 46 |               {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
 47 |               {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
 48 |               {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
 49 |             ]
 50 | 
 51 |   e2e-large-test:
 52 |     needs:
 53 |       - start-large-ec2-runner
 54 |     runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
 55 | 
 56 |     permissions:
 57 |       pull-requests: write
 58 | 
 59 |     steps:
 60 |       - name: Install Packages
 61 |         run: |
 62 |           cat /etc/os-release
 63 |           mkdir -p "${TMPDIR}"
 64 |           sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
 65 | 
 66 |       - name: Checkout instructlab/instructlab
 67 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 68 |         with:
 69 |           repository: "instructlab/instructlab"
 70 |           path: "instructlab"
 71 |           # https://github.com/actions/checkout/issues/249
 72 |           fetch-depth: 0
 73 |   
 74 |       - name: Checkout instructlab/eval
 75 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 76 |         with:
 77 |           repository: "instructlab/eval"
 78 |           path: "eval"
 79 |           # https://github.com/actions/checkout/issues/249
 80 |           fetch-depth: 0
 81 | 
 82 |       - name: Determine if pr_or_branch is a PR number
 83 |         id: check_pr
 84 |         run: |
 85 |           PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
 86 |           if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
 87 |             echo "is_pr=true" >> "$GITHUB_OUTPUT"
 88 |           else
 89 |             echo "is_pr=false" >> "$GITHUB_OUTPUT"
 90 |           fi
 91 |           echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
 92 | 
 93 |       - name: Check if gh cli is installed
 94 |         id: gh_cli
 95 |         run: |
 96 |           if command -v gh &> /dev/null ; then
 97 |             echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
 98 |           else
 99 |             echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
100 |           fi
101 | 
102 |       - name: Install gh CLI
103 |         if: steps.gh_cli.outputs.gh_cli_installed == 'false'
104 |         run: |
105 |           sudo dnf install 'dnf-command(config-manager)' -y
106 |           sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
107 |           sudo dnf install gh --repo gh-cli -y
108 | 
109 |       - name: test gh CLI
110 |         run: |
111 |           gh --version
112 | 
113 |       - name: set default repo
114 |         working-directory: ./eval
115 |         run: |
116 |           gh repo set-default ${{ github.server_url }}/${{ github.repository }}
117 |         env:
118 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
119 | 
120 |       - name: Add comment to PR
121 |         if: steps.check_pr.outputs.is_pr == 'true'
122 |         working-directory: ./eval
123 |         run: |
124 |           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
125 |         env:
126 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
127 | 
128 |       - name: Fetch and checkout PR
129 |         if: steps.check_pr.outputs.is_pr == 'true'
130 |         working-directory: ./eval
131 |         run: |
132 |           gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
133 |         env:
134 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
135 | 
136 |       - name: Checkout branch
137 |         if: steps.check_pr.outputs.is_pr == 'false'
138 |         working-directory: ./eval
139 |         run: |
140 |           git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
141 | 
142 |       - name: Install ilab
143 |         working-directory: ./instructlab
144 |         run: |
145 |           PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh
146 | 
147 |       - name: Update instructlab-eval library
148 |         working-directory: ./eval
149 |         run: |
150 |           . ../instructlab/venv/bin/activate
151 |           # Patch out our own pin from the ilab repo constraints file
152 |           ilab_constraints=../instructlab/constraints-dev.txt
153 |           sed -i '/instructlab-eval==/d' $ilab_constraints
154 | 
155 |           # Since we reuse the virtual environment prepared using ilab
156 |           # constraints, we should stick to the same constraints when
157 |           # installing latest eval.
158 |           #
159 |           # FIX: this is not ideal; a proper fix would require decoupling the
160 |           # two repos in CI: either by removing the job completely and relying
161 |           # on "sdk" (no ilab) test runs; or by preparing a separate
162 |           # constraints file that would consider both the requirements files
163 |           # for the eval library AND for the ilab - so that they are
164 |           # consistent.
165 |           pip_install="pip install -c $ilab_constraints"
166 |           $pip_install .
167 |           $pip_install .[cuda]
168 | 
169 |       - name: Check disk before tests
170 |         run: |
171 |           df -h
172 | 
173 |       - name: Run e2e test
174 |         working-directory: ./instructlab
175 |         env:
176 |           HF_TOKEN: ${{ secrets.HF_TOKEN }}
177 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
178 |         run: |
179 |           . venv/bin/activate
180 |           ./scripts/e2e-ci.sh -l
181 | 
182 |       - name: Check disk after tests
183 |         run: |
184 |           df -h
185 | 
186 |       - name: Add comment to PR if the workflow failed
187 |         if: failure() && steps.check_pr.outputs.is_pr == 'true'
188 |         working-directory: ./eval
189 |         run: |
190 |           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
191 |         env:
192 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
193 | 
194 |       - name: Add comment to PR if the workflow succeeded
195 |         if: success() && steps.check_pr.outputs.is_pr == 'true'
196 |         working-directory: ./eval
197 |         run: |
198 |           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
199 |         env:
200 |           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
201 | 
202 |       - name: Send Discord notification for failure
203 |         if: failure() && steps.check_pr.outputs.is_pr == 'false'
204 |         uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
205 |         with:
206 |           webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
207 |           status: ${{ job.status }}
208 |           title: "e2e-nvidia-l40s-x4"
209 |           description: |
210 |             Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
211 |             Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
212 |           color: 0xCB2431 # Red color for failure
213 | 
214 |       - name: Send Discord notification for success
215 |         if: success() && steps.check_pr.outputs.is_pr == 'false'
216 |         uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
217 |         with:
218 |           webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
219 |           status: ${{ job.status }}
220 |           title: "e2e-nvidia-l40s-x4"
221 |           description: |
222 |             Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
223 |             Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
224 |           color: 0x28A745 # Green color for success
225 | 
226 |   stop-large-ec2-runner:
227 |     needs:
228 |       - start-large-ec2-runner
229 |       - e2e-large-test
230 |     runs-on: ubuntu-latest
231 |     if: ${{ always() }}
232 |     steps:
233 |       - name: Configure AWS credentials
234 |         uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
235 |         with:
236 |           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
237 |           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
238 |           aws-region: ${{ vars.AWS_REGION }}
239 | 
240 |       - name: Stop EC2 runner
241 |         uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
242 |         with:
243 |           mode: stop
244 |           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
245 |           label: ${{ needs.start-large-ec2-runner.outputs.label }}
246 |           ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
247 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | name: Lint, Format, and MyPy
  4 | 
  5 | on:
  6 |   push:
  7 |     branches:
  8 |       - "main"
  9 |       - "release-**"
 10 |     paths:
 11 |       - '**.py'
 12 |       - 'pyproject.toml'
 13 |       - 'requirements*.txt'
 14 |       - 'constraints-dev.txt'
 15 |       - 'tox.ini'
 16 |       - '.pylintrc'
 17 |       - 'scripts/*.sh' # Used by this workflow
 18 |       - '.github/workflows/lint.yml' # This workflow
 19 |   pull_request:
 20 |     branches:
 21 |       - "main"
 22 |       - "release-**"
 23 |     paths:
 24 |       - '**.py'
 25 |       - 'pyproject.toml'
 26 |       - 'requirements*.txt'
 27 |       - 'constraints-dev.txt'
 28 |       - 'tox.ini'
 29 |       - '.pylintrc'
 30 |       - 'scripts/*.sh' # Used by this workflow
 31 |       - '.github/workflows/lint.yml' # This workflow
 32 | 
 33 | env:
 34 |   LC_ALL: en_US.UTF-8
 35 | 
 36 | defaults:
 37 |   run:
 38 |     shell: bash
 39 | 
 40 | permissions:
 41 |   contents: read
 42 | 
 43 | jobs:
 44 |   lint:
 45 |     runs-on: ubuntu-latest
 46 |     name: "${{ matrix.lint.name }}"
 47 |     strategy:
 48 |       fail-fast: false
 49 |       matrix:
 50 |         lint:
 51 |           - name: "ruff"
 52 |             commands: |
 53 |               tox -e ruff -- check
 54 |           - name: "pylint"
 55 |             commands: |
 56 |               echo "::add-matcher::.github/workflows/matchers/pylint.json"
 57 |               tox -e lint
 58 |           - name: "mypy"
 59 |             commands: |
 60 |               tox -e mypy
 61 |     steps:
 62 |       - name: "Harden Runner"
 63 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
 64 |         with:
 65 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 66 | 
 67 |       - name: "Checkout"
 68 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 69 |         with:
 70 |           # https://github.com/actions/checkout/issues/249
 71 |           fetch-depth: 0
 72 | 
 73 |       - name: Free disk space
 74 |         uses: ./.github/actions/free-disk-space
 75 | 
 76 |       - name: Setup Python 3.11
 77 |         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
 78 |         with:
 79 |           python-version: 3.11
 80 |           cache: pip
 81 |           cache-dependency-path: |
 82 |             **/pyproject.toml
 83 |             **/requirements*.txt
 84 | 
 85 |       - name: Install tox
 86 |         run: |
 87 |           pip_install="python -m pip install -c constraints-dev.txt"
 88 |           $pip_install --upgrade pip
 89 |           $pip_install tox tox-gh
 90 | 
 91 |       - name: "${{ matrix.lint.name }}"
 92 |         run: |
 93 |           ${{ matrix.lint.commands }}
 94 |         env:
 95 |           RUFF_OUTPUT_FORMAT: github
 96 | 
 97 |   lint-workflow-complete:
 98 |     needs: ["lint"]
 99 |     runs-on: ubuntu-latest
100 |     steps:
101 |       - name: Lint Workflow Complete
102 |         run: echo "Lint Workflow Complete"
103 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/pylint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "pylint-error",
 5 |       "severity": "error",
 6 |       "pattern": [
 7 |         {
 8 |           "regexp": "^(.+):(\\d+):(\\d+):\\s(([EF]\\d{4}):\\s.+)$",
 9 |           "file": 1,
10 |           "line": 2,
11 |           "column": 3,
12 |           "message": 4,
13 |           "code": 5
14 |         }
15 |       ]
16 |     },
17 |     {
18 |       "owner": "pylint-warning",
19 |       "severity": "warning",
20 |       "pattern": [
21 |         {
22 |           "regexp": "^(.+):(\\d+):(\\d+):\\s(([CRW]\\d{4}):\\s.+)$",
23 |           "file": 1,
24 |           "line": 2,
25 |           "column": 3,
26 |           "message": 4,
27 |           "code": 5
28 |         }
29 |       ]
30 |     }
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yaml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | name: Build, test, and upload PyPI package
  4 | 
  5 | on:
  6 |     push:
  7 |         branches:
  8 |             - "main"
  9 |             - "release-**"
 10 |         tags:
 11 |             - "v*"
 12 |     pull_request:
 13 |         branches:
 14 |             - "main"
 15 |             - "release-**"
 16 |     release:
 17 |         types:
 18 |             - published
 19 | 
 20 | env:
 21 |     LC_ALL: en_US.UTF-8
 22 | 
 23 | defaults:
 24 |     run:
 25 |         shell: bash
 26 | 
 27 | permissions:
 28 |     contents: read
 29 | 
 30 | jobs:
 31 |     # Create and verify release artifacts
 32 |     # - build source dist (tar ball) and wheel
 33 |     # - validate artifacts with various tools
 34 |     # - upload artifacts to GHA
 35 |     build-package:
 36 |         name: Build and check packages
 37 |         runs-on: ubuntu-latest
 38 |         steps:
 39 |             - name: "Harden Runner"
 40 |               uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
 41 |               with:
 42 |                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 43 | 
 44 |             - name: "Checkout"
 45 |               uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 46 |               with:
 47 |                   # for setuptools-scm
 48 |                   fetch-depth: 0
 49 | 
 50 |             - name: "Build and Inspect"
 51 |               uses: hynek/build-and-inspect-python-package@b5076c307dc91924a82ad150cdd1533b444d3310 # v2.12.0
 52 | 
 53 |     # push to Test PyPI on
 54 |     # - a new GitHub release is published
 55 |     # - a PR is merged into main branch
 56 |     publish-test-pypi:
 57 |         name: Publish packages to test.pypi.org
 58 |         # environment: publish-test-pypi
 59 |         if: ${{ (github.repository_owner == 'instructlab') && ((github.event.action == 'published') || ((github.event_name == 'push') && (github.ref == 'refs/heads/main'))) }}
 60 |         permissions:
 61 |             contents: read
 62 |             # see https://docs.pypi.org/trusted-publishers/
 63 |             id-token: write
 64 |         runs-on: ubuntu-latest
 65 |         needs: build-package
 66 | 
 67 |         steps:
 68 |             - name: "Harden Runner"
 69 |               uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
 70 |               with:
 71 |                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 72 | 
 73 |             - name: "Download build artifacts"
 74 |               uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
 75 |               with:
 76 |                   name: Packages
 77 |                   path: dist
 78 | 
 79 |             - name: "Upload to Test PyPI"
 80 |               uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
 81 |               with:
 82 |                   repository-url: https://test.pypi.org/legacy/
 83 | 
 84 |     # push to Production PyPI on
 85 |     # - a new GitHub release is published
 86 |     publish-pypi:
 87 |         name: Publish release to pypi.org
 88 |         # environment: publish-pypi
 89 |         if: ${{ (github.repository_owner == 'instructlab') && (github.event.action == 'published') }}
 90 |         permissions:
 91 |             # see https://docs.pypi.org/trusted-publishers/
 92 |             id-token: write
 93 |             # allow gh release upload
 94 |             contents: write
 95 | 
 96 |         runs-on: ubuntu-latest
 97 |         needs: build-package
 98 | 
 99 |         steps:
100 |             - name: "Harden Runner"
101 |               uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
102 |               with:
103 |                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
104 | 
105 |             - name: "Download build artifacts"
106 |               uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
107 |               with:
108 |                   name: Packages
109 |                   path: dist
110 | 
111 |             - name: "Sigstore sign package"
112 |               uses: sigstore/gh-action-sigstore-python@f514d46b907ebcd5bedc05145c03b69c1edd8b46 # v3.0.0
113 |               with:
114 |                   release-signing-artifacts: false
115 |                   inputs: |
116 |                       ./dist/*.tar.gz
117 |                       ./dist/*.whl
118 | 
119 |             - name: "Upload artifacts and signatures to GitHub release"
120 |               run: |
121 |                   gh release upload '${{ github.ref_name }}' dist/* --repo '${{ github.repository }}'
122 |               env:
123 |                   GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
124 | 
125 |             # PyPI does not accept .sigstore artifacts and
126 |             # gh-action-pypi-publish has no option to ignore them.
127 |             - name: "Remove sigstore signatures before uploading to PyPI"
128 |               run: |
129 |                   rm ./dist/*.sigstore.json
130 | 
131 |             - name: "Upload to PyPI"
132 |               uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
133 | 


--------------------------------------------------------------------------------
/.github/workflows/spellcheck.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Spellcheck
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - "main"
 9 |     paths:
10 |       - '**.md'
11 |       - '.github/workflows/spellcheck.yml' # This workflow
12 |   pull_request:
13 |     branches:
14 |       - "main"
15 |     paths:
16 |       - '**.md'
17 |       - '.github/workflows/spellcheck.yml' # This workflow
18 | 
19 | env:
20 |   LC_ALL: en_US.UTF-8
21 | 
22 | defaults:
23 |   run:
24 |     shell: bash
25 | 
26 | permissions:
27 |   contents: read
28 | 
29 | jobs:
30 |   spellcheck:
31 |     runs-on: ubuntu-latest
32 |     steps:
33 |       - name: "Harden Runner"
34 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
35 |         with:
36 |          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
37 | 
38 |       - name: "Checkout"
39 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
40 |         with:
41 |           fetch-depth: 0
42 | 
43 |       - name: Spellcheck
44 |         uses: rojopolis/spellcheck-github-actions@584b2ae95998967a53af7fbfb7f5b15352c38748 # v0.49.0
45 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | name: Test
  4 | 
  5 | on:
  6 |   workflow_dispatch:
  7 |   push:
  8 |     branches:
  9 |       - "main"
 10 |       - "release-**"
 11 |     paths:
 12 |       - '**.py'
 13 |       - 'pyproject.toml'
 14 |       - 'requirements**.txt'
 15 |       - 'constraints-dev.txt'
 16 |       - 'tox.ini'
 17 |       - 'scripts/*.sh' # Used by this workflow
 18 |       - '.github/workflows/test.yml' # This workflow
 19 |   pull_request:
 20 |     branches:
 21 |       - "main"
 22 |       - "release-**"
 23 |     paths:
 24 |       - '**.py'
 25 |       - 'pyproject.toml'
 26 |       - 'requirements**.txt'
 27 |       - 'constraints-dev.txt'
 28 |       - 'tox.ini'
 29 |       - 'scripts/*.sh' # Used by this workflow
 30 |       - '.github/workflows/test.yml' # This workflow
 31 | 
 32 | env:
 33 |   LC_ALL: en_US.UTF-8
 34 | 
 35 | defaults:
 36 |   run:
 37 |     shell: bash
 38 | 
 39 | permissions:
 40 |   contents: read
 41 | 
 42 | jobs:
 43 |   test:
 44 |     name: "test: ${{ matrix.python }} on ${{ matrix.platform }}"
 45 |     runs-on: "${{ matrix.platform }}"
 46 |     strategy:
 47 |       fail-fast: false
 48 |       matrix:
 49 |         python:
 50 |           - "3.11"
 51 |         platform:
 52 |           - "ubuntu-latest"
 53 |         include:
 54 |           - python: "3.11"
 55 |             platform: "macos-latest"
 56 |     steps:
 57 |       - name: "Harden Runner"
 58 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
 59 |         with:
 60 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 61 | 
 62 |       - name: Checkout
 63 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 64 |         with:
 65 |           # https://github.com/actions/checkout/issues/249
 66 |           fetch-depth: 0
 67 | 
 68 |       - name: Free disk space
 69 |         if: matrix.platform != 'macos-latest'
 70 |         uses: ./.github/actions/free-disk-space
 71 | 
 72 |       - name: Install the expect package
 73 |         if: startsWith(matrix.platform, 'ubuntu')
 74 |         run: |
 75 |           sudo apt-get install -y expect
 76 | 
 77 |       - name: Install tools on MacOS
 78 |         if: startsWith(matrix.platform, 'macos')
 79 |         run: |
 80 |           brew install expect coreutils bash
 81 | 
 82 |       - name: Setup Python ${{ matrix.python }}
 83 |         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
 84 |         with:
 85 |           python-version: ${{ matrix.python }}
 86 |           cache: pip
 87 |           cache-dependency-path: |
 88 |             **/pyproject.toml
 89 |             **/requirements*.txt
 90 | 
 91 |       - name: Remove llama-cpp-python from cache
 92 |         run: |
 93 |           pip cache remove llama_cpp_python
 94 | 
 95 |       - name: Cache huggingface
 96 |         uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
 97 |         with:
 98 |           path: ~/.cache/huggingface
 99 |           # config contains DEFAULT_MODEL
100 |           key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }}
101 | 
102 |       - name: Install dependencies
103 |         run: |
104 |           pip_install="python -m pip install -c constraints-dev.txt"
105 |           $pip_install --upgrade pip
106 |           $pip_install tox tox-gh>=1.2
107 | 
108 |       - name: Run unit and functional tests with tox
109 |         run: |
110 |           tox
111 | 
112 |       - name: Remove llama-cpp-python from cache
113 |         if: always()
114 |         run: |
115 |           pip cache remove llama_cpp_python
116 | 
117 |   test-workflow-complete:
118 |     needs: ["test"]
119 |     runs-on: ubuntu-latest
120 |     steps:
121 |       - name: Test Workflow Complete
122 |         run: echo "Test Workflow Complete"
123 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Auto generated
  2 | src/instructlab/eval/_version.py
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | coverage-py3-*
 52 | coverage
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | cover/
 58 | durations/*
 59 | 
 60 | # Functional tests
 61 | mt_bench_branch_generator/*
 62 | eval_output/*
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | db.sqlite3-journal
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | .pybuilder/
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | #   For a library or package, you might want to ignore these files since the code is
 97 | #   intended to run in multiple environments; otherwise, check them in:
 98 | # .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # poetry
108 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
109 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
110 | #   commonly ignored for libraries.
111 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
112 | #poetry.lock
113 | 
114 | # pdm
115 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116 | #pdm.lock
117 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
118 | #   in version control.
119 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
120 | .pdm.toml
121 | .pdm-python
122 | .pdm-build/
123 | 
124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
125 | __pypackages__/
126 | 
127 | # Celery stuff
128 | celerybeat-schedule
129 | celerybeat.pid
130 | 
131 | # SageMath parsed files
132 | *.sage.py
133 | 
134 | # Environments
135 | .env
136 | .venv
137 | env/
138 | venv/
139 | ENV/
140 | env.bak/
141 | venv.bak/
142 | dictionary.dic
143 | 
144 | # Spyder project settings
145 | .spyderproject
146 | .spyproject
147 | 
148 | # Rope project settings
149 | .ropeproject
150 | 
151 | # mkdocs documentation
152 | /site
153 | 
154 | # mypy
155 | .mypy_cache/
156 | .dmypy.json
157 | dmypy.json
158 | 
159 | # Pyre type checker
160 | .pyre/
161 | 
162 | # pytype static type analyzer
163 | .pytype/
164 | 
165 | # Cython debug symbols
166 | cython_debug/
167 | 
168 | # PyCharm
169 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
172 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
173 | #.idea/
174 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [settings]
 2 | profile=black
 3 | from_first=true
 4 | import_heading_future=Future
 5 | import_heading_stdlib=Standard
 6 | import_heading_thirdparty=Third Party
 7 | import_heading_firstparty=First Party
 8 | import_heading_localfolder=Local
 9 | known_firstparty=
10 | known_localfolder=tuning
11 | 


--------------------------------------------------------------------------------
/.markdownlint-cli2.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | config:
 4 |   line-length: false
 5 |   no-emphasis-as-header: false
 6 |   first-line-heading: false
 7 |   code-block-style: false
 8 |   no-duplicate-header: false
 9 |   single-trailing-newline: false
10 |   descriptive-link-text: false
11 | globs:
12 |   - "**/*.md"
13 | ignores:
14 |   - ".github/**"
15 |   - "venv/**"
16 |   - ".venv/**"
17 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | repos:
 4 |     - repo: https://github.com/PyCQA/isort
 5 |       rev: 5.11.5
 6 |       hooks:
 7 |           - id: isort
 8 |             exclude: imports
 9 |     - repo: https://github.com/astral-sh/ruff-pre-commit
10 |       # Ruff version.
11 |       rev: v0.3.4
12 |       hooks:
13 |         # Run the linter (most fixers are disabled for now).
14 |         - id: ruff
15 |         # Run the formatter.
16 |         - id: ruff-format
17 | 


--------------------------------------------------------------------------------
/.spellcheck-en-custom.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # make spellcheck-sort
 3 | # Please keep this file sorted:
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | Backport
 6 | backported
 7 | benchmarking
 8 | codebase
 9 | cli
10 | dev
11 | dr
12 | eval
13 | gpt
14 | hoc
15 | http
16 | instructlab
17 | jsonl
18 | justfile
19 | MMLU
20 | openai
21 | pre
22 | SDG
23 | Tatsu
24 | tl
25 | TODO
26 | tox
27 | venv
28 | vllm
29 | barebones
30 | LM
31 | 


--------------------------------------------------------------------------------
/.spellcheck.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | matrix:
 4 | - name: markdown
 5 |   aspell:
 6 |     lang: en
 7 |     d: en_US
 8 |     camel-case: true
 9 |     mode: markdown
10 |   sources:
11 |   - "**/*.md|!.tox/**|!venv/**"
12 |   dictionary:
13 |     wordlists:
14 |     - .spellcheck-en-custom.txt
15 |   pipeline:
16 |   - pyspelling.filters.context:
17 |       context_visible_first: true
18 |       escapes: '\\[\\`~]'
19 |       delimiters:
20 |       # Ignore multiline content between fences (fences can have 3 or more back ticks)
21 |       # ```language
22 |       # content
23 |       # ```
24 |       - open: '(?s)^(?P<open> *`{3,}).*?$'
25 |         close: '^(?P=open)$'
26 |       # Ignore text between inline back ticks
27 |       - open: '(?P<open>`+)'
28 |         close: '(?P=open)'
29 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## 0.5.0
 2 | 
 3 | * Introduces Ragas as a supported evaluation framework. This integration only supports the `RubricsScore` metric and OpenAI models. Users can pass in either a dataset with a pre-computed `user_input`, `reference` and `response` fields or they can provide a dataset containing `user_input` and `reference` along with information about a model endpoint that will be used for computing the `response` field.
 4 | 
 5 | ## 0.4.2
 6 | 
 7 | * Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt.
 8 | * Adds an `extra_args` parameter to the `.run` method of all MMLU-based evaluators. This way, consumers are able to directly pass any additional arguments they want through to the `lm_eval.evaluators.simple_evaluate` function.
 9 | 
10 | ## 0.4
11 | 
12 | * Added ability to specify a custom http client to MT-Bench
13 | 
14 | ## v0.2
15 | 


--------------------------------------------------------------------------------
/DCO.txt:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 | 
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 
 6 | Everyone is permitted to copy and distribute verbatim copies of this
 7 | license document, but changing it is not allowed.
 8 | 
 9 | 
10 | Developer's Certificate of Origin 1.1
11 | 
12 | By making a contribution to this project, I certify that:
13 | 
14 | (a) The contribution was created in whole or in part by me and I
15 |     have the right to submit it under the open source license
16 |     indicated in the file; or
17 | 
18 | (b) The contribution is based upon previous work that, to the best
19 |     of my knowledge, is covered under an appropriate open source
20 |     license and I have the right under that license to submit that
21 |     work with modifications, whether created in whole or in part
22 |     by me, under the same open source license (unless I am
23 |     permitted to submit under a different license), as indicated
24 |     in the file; or
25 | 
26 | (c) The contribution was provided directly to me by some other
27 |     person who certified (a), (b) or (c) and I have not modified
28 |     it.
29 | 
30 | (d) I understand and agree that this project and the contribution
31 |     are public and that a record of the contribution (including all
32 |     personal information I submit with it, including my sign-off) is
33 |     maintained indefinitely and may be redistributed consistent with
34 |     this project or the open source license(s) involved.
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | #
 4 | # If you want to see the full commands, run:
 5 | #   NOISY_BUILD=y make
 6 | #
 7 | ifeq ($(NOISY_BUILD),)
 8 |     ECHO_PREFIX=@
 9 |     CMD_PREFIX=@
10 |     PIPE_DEV_NULL=> /dev/null 2> /dev/null
11 | else
12 |     ECHO_PREFIX=@\#
13 |     CMD_PREFIX=
14 |     PIPE_DEV_NULL=
15 | endif
16 | 
17 | .PHONY: help
18 | help:
19 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
20 | 
21 | .PHONY: action-lint actionlint
22 | action-lint: actionlint
23 | actionlint: ## Lint GitHub Action workflows
24 | 	$(ECHO_PREFIX) printf "  %-12s .github/...\n" "[ACTION LINT]"
25 | 	$(CMD_PREFIX) if ! command -v actionlint $(PIPE_DEV_NULL) ; then \
26 | 		echo "Please install actionlint." ; \
27 | 		echo "go install github.com/rhysd/actionlint/cmd/actionlint@latest" ; \
28 | 		exit 1 ; \
29 | 	fi
30 | 	$(CMD_PREFIX) if ! command -v shellcheck $(PIPE_DEV_NULL) ; then \
31 | 		echo "Please install shellcheck." ; \
32 | 		echo "https://github.com/koalaman/shellcheck#user-content-installing" ; \
33 | 		exit 1 ; \
34 | 	fi
35 | 	$(CMD_PREFIX) actionlint -color
36 | 
37 | .PHONY: check-tox
38 | check-tox:
39 | 	@command -v tox &> /dev/null || (echo "'tox' is not installed" && exit 1)
40 | 
41 | .PHONY: md-lint
42 | md-lint: ## Lint markdown files
43 | 	$(ECHO_PREFIX) printf "  %-12s ./...\n" "[MD LINT]"
44 | 	$(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest > /dev/null
45 | 
46 | .PHONY: spellcheck
47 | spellcheck: ## Spellcheck markdown files
48 | 	tox p -e spellcheck
49 | 
50 | .PHONY: spellcheck-sort
51 | spellcheck-sort: .spellcheck-en-custom.txt ## Sort spellcheck directory
52 | 	sort -d -f -o $< $<
53 | 
54 | .PHONY: verify
55 | verify: check-tox ## Run linting, typing, and formatting checks via tox
56 | 	tox p -e fastlint,mypy,ruff
57 | 
58 | ##@ Development
59 | 
60 | .PHONY: tests
61 | tests: check-tox ## Run unit and type checks
62 | 	tox -e py3-unit,mypy
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # eval
  2 | 
  3 | ![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main)
  4 | ![Tests](https://github.com/instructlab/eval/actions/workflows/test.yml/badge.svg?branch=main)
  5 | ![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main)
  6 | ![Release](https://img.shields.io/github/v/release/instructlab/eval)
  7 | ![License](https://img.shields.io/github/license/instructlab/eval)
  8 | 
  9 | ![`e2e-nvidia-l4-x1.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-l4-x1.yml/badge.svg?branch=main)
 10 | ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
 11 | 
 12 | Python Library for Evaluation
 13 | 
 14 | ## What is Evaluation?
 15 | 
 16 | Evaluation allows us to assess how a given model is performing against a set of specific tasks. This is done by running a set of standardized benchmark tests against
 17 | the model. Running evaluation produces numerical scores across these various benchmarks, as well as logs excerpts/samples of the outputs the model produced during these
 18 | benchmarks. Using a combination of these artifacts as reference, along with a manual smoke test, allows us to get the best idea about whether or not a model has learned
 19 | and improved on something we are trying to teach it. There are 2 stages of model evaluation in the InstructLab process:
 20 | 
 21 | ### Inter-checkpoint Evaluation
 22 | 
 23 | This step occurs during multi-phase training. Each phase of training produces multiple different “checkpoints” of the model that are taken at various stages during
 24 | the phase. At the end of each phase, we evaluate all the checkpoints in order to find the one that provides the best results. This is done as part of the
 25 | [InstructLab Training](https://github.com/instructlab/training) library.
 26 | 
 27 | ### Full-scale final Evaluation
 28 | 
 29 | Once training is complete, and we have picked the best checkpoint from the output of the final phase, we can run full-scale evaluation suite which runs MT-Bench, MMLU,
 30 | MT-Bench Branch and MMLU Branch.
 31 | 
 32 | ### Leaderboard Evaluation
 33 | 
 34 | For cases when you want to run the full Open LLM Leaderboard v2 evaluation suite, we provide an optional dependency package for the leaderboard tasks. This includes additional benchmarks like GPQA, IFEVAL, BBH, MMLU-PRO, MUSR, and MATH-HARD.
 35 | 
 36 | To install the optional leaderboard dependencies, use:
 37 | 
 38 | ```bash
 39 | pip install instructlab-eval[leaderboard]
 40 | ```
 41 | 
 42 | ## Methods of Evaluation
 43 | 
 44 | Below are more in-depth explanations of the suite of benchmarks we are using as methods for evaluation of models.
 45 | 
 46 | ### Multi-turn benchmark (MT-Bench)
 47 | 
 48 | **tl;dr** Full model evaluation of performance on **skills**
 49 | 
 50 | MT-Bench is a type of benchmarking that involves asking a model 80 multi-turn questions - i.e.
 51 | 
 52 | ```text
 53 | <Question 1> → <model’s answer 1> → <Follow-up question> → <model’s answer 2>
 54 | ```
 55 | 
 56 | A “judge” model reviews the given multi-turn question, the provided model answer, and rate the answer with a score out of 10. The scores are then averaged out
 57 | and the final score produced is the “MT-bench score” for that model. This benchmark assumes no factual knowledge on the model’s part. The questions are static, but do not get obsolete with time.
 58 | 
 59 | You can read more about MT-Bench [here](https://arxiv.org/abs/2306.05685)
 60 | 
 61 | ### MT-Bench Branch
 62 | 
 63 | MT-Bench Branch is an adaptation of MT-Bench that is designed to test custom skills that are added to the model with the InstructLab project. These new skills
 64 | come in the form of question/answer pairs in a Git branch of the [taxonomy](https://github.com/instructlab/taxonomy).
 65 | 
 66 | MT-Bench Branch uses the user supplied seed questions to have the candidate model generate answers to, which are then judged by the judge model using the user supplied
 67 | seed answers as a reference.
 68 | 
 69 | ### Massive Multitask Language Understanding (MMLU)
 70 | 
 71 | **tl;dr** Full model evaluation of performance on **knowledge**
 72 | 
 73 | MMLU is a type of benchmarking that involves a series of fact-based multiple choice questions, along with 4 options for answers. It tests if a model is able to interpret
 74 | the questions correctly, along the answers, formulate its own answer, then selects the correct option out of the provided ones. The questions are designed as a set
 75 | of 57 “tasks”, and each task has a given domain. The domains cover a number of topics ranging from Chemistry and Biology to US History and Math.
 76 | 
 77 | The performance number is then compared against the set of known correct answers for each question to determine how many the model got right. The final MMLU score is the
 78 | average of its scores. This benchmark does not involve any reference/critic model, and is a completely objective benchmark. This benchmark does assume factual knowledge
 79 | on the model’s part. The questions are static, therefore MMLU cannot be used to gauge the model’s knowledge on more recent topics.
 80 | 
 81 | InstructLab uses an implementation found [here](https://github.com/EleutherAI/lm-evaluation-harness) for running MMLU.
 82 | 
 83 | You can read more about MMLU [here](https://arxiv.org/abs/2306.05685)
 84 | 
 85 | ### MMLU Branch
 86 | 
 87 | MMLU Branch is an adaptation of MMLU that is designed to test custom knowledge that is being added to the model via a Git branch of the [taxonomy](https://github.com/instructlab/taxonomy).
 88 | 
 89 | A teacher model is used to generate new multiple choice questions based on the knowledge document included in the taxonomy Git branch. A “task” is then constructed that references the newly generated answer choices. These tasks are then used to score the model’s grasp on new knowledge the same way MMLU works. Generation of these tasks are done as part of the [InstructLab SDG](https://github.com/instructlab/sdg) library.
 90 | 
 91 | ## Development
 92 | 
 93 | > **⚠️ Note:** Must use Python version 3.11 or later.
 94 | 
 95 | ### Set up your dev environment
 96 | 
 97 | The following tools are required:
 98 | 
 99 | - [`git`](https://git-scm.com)
100 | - [`python`](https://www.python.org) (v3.11)
101 | - [`pip`](https://pypi.org/project/pip/) (v23.0+)
102 | - [`bash`](https://www.gnu.org/software/bash/) (v5+, for functional tests)
103 | 
104 | #### Optional: Use [cloud-instance.sh](https://github.com/instructlab/instructlab/tree/main/scripts/infra) to launch and setup an instance
105 | 
106 | ```shell
107 | scripts/infra/cloud-instance.sh ec2 launch -t g6.2xlarge
108 | scripts/infra/cloud-instance.sh ec2 setup-rh-devenv
109 | scripts/infra/cloud-instance.sh ec2 install-rh-nvidia-drivers
110 | scripts/infra/cloud-instance.sh ec2 ssh sudo reboot
111 | scripts/infra/cloud-instance.sh ec2 ssh
112 | ```
113 | 
114 | #### Regardless of how you setup your instance
115 | 
116 | ```shell
117 | git clone https://github.com/instructlab/taxonomy.git && pushd taxonomy && git branch rc && popd
118 | git clone --bare https://github.com/instructlab/eval.git && git clone eval.git/ && cd eval && git remote add syncrepo ../eval.git
119 | python3 -m venv venv
120 | source venv/bin/activate
121 | pip install -r requirements.txt
122 | pip install -r requirements-dev.txt
123 | pip install -e .
124 | pip install vllm
125 | ```
126 | 
127 | ### Testing
128 | 
129 | Before pushing changes to GitHub, you need to run the tests as shown below. They can be run individually as shown in each sub-section
130 | or can be run with the one command:
131 | 
132 | ```shell
133 | tox
134 | ```
135 | 
136 | #### Unit tests
137 | 
138 | Unit tests are enforced by the CI system using [`pytest`](https://docs.pytest.org/). When making changes, run these tests before pushing the changes to avoid CI issues.
139 | 
140 | Running unit tests can be done with:
141 | 
142 | ```shell
143 | tox -e py3-unit
144 | ```
145 | 
146 | By default, all tests found within the `tests` directory are run. However, specific unit tests can run by passing filenames, classes and/or methods to `pytest` using tox positional arguments.  The following example invokes a single test method `test_mt_bench` that is declared in the `tests/test_mt_bench.py` file:
147 | 
148 | ```shell
149 | tox -e py3-unit -- tests/test_mt_bench.py::test_mt_bench
150 | ```
151 | 
152 | #### Functional tests
153 | 
154 | Functional tests are enforced by the CI system. When making changes, run the tests before pushing the changes to avoid CI issues.
155 | 
156 | Running functional tests can be done with:
157 | 
158 | ```shell
159 | tox -e py3-functional
160 | ```
161 | 
162 | #### Coding style
163 | 
164 | Cli follows the python [`pep8`](https://peps.python.org/pep-0008/) coding style. The coding style is enforced by the CI system, and your PR will fail until the style has been applied correctly.
165 | 
166 | We use [pre-commit](https://pre-commit.com/) to enforce coding style using [`black`](https://github.com/psf/black), and [`isort`](https://pycqa.github.io/isort/).
167 | 
168 | You can invoke formatting with:
169 | 
170 | ```shell
171 | tox -e ruff
172 | ```
173 | 
174 | In addition, we use [`pylint`](https://www.pylint.org) to perform static code analysis of the code.
175 | 
176 | You can invoke the linting with the following command
177 | 
178 | ```shell
179 | tox -e lint
180 | ```
181 | 
182 | ### MT-Bench / MT-Bench Branch Example Usage
183 | 
184 | Launch vllm serving granite-7b-lab
185 | 
186 | ```shell
187 | python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1
188 | ```
189 | 
190 | In another shell window
191 | 
192 | ```shell
193 | export INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times
194 | # Commands relative to eval directory
195 | python3 scripts/test_gen_answers.py
196 | python3 scripts/test_branch_gen_answers.py
197 | ```
198 | 
199 | Example output tree
200 | 
201 | ```shell
202 | eval_output/
203 | ├── mt_bench
204 | │   └── model_answer
205 | │       └── instructlab
206 | │           └── granite-7b-lab.jsonl
207 | └── mt_bench_branch
208 |     ├── main
209 |     │   ├── model_answer
210 |     │   │   └── instructlab
211 |     │   │       └── granite-7b-lab.jsonl
212 |     │   ├── question.jsonl
213 |     │   └── reference_answer
214 |     │       └── instructlab
215 |     │           └── granite-7b-lab.jsonl
216 |     └── rc
217 |         ├── model_answer
218 |         │   └── instructlab
219 |         │       └── granite-7b-lab.jsonl
220 |         ├── question.jsonl
221 |         └── reference_answer
222 |             └── instructlab
223 |                 └── granite-7b-lab.jsonl
224 | ```
225 | 
226 | ```shell
227 | python3 scripts/test_judge_answers.py
228 | python3 scripts/test_branch_judge_answers.py
229 | ```
230 | 
231 | Example output tree
232 | 
233 | ```shell
234 | eval_output/
235 | ├── mt_bench
236 | │   ├── model_answer
237 | │   │   └── instructlab
238 | │   │       └── granite-7b-lab.jsonl
239 | │   └── model_judgment
240 | │       └── instructlab
241 | │           └── granite-7b-lab_single.jsonl
242 | └── mt_bench_branch
243 |     ├── main
244 |     │   ├── model_answer
245 |     │   │   └── instructlab
246 |     │   │       └── granite-7b-lab.jsonl
247 |     │   ├── model_judgment
248 |     │   │   └── instructlab
249 |     │   │       └── granite-7b-lab_single.jsonl
250 |     │   ├── question.jsonl
251 |     │   └── reference_answer
252 |     │       └── instructlab
253 |     │           └── granite-7b-lab.jsonl
254 |     └── rc
255 |         ├── model_answer
256 |         │   └── instructlab
257 |         │       └── granite-7b-lab.jsonl
258 |         ├── model_judgment
259 |         │   └── instructlab
260 |         │       └── granite-7b-lab_single.jsonl
261 |         ├── question.jsonl
262 |         └── reference_answer
263 |             └── instructlab
264 |                 └── granite-7b-lab.jsonl
265 | ```
266 | 
267 | ## Developer Certificate of Origin
268 | 
269 | When you make a contribution to InstructLab eval, you implicitly agree to the Developer Certificate of Origin terms as set in `DCO.txt` at the root of this repository.
270 | 


--------------------------------------------------------------------------------
/constraints-dev.txt.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/eval/11683ac841df1df6ee65ea6478d1066bb744833d/constraints-dev.txt.in


--------------------------------------------------------------------------------
/docs/ci.md:
--------------------------------------------------------------------------------
 1 | # CI for InstructLab Eval
 2 | 
 3 | Before running any testing locally, ensure you have run `pip install -r requirements-dev.txt` in your environment.
 4 | 
 5 | ## Unit tests
 6 | 
 7 | Unit tests are designed to test specific Eval components or features in isolation. Generally, new code should be adding or modifying unit tests.
 8 | 
 9 | All unit tests currently live in the `tests/` directory and are run with [pytest](https://docs.pytest.org/) via [tox](https://tox.wiki/).
10 | 
11 | To run the unit tests, you can run `tox -e unit` or `tox -e unitcov` if you want to generate coverage metrics as well.
12 | 
13 | In CI, the tests are run with Python 3.11 on Ubuntu and MacOS runners - you can see the details [here](https://github.com/instructlab/eval/blob/main/.github/workflows/test.yml)
14 | 
15 | ## Functional tests
16 | 
17 | Functional tests are designed to test Eval components or features in tandem, but not necessarily as part of a complex workflow. New code may or may not need a functional test but should strive to implement one if possible.
18 | 
19 | The functional test script is Shell-based and can be found at `scripts/functional-tests.sh`.
20 | 
21 | To run the functional tests, you can run `tox -e functional`.
22 | 
23 | In CI, the tests are run with Python 3.11 on Ubuntu and MacOS runners - you can see the details [here](https://github.com/instructlab/eval/blob/main/.github/workflows/test.yml)
24 | 
25 | ## End-to-end (E2E) tests
26 | 
27 | InstructLab Eval has several end-to-end jobs that run to ensure compatibility with the [InstructLab Core](https://github.com/instructlab/instructlab) project.
28 | You can see details about the types of jobs being run in the matrix below.
29 | 
30 | For more details about the E2E scripts themselves, see [the InstructLab Core documentation](https://github.com/instructlab/instructlab/blob/main/docs/maintainers/ci.md#end-to-end-e2e-tests).
31 | 
32 | ### Current E2E Jobs
33 | 
34 | | Name | T-Shirt Size | Runner Host | Instance Type | OS | GPU Type | Script | Flags | Runs when? | Slack/Discord reporting? |
35 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
36 | | [`e2e-nvidia-l4-x1.yml`](https://github.com/instructlab/sdg/blob/main/.github/workflows/e2e-nvidia-l4-x1.yml) | Medium | AWS |[`g6.8xlarge`](https://aws.amazon.com/ec2/instance-types/g5/) | CentOS Stream 9 | 1 x NVIDIA L4 w/ 24 GB VRAM | `e2e-ci.sh` | `m` | Pull Requests, Push to `main` or `release-*` branch | No |
37 | | [`e2e-nvidia-l40s-x4.yml`](https://github.com/instructlab/sdg/blob/main/.github/workflows/e2e-nvidia-l40s-x4.yml) | Large | AWS |[`g6e.12xlarge`](https://aws.amazon.com/ec2/instance-types/g6e/) | CentOS Stream 9 | 4 x NVIDIA L40S w/ 48 GB VRAM (192 GB) | `e2e-ci.sh` | `l` | Manually by Maintainers, Automatically against `main` branch at 4PM UTC | Yes |
38 | 
39 | ### Discord/Slack reporting
40 | 
41 | Some E2E jobs send their results to the channel `#e2e-ci-results` via the `Son of Jeeves` bot in both Discord and Slack. You can see which jobs currently have reporting via the "Current E2E Jobs" table above.
42 | 
43 | In Slack, this has been implemented via [the official Slack GitHub Action](https://github.com/slackapi/slack-github-action?tab=readme-ov-file#technique-2-slack-app).
44 | In Discord, we use [actions/actions-status-discord](https://github.com/sarisia/actions-status-discord) and the built-in channel webhooks feature.
45 | 
46 | ### Triggering an E2E job via GitHub Web UI
47 | 
48 | For the E2E jobs that can be launched manually, they take an input field that
49 | specifies the PR number or git branch to run them against. If you run them
50 | against a PR, they will automatically post a comment to the PR when the tests
51 | begin and end so it's easier for those involved in the PR to follow the results.
52 | 
53 | 1. Visit the [Actions tab](https://github.com/instructlab/eval/actions).
54 | 2. Click on one of the E2E workflows on the left side of the page.
55 | 3. Click on the `Run workflow` button on the right side of the page.
56 | 4. Enter a branch name or a PR number in the input field.
57 | 5. Click the green `Run workflow` button.
58 | 
59 | > [!NOTE]
60 | > Only users with "Write" permissions to the repo can run CI jobs manually
61 | 


--------------------------------------------------------------------------------
/docs/release-strategy.md:
--------------------------------------------------------------------------------
 1 | # InstructLab Eval Release Strategy
 2 | 
 3 | This document discusses the release strategy and processes for the
 4 | `instructlab-eval` Python package built from the
 5 | <https://github.com/instructlab/eval> git repository.
 6 | 
 7 | ## Versioning Scheme
 8 | 
 9 | Releases use a `X.Y.Z` numbering scheme.
10 | 
11 | X-stream release are for major releases. At this stage in the project a major release has not been cut and we expect each release to be a new Y-stream.
12 | 
13 | Z-stream releases are meant for critical bug and documentation fixes. Z-stream releases are cut as maintainers see fit.
14 | 
15 | ## Schedule
16 | 
17 | The project currently operates on an ad-hoc release schedule based on the discretion of the maintainers team.
18 | 
19 | The cadence for major releases starting from 1.0 onward will be determined as the project matures.
20 | 
21 | A schedule will be updated in a markdown file on the <https://github.com/instructlab/eval> GitHub repository.
22 | 
23 | ## Release Tracking
24 | 
25 | Currently there is no formal process of release tracking. GitHub Issues are used for tracking individual work items.
26 | 
27 | In the future, the project may use Milestones or Project Boards for more formal release planning. At that time this document will be updated.
28 | 
29 | ## Git Branches and Tags
30 | 
31 | Every `X.Y` release stream gets a new branch.
32 | 
33 | Each release, `X.Y.Z`, exists as a tag named `vX.Y.Z`.
34 | 
35 | ## Release Branch Maintenance
36 | 
37 | Maintenance efforts are only on the most recent Y-stream.
38 | Critical bug fixes are backported to the most recent release branch.
39 | 
40 | ## Release Mechanics
41 | 
42 | Release mechanics are done by a Release Manager identified for that release.
43 | The Release Manager is a member of the Eval Maintainers team that has agreed to take on these responsibilities.
44 | The Release Manager can change on a per-release basis.
45 | 
46 | The following are the steps for how Y-stream and Z-stream releases gets cut.
47 | 
48 | ### Y-Stream
49 | 
50 | 1. Determine a commit on the main branch that will serve as the basis for the next release - most of the time this should be the latest commit.
51 | 1. Create a new release branch in the format `release-vX.Y` off of the determined commit (will match `main` if the latest commit is chosen).
52 | 1. Create a new release on GitHub targeting the release branch and using the latest Y-Stream tag as the previous release (e.g. `0.15.1` precedes `0.16.0`).
53 | 1. Announce release via the following:
54 |     - The `#eval` channel on Slack
55 |     - The `#eval` channel on Discord
56 |     - The `dev` mailing list
57 | 
58 | ### Z-Stream
59 | 
60 | 1. Backport all relevant commits from `main` to the `release-vX.Y` branch.
61 |     - It may also be the case you wish to update release branch first - if this approach is taken, ensure any relevant commits are subsequently backported to `main`
62 | 1. Create a new release on GitHub targeting the release branch and using the previous Z-Stream tag as the previous release (e.g. `0.15.0` precedes `0.15.1`).
63 | 1. Announce release via the following:
64 |     - The `#eval` channel on Slack
65 |     - The `#eval` channel on Discord
66 |     - The `dev` mailing list
67 | 
68 | ## Release Notes
69 | 
70 | The project maintains a single `CHANGELOG.md` file that documents all releases. To ensure our users
71 | are well-informed about new features, improvements, and breaking changes, we maintain a
72 | `CHANGELOG.md` file. This file serves as a centralized place to document changes that will be
73 | included in the next (X) or (Y) release. Given that the project is in its early stages, we are
74 | currently focused on documenting changes for the next (Y) release.
75 | 
76 | ### Editing Release Notes
77 | 
78 | When submitting a Pull Request (PR) that introduces notable features or breaking changes, committers
79 | need to update the `CHANGELOG.md` file. Clearly describe the changes, their impact, and
80 | any actions users might need to take. We want clear, concise, and user-friendly notes.
81 | 
82 | ### Branching for a New Release
83 | 
84 | Each time we prepare for a new (X) or (Y) release, we branch out from the main codebase.
85 | As part of this branching process, the contents of `CHANGELOG.md` are reviewed and
86 | finalized.
87 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | [build-system]
  4 | requires = ["setuptools>=64", "setuptools_scm>=8"]
  5 | build-backend = "setuptools.build_meta"
  6 | 
  7 | [project]
  8 | name = "instructlab-eval"
  9 | authors = [
 10 |     { name="InstructLab", email="dev@instructlab.ai" },
 11 | ]
 12 | description = "Evaluation"
 13 | readme = "README.md"
 14 | license = {text = "Apache-2.0"}
 15 | requires-python = ">=3.11"
 16 | classifiers = [
 17 |     "Development Status :: 3 - Alpha",
 18 |     "Environment :: Console",
 19 |     "License :: OSI Approved :: Apache Software License",
 20 |     "License :: OSI Approved :: MIT License",
 21 |     "Operating System :: MacOS :: MacOS X",
 22 |     "Operating System :: POSIX :: Linux",
 23 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 24 |     "Programming Language :: Python :: 3",
 25 |     "Programming Language :: Python :: 3.11",
 26 |     "Programming Language :: Python :: 3.12",
 27 |     "Programming Language :: Python :: Implementation :: CPython",
 28 | ]
 29 | dynamic = ["dependencies", "optional-dependencies", "version"]
 30 | 
 31 | [project.scripts]
 32 | 
 33 | [project.urls]
 34 | homepage = "https://instructlab.ai"
 35 | source = "https://github.com/instructlab/eval"
 36 | issues = "https://github.com/instructlab/eval/issues"
 37 | 
 38 | [project.entry-points."instructlab.eval.evaluator"]
 39 | "mmlu" = "instructlab.eval.mmlu:MMLUEvaluator"
 40 | "mmlu_branch" = "instructlab.eval.mmlu:MMLUBranchEvaluator"
 41 | "mt_bench" = "instructlab.eval.mt_bench:MTBenchEvaluator"
 42 | "mt_bench_branch" = "instructlab.eval.mt_bench:MTBenchBranchEvaluator"
 43 | "leaderboard_v2" = "instructlab.eval.leaderboard:LeaderboardV2Evaluator"
 44 | "ruler" = "instructlab.eval.ruler:RulerEvaluator"
 45 | 
 46 | [tool.setuptools_scm]
 47 | version_file = "src/instructlab/eval/_version.py"
 48 | # do not include +gREV local version, required for Test PyPI upload
 49 | local_scheme = "no-local-version"
 50 | 
 51 | [tool.setuptools]
 52 | package-dir = {"" = "src"}
 53 | 
 54 | [tool.setuptools.dynamic]
 55 | dependencies = {file = ["requirements.txt"]}
 56 | optional-dependencies = {leaderboard = {file = ["requirements-leaderboard.txt"]}}
 57 | 
 58 | [tool.setuptools.packages.find]
 59 | where = ["src"]
 60 | include = ["instructlab.eval"]
 61 | 
 62 | [tool.ruff]
 63 | target-version = "py39"
 64 | # same as black's default line length
 65 | line-length = 88
 66 | 
 67 | [tool.ruff.lint]
 68 | # Allow fix for all enabled rules (when `--fix`) is provided.
 69 | fixable = ["ALL"]
 70 | unfixable = []
 71 | 
 72 | # Fixers will be enabled gradually.
 73 | select = [
 74 |     # "B",  # flake8-bugbear
 75 |     # "E",  # pycodestyle
 76 |     # "F",  # Pyflakes
 77 |     "Q",  # flake8-quotes
 78 |     # Ruff does not support isort's import_headings feature, yet.
 79 |     # "I",  # isort
 80 |     # "UP",  # pyupgrade
 81 |     # "SIM",  # flake8-simplify
 82 |     "TID",  # flake8-tidy-imports
 83 | ]
 84 | ignore = [
 85 |     # some embedded strings are longer than 88 characters
 86 |     "E501",  # line too long
 87 |     "TID252",  # Prefer absolute imports over relative imports from parent modules
 88 | ]
 89 | 
 90 | [tool.ruff.lint.isort]
 91 | # same as .isort.cfg
 92 | from-first = true
 93 | # not supported yet
 94 | # import-heading-future=Future
 95 | # import-heading-stdlib=Standard
 96 | # import-heading-thirdparty=Third Party
 97 | # import-heading-firstparty=First Party
 98 | # import-heading-localfolder=Local
 99 | known-local-folder = ["tuning"]
100 | 
101 | [tool.mypy]
102 | ignore_missing_imports = true
103 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | -r requirements.txt
 4 | 
 5 | pre-commit>=3.0.4
 6 | pylint>=2.16.2
 7 | pylint-pydantic
 8 | tox>=4.4.2
 9 | 
10 | pytest
11 | pytest-asyncio
12 | pytest-cov
13 | pytest-html
14 | 
15 | ruff
16 | isort
17 | pyspelling
18 | 
19 | mypy>=1.10.0
20 | types-tqdm
21 | types-PyYAML
22 | 


--------------------------------------------------------------------------------
/requirements-files.in:
--------------------------------------------------------------------------------
1 | requirements.txt
2 | requirements-dev.txt
3 | requirements-leaderboard.txt
4 | 


--------------------------------------------------------------------------------
/requirements-leaderboard.txt:
--------------------------------------------------------------------------------
1 | lm-eval[ifeval,vllm,math,sentencepiece]>=0.4.4
2 | 
3 | # vLLM 0.8.3 + torch 2.6.0 doesn't work when running vLLM on granite-3.1-8b-instruct
4 | vllm<=0.7.3
5 | torch<=2.5.1
6 | 


--------------------------------------------------------------------------------
/requirements-ruler.txt:
--------------------------------------------------------------------------------
1 | lm-eval[ruler]>=0.4.8


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | GitPython>=3.1.42
 3 | shortuuid
 4 | openai>=1.13.3
 5 | psutil
 6 | torch
 7 | transformers
 8 | accelerate
 9 | pandas
10 | pandas-stubs
11 | # Base lm-eval dependency
12 | lm-eval>=0.4.4
13 | httpx
14 | ragas>=0.2.11
15 | 


--------------------------------------------------------------------------------
/scripts/evaluate_best_checkpoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Example usage:
 5 | python scripts/evaluate_best_checkpoint.py \
 6 |     /path/to/checkpoint_dir \
 7 |     --output-file /path/to/output_file
 8 | """
 9 | 
10 | # Standard
11 | from pathlib import Path
12 | from typing import Optional
13 | import json
14 | 
15 | # Third Party
16 | import typer
17 | 
18 | app = typer.Typer()
19 | 
20 | 
21 | @app.command()
22 | def main(
23 |     input_dir: Path = typer.Argument(..., help="Input directory to process"),
24 |     output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
25 | ):
26 |     """
27 |     Process files in the input directory and optionally save results to an output file.
28 |     """
29 |     if not input_dir.exists():
30 |         typer.echo(f"Error: Input directory '{input_dir}' does not exist")
31 |         raise typer.Exit(1)
32 | 
33 |     if not input_dir.is_dir():
34 |         typer.echo(f"Error: '{input_dir}' is not a directory")
35 |         raise typer.Exit(1)
36 | 
37 |     checkpoint_dirs = list(input_dir.glob("hf_format/samples_*"))
38 |     typer.echo(f"Found {len(checkpoint_dirs)} samples files")
39 | 
40 |     if not checkpoint_dirs:
41 |         typer.echo(
42 |             f"No checkpoint directories found in the input directory: {input_dir}"
43 |         )
44 |         raise typer.Exit(1)
45 | 
46 |     typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
47 |     # First Party
48 |     from instructlab.eval.leaderboard import LeaderboardV2Evaluator
49 | 
50 |     checkpoint_results = {}
51 |     for checkpoint in checkpoint_dirs:
52 |         typer.echo(f"Processing checkpoint: {checkpoint}")
53 |         ckpt_output_file = checkpoint / "leaderboard_results.json"
54 |         evaluator = LeaderboardV2Evaluator(
55 |             model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
56 |         )
57 |         result = evaluator.run()
58 |         checkpoint_results[checkpoint.name] = result
59 |         typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
60 | 
61 |     # Sort checkpoints by score
62 |     sorted_checkpoints = sorted(
63 |         checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
64 |     )
65 |     typer.echo("Sorted checkpoints by score:")
66 |     for checkpoint_name, result in sorted_checkpoints:
67 |         typer.echo(f"{'=' * 100}")
68 |         typer.echo(json.dumps(result, indent=2))
69 | 
70 |     typer.echo(f"{'=' * 100}")
71 |     typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
72 | 
73 |     if output_file:
74 |         typer.echo(f"Output will be saved to: {output_file}")
75 |         with open(output_file, "w") as f:
76 |             json.dump(checkpoint_results, f, indent=2)
77 | 
78 |     # Add your processing logic here
79 | 
80 |     typer.echo("Processing complete!")
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     app()
85 | 


--------------------------------------------------------------------------------
/scripts/functional-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # This test script is laid out as follows:
 5 | # - UTILITIES:  utility functions
 6 | # - TESTS:      test functions
 7 | # - SETUP:      environment setup steps
 8 | # - MAIN:       test execution steps
 9 | #
10 | # If you are running locally and calling the script multiple times you may want to run like this:
11 | #
12 | # TEST_DIR=/tmp/foo ./scripts/functional-tests.sh
13 | 
14 | set -ex
15 | 
16 | #############
17 | # UTILITIES #
18 | #############
19 | 
20 | clone_taxonomy(){
21 |     if [ ! -d taxonomy ]; then
22 |         git clone https://github.com/instructlab/taxonomy.git
23 |     fi
24 | }
25 | 
26 | #########
27 | # TESTS #
28 | #########
29 | 
30 | test_branch_generator(){
31 |     python3 ${SCRIPTDIR}/test_branch_generator.py --test-dir "${TEST_DIR}"
32 | }
33 | 
34 | #########
35 | # SETUP #
36 | #########
37 | 
38 | # shellcheck disable=SC2155
39 | export SCRIPTDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
40 | # build a prompt string that includes the time, source file, line number, and function name
41 | export PS4='+$(date +"%Y-%m-%d %T") ${BASH_VERSION}:${BASH_SOURCE}:${LINENO}: ${FUNCNAME[0]:+${FUNCNAME[0]}(): }'
42 | 
43 | # Support overriding the test directory for local testing otherwise creates a temporary directory
44 | TEST_DIR=${TEST_DIR:-$(mktemp -d)}
45 | 
46 | export TEST_DIR
47 | export PACKAGE_NAME='instructlab-eval'
48 | 
49 | 
50 | ########
51 | # MAIN #
52 | ########
53 | 
54 | pushd $TEST_DIR
55 | 
56 | clone_taxonomy
57 | 
58 | test_branch_generator
59 | 
60 | 
61 | popd
62 | exit 0
63 | 


--------------------------------------------------------------------------------
/scripts/ruff.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | set -e
 4 | 
 5 | # wrapper to combine ruff check, ruff format, and isort
 6 | #
 7 | # "ruff.sh fix" runs fixes and reformats the code
 8 | # "ruff.sh check" checks style, format, and isort
 9 | # "ruff.sh <args>" passes abitrary args to ruff
10 | 
11 | if [ -z "$1" ]; then
12 |     echo "USAGE: $0 [check|fix|<args>]" >&2
13 |     exit 2
14 | fi
15 | 
16 | run() {
17 |     declare -i err
18 | 
19 |     echo "RUN: '$*'"
20 |     "$@"
21 |     err=$?
22 |     echo
23 |     return $err
24 | }
25 | 
26 | case $1 in
27 |     "check")
28 |         declare -i exitcode=0
29 | 
30 |         set +e
31 |         run ruff check .
32 |         exitcode=$(( exitcode + $? ))
33 | 
34 |         run ruff format --diff .
35 |         exitcode=$(( exitcode + $? ))
36 | 
37 |         run isort --check --diff .
38 |         exitcode=$(( exitcode + $? ))
39 |         set -e
40 | 
41 |         if [ $exitcode -ne 0 ]; then
42 |             echo "ERROR: one or more checks have failed." >&2
43 |             echo "Run 'tox -e ruff' to auto-correct all fixable errors." >&2
44 |             exit 3
45 |         fi
46 |         ;;
47 |     "fix")
48 |         run ruff check --fix .
49 |         run ruff format .
50 |         run isort .
51 |         ;;
52 |     *)
53 |         ruff "$@"
54 | esac
55 | 


--------------------------------------------------------------------------------
/scripts/test_branch_gen_answers.py:
--------------------------------------------------------------------------------
 1 | # Third Party
 2 | import httpx
 3 | 
 4 | # First Party
 5 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator
 6 | 
 7 | mt_bench_branch = MTBenchBranchEvaluator(
 8 |     "instructlab/granite-7b-lab",
 9 |     "instructlab/granite-7b-lab",
10 |     "../taxonomy",
11 |     "main",
12 | )
13 | mt_bench_branch.gen_answers(
14 |     "http://localhost:8000/v1",
15 |     http_client=httpx.Client(verify=False),
16 | )
17 | 


--------------------------------------------------------------------------------
/scripts/test_branch_generator.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import argparse
 3 | import os
 4 | 
 5 | # First Party
 6 | from instructlab.eval import mt_bench_branch_generator
 7 | 
 8 | 
 9 | def test_mt_bench_branch_generator(test_dir):
10 |     output_dir = os.path.join(test_dir, "mt_bench_branch_generator")
11 |     mt_bench_branch_generator.generate(
12 |         "prometheus-eval/prometheus-8x7b-v2.0",
13 |         "main",
14 |         "taxonomy",
15 |         output_dir,
16 |     )
17 |     main_dir = os.path.join(output_dir, "mt_bench_branch", "main")
18 |     assert os.path.isfile(os.path.join(main_dir, "question.jsonl"))
19 |     assert os.path.isfile(
20 |         os.path.join(
21 |             main_dir,
22 |             "reference_answer",
23 |             "prometheus-eval",
24 |             "prometheus-8x7b-v2.0.jsonl",
25 |         )
26 |     )
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     parser = argparse.ArgumentParser(description="Test Branch Generator")
31 |     parser.add_argument("--test-dir", help="Base test working directory")
32 |     args = parser.parse_args()
33 |     test_dir = args.test_dir
34 | 
35 |     test_mt_bench_branch_generator(test_dir)
36 | 


--------------------------------------------------------------------------------
/scripts/test_branch_judge_answers.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import pprint
 3 | 
 4 | # First Party
 5 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator
 6 | 
 7 | mt_bench_branch = MTBenchBranchEvaluator(
 8 |     "instructlab/granite-7b-lab",
 9 |     "instructlab/granite-7b-lab",
10 |     "../taxonomy",
11 |     "main",
12 | )
13 | overall_score, qa_pairs, error_rate = mt_bench_branch.judge_answers(
14 |     "http://localhost:8000/v1"
15 | )
16 | 
17 | print(f"Overall Score: {overall_score}")
18 | print(f"Error Rate: {error_rate}")
19 | print(f"QA Pair 0:")
20 | pprint.pprint(qa_pairs[0])
21 | 
22 | print(f"qa_pairs length: {len(qa_pairs)}")
23 | 
24 | for qa_pair in qa_pairs:
25 |     question_id = qa_pair.get("question_id")
26 |     assert question_id is not None
27 |     assert qa_pair.get("score") is not None
28 |     assert qa_pair.get("category") is not None
29 |     assert qa_pair.get("question") is not None
30 |     assert qa_pair.get("answer") is not None
31 |     assert qa_pair.get("qna_file") is not None
32 | 


--------------------------------------------------------------------------------
/scripts/test_gen_answers.py:
--------------------------------------------------------------------------------
1 | # First Party
2 | from instructlab.eval.mt_bench import MTBenchEvaluator
3 | 
4 | mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab")
5 | mt_bench.gen_answers("http://localhost:8000/v1")
6 | 


--------------------------------------------------------------------------------
/scripts/test_judge_answers.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import pprint
 3 | 
 4 | # First Party
 5 | from instructlab.eval.mt_bench import MTBenchEvaluator
 6 | 
 7 | mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab")
 8 | overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers(
 9 |     "http://localhost:8000/v1"
10 | )
11 | 
12 | print(f"Overall Score: {overall_score}")
13 | print(f"Turn 1 Score: {turn_scores[0]}")
14 | print(f"Turn 2 Score: {turn_scores[1]}")
15 | print(f"Error Rate: {error_rate}")
16 | print(f"QA Pair 0:")
17 | pprint.pprint(qa_pairs[0])
18 | 
19 | print(f"qa_pairs length: {len(qa_pairs)}")
20 | 
21 | for qa_pair in qa_pairs:
22 |     assert qa_pair.get("question_id") is not None
23 |     assert qa_pair.get("score") is not None
24 |     assert qa_pair.get("category") is not None
25 |     assert qa_pair.get("question") is not None
26 |     assert qa_pair.get("answer") is not None
27 | 


--------------------------------------------------------------------------------
/scripts/test_leaderboard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # NOTE: This script requires the leaderboard optional dependencies.
 5 | # Install with: pip install instructlab-eval[leaderboard]
 6 | 
 7 | # Standard
 8 | import json
 9 | 
10 | # First Party
11 | from instructlab.eval.leaderboard import LeaderboardV2Evaluator
12 | 
13 | if __name__ == "__main__":
14 |     evaluator = LeaderboardV2Evaluator(
15 |         model_path="ibm-granite/granite-3.1-8b-base",
16 |         eval_config={
17 |             "apply_chat_template": False,
18 |         },
19 |     )
20 |     results = evaluator.run()
21 |     print("got results from leaderboard v2")
22 |     print(json.dumps(results, indent=2))
23 | 


--------------------------------------------------------------------------------
/scripts/test_mmlu.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from typing import Dict, List, Tuple, TypedDict
 3 | 
 4 | # First Party
 5 | from instructlab.eval.mmlu import MMLUEvaluator
 6 | 
 7 | SYSTEM_PROMPT = """I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."""
 8 | 
 9 | 
10 | class MMLUSample(TypedDict):
11 |     """
12 |     Example of a single sample returned from lm_eval when running MMLU.
13 |     This is not a comprehensive type, just the subset of fields we care about for this test.
14 |     """
15 | 
16 |     # Arguments is the list of (prompt, answer) pairs passed to MMLU as few-shot samples.
17 |     # They will not be present with few_shot=0
18 |     arguments: List[Tuple[str, str]]
19 | 
20 | 
21 | def all_samples_contain_system_prompt(
22 |     samples: Dict[str, List[MMLUSample]], prompt: str
23 | ) -> bool:
24 |     """
25 |     Given a mapping of evaluation --> list of results, validates that all few-shot examples
26 |     included the system prompt
27 |     """
28 |     for topic, samples_set in samples.items():
29 |         for sample in samples_set:
30 |             for mmlu_prompt, _ in sample["arguments"]:
31 |                 if prompt not in mmlu_prompt:
32 |                     # we are looking for the exact system prompt, so no need to convert to normalize to lowercase
33 |                     print(f"found a sample in the '{topic}' MMLU topic set")
34 |                     return False
35 | 
36 |     return True
37 | 
38 | 
39 | def test_minimal_mmlu():
40 |     print("===> Executing 'test_minimal_mmlu'...")
41 |     try:
42 |         model_path = "instructlab/granite-7b-lab"
43 |         tasks = ["mmlu_anatomy", "mmlu_astronomy"]
44 |         mmlu = MMLUEvaluator(
45 |             model_path=model_path,
46 |             tasks=tasks,
47 |             system_prompt=SYSTEM_PROMPT,
48 |         )
49 |         overall_score, individual_scores = mmlu.run(
50 |             extra_args={"log_samples": True, "write_out": True}
51 |         )
52 |         samples = mmlu.results["samples"]
53 | 
54 |         print(overall_score)
55 |         print(individual_scores)
56 | 
57 |         # we need n-shots > 1 to be able to validate the inclusion of the system prompt
58 |         eligible_samples = {
59 |             topic: samples[topic]
60 |             for topic, shot in mmlu.results["n-shot"].items()
61 |             if shot > 1
62 |         }
63 |         if eligible_samples:
64 |             if not all_samples_contain_system_prompt(eligible_samples, SYSTEM_PROMPT):
65 |                 return False
66 |         else:
67 |             print(
68 |                 "MMLU was run in zero-shot mode, cannot confirm that system prompt was included, skipping check..."
69 |             )
70 | 
71 |     except Exception as exc:
72 |         print(f"'test_minimal_mmlu' failed: {exc}")
73 |         return False
74 |     return True
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     assert test_minimal_mmlu() == True
79 | 


--------------------------------------------------------------------------------
/scripts/test_mmlu_branch.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import os
 3 | 
 4 | # First Party
 5 | from instructlab.eval.mmlu import MMLUBranchEvaluator
 6 | 
 7 | 
 8 | def test_mmlu_branch():
 9 |     print("===> Executing 'test_mmlu_branch'...")
10 |     try:
11 |         model_path = "instructlab/granite-7b-lab"
12 |         tasks_dir = (
13 |             f"{os.path.dirname(os.path.realpath(__file__))}/../tests/testdata/sdg"
14 |         )
15 |         tasks = ["mmlu_pr"]
16 |         mmlu = MMLUBranchEvaluator(
17 |             model_path=model_path, tasks_dir=tasks_dir, tasks=tasks
18 |         )
19 |         overall_score, individual_scores = mmlu.run()
20 |         print(overall_score)
21 |         print(individual_scores)
22 |     except Exception as exc:
23 |         print(f"'test_mmlu_branch' failed: {exc}")
24 |         return False
25 |     return True
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     assert test_mmlu_branch() == True
30 | 


--------------------------------------------------------------------------------
/src/instructlab/__init__.py:
--------------------------------------------------------------------------------
1 | __path__ = __import__("pkgutil").extend_path(__path__, __name__)
2 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Standard
2 | import os
3 | 
4 | # Inherit logging from caller rather than from vLLM
5 | os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
6 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/data/mt_bench/judge_prompts.jsonl:
--------------------------------------------------------------------------------
1 | {"name": "pair-v2", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[A]]"}
2 | {"name": "pair-v2-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
3 | {"name": "pair-math-v1", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for math questions", "category": "math", "output_format": "[[A]]"}
4 | {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
5 | {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
6 | {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
7 | {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
8 | {"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
9 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/data/mt_bench_branch/judge_prompts.jsonl:
--------------------------------------------------------------------------------
1 | {"name": "pair-v2", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[A]]"}
2 | {"name": "pair-v2-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
3 | {"name": "pair-math-v1", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for math questions", "category": "math", "output_format": "[[A]]"}
4 | {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
5 | {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
6 | {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. If correct, an assistant's answer that follows a similar style of the reference answer is preferable. Do not bias to any particular style that does not appear in the reference answer. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
7 | {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
8 | {"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
9 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/evaluator.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | 
 4 | class Evaluator:
 5 |     """
 6 |     Parent class for Evaluators
 7 |     """
 8 | 
 9 |     name: str
10 | 
11 |     def __init__(self) -> None:
12 |         pass
13 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/exceptions.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | 
  4 | class EvalError(Exception):
  5 |     """
  6 |     Parent class for all of instructlab-eval exceptions
  7 |     """
  8 | 
  9 | 
 10 | class ModelNotFoundError(EvalError):
 11 |     """
 12 |     Error raised when model is not able to be found
 13 | 
 14 |     Attributes
 15 |         message     error message to be printed on raise
 16 |         path        filepath of model location
 17 |     """
 18 | 
 19 |     def __init__(self, path) -> None:
 20 |         super().__init__()
 21 |         self.path = path
 22 |         self.message = f"Model could not be found at {path}"
 23 | 
 24 | 
 25 | class InvalidModelError(EvalError):
 26 |     """
 27 |     Error raised when model can be found but is invalid
 28 | 
 29 |     Attributes
 30 |         message     error message to be printed on raise
 31 |         path        filepath of model location
 32 |         reason      root cause for model invalidity
 33 |     """
 34 | 
 35 |     def __init__(self, path, reason) -> None:
 36 |         super().__init__()
 37 |         self.path = path
 38 |         self.reason = reason
 39 |         self.message = f"Model found at {path} but was invalid due to: {reason}"
 40 | 
 41 | 
 42 | class InvalidMaxWorkersError(EvalError):
 43 |     """
 44 |     Error raised when max_workers isn't an int or "auto"
 45 | 
 46 |     Attributes
 47 |         message         error message to be printed on raise
 48 |         max_workers     max_workers specified
 49 |     """
 50 | 
 51 |     def __init__(self, max_workers) -> None:
 52 |         super().__init__()
 53 |         self.max_workers = max_workers
 54 |         self.message = f"Invalid max_workers '{max_workers}' specified. Valid values are positive integers or 'auto'."
 55 | 
 56 | 
 57 | class InvalidGitRepoError(EvalError):
 58 |     """
 59 |     Error raised when taxonomy dir provided isn't a valid git repo
 60 |     Attributes
 61 |         message         error message to be printed on raise
 62 |         taxonomy_dir    supplied taxonomy directory
 63 |     """
 64 | 
 65 |     def __init__(self, taxonomy_dir) -> None:
 66 |         super().__init__()
 67 |         self.taxonomy_dir = taxonomy_dir
 68 |         self.message = f"Invalid git repo: {taxonomy_dir}"
 69 | 
 70 | 
 71 | class GitRepoNotFoundError(EvalError):
 72 |     """
 73 |     Error raised when taxonomy dir provided does not exist
 74 |     Attributes
 75 |         message         error message to be printed on raise
 76 |         taxonomy_dir    supplied taxonomy directory
 77 |     """
 78 | 
 79 |     def __init__(self, taxonomy_dir) -> None:
 80 |         super().__init__()
 81 |         self.taxonomy_dir = taxonomy_dir
 82 |         self.message = f"Taxonomy git repo not found: {taxonomy_dir}"
 83 | 
 84 | 
 85 | class InvalidGitBranchError(EvalError):
 86 |     """
 87 |     Error raised when branch provided is invalid
 88 |     Attributes
 89 |         message         error message to be printed on raise
 90 |         branch          supplied branch
 91 |     """
 92 | 
 93 |     def __init__(self, branch) -> None:
 94 |         super().__init__()
 95 |         self.branch = branch
 96 |         self.message = f"Invalid git branch: {branch}"
 97 | 
 98 | 
 99 | class TasksDirNotFoundError(EvalError):
100 |     """
101 |     Error raised when the tasks dir doesn't exist
102 |     Attributes
103 |         message         error message to be printed on raise
104 |         tasks_dir       tasks dir
105 |     """
106 | 
107 |     def __init__(self, tasks_dir) -> None:
108 |         super().__init__()
109 |         self.tasks_dir = tasks_dir
110 |         self.message = f"Tasks dir not found: {tasks_dir}"
111 | 
112 | 
113 | class InvalidTasksDirError(EvalError):
114 |     """
115 |     Error raised when the tasks dir is invalid
116 |     Attributes
117 |         message         error message to be printed on raise
118 |         tasks_dir       tasks dir
119 |     """
120 | 
121 |     def __init__(self, tasks_dir) -> None:
122 |         super().__init__()
123 |         self.tasks_dir = tasks_dir
124 |         self.message = f"Invalid Tasks Dir: {tasks_dir}"
125 | 
126 | 
127 | class InvalidEvaluationResult(EvalError):
128 |     """
129 |     Error raised for invalid eval results
130 |     Attributes
131 |         message         error message to be printed on raise
132 |     """
133 | 
134 |     def __init__(self, message) -> None:
135 |         super().__init__()
136 |         self.message = message
137 | 
138 | 
139 | class ModelServingAPIError(EvalError):
140 |     """
141 |     Error raised when reply retrieval from model serving fails.
142 |     Attributes
143 |         message              error message to be printed on raise
144 |     """
145 | 
146 |     def __init__(self) -> None:
147 |         super().__init__()
148 |         self.message = "Failed to receive a reply from model serving API."
149 | 
150 | 
151 | class EmptyTaxonomyError(EvalError):
152 |     """
153 |     Error raised when taxonomy doesn't contain any skill QNAs
154 |     Attributes
155 |         message              error message to be printed on raise
156 |     """
157 | 
158 |     def __init__(self) -> None:
159 |         super().__init__()
160 |         self.message = "Provided taxonomy doesn't contain any skill qna.yaml files"
161 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/logger_config.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # Standard
 3 | import logging
 4 | 
 5 | 
 6 | def setup_logger(name):
 7 |     # Set up the logger
 8 |     logger = logging.getLogger(name)
 9 |     return logger
10 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/mmlu.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | """
  4 | MMLU - Massive Multitask Language Understanding
  5 | https://en.wikipedia.org/wiki/MMLU
  6 | https://arxiv.org/abs/2009.03300
  7 | """
  8 | 
  9 | # Standard
 10 | from typing import Any, Dict, Optional, Union
 11 | import os
 12 | 
 13 | # Third Party
 14 | from lm_eval.evaluator import simple_evaluate
 15 | from lm_eval.tasks import TaskManager
 16 | import torch
 17 | 
 18 | # First Party
 19 | from instructlab.eval.evaluator import Evaluator
 20 | from instructlab.eval.exceptions import (
 21 |     InvalidModelError,
 22 |     InvalidTasksDirError,
 23 |     ModelNotFoundError,
 24 |     TasksDirNotFoundError,
 25 | )
 26 | 
 27 | # Local
 28 | from .logger_config import setup_logger
 29 | 
 30 | logger = setup_logger(__name__)
 31 | 
 32 | MMLU_TASKS = [
 33 |     "mmlu_abstract_algebra",
 34 |     "mmlu_anatomy",
 35 |     "mmlu_astronomy",
 36 |     "mmlu_business_ethics",
 37 |     "mmlu_clinical_knowledge",
 38 |     "mmlu_college_biology",
 39 |     "mmlu_college_chemistry",
 40 |     "mmlu_college_computer_science",
 41 |     "mmlu_college_mathematics",
 42 |     "mmlu_college_medicine",
 43 |     "mmlu_college_physics",
 44 |     "mmlu_computer_security",
 45 |     "mmlu_conceptual_physics",
 46 |     "mmlu_econometrics",
 47 |     "mmlu_electrical_engineering",
 48 |     "mmlu_elementary_mathematics",
 49 |     "mmlu_formal_logic",
 50 |     "mmlu_global_facts",
 51 |     "mmlu_high_school_biology",
 52 |     "mmlu_high_school_chemistry",
 53 |     "mmlu_high_school_computer_science",
 54 |     "mmlu_high_school_european_history",
 55 |     "mmlu_high_school_geography",
 56 |     "mmlu_high_school_government_and_politics",
 57 |     "mmlu_high_school_macroeconomics",
 58 |     "mmlu_high_school_mathematics",
 59 |     "mmlu_high_school_microeconomics",
 60 |     "mmlu_high_school_physics",
 61 |     "mmlu_high_school_psychology",
 62 |     "mmlu_high_school_statistics",
 63 |     "mmlu_high_school_us_history",
 64 |     "mmlu_high_school_world_history",
 65 |     "mmlu_human_aging",
 66 |     "mmlu_human_sexuality",
 67 |     "mmlu_international_law",
 68 |     "mmlu_jurisprudence",
 69 |     "mmlu_logical_fallacies",
 70 |     "mmlu_machine_learning",
 71 |     "mmlu_management",
 72 |     "mmlu_marketing",
 73 |     "mmlu_medical_genetics",
 74 |     "mmlu_miscellaneous",
 75 |     "mmlu_moral_disputes",
 76 |     "mmlu_moral_scenarios",
 77 |     "mmlu_nutrition",
 78 |     "mmlu_philosophy",
 79 |     "mmlu_prehistory",
 80 |     "mmlu_professional_accounting",
 81 |     "mmlu_professional_law",
 82 |     "mmlu_professional_medicine",
 83 |     "mmlu_professional_psychology",
 84 |     "mmlu_public_relations",
 85 |     "mmlu_security_studies",
 86 |     "mmlu_sociology",
 87 |     "mmlu_us_foreign_policy",
 88 |     "mmlu_virology",
 89 |     "mmlu_world_religions",
 90 | ]
 91 | 
 92 | 
 93 | class AbstractMMLUEvaluator(Evaluator):
 94 |     """
 95 |     Abstract child class of an Evaluator for Massive Multitask Language Understanding Branch
 96 | 
 97 |     Attributes:
 98 |         model_path      absolute path to or name of a huggingface model
 99 |         tasks_dir       path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
100 |         tasks           list of tasks for MMLU to test the model with
101 |         model_dtype     dtype of model when served
102 |         few_shots       number of examples
103 |         batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
104 |         device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
105 |         system_prompt   system prompt to be used when applying the chat template
106 |         results         full output from the `lm_eval.evaluator.simple_evaluate` function after MMLU has run.
107 |     """
108 | 
109 |     def __init__(
110 |         self,
111 |         model_path,
112 |         tasks_dir: Optional[str],
113 |         tasks: list[str],
114 |         model_dtype="bfloat16",
115 |         few_shots: int = 5,
116 |         batch_size: Optional[Union[int, str]] = "auto",
117 |         device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
118 |         system_prompt: Optional[str] = None,
119 |     ) -> None:
120 |         self.model_path = model_path
121 |         self.system_prompt = system_prompt
122 |         self.tasks_dir = tasks_dir
123 |         self.tasks = tasks
124 |         self.model_dtype = model_dtype
125 |         self.few_shots = few_shots
126 |         self.batch_size = batch_size
127 |         self.device = device
128 |         self._results = None
129 | 
130 |     @property
131 |     def results(self) -> Dict[str, Any] | None:
132 |         """
133 |         Returns the results of the last MMLU evaluation, if one has taken place.
134 | 
135 |         Returns:
136 |             Dict[str, Any] | None: The output from `lm_eval.evaluator.simple_evaluate`
137 |         """
138 |         return self._results
139 | 
140 |     def run(
141 |         self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None
142 |     ) -> tuple:
143 |         """
144 |         Runs evaluation
145 | 
146 |         Attributes
147 |             server_url          Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
148 |             extra_args          Dictionary containing any extra arguments to be passed into the lm_eval `lm_eval.evaluator.simple_evaluate` function.
149 | 
150 |         Returns:
151 |             overall_score       Average score for the task group
152 |             individual_scores   Individual scores for each task in the task group
153 |         """
154 |         extra_args = {} if not extra_args else extra_args
155 |         logger.debug(locals())
156 | 
157 |         # TODO: make this a parameter for class?
158 |         os.environ["TOKENIZERS_PARALLELISM"] = "true"
159 | 
160 |         individual_scores: dict = {}
161 |         agg_score: float = 0.0
162 | 
163 |         results = self._run_mmlu(server_url)
164 |         for task, result in results.items():
165 |             agg_score += float(result["acc,none"])
166 |             individual_scores[task] = {
167 |                 "score": float(result["acc,none"]),
168 |                 "stderr": float(result["acc_stderr,none"]),
169 |             }
170 | 
171 |         overall_score = float(agg_score / len(individual_scores))
172 | 
173 |         return overall_score, individual_scores
174 | 
175 |     def _run_mmlu(
176 |         self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None
177 |     ) -> dict:
178 |         extra_args = {} if not extra_args else extra_args
179 |         if server_url is not None:
180 |             # Requires lm_eval >= 0.4.4
181 |             model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface"
182 |             model = "local-completions"
183 |         else:
184 |             model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
185 |             model = "hf"
186 |         tm = None
187 |         if self.tasks_dir is not None:
188 |             if not os.path.exists(self.tasks_dir):
189 |                 raise TasksDirNotFoundError(self.tasks_dir)
190 |             if not os.access(self.tasks_dir, os.R_OK):
191 |                 raise InvalidTasksDirError(self.tasks_dir)
192 |             tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
193 |         should_apply_chat_template = self.system_prompt is not None
194 | 
195 |         # configure the args here so users can override them as necessary
196 |         simple_evaluate_kwargs = {
197 |             "model": model,
198 |             "model_args": model_args,
199 |             "tasks": self.tasks,
200 |             "num_fewshot": self.few_shots,
201 |             "batch_size": self.batch_size,
202 |             "device": self.device,
203 |             "task_manager": tm,
204 |             "system_instruction": self.system_prompt,
205 |             "apply_chat_template": should_apply_chat_template,
206 |         }
207 |         simple_evaluate_kwargs.update(extra_args)
208 | 
209 |         results = self._simple_evaluate_with_error_handling(**simple_evaluate_kwargs)
210 |         self._results = results
211 |         return results["results"]
212 | 
213 |     # This method converts general errors from simple_evaluate
214 |     # into a more user-understandable error
215 |     def _simple_evaluate_with_error_handling(self, **kwargs):
216 |         try:
217 |             return simple_evaluate(**kwargs)
218 |         except KeyError as ke:
219 |             # If the first task key file cannot be found in tasks_dir, simple_evaluate() will return
220 |             # an obscure KeyError(first task key)
221 |             if (
222 |                 self.tasks_dir is not None
223 |                 and len(self.tasks) > 0
224 |                 and ke.args[0] == self.tasks[0]
225 |             ):
226 |                 raise InvalidTasksDirError(self.tasks_dir) from ke
227 |             raise
228 |         except OSError as ose:
229 |             # If a model can not be found, simple_evaluate() will return
230 |             # an obscure OSError with a message
231 |             if "is not a valid model" in str(
232 |                 ose
233 |             ) or "does not appear to have a file named" in str(ose):
234 |                 raise ModelNotFoundError(self.model_path) from ose
235 |             if "is not a valid JSON file" in str(ose):
236 |                 reason = "Looked for valid JSON file but couldn't find one - are you pointing at a directory with a 'config.json'?"
237 |                 raise InvalidModelError(self.model_path, reason) from ose
238 |             raise
239 | 
240 | 
241 | class MMLUEvaluator(AbstractMMLUEvaluator):
242 |     """
243 |     Evaluator for Massive Multitask Language Understanding (MMLU)
244 | 
245 |     Attributes:
246 |         model_path      absolute path to or name of a huggingface model
247 |         tasks           list of tasks for MMLU to test the model with
248 |         model_dtype     dtype of model when served
249 |         few_shots       number of examples
250 |         batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
251 |         device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
252 |         system_prompt   system prompt to be used when applying the chat template
253 |     """
254 | 
255 |     name = "mmlu"
256 | 
257 |     def __init__(
258 |         self,
259 |         model_path,
260 |         tasks: list[str] = MMLU_TASKS,
261 |         model_dtype="bfloat16",
262 |         few_shots: int = 5,
263 |         batch_size: Optional[Union[int, str]] = "auto",
264 |         device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
265 |         system_prompt: Optional[str] = None,
266 |     ) -> None:
267 |         super().__init__(
268 |             model_path,
269 |             None,
270 |             tasks,
271 |             model_dtype,
272 |             few_shots,
273 |             batch_size,
274 |             device,
275 |             system_prompt=system_prompt,
276 |         )
277 | 
278 | 
279 | class MMLUBranchEvaluator(AbstractMMLUEvaluator):
280 |     """
281 |     Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch)
282 | 
283 |     Attributes:
284 |         model_path      absolute path to or name of a huggingface model
285 |         system_prompt   system prompt to be used when applying the chat template
286 |         tasks_dir       path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
287 |         tasks           group name that is shared by all the MMLUBranch tasks
288 |         model_dtype     dtype of model when served
289 |         few_shots       number of examples
290 |         batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
291 |         device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
292 |     """
293 | 
294 |     name = "mmlu_branch"
295 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/mt_bench_answers.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | # Standard
  3 | import concurrent.futures
  4 | import json
  5 | import os
  6 | import time
  7 | 
  8 | # Third Party
  9 | import shortuuid
 10 | import tqdm
 11 | 
 12 | # Local
 13 | from .logger_config import setup_logger
 14 | from .mt_bench_common import (
 15 |     bench_dir,
 16 |     chat_completion_openai,
 17 |     get_openai_client,
 18 |     load_questions,
 19 |     temperature_config,
 20 | )
 21 | from .mt_bench_model_adapter import get_conversation_template  # type: ignore
 22 | 
 23 | logger = setup_logger(__name__)
 24 | 
 25 | 
 26 | def reorg_answer_file(answer_file):
 27 |     """Sort by question id and de-duplication"""
 28 |     logger.debug(locals())
 29 |     with open(answer_file, "r+", encoding="utf-8") as f:
 30 |         answers = {}
 31 |         for l in f:
 32 |             qid = json.loads(l)["question_id"]
 33 |             answers[qid] = l
 34 | 
 35 |         # Reset to the beginning of the file and clear it
 36 |         f.seek(0)
 37 |         f.truncate()
 38 | 
 39 |         qids = sorted(list(answers.keys()))
 40 |         for qid in qids:
 41 |             f.write(answers[qid])
 42 | 
 43 | 
 44 | def get_answer(
 45 |     question: dict,
 46 |     model: str,
 47 |     num_choices: int,
 48 |     max_tokens: int,
 49 |     answer_file: str,
 50 |     force_temperature: float,
 51 |     openai_client,
 52 | ):
 53 |     """Answer a question with the model"""
 54 |     assert force_temperature is None or question.get("required_temperature") is None
 55 |     if force_temperature is not None:
 56 |         temperature = force_temperature
 57 |     elif "required_temperature" in question.keys():
 58 |         temperature = question["required_temperature"]
 59 |     elif question["category"] in temperature_config:
 60 |         temperature = temperature_config[question["category"]]
 61 |     else:
 62 |         temperature = 0.7
 63 | 
 64 |     choices = []
 65 |     for i in range(num_choices):
 66 |         conv = get_conversation_template(model, "granite")
 67 | 
 68 |         turns = []
 69 |         for j in range(len(question["turns"])):
 70 |             conv.append_message(conv.roles[0], question["turns"][j])
 71 |             conv.append_message(conv.roles[1], None)
 72 | 
 73 |             output = chat_completion_openai(
 74 |                 openai_client,
 75 |                 model,
 76 |                 conv,
 77 |                 temperature,
 78 |                 max_tokens,
 79 |             )
 80 | 
 81 |             conv.update_last_message(output)
 82 |             turns.append(output)
 83 | 
 84 |         choices.append({"index": i, "turns": turns})
 85 | 
 86 |     # Dump answers
 87 |     ans = {
 88 |         "question_id": question["question_id"],
 89 |         "answer_id": shortuuid.uuid(),
 90 |         "model_id": model,
 91 |         "choices": choices,
 92 |         "tstamp": time.time(),
 93 |     }
 94 | 
 95 |     os.makedirs(os.path.dirname(answer_file), exist_ok=True)
 96 |     with open(answer_file, "a", encoding="utf-8") as fout:
 97 |         fout.write(json.dumps(ans) + "\n")
 98 | 
 99 | 
100 | def generate_answers(
101 |     model_name,
102 |     model_api_base,
103 |     api_key=None,
104 |     branch=None,
105 |     output_dir="eval_output",
106 |     data_dir=None,
107 |     question_begin=None,
108 |     question_end=None,
109 |     force_temperature=None,
110 |     num_choices=1,
111 |     max_tokens=1024,
112 |     max_workers=1,
113 |     bench_name="mt_bench",
114 |     http_client=None,
115 | ):
116 |     """Generate model answers to be judged"""
117 |     logger.debug(locals())
118 | 
119 |     openai_client = get_openai_client(model_api_base, api_key, http_client)
120 | 
121 |     if data_dir is None:
122 |         data_dir = os.path.join(os.path.dirname(__file__), "data")
123 | 
124 |     data_base_dir = bench_dir(data_dir, bench_name, branch)
125 |     output_base_dir = bench_dir(output_dir, bench_name, branch)
126 | 
127 |     question_file = os.path.join(data_base_dir, "question.jsonl")
128 |     questions = load_questions(question_file, question_begin, question_end)
129 | 
130 |     answer_file = os.path.join(output_base_dir, "model_answer", f"{model_name}.jsonl")
131 |     if os.path.isfile(answer_file):
132 |         os.remove(answer_file)
133 |         logger.debug("Removing previous answer file: %s", answer_file)
134 | 
135 |     first_n = None
136 |     first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS")
137 |     if first_n_env:
138 |         first_n = int(first_n_env)
139 |         logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n)
140 | 
141 |     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
142 |         futures = []
143 |         for i, question in enumerate(questions):
144 |             if first_n is not None and i >= first_n:
145 |                 break
146 | 
147 |             future = executor.submit(
148 |                 get_answer,
149 |                 question,
150 |                 model_name,
151 |                 num_choices,
152 |                 max_tokens,
153 |                 answer_file,
154 |                 force_temperature,
155 |                 openai_client,
156 |             )
157 |             futures.append(future)
158 | 
159 |         for future in tqdm.tqdm(
160 |             concurrent.futures.as_completed(futures), total=len(futures)
161 |         ):
162 |             future.result()
163 | 
164 |     reorg_answer_file(answer_file)
165 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/mt_bench_branch_generator.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | # Standard
  3 | from pathlib import Path
  4 | import hashlib
  5 | import json
  6 | import os
  7 | import time
  8 | 
  9 | # Third Party
 10 | from tqdm import tqdm
 11 | import git
 12 | import shortuuid
 13 | import yaml
 14 | 
 15 | # Local
 16 | from .exceptions import (
 17 |     EmptyTaxonomyError,
 18 |     GitRepoNotFoundError,
 19 |     InvalidGitBranchError,
 20 |     InvalidGitRepoError,
 21 | )
 22 | from .logger_config import setup_logger
 23 | from .mt_bench_common import bench_dir
 24 | 
 25 | logger = setup_logger(__name__)
 26 | 
 27 | 
 28 | def get_file_paths(directory):
 29 |     logger.debug(locals())
 30 |     file_paths = []
 31 |     root_paths = [
 32 |         entry
 33 |         for entry in Path(directory).iterdir()
 34 |         if entry.is_dir()
 35 |         if not entry.name.startswith(".")
 36 |         if entry.name != "knowledge"
 37 |         if entry.name != "docs"
 38 |         if entry.name != "scripts"
 39 |     ]
 40 |     for basedir in root_paths:
 41 |         for root, _, files in os.walk(basedir):
 42 |             file_paths.extend(
 43 |                 [os.path.join(root, file) for file in files if file == "qna.yaml"]
 44 |             )
 45 |     return file_paths
 46 | 
 47 | 
 48 | def read_qna(fn):
 49 |     with open(fn, "r", encoding="utf-8") as file:
 50 |         contents = yaml.safe_load(file)
 51 |     return contents.get("seed_examples")
 52 | 
 53 | 
 54 | def generate(judge_model_name, branch, taxonomy_dir, output_dir):
 55 |     """Create questions and reference answers from taxonomy"""
 56 |     logger.debug(locals())
 57 |     restore_branch = None
 58 |     try:
 59 |         if branch is not None:
 60 |             taxonomy_repo = git.Repo(taxonomy_dir)
 61 |             restore_branch = taxonomy_repo.active_branch
 62 |             taxonomy_repo.git.checkout(branch)
 63 | 
 64 |         qna_file_list = get_file_paths(taxonomy_dir)
 65 |         if len(qna_file_list) == 0:
 66 |             raise EmptyTaxonomyError
 67 | 
 68 |         question_lst = []
 69 |         reference_answers = []
 70 |         for qna_file_path in tqdm(qna_file_list):
 71 |             examples = read_qna(qna_file_path)
 72 |             qna_file = qna_file_path[len(taxonomy_dir) + 1 :]
 73 |             if examples is None:
 74 |                 logger.warning("failed to load %s. skipping...", qna_file)
 75 |                 continue
 76 |             for ex in examples:
 77 |                 q, a = ex.get("question"), ex.get("answer")
 78 |                 if q is None or a is None:
 79 |                     logger.warning("Skipping malformed file %s", qna_file)
 80 |                     continue
 81 | 
 82 |                 c = ex.get("context")
 83 |                 if c is not None:
 84 |                     t_1 = (
 85 |                         "Given the context below:\n"
 86 |                         + c
 87 |                         + "\n"
 88 |                         + "Answer the following question: "
 89 |                         + q
 90 |                     )
 91 |                 else:
 92 |                     t_1 = q
 93 | 
 94 |                 # Generate a consistent hash to have consistent question_id across qna_files from different runs
 95 |                 str_bytes = bytes(q, "UTF-8")
 96 |                 m = hashlib.md5(str_bytes)
 97 |                 question_id = str(int(m.hexdigest(), base=16))
 98 |                 question_lst.append(
 99 |                     {
100 |                         "qna_file": qna_file,
101 |                         "question_id": question_id,
102 |                         "category": "taxonomy",
103 |                         "turns": [t_1],
104 |                         "reference": [a],
105 |                     }
106 |                 )
107 | 
108 |                 reference_answers.append(
109 |                     {
110 |                         "question_id": question_id,
111 |                         "answer_id": shortuuid.uuid(),
112 |                         "model_id": judge_model_name,
113 |                         "choices": [{"index": 0, "turns": [a]}],
114 |                         "tstamp": time.time(),
115 |                     }
116 |                 )
117 | 
118 |         logger.debug("Generated %s questions", len(question_lst))
119 | 
120 |         output_base_dir = bench_dir(output_dir, "mt_bench_branch", branch)
121 |         os.makedirs(output_base_dir, exist_ok=True)
122 |         question_file = os.path.join(output_base_dir, "question.jsonl")
123 |         logger.debug("Generating question file: %s", question_file)
124 |         with open(question_file, "w", encoding="utf-8") as outfile:
125 |             for entry in question_lst:
126 |                 json.dump(entry, outfile)
127 |                 outfile.write("\n")
128 | 
129 |         answer_file = os.path.join(
130 |             output_base_dir, "reference_answer", f"{judge_model_name}.jsonl"
131 |         )
132 |         logger.debug("Generating answer file: %s", answer_file)
133 |         os.makedirs(os.path.dirname(answer_file), exist_ok=True)
134 |         with open(
135 |             answer_file,
136 |             "w",
137 |             encoding="utf-8",
138 |         ) as outfile:
139 |             for entry in reference_answers:
140 |                 json.dump(entry, outfile)
141 |                 outfile.write("\n")
142 |     except git.exc.NoSuchPathError as nspe:
143 |         raise GitRepoNotFoundError(taxonomy_dir) from nspe
144 |     except git.exc.GitCommandError as gce:
145 |         raise InvalidGitBranchError(branch) from gce
146 |     except (git.exc.InvalidGitRepositoryError, git.exc.GitError) as ge:
147 |         raise InvalidGitRepoError(taxonomy_dir) from ge
148 |     finally:
149 |         if restore_branch is not None:
150 |             taxonomy_repo.git.checkout(restore_branch)
151 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/mt_bench_conversation.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | """
  3 | Conversation prompt templates.
  4 | """
  5 | 
  6 | # Standard
  7 | from enum import IntEnum, auto
  8 | from typing import Dict, List, Tuple, Union
  9 | import dataclasses
 10 | 
 11 | 
 12 | class SeparatorStyle(IntEnum):
 13 |     """Separator styles."""
 14 | 
 15 |     ADD_COLON_SINGLE = auto()
 16 |     ADD_COLON_TWO = auto()
 17 |     ADD_COLON_SPACE_SINGLE = auto()
 18 |     NO_COLON_SINGLE = auto()
 19 |     NO_COLON_TWO = auto()
 20 |     ADD_NEW_LINE_SINGLE = auto()
 21 |     LLAMA2 = auto()
 22 |     DEFAULT = auto()
 23 | 
 24 | 
 25 | @dataclasses.dataclass
 26 | class Conversation:
 27 |     # pylint: disable=too-many-instance-attributes
 28 |     """A class that manages prompt templates and keeps all conversation history."""
 29 | 
 30 |     # The name of this template
 31 |     name: str
 32 |     # The template of the system prompt
 33 |     system_template: str = "{system_message}"
 34 |     # The system message
 35 |     system_message: str = ""
 36 |     # The names of two roles
 37 |     roles: Tuple[str, str] = ("USER", "ASSISTANT")
 38 |     # All messages. Each item is (role, message).
 39 |     # Each message is either a string or a tuple of (string, List[image_url]).
 40 |     messages: List[List[str | None]] = dataclasses.field(default_factory=list)
 41 |     # The number of few shot examples
 42 |     offset: int = 0
 43 |     # The separator style and configurations
 44 |     sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
 45 |     sep: str | None = "\n"
 46 |     sep2: str | None = None
 47 |     # Stop criteria (the default one is EOS token)
 48 |     stop_str: Union[str, List[str]] | None = None
 49 |     # Stops generation if meeting any token in this list
 50 |     stop_token_ids: List[int] | None = None
 51 | 
 52 |     def set_system_message(self, system_message: str):
 53 |         """Set the system message."""
 54 |         self.system_message = system_message
 55 | 
 56 |     def get_system_message(self):
 57 |         """return the system message."""
 58 |         return self.system_message
 59 | 
 60 |     def append_message(self, role: str, message: str | None):
 61 |         """Append a new message."""
 62 |         self.messages.append([role, message])
 63 | 
 64 |     def update_last_message(self, message: str):
 65 |         """Update the last output.
 66 | 
 67 |         The last message is typically set to be None when constructing the prompt,
 68 |         so we need to update it in-place after getting the response from a model.
 69 |         """
 70 |         self.messages[-1][1] = message
 71 | 
 72 |     def to_openai_api_messages(self):
 73 |         """Convert the conversation to OpenAI chat completion format."""
 74 |         if self.system_message == "":
 75 |             ret = []
 76 |         else:
 77 |             ret = [{"role": "system", "content": self.system_message}]
 78 | 
 79 |         for i, (_, msg) in enumerate(self.messages[self.offset :]):
 80 |             if i % 2 == 0:
 81 |                 ret.append({"role": "user", "content": msg})
 82 |             else:
 83 |                 if msg is not None:
 84 |                     ret.append({"role": "assistant", "content": msg})
 85 |         return ret
 86 | 
 87 |     def copy(self):
 88 |         return Conversation(
 89 |             name=self.name,
 90 |             system_template=self.system_template,
 91 |             system_message=self.system_message,
 92 |             roles=self.roles,
 93 |             messages=[[x, y] for x, y in self.messages],
 94 |             offset=self.offset,
 95 |             sep_style=self.sep_style,
 96 |             sep=self.sep,
 97 |             sep2=self.sep2,
 98 |             stop_str=self.stop_str,
 99 |             stop_token_ids=self.stop_token_ids,
100 |         )
101 | 
102 |     def dict(self):
103 |         return {
104 |             "template_name": self.name,
105 |             "system_message": self.system_message,
106 |             "roles": self.roles,
107 |             "messages": self.extract_text_from_messages(),
108 |             "offset": self.offset,
109 |         }
110 | 
111 | 
112 | # A global registry for all conversation templates
113 | conv_templates: Dict[str, Conversation] = {}
114 | 
115 | 
116 | def register_conv_template(template: Conversation, override: bool = False):
117 |     """Register a new conversation template."""
118 |     if not override:
119 |         assert template.name not in conv_templates, (
120 |             f"{template.name} has been registered."
121 |         )
122 | 
123 |     conv_templates[template.name] = template
124 | 
125 | 
126 | def get_conv_template(name: str) -> Conversation:
127 |     """Get a conversation template."""
128 |     return conv_templates[name].copy()
129 | 
130 | 
131 | # An empty template for raw conversation.
132 | register_conv_template(
133 |     Conversation(
134 |         name="raw",
135 |         system_message="",
136 |         roles=("", ""),
137 |         sep_style=SeparatorStyle.NO_COLON_SINGLE,
138 |         sep="",
139 |     )
140 | )
141 | 
142 | 
143 | # api-based default template
144 | register_conv_template(
145 |     Conversation(
146 |         name="api_based_default",
147 |         system_message="",
148 |         roles=("user", "assistant"),
149 |         sep_style=SeparatorStyle.DEFAULT,
150 |         sep=None,
151 |     )
152 | )
153 | 
154 | 
155 | # ChatGPT default template
156 | register_conv_template(
157 |     Conversation(
158 |         name="chatgpt",
159 |         system_message="You are a helpful assistant.",
160 |         roles=("user", "assistant"),
161 |         sep_style=SeparatorStyle.DEFAULT,
162 |         sep=None,
163 |     )
164 | )
165 | 
166 | # Mistral template
167 | # source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
168 | register_conv_template(
169 |     Conversation(
170 |         name="mistral",
171 |         system_template="[INST] {system_message}\n",
172 |         roles=("[INST]", "[/INST]"),
173 |         sep_style=SeparatorStyle.LLAMA2,
174 |         sep=" ",
175 |         sep2="</s>",
176 |     )
177 | )
178 | 
179 | register_conv_template(
180 |     Conversation(
181 |         name="labrador-chat",
182 |         system_template="<|system|>\n{system_message}",
183 |         system_message="""You are Labrador, an AI language model developed by IBM DMF (Data Model Factory) Alignment Team. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior. You always respond to greetings (for example, hi, hello, g'day, morning, afternoon, evening, night, what's up, nice to meet you, sup, etc) with "Hello! I am Labrador, created by the IBM DMF Alignment Team. How can I help you today?". Please do not say anything else and do not start a conversation.""",
184 |         roles=("<|user|>", "<|assistant|>"),
185 |         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
186 |         sep="\n",
187 |         stop_str="<|endoftext|>",
188 |     )
189 | )
190 | 
191 | register_conv_template(
192 |     Conversation(
193 |         name="ibm-generic",
194 |         system_template="<|system|>\n{system_message}",
195 |         system_message="""You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.""",
196 |         roles=("<|user|>", "<|assistant|>"),
197 |         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
198 |         sep="\n",
199 |         stop_str="<|endoftext|>",
200 |     )
201 | )
202 | 
203 | register_conv_template(
204 |     Conversation(
205 |         name="granite-chat",
206 |         system_template="<|system|>\n{system_message}",
207 |         system_message="""You are Granite Chat, an AI language model developed by IBM. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.""",
208 |         roles=("<|user|>", "<|assistant|>"),
209 |         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
210 |         sep="\n",
211 |         stop_str="<|endoftext|>",
212 |     )
213 | )
214 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/mt_bench_judgment.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | # Standard
  3 | from concurrent.futures import ThreadPoolExecutor
  4 | import os
  5 | 
  6 | # Third Party
  7 | from tqdm import tqdm
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | # First Party
 12 | from instructlab.eval import exceptions
 13 | 
 14 | # Local
 15 | from .logger_config import setup_logger
 16 | from .mt_bench_common import (
 17 |     NEED_REF_CATS,
 18 |     Judge,
 19 |     MatchSingle,
 20 |     bench_dir,
 21 |     check_data,
 22 |     get_model_list,
 23 |     get_openai_client,
 24 |     load_judge_prompts,
 25 |     load_model_answers,
 26 |     load_questions,
 27 |     play_a_match_single,
 28 | )
 29 | 
 30 | logger = setup_logger(__name__)
 31 | 
 32 | 
 33 | def make_match_single(
 34 |     questions,
 35 |     models,
 36 |     model_answers,
 37 |     judge,
 38 |     ref_answers=None,
 39 |     multi_turn=False,
 40 | ):
 41 |     """Setup a match"""
 42 |     matches = []
 43 |     for q in questions:
 44 |         if multi_turn and len(q["turns"]) != 2:
 45 |             continue
 46 |         q_id = q["question_id"]
 47 |         for m in models:
 48 |             a = model_answers[m][q_id]
 49 |             if ref_answers is not None:
 50 |                 ref = ref_answers[judge.model_name][q_id]
 51 |                 matches.append(
 52 |                     MatchSingle(
 53 |                         dict(q), m, a, judge, ref_answer=ref, multi_turn=multi_turn
 54 |                     )
 55 |                 )
 56 |             else:
 57 |                 matches.append(MatchSingle(dict(q), m, a, judge, multi_turn=multi_turn))
 58 |     return matches
 59 | 
 60 | 
 61 | def make_judge_single(judge_model_name, judge_prompts) -> dict:
 62 |     """Setup the judge"""
 63 |     judges = {}
 64 |     judges["default"] = Judge(judge_model_name, judge_prompts["single-v1"])
 65 |     judges["math"] = Judge(
 66 |         judge_model_name, judge_prompts["single-math-v1"], ref_based=True
 67 |     )
 68 |     judges["default-mt"] = Judge(
 69 |         judge_model_name, judge_prompts["single-v1-multi-turn"], multi_turn=True
 70 |     )
 71 |     judges["math-mt"] = Judge(
 72 |         judge_model_name,
 73 |         judge_prompts["single-math-v1-multi-turn"],
 74 |         ref_based=True,
 75 |         multi_turn=True,
 76 |     )
 77 |     return judges
 78 | 
 79 | 
 80 | def make_judgment(
 81 |     question_file,
 82 |     judgment_file,
 83 |     answer_file,
 84 |     bench_name="mt_bench",
 85 | ):
 86 |     """Create judgment output"""
 87 |     logger.debug(locals())
 88 |     judgment_df_all = pd.read_json(
 89 |         judgment_file, lines=True, dtype={"question_id": str}
 90 |     )
 91 |     judgment_df = judgment_df_all[["model", "score", "turn"]]
 92 |     judgments_len = len(judgment_df)
 93 |     judgment_df = judgment_df[judgment_df["score"] != -1]
 94 |     error_free_judgments_len = len(judgment_df)
 95 |     error_rate = (judgments_len - error_free_judgments_len) / judgments_len
 96 |     logger.debug("#judgments: %s", judgments_len)
 97 |     logger.debug("#error free judgments: %s", error_free_judgments_len)
 98 |     logger.debug("error rate: %s", error_rate)
 99 | 
100 |     turn_scores = []
101 |     # First turn
102 |     df_1 = judgment_df[judgment_df["turn"] == 1].groupby(["model", "turn"]).mean()
103 |     if len(df_1.index) > 0:
104 |         overall_score = df_1["score"].iloc[0]
105 |         turn_scores.append(overall_score)
106 |     else:
107 |         raise exceptions.InvalidEvaluationResult(
108 |             "Evaluation provided no result. See logs for more details."
109 |         )
110 | 
111 |     if bench_name == "mt_bench":
112 |         # Second turn
113 |         df_2 = judgment_df[judgment_df["turn"] == 2].groupby(["model", "turn"]).mean()
114 |         if len(df_2.index) > 0:
115 |             turn2_score = df_2["score"].iloc[0]
116 |             turn_scores.append(turn2_score)
117 | 
118 |             # Average
119 |             df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean()
120 |             overall_score = df_3["score"].iloc[0]
121 |         else:
122 |             turn_scores.append("N/A")
123 | 
124 |     question_df = pd.read_json(question_file, lines=True, dtype={"question_id": str})
125 | 
126 |     answer_df = pd.read_json(answer_file, lines=True, dtype={"question_id": str})
127 | 
128 |     # Join to get questions with answers
129 |     join_columns = ["question_id", "choices", "turns", "category"]
130 |     if bench_name == "mt_bench_branch":
131 |         join_columns.append("qna_file")
132 | 
133 |     joined_df = question_df.join(
134 |         answer_df.set_index("question_id"), on="question_id", rsuffix="_answer"
135 |     )[join_columns]
136 |     # Join to get scores
137 |     join_columns.append("score")
138 |     joined_df = judgment_df_all.join(
139 |         joined_df.set_index("question_id"), on="question_id", lsuffix="_judgment"
140 |     )[join_columns]
141 |     joined_df = joined_df[joined_df["score"] != -1]
142 | 
143 |     qa_pairs = []
144 |     for _, row in joined_df.iterrows():
145 |         qa_pair = {
146 |             "question_id": row["question_id"],
147 |             "score": row["score"],
148 |             "category": row["category"],
149 |             "question": row["turns"],
150 |             "answer": row["choices"],
151 |         }
152 |         if bench_name == "mt_bench_branch":
153 |             qa_pair["qna_file"] = row["qna_file"]
154 |         qa_pairs.append(qa_pair)
155 |     return overall_score, qa_pairs, turn_scores, error_rate
156 | 
157 | 
158 | def judge_model(
159 |     model_name,
160 |     judge_model_name,
161 |     openai_client,
162 |     branch=None,
163 |     bench_name="mt_bench",
164 |     output_dir="eval_output",
165 |     data_dir=None,
166 |     max_workers=1,
167 |     first_n=None,
168 |     merge_system_user_message=False,
169 | ):
170 |     """Judge the model based on questions and reference answers"""
171 |     logger.debug(locals())
172 |     package_data_dir = os.path.join(os.path.dirname(__file__), "data")
173 |     use_builtin_ref_answers = False
174 |     if data_dir is None:
175 |         use_builtin_ref_answers = True
176 |         data_dir = package_data_dir
177 | 
178 |     data_base_dir = bench_dir(data_dir, bench_name, branch)
179 |     output_base_dir = bench_dir(output_dir, bench_name, branch)
180 | 
181 |     judge_file = os.path.join(package_data_dir, bench_name, "judge_prompts.jsonl")
182 | 
183 |     question_file = os.path.join(data_base_dir, "question.jsonl")
184 |     answer_file = os.path.join(output_base_dir, "model_answer", f"{model_name}.jsonl")
185 |     if use_builtin_ref_answers:
186 |         ref_answer_file = os.path.join(data_base_dir, "reference_answer", "gpt-4.jsonl")
187 |     else:
188 |         ref_answer_file = os.path.join(
189 |             data_base_dir, "reference_answer", f"{judge_model_name}.jsonl"
190 |         )
191 | 
192 |     # Load questions
193 |     questions = load_questions(question_file, None, None)
194 | 
195 |     # Load answers
196 |     model_answers = load_model_answers(answer_file)
197 |     ref_answers = load_model_answers(ref_answer_file, judge_model_name)
198 | 
199 |     # Load judge
200 |     judge_prompts = load_judge_prompts(judge_file)
201 | 
202 |     if first_n:
203 |         questions = questions[:first_n]
204 | 
205 |     models = get_model_list(answer_file)
206 | 
207 |     judges = make_judge_single(judge_model_name, judge_prompts)
208 |     output_file = os.path.join(
209 |         output_base_dir, "model_judgment", f"{judge_model_name}_single.jsonl"
210 |     )
211 |     if os.path.isfile(output_file):
212 |         os.remove(output_file)
213 |         logger.debug("Removing previous judgment file: %s", output_file)
214 | 
215 |     check_data(questions, model_answers, ref_answers, models, judges)
216 | 
217 |     question_math = [q for q in questions if q["category"] in NEED_REF_CATS]
218 |     question_default = [q for q in questions if q["category"] not in NEED_REF_CATS]
219 | 
220 |     # Make matches
221 |     matches = []
222 |     matches += make_match_single(
223 |         question_default, models, model_answers, judges["default"]
224 |     )
225 |     matches += make_match_single(
226 |         question_math,
227 |         models,
228 |         model_answers,
229 |         judges["math"],
230 |         ref_answers,
231 |     )
232 |     matches += make_match_single(
233 |         question_default,
234 |         models,
235 |         model_answers,
236 |         judges["default-mt"],
237 |         multi_turn=True,
238 |     )
239 |     matches += make_match_single(
240 |         question_math,
241 |         models,
242 |         model_answers,
243 |         judges["math-mt"],
244 |         ref_answers,
245 |         multi_turn=True,
246 |     )
247 | 
248 |     logger.debug("bench_name=%s", bench_name)
249 |     logger.debug("judge=%s", judge_model_name)
250 |     logger.debug("model_list=%s", models)
251 |     logger.debug("total_num_questions=%s", len(questions))
252 |     logger.debug("total_num_matches=%s", len(matches))
253 | 
254 |     # Play matches
255 |     if max_workers == 1:
256 |         for match in tqdm(matches):
257 |             play_a_match_single(
258 |                 openai_client,
259 |                 match,
260 |                 output_file=output_file,
261 |                 merge_system_user_message=merge_system_user_message,
262 |             )
263 |     else:
264 | 
265 |         def play_a_match_wrapper(match):
266 |             play_a_match_single(
267 |                 openai_client,
268 |                 match,
269 |                 output_file=output_file,
270 |                 merge_system_user_message=merge_system_user_message,
271 |             )
272 | 
273 |         np.random.seed(0)
274 |         np.random.shuffle(matches)
275 | 
276 |         with ThreadPoolExecutor(max_workers) as executor:
277 |             for match in tqdm(
278 |                 executor.map(play_a_match_wrapper, matches), total=len(matches)
279 |             ):
280 |                 pass
281 | 
282 |     return question_file, output_file, answer_file
283 | 
284 | 
285 | def generate_judgment(
286 |     model_name,
287 |     judge_model_name,
288 |     model_api_base,
289 |     api_key=None,
290 |     bench_name="mt_bench",
291 |     output_dir="eval_output",
292 |     data_dir=None,
293 |     branch=None,
294 |     max_workers=1,
295 |     first_n=None,
296 |     merge_system_user_message=False,
297 |     http_client=None,
298 | ):
299 |     """Generate judgment with scores and qa_pairs for a model"""
300 |     logger.debug(locals())
301 | 
302 |     openai_client = get_openai_client(model_api_base, api_key, http_client)
303 | 
304 |     first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS")
305 |     if first_n_env is not None and first_n is None:
306 |         first_n = int(first_n_env)
307 |         logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n)
308 | 
309 |     question_file, judgment_file, answer_file = judge_model(
310 |         model_name,
311 |         judge_model_name,
312 |         openai_client,
313 |         bench_name=bench_name,
314 |         output_dir=output_dir,
315 |         data_dir=data_dir,
316 |         branch=branch,
317 |         max_workers=max_workers,
318 |         first_n=first_n,
319 |         merge_system_user_message=merge_system_user_message,
320 |     )
321 | 
322 |     return make_judgment(
323 |         question_file,
324 |         judgment_file,
325 |         answer_file,
326 |         bench_name=bench_name,
327 |     )
328 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/mt_bench_model_adapter.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | """Model adapter registration."""
  3 | 
  4 | # Standard
  5 | from functools import cache
  6 | from typing import List
  7 | import abc
  8 | import os
  9 | 
 10 | # Local
 11 | from .logger_config import setup_logger
 12 | from .mt_bench_conversation import Conversation, get_conv_template
 13 | 
 14 | OPENAI_MODEL_LIST = ("gpt-4",)
 15 | 
 16 | logger = setup_logger(__name__)
 17 | 
 18 | 
 19 | class BaseModelAdapter:
 20 |     """The base and the default model adapter."""
 21 | 
 22 |     @abc.abstractmethod
 23 |     def match(self, model_path: str) -> bool:
 24 |         pass
 25 | 
 26 |     @abc.abstractmethod
 27 |     def get_default_conv_template(self, model_path: str) -> Conversation:
 28 |         pass
 29 | 
 30 | 
 31 | # A global registry for all model adapters
 32 | model_adapters: List[BaseModelAdapter] = []
 33 | 
 34 | 
 35 | def register_model_adapter(cls):
 36 |     """Register a model adapter."""
 37 |     model_adapters.append(cls())
 38 | 
 39 | 
 40 | @cache
 41 | def get_model_adapter(model_path: str, default_adapter_name: str) -> BaseModelAdapter:
 42 |     """Get a model adapter for a model_path."""
 43 |     model_path_basename = os.path.basename(os.path.normpath(model_path))
 44 | 
 45 |     default_adapter = None
 46 | 
 47 |     # Try the basename of model_path at first
 48 |     for adapter in model_adapters:
 49 |         if adapter.match(model_path_basename):
 50 |             return adapter
 51 |         if adapter.match(default_adapter_name) and default_adapter is None:
 52 |             default_adapter = adapter
 53 | 
 54 |     # Then try the full path
 55 |     for adapter in model_adapters:
 56 |         if adapter.match(model_path):
 57 |             return adapter
 58 | 
 59 |     if default_adapter is not None:
 60 |         logger.warning(
 61 |             "No valid model adapter for %s, defaulting to %s adapter",
 62 |             model_path,
 63 |             default_adapter_name,
 64 |         )
 65 |         return default_adapter
 66 |     raise ValueError(f"No valid model adapter for {model_path}")
 67 | 
 68 | 
 69 | def get_conversation_template(
 70 |     model_path: str, default_adapter_name: str
 71 | ) -> Conversation:
 72 |     """Get the default conversation template."""
 73 |     adapter = get_model_adapter(model_path, default_adapter_name)
 74 |     return adapter.get_default_conv_template(model_path)
 75 | 
 76 | 
 77 | class ChatGPTAdapter(BaseModelAdapter):
 78 |     """The model adapter for ChatGPT"""
 79 | 
 80 |     def match(self, model_path: str):
 81 |         return model_path in OPENAI_MODEL_LIST
 82 | 
 83 |     def get_default_conv_template(self, model_path: str) -> Conversation:
 84 |         if "browsing" in model_path:
 85 |             return get_conv_template("api_based_default")
 86 |         return get_conv_template("chatgpt")
 87 | 
 88 | 
 89 | class MistralAdapter(BaseModelAdapter):
 90 |     """The model adapter for Mistral AI models"""
 91 | 
 92 |     def match(self, model_path: str):
 93 |         model_path = model_path.lower()
 94 |         return (
 95 |             "mistral" in model_path
 96 |             or "mixtral" in model_path
 97 |             or "prometheus" in model_path
 98 |         )
 99 | 
100 |     def get_default_conv_template(self, model_path: str) -> Conversation:
101 |         return get_conv_template("mistral")
102 | 
103 | 
104 | class LabradoriteAdapter(BaseModelAdapter):
105 |     """The model adapter for ibm/labradorite-13b"""
106 | 
107 |     def match(self, model_path: str):
108 |         return "labradorite" in model_path.lower()
109 | 
110 |     def get_default_conv_template(self, model_path: str) -> Conversation:
111 |         return get_conv_template("labrador-chat")
112 | 
113 | 
114 | class MerliniteAdapter(BaseModelAdapter):
115 |     """The model adapter for ibm/merlinite-7b and instructlab/merlinite-7b-lab"""
116 | 
117 |     def match(self, model_path: str):
118 |         return "merlinite" in model_path.lower()
119 | 
120 |     def get_default_conv_template(self, model_path: str) -> Conversation:
121 |         return get_conv_template("ibm-generic")
122 | 
123 | 
124 | class GraniteAdapter(BaseModelAdapter):
125 |     """The model adapter for instructlab/granite-7b-lab"""
126 | 
127 |     def match(self, model_path: str):
128 |         model_path = model_path.lower()
129 |         return (
130 |             "granite" in model_path
131 |             and "granite-old" not in model_path
132 |             and "granite-chat" not in model_path
133 |             and "granite-code" not in model_path
134 |         )
135 | 
136 |     def get_default_conv_template(self, model_path: str) -> Conversation:
137 |         return get_conv_template("ibm-generic")
138 | 
139 | 
140 | class LabradorAdapter(BaseModelAdapter):
141 |     """The model adapter for ibm/labradorite-13b"""
142 | 
143 |     def match(self, model_path: str):
144 |         model_path = model_path.lower()
145 |         return ("granite-chat" in model_path) or (
146 |             "labrador" in model_path and "labradorite" not in model_path
147 |         )
148 | 
149 |     def get_default_conv_template(self, model_path: str) -> Conversation:
150 |         return get_conv_template("granite-chat")
151 | 
152 | 
153 | # Note: the registration order matters.
154 | # The one registered earlier has a higher matching priority.
155 | register_model_adapter(MistralAdapter)
156 | register_model_adapter(LabradoriteAdapter)
157 | register_model_adapter(MerliniteAdapter)
158 | register_model_adapter(GraniteAdapter)
159 | register_model_adapter(LabradorAdapter)
160 | register_model_adapter(ChatGPTAdapter)
161 | 


--------------------------------------------------------------------------------
/src/instructlab/eval/ruler.py:
--------------------------------------------------------------------------------
  1 | # Standard
  2 | from typing import Any, Dict, List, Optional
  3 | import json
  4 | import os
  5 | import pathlib
  6 | 
  7 | # Third Party
  8 | from lm_eval.evaluator import simple_evaluate
  9 | 
 10 | # First Party
 11 | from instructlab.eval.evaluator import Evaluator
 12 | 
 13 | RULER_TASKS = [
 14 |     "niah_single_1",
 15 |     "niah_single_2",
 16 |     "niah_single_3",
 17 |     "niah_multikey_1",
 18 |     "niah_multikey_2",
 19 |     "niah_multikey_3",
 20 |     "niah_multiquery",
 21 |     "niah_multivalue",
 22 |     "ruler_vt",
 23 |     "ruler_cwe",
 24 |     "ruler_fwe",
 25 |     "ruler_qa_hotpot",
 26 |     "ruler_qa_squad",
 27 | ]
 28 | 
 29 | DEFAULT_MAX_LENGTH = 4096
 30 | 
 31 | 
 32 | class RulerEvaluator(Evaluator):
 33 |     """
 34 |     Class definition for running RULER benchmarking tasks.
 35 |     """
 36 | 
 37 |     name = "ruler"
 38 | 
 39 |     def __init__(
 40 |         self,
 41 |         model_path: Optional[str] = None,
 42 |         output_file: Optional[str] = None,
 43 |         tasks: list[str] = RULER_TASKS,
 44 |         api_endpoint: Optional[str] = None,
 45 |         max_length: Optional[int] = None,
 46 |     ) -> None:
 47 |         self.model_path = model_path
 48 |         self.tasks = tasks
 49 |         self.results: Dict[Any, Any] = {}
 50 |         self.output_file = output_file
 51 | 
 52 |         self.api_endpoint = api_endpoint or None
 53 |         self.max_length = max_length or 4096
 54 | 
 55 |     def save_to_file(self, output_file: Optional[str] = None) -> None:
 56 |         """Save results to a JSON file"""
 57 |         output_file = output_file if output_file else self.output_file
 58 |         if not output_file:
 59 |             raise ValueError("Output file path cannot be empty")
 60 | 
 61 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
 62 |         with open(output_file, "w", encoding="utf-8") as f:
 63 |             json.dump(self.results, f, indent=2)
 64 | 
 65 |     def process_lm_eval_results(
 66 |         self,
 67 |         fpath: Optional[pathlib.Path] = None,
 68 |         raw_results: Optional[dict[str, Any]] = None,
 69 |     ) -> dict[str, float]:
 70 |         """
 71 |         Process the evaluation results from lm_eval for the given file path and extract
 72 |         aggregarted scores for each context length
 73 |         Args:
 74 |             fpath (pathlib.Path): The file path to the evaluation results.
 75 | 
 76 |         """
 77 |         unqiue_metrics_dict: dict[str, Any] = {}
 78 | 
 79 |         # This is required because the lm_eval results are nested under 'ruler' if
 80 |         # that is the supplied task to it. The output contains a nested dictionary
 81 |         # in this case, using RULER tasks as the key. Each context length is a further subkey
 82 |         # in the dictionary. There is an additional key per context length which also
 83 |         # contains score adjusted for stderr, which we are ignoring here.
 84 |         def extract_metrics(results: dict, unqiue_metrics_dict: dict = {}):
 85 |             for k, v in results.items():
 86 |                 if isinstance(v, dict):
 87 |                     extract_metrics(v, unqiue_metrics_dict)
 88 |                 else:
 89 |                     if "stderr" not in k:
 90 |                         metric = k.split(",")[0]
 91 |                         if metric not in unqiue_metrics_dict:
 92 |                             unqiue_metrics_dict[metric] = []
 93 |                         unqiue_metrics_dict[metric].append(v)
 94 | 
 95 |             return unqiue_metrics_dict
 96 | 
 97 |         if fpath:
 98 |             with open(fpath, "r", encoding="utf-8") as f:
 99 |                 raw_results = json.load(f)
100 | 
101 |         if raw_results is not None:
102 |             extract_metrics(raw_results["results"], unqiue_metrics_dict)
103 |         unique_float_metrics = {}
104 |         # if value is list of floats, average the list
105 |         for k, v in unqiue_metrics_dict.items():
106 |             if isinstance(v, list) and all(isinstance(i, float) for i in v):
107 |                 unique_float_metrics[k] = sum(v) / len(v)
108 | 
109 |         # find average of all float values in dict
110 |         float_values = [
111 |             v for v in unique_float_metrics.values() if isinstance(v, float)
112 |         ]
113 |         if float_values:
114 |             unique_float_metrics["avg"] = sum(float_values) / len(float_values)
115 |         else:
116 |             unique_float_metrics["avg"] = 0.0
117 | 
118 |         # result format
119 |         # {'8192': 0.90, '32768': 0.82, '65536': 0.77, '131072': 0.71, 'avg': 0.80}
120 |         return unique_float_metrics
121 | 
122 |     def run(
123 |         self,
124 |         model_path: Optional[str] = None,
125 |         tasks: Optional[List[str]] = None,
126 |         output_file: Optional[str] = None,
127 |         api_endpoint: Optional[str] = None,
128 |         max_length: Optional[int] = DEFAULT_MAX_LENGTH,
129 |     ) -> None:
130 |         """
131 |         Run the RULER evaluation using the specified model and tasks.
132 |         """
133 | 
134 |         model_path = self.model_path if model_path is None else model_path
135 |         tasks = self.tasks if not tasks else tasks
136 |         output_file = self.output_file if not output_file else output_file
137 | 
138 |         # validate above params are not none and output file can be written to
139 |         if not model_path:
140 |             raise ValueError("Model path cannot be empty")
141 |         if not output_file:
142 |             raise ValueError("Output file path cannot be empty")
143 |         if not api_endpoint:
144 |             raise ValueError("API endpoint cannot be empty")
145 | 
146 |         # Prepare model_args
147 |         model_args = {
148 |             "pretrained": model_path,
149 |             "base_url": api_endpoint,
150 |             "max_length": max_length,
151 |         }
152 | 
153 |         self.lm_eval_results = simple_evaluate(
154 |             model="local-completions",
155 |             model_args=model_args,
156 |             tasks=tasks,
157 |         )
158 | 
159 |         self.result = self.process_lm_eval_results(
160 |             raw_results=self.lm_eval_results,
161 |         )
162 | 
163 |         # write results to file
164 |         if output_file:
165 |             try:
166 |                 with open(output_file, "w", encoding="utf-8") as f:
167 |                     json.dump(self.result, f, indent=2)
168 |             except (OSError, IOError) as e:
169 |                 raise ValueError(f"Failed to write to output file: {e}") from e
170 | 


--------------------------------------------------------------------------------
/tests/test_mmlu.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # Standard
 4 | from unittest import mock
 5 | from unittest.mock import patch
 6 | import os
 7 | 
 8 | # First Party
 9 | from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator
10 | 
11 | MMLU_EXAMPLE_OUTPUT = {
12 |     "results": {
13 |         "mmlu_astronomy": {
14 |             "alias": "astronomy",
15 |             "acc,none": 0.5592105263157895,
16 |             "acc_stderr,none": 0.04040311062490436,
17 |         },
18 |         "mmlu_anatomy": {
19 |             "alias": "anatomy",
20 |             "acc,none": 0.4444444444444444,
21 |             "acc_stderr,none": 0.04292596718256981,
22 |         },
23 |         "mmlu_abstract_algebra": {
24 |             "alias": "abstract_algebra",
25 |             "acc,none": 0.35,
26 |             "acc_stderr,none": 0.047937248544110196,
27 |         },
28 |     },
29 | }
30 | 
31 | MODEL_EXAMPLE = "instructlab/granite-7b-lab"
32 | 
33 | 
34 | def assert_example_mmlu_individual_scores(overall_score, individual_scores):
35 |     assert round(overall_score, 2) == 0.45
36 |     assert individual_scores == {
37 |         "mmlu_abstract_algebra": {"score": 0.35, "stderr": 0.047937248544110196},
38 |         "mmlu_anatomy": {"score": 0.4444444444444444, "stderr": 0.04292596718256981},
39 |         "mmlu_astronomy": {"score": 0.5592105263157895, "stderr": 0.04040311062490436},
40 |     }
41 | 
42 | 
43 | @patch(
44 |     "instructlab.eval.mmlu.AbstractMMLUEvaluator._simple_evaluate_with_error_handling",
45 |     return_value=MMLU_EXAMPLE_OUTPUT,
46 | )
47 | def test_mmlu_branch(eval_mock):
48 |     tasks_dir = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg"
49 |     tasks = ["mmlu_pr"]
50 |     mmlu = MMLUBranchEvaluator(
51 |         model_path=MODEL_EXAMPLE,
52 |         tasks_dir=tasks_dir,
53 |         tasks=tasks,
54 |         system_prompt="You are an intelligent AI language model.",
55 |     )
56 |     overall_score, individual_scores = mmlu.run()
57 | 
58 |     assert_example_mmlu_individual_scores(overall_score, individual_scores)
59 |     eval_mock.assert_called()
60 | 
61 | 
62 | @patch(
63 |     "instructlab.eval.mmlu.AbstractMMLUEvaluator._simple_evaluate_with_error_handling",
64 |     return_value=MMLU_EXAMPLE_OUTPUT,
65 | )
66 | def test_mmlu(eval_mock):
67 |     tasks = ["mmlu_anatomy", "mmlu_astronomy", "mmlu_algebra"]
68 |     mmlu = MMLUEvaluator(
69 |         model_path=MODEL_EXAMPLE,
70 |         tasks=tasks,
71 |         system_prompt="You are an intelligent AI language model.",
72 |     )
73 |     overall_score, individual_scores = mmlu.run()
74 | 
75 |     eval_mock.assert_called()
76 |     assert_example_mmlu_individual_scores(overall_score, individual_scores)
77 | 


--------------------------------------------------------------------------------
/tests/test_mt_bench.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | # Standard
  4 | from unittest import mock
  5 | from unittest.mock import patch
  6 | 
  7 | # First Party
  8 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator
  9 | 
 10 | 
 11 | def gen_qa_pairs(odd):
 12 |     i = 1
 13 |     qa_pairs = []
 14 |     score = 0
 15 |     while i < 5:
 16 |         if i % 2:
 17 |             if odd:
 18 |                 score = 0.2
 19 |             else:
 20 |                 score = 0.1
 21 |         elif not i % 2:
 22 |             if odd:
 23 |                 score = 0.3
 24 |             else:
 25 |                 score = 0.4
 26 |         qa_pairs.append(
 27 |             {
 28 |                 "question_id": i,
 29 |                 "score": score,
 30 |                 "qna_file": f"category{i}/qna.yaml",
 31 |             }
 32 |         )
 33 |         i = i + 1
 34 |     qa_pairs.append(
 35 |         {
 36 |             "question_id": i,
 37 |             "score": 0.5,
 38 |             "qna_file": f"category{i}/qna.yaml",
 39 |         }
 40 |     )
 41 |     if odd:
 42 |         qa_pairs.append(
 43 |             {
 44 |                 "question_id": i + 1,
 45 |                 "score": 0.6,
 46 |                 "qna_file": f"category{i + 1}/qna.yaml",
 47 |             }
 48 |         )
 49 |     return qa_pairs
 50 | 
 51 | 
 52 | @patch("instructlab.eval.mt_bench_branch_generator.generate")
 53 | @patch("instructlab.eval.mt_bench_answers.generate_answers")
 54 | @patch(
 55 |     "instructlab.eval.mt_bench_judgment.generate_judgment",
 56 |     return_value=(0, gen_qa_pairs(True), None, 0),
 57 | )
 58 | def test_mt_bench_branch(gen_judgment_mock, gen_answers_mock, generate_mock):
 59 |     mt_bench_branch = MTBenchBranchEvaluator(
 60 |         "instructlab/granite-7b-lab",
 61 |         "prometheus-eval/prometheus-8x7b-v2.0",
 62 |         "../taxonomy",
 63 |         "main",
 64 |     )
 65 |     mt_bench_branch.gen_answers(
 66 |         "http://localhost:8000/v1",
 67 |     )
 68 |     overall_score, qa_pairs, error_rate = mt_bench_branch.judge_answers(
 69 |         "http://localhost:8000/v1",
 70 |     )
 71 |     assert overall_score == 0
 72 |     assert qa_pairs == gen_qa_pairs(True)
 73 |     assert error_rate == 0
 74 | 
 75 |     gen_judgment_mock.assert_called()
 76 |     gen_answers_mock.assert_called()
 77 |     generate_mock.assert_called()
 78 | 
 79 | 
 80 | @patch("instructlab.eval.mt_bench_answers.generate_answers")
 81 | @patch(
 82 |     "instructlab.eval.mt_bench_judgment.generate_judgment",
 83 |     return_value=(1.5001, [{}, {}], [1.002, 2], 0),
 84 | )
 85 | def test_mt_bench(gen_judgment_mock, gen_answers_mock):
 86 |     mt_bench = MTBenchEvaluator(
 87 |         "instructlab/granite-7b-lab",
 88 |         "prometheus-eval/prometheus-8x7b-v2.0",
 89 |     )
 90 |     mt_bench.gen_answers(
 91 |         "http://localhost:8000/v1",
 92 |     )
 93 |     overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers(
 94 |         "http://localhost:8000/v1",
 95 |     )
 96 | 
 97 |     assert overall_score == 1.5001
 98 |     assert qa_pairs == [{}, {}]
 99 |     assert turn_scores == [1.002, 2]
100 |     assert error_rate == 0
101 | 
102 |     gen_judgment_mock.assert_called()
103 |     gen_answers_mock.assert_called()
104 | 


--------------------------------------------------------------------------------
/tests/test_mt_bench_answers.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # Standard
 4 | import json
 5 | import os
 6 | import random
 7 | import shutil
 8 | import tempfile
 9 | 
10 | # First Party
11 | from instructlab.eval.mt_bench_answers import reorg_answer_file
12 | 
13 | 
14 | def test_reorg_answer_file():
15 |     answer_file = os.path.join(
16 |         os.path.dirname(__file__),
17 |         "..",
18 |         "src",
19 |         "instructlab",
20 |         "eval",
21 |         "data",
22 |         "mt_bench",
23 |         "reference_answer",
24 |         "gpt-4.jsonl",
25 |     )
26 | 
27 |     # Create a temporary file
28 |     with tempfile.NamedTemporaryFile(delete=True) as temp_file:
29 |         temp_answer_file = temp_file.name
30 | 
31 |         # Copy the original file to the temp file
32 |         shutil.copy(answer_file, temp_answer_file)
33 | 
34 |         orig_length = 0
35 |         with open(temp_answer_file, "r+", encoding="utf-8") as f:
36 |             answers = {}
37 |             for l in f:
38 |                 orig_length += 1
39 |                 qid = json.loads(l)["question_id"]
40 |                 answers[qid] = l
41 | 
42 |             # Reset to the beginning of the file and clear it
43 |             f.seek(0)
44 |             f.truncate()
45 | 
46 |             # Randomize the values
47 |             qids = sorted(list(answers.keys()), key=lambda answer: random.random())
48 |             for qid in qids:
49 |                 f.write(answers[qid])
50 |                 # Write each answer twice
51 |                 f.write(answers[qid])
52 | 
53 |         # Run the reorg which should sort and dedup the file in place
54 |         reorg_answer_file(temp_answer_file)
55 | 
56 |         new_length = 0
57 |         with open(temp_answer_file, "r", encoding="utf-8") as fin:
58 |             previous_question_id = -1
59 |             for l in fin:
60 |                 new_length += 1
61 |                 qid = json.loads(l)["question_id"]
62 |                 assert qid > previous_question_id
63 |                 previous_question_id = qid
64 | 
65 |         assert new_length == orig_length
66 | 


--------------------------------------------------------------------------------
/tests/test_mt_bench_common.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | # Standard
  4 | from unittest import mock
  5 | 
  6 | # First Party
  7 | from instructlab.eval.mt_bench_common import Judge, check_data
  8 | 
  9 | CHECK_DATA_EXAMPLE_QUESTIONS = [
 10 |     {
 11 |         "question_id": 81,
 12 |         "category": "writing",
 13 |         "turns": [
 14 |             "Fake question",
 15 |             "Fake question",
 16 |         ],
 17 |     },
 18 |     {
 19 |         "question_id": 101,
 20 |         "category": "reasoning",
 21 |         "turns": [
 22 |             "Fake question",
 23 |             "Fake question",
 24 |         ],
 25 |     },
 26 | ]
 27 | CHECK_DATA_EXAMPLE_MODEL_ANSWERS = {
 28 |     "granite-7b-lab": {
 29 |         81: {
 30 |             "question_id": 81,
 31 |             "answer_id": "c4j9vPyHM8w3JHPGohrJQG",
 32 |             "model_id": "granite-7b-lab",
 33 |             "choices": [
 34 |                 {
 35 |                     "index": 0,
 36 |                     "turns": [
 37 |                         "Fake answer",
 38 |                         "Fake answer",
 39 |                     ],
 40 |                 }
 41 |             ],
 42 |             "tstamp": 1730816201.883507,
 43 |         },
 44 |         101: {
 45 |             "question_id": 101,
 46 |             "answer_id": "kaQw7Fj2SDeE2VfvU25FJ4",
 47 |             "model_id": "granite-7b-lab",
 48 |             "choices": [
 49 |                 {
 50 |                     "index": 0,
 51 |                     "turns": [
 52 |                         "Fake answer",
 53 |                         "Fake answer",
 54 |                     ],
 55 |                 }
 56 |             ],
 57 |             "tstamp": 1730816166.3719094,
 58 |         },
 59 |     }
 60 | }
 61 | CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS = {
 62 |     "merlinite-7b-lab": {
 63 |         101: {
 64 |             "question_id": 101,
 65 |             "answer_id": "TFomieEmmAgdeCkvmuvwbc",
 66 |             "model_id": "gpt-4",
 67 |             "choices": [
 68 |                 {
 69 |                     "index": 0,
 70 |                     "turns": [
 71 |                         "Fake answer",
 72 |                         "Fake answer",
 73 |                     ],
 74 |                 }
 75 |             ],
 76 |             "tstamp": 1686286924.844282,
 77 |         },
 78 |         102: {
 79 |             "question_id": 102,
 80 |             "answer_id": "hLH8WozvaB88bb5vV224H4",
 81 |             "model_id": "gpt-4",
 82 |             "choices": [
 83 |                 {
 84 |                     "index": 0,
 85 |                     "turns": [
 86 |                         "Fake answer",
 87 |                         "Fake answer",
 88 |                     ],
 89 |                 }
 90 |             ],
 91 |             "tstamp": 1686286937.7164738,
 92 |         },
 93 |     }
 94 | }
 95 | 
 96 | CHECK_DATA_EXAMPLE_MODELS = ["granite-7b-lab"]
 97 | CHECK_DATA_EXAMPLE_JUDGES = {
 98 |     "default": Judge(
 99 |         model_name="merlinite-7b-lab",
100 |         prompt_template={
101 |             "name": "single-v1",
102 |             "type": "single",
103 |             "system_prompt": "Fake prompt",
104 |             "prompt_template": "Fake prompt",
105 |             "description": "Prompt for general questions",
106 |             "category": "general",
107 |             "output_format": "[[rating]]",
108 |         },
109 |         ref_based=False,
110 |         multi_turn=False,
111 |     ),
112 |     "math": Judge(
113 |         model_name="merlinite-7b-lab",
114 |         prompt_template={
115 |             "name": "single-math-v1",
116 |             "type": "single",
117 |             "system_prompt": "Fake prompt",
118 |             "prompt_template": "Fake prompt",
119 |             "description": "Prompt for general questions",
120 |             "category": "math",
121 |             "output_format": "[[rating]]",
122 |         },
123 |         ref_based=True,
124 |         multi_turn=False,
125 |     ),
126 |     "default-mt": Judge(
127 |         model_name="merlinite-7b-lab",
128 |         prompt_template={
129 |             "name": "single-v1-multi-turn",
130 |             "type": "single",
131 |             "system_prompt": "Fake prompt",
132 |             "prompt_template": "Fake prompt",
133 |             "description": "Prompt for general questions",
134 |             "category": "general",
135 |             "output_format": "[[rating]]",
136 |         },
137 |         ref_based=False,
138 |         multi_turn=True,
139 |     ),
140 |     "math-mt": Judge(
141 |         model_name="merlinite-7b-lab",
142 |         prompt_template={
143 |             "name": "single-math-v1-multi-turn",
144 |             "type": "single",
145 |             "system_prompt": "Fake prompt",
146 |             "prompt_template": "Fake prompt",
147 |             "description": "Prompt for general questions",
148 |             "category": "math",
149 |             "output_format": "[[rating]]",
150 |         },
151 |         ref_based=True,
152 |         multi_turn=True,
153 |     ),
154 | }
155 | 
156 | 
157 | def test_check_data():
158 |     check_data(
159 |         CHECK_DATA_EXAMPLE_QUESTIONS,
160 |         CHECK_DATA_EXAMPLE_MODEL_ANSWERS,
161 |         CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS,
162 |         CHECK_DATA_EXAMPLE_MODELS,
163 |         CHECK_DATA_EXAMPLE_JUDGES,
164 |     )
165 | 
166 |     try:
167 |         check_data(
168 |             CHECK_DATA_EXAMPLE_QUESTIONS,
169 |             {"granite-7b-lab": {}},
170 |             CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS,
171 |             CHECK_DATA_EXAMPLE_MODELS,
172 |             CHECK_DATA_EXAMPLE_JUDGES,
173 |         )
174 |     except Exception as e:
175 |         assert "Missing model granite-7b-lab's answer to Question" in str(e)
176 |     else:
177 |         assert False, "Didn't fail with missing model answer"
178 | 
179 |     try:
180 |         check_data(
181 |             CHECK_DATA_EXAMPLE_QUESTIONS,
182 |             CHECK_DATA_EXAMPLE_MODEL_ANSWERS,
183 |             {"merlinite-7b-lab": {}},
184 |             CHECK_DATA_EXAMPLE_MODELS,
185 |             CHECK_DATA_EXAMPLE_JUDGES,
186 |         )
187 |     except Exception as e:
188 |         assert "Missing reference answer to Question" in str(e)
189 |     else:
190 |         assert False, "Didn't fail with missing reference answer"
191 | 


--------------------------------------------------------------------------------
/tests/test_mt_bench_judgment.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # Standard
 4 | import os
 5 | 
 6 | # First Party
 7 | from instructlab.eval.mt_bench_common import Judge
 8 | from instructlab.eval.mt_bench_judgment import load_judge_prompts, make_judge_single
 9 | 
10 | 
11 | def test_make_judge_single():
12 |     judge_file = os.path.join(
13 |         os.path.dirname(__file__),
14 |         "..",
15 |         "src",
16 |         "instructlab",
17 |         "eval",
18 |         "data",
19 |         "mt_bench",
20 |         "judge_prompts.jsonl",
21 |     )
22 |     judge_prompts = load_judge_prompts(judge_file)
23 |     judges = make_judge_single("prometheus-8x7b-v2-0", judge_prompts)
24 |     assert len(judges) == 4
25 |     assert isinstance(judges["default"], Judge)
26 |     assert isinstance(judges["math"], Judge)
27 |     assert judges["math"].ref_based
28 |     assert isinstance(judges["default-mt"], Judge)
29 |     assert judges["default-mt"].multi_turn
30 |     assert isinstance(judges["math-mt"], Judge)
31 |     assert judges["math-mt"].ref_based
32 |     assert judges["math-mt"].multi_turn
33 | 


--------------------------------------------------------------------------------
/tests/test_mt_bench_model_adapter.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # Third Party
 4 | import pytest
 5 | 
 6 | # First Party
 7 | from instructlab.eval.mt_bench_model_adapter import (
 8 |     GraniteAdapter,
 9 |     MistralAdapter,
10 |     get_conversation_template,
11 |     get_model_adapter,
12 | )
13 | 
14 | MISTRAL_DEFAULT_MODEL_NAME = "mistral"
15 | EXAMPLE_MISTRAL_MODEL_PATHS = [
16 |     "mistral",
17 |     "mistralai/Mixtral-8x7B-Instruct-v0.1",
18 |     "/cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
19 |     "prometheus-eval/prometheus-8x7b-v2.0",
20 |     "/cache/instructlab/models/prometheus-eval/prometheus-8x7b-v2.0",
21 | ]
22 | 
23 | GRANITE_DEFAULT_MODEL_NAME = "granite"
24 | EXAMPLE_GRANITE_MODEL_PATHS = [
25 |     "granite",
26 |     "instructlab/granite-7b-lab",
27 |     "/cache/instructlab/models/instructlab/granite-7b-lab.gguf",
28 |     "instructlab/granite-8b-lab",
29 | ]
30 | 
31 | TEST_TUPLES = [
32 |     (
33 |         MISTRAL_DEFAULT_MODEL_NAME,
34 |         EXAMPLE_MISTRAL_MODEL_PATHS,
35 |         MistralAdapter,
36 |         MISTRAL_DEFAULT_MODEL_NAME,
37 |     ),
38 |     (
39 |         GRANITE_DEFAULT_MODEL_NAME,
40 |         EXAMPLE_GRANITE_MODEL_PATHS,
41 |         GraniteAdapter,
42 |         "ibm-generic",
43 |     ),
44 | ]
45 | 
46 | 
47 | def test_get_model_adapter():
48 |     for model, model_paths, adapter, _ in TEST_TUPLES:
49 |         for model_path in model_paths:
50 |             assert isinstance(get_model_adapter(model_path, model), adapter)
51 | 
52 |     # Test default adapter overrides as expected
53 |     assert isinstance(get_model_adapter("", MISTRAL_DEFAULT_MODEL_NAME), MistralAdapter)
54 | 
55 | 
56 | def test_get_model_adapter_not_found():
57 |     with pytest.raises(ValueError):
58 |         get_model_adapter("unknown", "unknown")
59 | 
60 | 
61 | def test_get_conversation_template():
62 |     for model, model_paths, _, conv_template_name in TEST_TUPLES:
63 |         for model_path in model_paths:
64 |             assert (
65 |                 conv_template_name == get_conversation_template(model_path, model).name
66 |             )
67 | 


--------------------------------------------------------------------------------
/tests/test_project.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # Standard
 3 | from importlib.metadata import entry_points
 4 | 
 5 | # First Party
 6 | from instructlab.eval.evaluator import Evaluator
 7 | from instructlab.eval.leaderboard import LeaderboardV2Evaluator
 8 | from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator
 9 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator
10 | from instructlab.eval.ruler import RulerEvaluator
11 | 
12 | 
13 | def test_evaluator_eps():
14 |     expected = {
15 |         "mmlu": MMLUEvaluator,
16 |         "mmlu_branch": MMLUBranchEvaluator,
17 |         "mt_bench": MTBenchEvaluator,
18 |         "mt_bench_branch": MTBenchBranchEvaluator,
19 |         "leaderboard_v2": LeaderboardV2Evaluator,
20 |         "ruler": RulerEvaluator,
21 |     }
22 |     eps = entry_points(group="instructlab.eval.evaluator")
23 |     found = {}
24 |     for ep in eps:
25 |         # different project
26 |         if not ep.module.startswith("instructlab.eval"):
27 |             continue
28 |         evaluator = ep.load()
29 |         assert issubclass(evaluator, Evaluator)
30 |         assert evaluator.name == ep.name
31 |         found[ep.name] = evaluator
32 | 
33 |     assert found == expected
34 | 


--------------------------------------------------------------------------------
/tests/test_ragas.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | # Standard
  3 | from pathlib import Path
  4 | from unittest.mock import MagicMock, patch
  5 | import unittest
  6 | 
  7 | # Third Party
  8 | from pandas import DataFrame
  9 | from ragas.callbacks import ChainRun
 10 | from ragas.dataset_schema import EvaluationDataset, EvaluationResult
 11 | 
 12 | # First Party
 13 | from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig
 14 | 
 15 | 
 16 | class TestRagasEvaluator(unittest.TestCase):
 17 |     def setUp(self):
 18 |         # Common setup data for all tests
 19 |         self.student_model_response = "Paris"
 20 |         self.user_question = "What is the capital of France?"
 21 |         self.golden_answer = "The capital of France is Paris."
 22 |         self.metric = "mocked-metric"
 23 |         self.metric_score = 4.0
 24 |         self.base_ds = [
 25 |             {
 26 |                 "user_input": self.user_question,
 27 |                 "reference": self.golden_answer,
 28 |             }
 29 |         ]
 30 |         self.student_model = ModelConfig(
 31 |             model_name="super-jeeves-8x700B",
 32 |         )
 33 |         self.run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30)
 34 | 
 35 |     @patch("instructlab.eval.ragas.ChatOpenAI")
 36 |     @patch("instructlab.eval.ragas.evaluate")
 37 |     @patch.object(RagasEvaluator, "_generate_answers_from_model")
 38 |     @patch.object(RagasEvaluator, "_get_metrics")
 39 |     def test_run_with_dataset(
 40 |         self,
 41 |         mock_get_metrics: MagicMock,
 42 |         mock_generate_answers_from_model: MagicMock,
 43 |         mock_evaluate: MagicMock,
 44 |         mock_ChatOpenAI: MagicMock,
 45 |     ):
 46 |         """
 47 |         Test case 1: Directly passing a Python list/dict dataset to `RagasEvaluator.run()`.
 48 |         """
 49 |         # Prepare mocks
 50 |         mock_get_metrics.return_value = [self.metric]
 51 |         interim_df = DataFrame(
 52 |             {
 53 |                 "user_input": [self.user_question],
 54 |                 "response": [self.student_model_response],
 55 |                 "reference": [self.golden_answer],
 56 |             }
 57 |         )
 58 |         mock_generate_answers_from_model.return_value = interim_df
 59 |         mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
 60 |         _unimportant_ragas_traces = {
 61 |             "default": ChainRun(
 62 |                 run_id="42",
 63 |                 parent_run_id=None,
 64 |                 name="root",
 65 |                 inputs={"system": "null", "user": "null"},
 66 |                 outputs={"assistant": "null"},
 67 |                 metadata={"user_id": 1337},
 68 |             )
 69 |         }
 70 |         mock_evaluate.return_value = EvaluationResult(
 71 |             scores=[{self.metric: self.metric_score}],
 72 |             dataset=mocked_evaluation_ds,
 73 |             ragas_traces=_unimportant_ragas_traces,
 74 |         )
 75 | 
 76 |         # Instantiate evaluator
 77 |         evaluator = RagasEvaluator()
 78 | 
 79 |         # Run test
 80 |         result = evaluator.run(
 81 |             dataset=self.base_ds,
 82 |             student_model=self.student_model,
 83 |             run_config=self.run_config,
 84 |             student_openai_client=MagicMock(),  # We pass a mock client
 85 |         )
 86 | 
 87 |         # Assertions
 88 |         self.assertIsInstance(result, EvaluationResult)
 89 |         mock_generate_answers_from_model.assert_called_once()
 90 |         mock_evaluate.assert_called_once()
 91 |         # we didn't provide an API key, so it expects to get `api_key=None`
 92 |         mock_ChatOpenAI.assert_called_once_with(model="gpt-4o", api_key=None)
 93 | 
 94 |     @patch("instructlab.eval.ragas.ChatOpenAI")
 95 |     @patch("instructlab.eval.ragas.read_json")
 96 |     @patch("instructlab.eval.ragas.evaluate")
 97 |     @patch.object(RagasEvaluator, "_generate_answers_from_model")
 98 |     @patch.object(RagasEvaluator, "_get_metrics")
 99 |     def test_run_with_dataset_via_path(
100 |         self,
101 |         mock_get_metrics: MagicMock,
102 |         mock_generate_answers_from_model: MagicMock,
103 |         mock_evaluate: MagicMock,
104 |         mock_read_json: MagicMock,
105 |         mock_ChatOpenAI: MagicMock,
106 |     ):
107 |         """
108 |         Test case 2: Passing a Path to a JSONL file (containing the dataset) to `RagasEvaluator.run()`.
109 |         """
110 |         # Prepare mocks
111 |         mock_get_metrics.return_value = [self.metric]
112 |         interim_df = DataFrame(
113 |             {
114 |                 "user_input": [self.user_question],
115 |                 "response": [self.student_model_response],
116 |                 "reference": [self.golden_answer],
117 |             }
118 |         )
119 |         mock_generate_answers_from_model.return_value = interim_df
120 |         mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
121 |         _unimportant_ragas_traces = {
122 |             "default": ChainRun(
123 |                 run_id="42",
124 |                 parent_run_id=None,
125 |                 name="root",
126 |                 inputs={"system": "null", "user": "null"},
127 |                 outputs={"assistant": "null"},
128 |                 metadata={"user_id": 1337},
129 |             )
130 |         }
131 |         mock_evaluate.return_value = EvaluationResult(
132 |             scores=[{self.metric: self.metric_score}],
133 |             dataset=mocked_evaluation_ds,
134 |             ragas_traces=_unimportant_ragas_traces,
135 |         )
136 | 
137 |         mock_read_json.return_value = DataFrame(self.base_ds)
138 | 
139 |         # Instantiate evaluator
140 |         evaluator = RagasEvaluator()
141 | 
142 |         # Run test
143 |         result = evaluator.run(
144 |             dataset=Path("dummy_path.jsonl"),
145 |             student_model=self.student_model,
146 |             run_config=self.run_config,
147 |             student_openai_client=MagicMock(),
148 |         )
149 | 
150 |         # Assertions
151 |         self.assertIsInstance(result, EvaluationResult)
152 |         mock_read_json.assert_called_once_with(
153 |             Path("dummy_path.jsonl"), orient="records", lines=True
154 |         )
155 |         mock_generate_answers_from_model.assert_called()
156 |         mock_evaluate.assert_called()
157 | 
158 |     @patch("instructlab.eval.ragas.ChatOpenAI")
159 |     @patch("instructlab.eval.ragas.read_json")
160 |     @patch("instructlab.eval.ragas.evaluate")
161 |     @patch.object(RagasEvaluator, "_generate_answers_from_model")
162 |     @patch.object(RagasEvaluator, "_get_metrics")
163 |     def test_run_with_instance_attributes(
164 |         self,
165 |         mock_get_metrics: MagicMock,
166 |         mock_generate_answers_from_model: MagicMock,
167 |         mock_evaluate: MagicMock,
168 |         mock_read_json: MagicMock,
169 |         mock_ChatOpenAI: MagicMock,
170 |     ):
171 |         """
172 |         Test case 3: Using `RagasEvaluator` instance attributes for `student_model`, `run_config`,
173 |                      and `student_openai_client` instead of passing them explicitly.
174 |         """
175 |         # Prepare mocks
176 |         mock_get_metrics.return_value = [self.metric]
177 |         interim_df = DataFrame(
178 |             {
179 |                 "user_input": [self.user_question],
180 |                 "response": [self.student_model_response],
181 |                 "reference": [self.golden_answer],
182 |             }
183 |         )
184 |         mock_generate_answers_from_model.return_value = interim_df
185 |         mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df)
186 |         _unimportant_ragas_traces = {
187 |             "default": ChainRun(
188 |                 run_id="42",
189 |                 parent_run_id=None,
190 |                 name="root",
191 |                 inputs={"system": "null", "user": "null"},
192 |                 outputs={"assistant": "null"},
193 |                 metadata={"user_id": 1337},
194 |             )
195 |         }
196 |         mock_evaluate.return_value = EvaluationResult(
197 |             scores=[{self.metric: self.metric_score}],
198 |             dataset=mocked_evaluation_ds,
199 |             ragas_traces=_unimportant_ragas_traces,
200 |         )
201 | 
202 |         mock_read_json.return_value = DataFrame(self.base_ds)
203 | 
204 |         # Instantiate evaluator with instance-level configs
205 |         evaluator = RagasEvaluator(
206 |             student_model=self.student_model,
207 |             student_openai_client=MagicMock(),
208 |             run_config=self.run_config,
209 |         )
210 | 
211 |         # Run test
212 |         result = evaluator.run(dataset=Path("dummy_path.jsonl"))
213 | 
214 |         # Assertions
215 |         self.assertIsInstance(result, EvaluationResult)
216 |         mock_read_json.assert_called_with(
217 |             Path("dummy_path.jsonl"), orient="records", lines=True
218 |         )
219 |         mock_generate_answers_from_model.assert_called()
220 |         mock_evaluate.assert_called()
221 | 
222 | 
223 | if __name__ == "__main__":
224 |     unittest.main()
225 | 


--------------------------------------------------------------------------------
/tests/testdata/sdg/_default_template_yaml:
--------------------------------------------------------------------------------
 1 | task: mmlu_pr
 2 | dataset_path: json
 3 | dataset_name: null
 4 | test_split: test
 5 | doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
 6 | doc_to_choice: ["A", "B", "C", "D"]
 7 | doc_to_target: answer
 8 | output_type: multiple_choice
 9 | metric_list:
10 |   - metric: acc
11 |     aggregation: mean
12 |     higher_is_better: true
13 | 


--------------------------------------------------------------------------------
/tests/testdata/sdg/tonsil_task.yaml:
--------------------------------------------------------------------------------
1 | dataset_kwargs:
2 |   data_files:
3 |     test: tests/testdata/sdg/tonsil_data.jsonl
4 | group: mmlu_pr
5 | include: _default_template_yaml
6 | task: tonsils
7 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | [tox]
  4 | # py3-unit runs unit tests with 'python3'
  5 | # py311-unit runs the same tests with 'python3.11'
  6 | envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional}
  7 | minversion = 4.4
  8 | 
  9 | [testenv]
 10 | description = run tests (unit, unitcov, functional)
 11 | passenv =
 12 |     CMAKE_ARGS
 13 | # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
 14 | # are huge. This reduces venv from 5.7 GB to 1.5 GB.
 15 | setenv =
 16 |     PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
 17 |     CMAKE_ARGS={env:CMAKE_ARGS:-DLLAMA_NATIVE=off}
 18 |     ILAB_MAX_STABLE_VRAM_WAIT=0
 19 | package = wheel
 20 | wheel_build_env = pkg
 21 | install_command = pip install \
 22 |                   --use-feature fast-deps \
 23 |                   -c constraints-dev.txt \
 24 |                   {opts} {packages}
 25 | # equivalent to `pip install instructlab[cpu]`
 26 | extras = 
 27 |     cpu
 28 |     leaderboard
 29 | deps =
 30 |     pytest
 31 |     pytest-asyncio
 32 |     pytest-cov
 33 |     pytest-html
 34 | commands =
 35 |     unit: {envpython} -m pytest {posargs:tests}
 36 |     unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.eval --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"}
 37 |     functional: ./scripts/functional-tests.sh
 38 | allowlist_externals =
 39 |     functional: ./scripts/functional-tests.sh
 40 | 
 41 | # format, check, and linting targets don't build and install the project to
 42 | # speed up testing.
 43 | [testenv:lint]
 44 | description = lint with pylint
 45 | skip_install = true
 46 | skipsdist = true
 47 | deps = -r requirements-dev.txt
 48 | commands =
 49 |     {envpython} -m pylint --load-plugins pylint_pydantic src/instructlab/eval/
 50 | 
 51 | [testenv:fastlint]
 52 | description = fast lint with pylint (without 3rd party modules)
 53 | skip_install = true
 54 | skipsdist = true
 55 | deps =
 56 |     pylint
 57 |     pylint-pydantic
 58 | commands =
 59 |     {envpython} -m pylint --load-plugins pylint_pydantic {posargs:--disable=import-error src/instructlab/eval/}
 60 | 
 61 | [testenv:ruff]
 62 | description = reformat and fix code with Ruff (and isort)
 63 | skip_install = True
 64 | skipsdist = true
 65 | # keep in sync with .pre-commit-config.yaml
 66 | deps =
 67 |     ruff
 68 |     isort
 69 | # supports 'fix', 'check', or abitrary args to 'ruff' command
 70 | commands =
 71 |     ./scripts/ruff.sh {posargs:fix}
 72 | allowlist_externals = ./scripts/ruff.sh
 73 | 
 74 | [testenv:spellcheck]
 75 | description = spell check (needs 'aspell' command)
 76 | skip_install = true
 77 | skipsdist = true
 78 | deps =
 79 |     pyspelling
 80 | commands =
 81 |     sh -c 'command -v aspell || (echo "aspell is not installed. Please install it." && exit 1)'
 82 |     {envpython} -m pyspelling --config {toxinidir}/.spellcheck.yml --spellchecker aspell
 83 | allowlist_externals = sh
 84 | 
 85 | [testenv:mypy]
 86 | description = Python type checking with mypy
 87 | deps =
 88 |   mypy
 89 |   types-tqdm
 90 |   types-PyYAML
 91 |   pytest
 92 | commands =
 93 |   mypy src
 94 | 
 95 | [testenv:py3]
 96 | basepython = python3.11
 97 | 
 98 | [testenv:py3-unit]
 99 | basepython = {[testenv:py3]basepython}
100 | 
101 | [testenv:py3-functional]
102 | basepython = {[testenv:py3]basepython}
103 | passenv =
104 |     {[testenv]passenv}
105 |     TEST_DIR
106 | 
107 | [gh]
108 | python =
109 |     3.11 = py311-{unitcov, functional}
110 | 
111 | [testenv:constraints]
112 | description = Generate new constraints file(s)
113 | basepython = {[testenv:py3]basepython}
114 | skip_install = True
115 | skipsdist = true
116 | deps =
117 |     uv==0.7.8
118 | commands = {posargs}
119 | allowlist_externals = *
120 | 


--------------------------------------------------------------------------------