├── .github ├── dependabot.yml ├── mergify.yml └── workflows │ ├── actionlint.dockerfile │ ├── actionlint.yml │ ├── docs.yml │ ├── matchers │ └── actionlint.json │ ├── spellcheck.yml │ └── stale_bot.yml ├── .gitignore ├── .markdownlint-cli2.yaml ├── .spellcheck-en-custom.txt ├── .spellcheck.yml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── SECURITY.md └── docs ├── adr-feature-gating.md ├── backend └── README.md ├── ci ├── ci-remove-dco-check.md ├── ci-universal-actions.md └── dependency-constraints.md ├── cli ├── ilab-model-engine.md ├── ilab-model-management-standardization.md ├── ilab-model-serve-backend.md └── ilab-processes.md ├── dependency-management.md ├── docs-instructlab-website.md ├── evaluation └── eval-repo.md ├── examples └── examples-repo.md ├── github-actions-providers.md ├── github-actions-use-policy.md ├── github-merge-strategy.md ├── github-taxonomy-automation.md ├── images ├── github-taxonomy-automation.excalidraw.png └── sdg-api-interface.png ├── instructlab-app.md ├── instructlab-cli-1.0.0.md ├── knowledge-submissions-past-wikipedia.md ├── library-release-strategy.md ├── license-headers.md ├── naming-conventions.md ├── publish-strategy.md ├── rag ├── adrs │ ├── README.md │ ├── adr-vectordb.md │ ├── granite-embeddings.md │ └── template.md ├── ilab-rag-retrieval.md ├── images │ ├── ingestion-mvp.png │ ├── rag-chat.png │ ├── rag-model-training.png │ ├── rag-plug-and-play.png │ └── rag-taxonomy-no-training.png └── rag-initial-code-location.md ├── retrieval-augmented-generation └── rag-framework.md ├── schema └── taxonomy-api.md ├── sdg ├── README.md ├── sdg-api-interface.md ├── sdg-api-simplification.md ├── sdg-flow-yaml.md ├── sdg-refactor.md └── sdg-repo.md ├── sdk └── instructlab-sdk.md ├── support-utils └── support-utils-repo.md ├── taxonomy-tree-organization.md ├── training └── training-accelerate-dep.md └── ui └── project-proposal.md /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # GitHub Dependabot configuration file 4 | version: 2 5 | updates: 6 | 7 | # Maintain dependencies for GitHub Actions 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | - package-ecosystem: "docker" 13 | directory: "/.github/workflows" 14 | schedule: 15 | interval: "daily" 16 | -------------------------------------------------------------------------------- /.github/mergify.yml: -------------------------------------------------------------------------------- 1 | pull_request_rules: 2 | - name: label-backend 3 | description: Automatically apply backend label 4 | conditions: 5 | - or: 6 | - files~=docs/backend/.* 7 | actions: 8 | label: 9 | add: 10 | - backend 11 | -------------------------------------------------------------------------------- /.github/workflows/actionlint.dockerfile: -------------------------------------------------------------------------------- 1 | # Since dependabot cannot update workflows using docker, 2 | # we use this indirection since dependabot can update this file. 3 | FROM rhysd/actionlint:1.7.7@sha256:887a259a5a534f3c4f36cb02dca341673c6089431057242cdc931e9f133147e9 4 | -------------------------------------------------------------------------------- /.github/workflows/actionlint.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Lint GitHub Actions workflows 4 | on: 5 | push: 6 | branches: 7 | - "main" 8 | paths: 9 | - '.github/workflows/*.ya?ml' 10 | - '.github/workflows/actionlint.*' # This workflow 11 | pull_request: 12 | branches: 13 | - "main" 14 | paths: 15 | - '.github/workflows/*.ya?ml' 16 | - '.github/workflows/actionlint.*' # This workflow 17 | 18 | env: 19 | LC_ALL: en_US.UTF-8 20 | 21 | defaults: 22 | run: 23 | shell: bash 24 | 25 | permissions: 26 | contents: read 27 | 28 | jobs: 29 | actionlint: 30 | runs-on: ubuntu-latest 31 | steps: 32 | - name: "Harden Runner" 33 | uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 34 | with: 35 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 36 | 37 | - name: "Checkout" 38 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 39 | with: 40 | fetch-depth: 0 41 | 42 | - name: "Download actionlint" 43 | run: | 44 | docker build --tag actionlint - < .github/workflows/actionlint.dockerfile 45 | 46 | - name: "Check workflow files" 47 | run: | 48 | echo "::add-matcher::.github/workflows/matchers/actionlint.json" 49 | docker run --volume="${PWD}:/repo" --workdir=/repo actionlint -color 50 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Lint Markdown documents 4 | 5 | on: 6 | push: 7 | branches: 8 | - "main" 9 | paths: 10 | - '**/*.md' 11 | - '.markdownlint-cli2.yaml' 12 | - '.github/workflows/docs.yml' # This workflow 13 | pull_request: 14 | branches: 15 | - "main" 16 | paths: 17 | - '**/*.md' 18 | - '.markdownlint-cli2.yaml' 19 | - '.github/workflows/docs.yml' # This workflow 20 | 21 | env: 22 | LC_ALL: en_US.UTF-8 23 | 24 | defaults: 25 | run: 26 | shell: bash 27 | 28 | permissions: 29 | contents: read 30 | 31 | jobs: 32 | markdown-lint: 33 | runs-on: ubuntu-latest 34 | steps: 35 | - name: "Harden Runner" 36 | uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 37 | with: 38 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 39 | - name: "Checkout" 40 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 41 | with: 42 | fetch-depth: 0 43 | - name: "Check Markdown documents" 44 | uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0 45 | with: 46 | globs: '**/*.md' 47 | -------------------------------------------------------------------------------- /.github/workflows/matchers/actionlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "actionlint", 5 | "pattern": [ 6 | { 7 | "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "message": 4, 12 | "code": 5 13 | } 14 | ] 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/spellcheck.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Spellcheck 4 | 5 | on: 6 | push: 7 | branches: 8 | - "main" 9 | paths: 10 | - '**.md' 11 | - '.github/workflows/spellcheck.yml' # This workflow 12 | pull_request: 13 | branches: 14 | - "main" 15 | paths: 16 | - '**.md' 17 | - '.github/workflows/spellcheck.yml' # This workflow 18 | 19 | env: 20 | LC_ALL: en_US.UTF-8 21 | 22 | defaults: 23 | run: 24 | shell: bash 25 | 26 | permissions: 27 | contents: read 28 | 29 | jobs: 30 | spellcheck: 31 | name: Spellcheck (en_US) 32 | runs-on: ubuntu-latest 33 | steps: 34 | - name: "Harden Runner" 35 | uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 36 | with: 37 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 38 | 39 | - name: "Checkout" 40 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 41 | with: 42 | fetch-depth: 0 43 | 44 | - name: Spellcheck 45 | uses: rojopolis/spellcheck-github-actions@584b2ae95998967a53af7fbfb7f5b15352c38748 # v0.49.0 46 | -------------------------------------------------------------------------------- /.github/workflows/stale_bot.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: 'Close stale issues and PRs' 4 | 5 | on: 6 | schedule: 7 | - cron: '30 1 * * *' 8 | 9 | env: 10 | LC_ALL: en_US.UTF-8 11 | 12 | defaults: 13 | run: 14 | shell: bash 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | stale: 21 | permissions: 22 | issues: write 23 | pull-requests: write 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: "Harden Runner" 27 | uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 28 | with: 29 | disable-sudo: true 30 | egress-policy: block 31 | allowed-endpoints: > 32 | api.github.com:443 33 | 34 | - name: "Stale Action" 35 | uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 36 | with: 37 | stale-issue-label: 'stale' 38 | stale-issue-message: > 39 | This issue has been automatically marked as stale because it has not had activity within 60 days. 40 | It will be automatically closed if no further activity occurs within 30 days. 41 | close-issue-message: > 42 | This issue has been automatically closed due to inactivity. Please feel free to reopen if you feel it is still relevant! 43 | days-before-issue-stale: 60 44 | days-before-issue-close: 30 45 | stale-pr-label: 'stale' 46 | stale-pr-message: > 47 | This pull request has been automatically marked as stale because it has not had activity within 30 days. 48 | It will be automatically closed if no further activity occurs within 7 days. 49 | close-pr-message: > 50 | This pull request has been automatically closed due to inactivity. Please feel free to reopen if you intend to continue working on it! 51 | days-before-pr-stale: 30 52 | days-before-pr-close: 7 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Spelling 4 | dictionary.dic 5 | 6 | # python virtualenv 7 | venv 8 | 9 | # Emacs 10 | *~ 11 | \#*\# 12 | .\#* 13 | .projectile 14 | .dir-locals.el 15 | -------------------------------------------------------------------------------- /.markdownlint-cli2.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | config: 4 | line-length: false 5 | no-emphasis-as-header: false 6 | no-emphasis-as-heading: false 7 | first-line-heading: false 8 | code-block-style: false 9 | no-duplicate-header: false 10 | single-trailing-newline: false 11 | globs: 12 | - "**/*.md" 13 | ignores: 14 | - ".github/**" 15 | - ".tox/**" 16 | - "venv/**" 17 | - ".venv/**" 18 | -------------------------------------------------------------------------------- /.spellcheck-en-custom.txt: -------------------------------------------------------------------------------- 1 | # make spellcheck-sort 2 | # Please keep this file sorted: 3 | Abhishek 4 | ADR 5 | agentic 6 | Akash 7 | AMDGPU 8 | Anil 9 | API 10 | api 11 | arge 12 | args 13 | arXiv 14 | backend 15 | backends 16 | benchmarking 17 | Bhandwaldar 18 | brainer 19 | Cappi 20 | checkpointing 21 | chunkers 22 | CLI 23 | cli 24 | CLI's 25 | codebase 26 | Colab 27 | compositional 28 | Conda 29 | config 30 | configs 31 | Containerfile 32 | cpp 33 | ctrl 34 | cuBLAS 35 | CUDA 36 | customizations 37 | CVE 38 | CVEs 39 | CWD 40 | Cynefin 41 | Daniele 42 | dataset 43 | datasets 44 | dateutil 45 | DBs 46 | DCO 47 | Dependabot 48 | dev 49 | disambiguating 50 | ditaa 51 | Docling 52 | docling 53 | docstring 54 | DocumentSplitter 55 | downstreams 56 | dr 57 | Dropdown 58 | DSL 59 | embedder 60 | embeddings 61 | env 62 | EP 63 | Eval 64 | eval 65 | Excalidraw 66 | exfiltrate 67 | exfiltrating 68 | Filesystem 69 | filesystem 70 | Finetuning 71 | formedness 72 | FQN 73 | freeform 74 | FSDP 75 | GB 76 | gb 77 | GFX 78 | GGUF 79 | gguf 80 | GGUFs 81 | GiB 82 | github 83 | Gmail 84 | GPTDolomite 85 | GPU 86 | gpu 87 | Guang 88 | hacky 89 | hardcode 90 | hardcoded 91 | Hema 92 | hipBLAS 93 | HyDE 94 | ilab 95 | Ilya 96 | impactful 97 | Inferencing 98 | init 99 | instantiation 100 | instructlab 101 | integrations 102 | io 103 | ISA 104 | iters 105 | itertools 106 | Jie 107 | jinja 108 | JIT 109 | JSON 110 | json 111 | Jupyter 112 | KAGGLE 113 | Kaggle's 114 | Kai 115 | Kolchinsky 116 | Kubernetes 117 | Kumar 118 | Langchain 119 | Langgraph 120 | leaderboard 121 | lifecycle 122 | lignment 123 | linters 124 | LLM 125 | LLMs 126 | llms 127 | LLVM 128 | lora 129 | Makefiles 130 | Markdownlint 131 | Martinoli 132 | md 133 | Mergify 134 | Merlinite 135 | merlinite 136 | Milvus 137 | MilvusEmbeddingRetriever 138 | MilvusLite 139 | mimimum 140 | Miniforge 141 | MiniLM 142 | Mixtral 143 | mixtral 144 | MLX 145 | MMLU 146 | modularize 147 | modularized 148 | MTEB 149 | Murdock 150 | mvp 151 | Nakamura 152 | natively 153 | networkx 154 | num 155 | numpy 156 | NVidia 157 | OCI 158 | Ollama 159 | ollama 160 | onboarding 161 | OpenAI 162 | openai 163 | OpenStax 164 | optimizers 165 | orchestrator 166 | ots 167 | PaRAGon 168 | Params 169 | Pareja 170 | PEFT 171 | Pereira 172 | PID 173 | Plag 174 | PlantUML 175 | PLOS 176 | pluggable 177 | PNG 178 | POC 179 | Podman 180 | podman 181 | posthog 182 | postprocessing 183 | pre 184 | preprint 185 | preprocessing 186 | prereqs 187 | productize 188 | productized 189 | PR's 190 | PSFL 191 | Pydantic 192 | pyenv 193 | PyPI 194 | pyproject 195 | PyTorch 196 | pyyaml 197 | qlora 198 | qna 199 | quantized 200 | Quantizing 201 | Radeon 202 | RDNA 203 | README 204 | rebase 205 | Ren 206 | repo 207 | repos 208 | RHEL 209 | roadmapping 210 | ROCm 211 | RTX 212 | runtime 213 | RX 214 | SaaS 215 | safetensor 216 | safetensors 217 | Salawu 218 | scalable 219 | SDG 220 | sdg 221 | SDK 222 | sdk 223 | semvar 224 | sexualized 225 | SHA 226 | sharding 227 | Shellcheck 228 | Shivchander 229 | Signoff 230 | Sigstore 231 | specifiying 232 | splitter 233 | src 234 | Srivastava 235 | Staar 236 | Standup 237 | subcommand 238 | subcommands 239 | subdirectory 240 | subfolder 241 | subprocess 242 | Sudalairaj 243 | supportability 244 | systemd 245 | Taj 246 | tatsu 247 | TBD 248 | templating 249 | Tesla 250 | th 251 | tl 252 | TODO 253 | tox 254 | tqdm 255 | traigers 256 | triager 257 | Triagers 258 | triagers 259 | txt 260 | UI 261 | ui 262 | unquantized 263 | unstaged 264 | URI 265 | USM 266 | UUID 267 | UUIDs 268 | UX 269 | vectordbs 270 | Veeradhi 271 | venv 272 | Vishnoi 273 | vLLM 274 | vllm 275 | watsonx 276 | Wikisource 277 | wikisql 278 | WIP 279 | WSL 280 | xcode 281 | XNACK 282 | XT 283 | XTX 284 | Xu 285 | YAML 286 | yaml 287 | -------------------------------------------------------------------------------- /.spellcheck.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | matrix: 4 | - name: markdown 5 | aspell: 6 | lang: en 7 | d: en_US 8 | camel-case: true 9 | mode: markdown 10 | sources: 11 | - "**/*.md|!.tox/**" 12 | dictionary: 13 | wordlists: 14 | - .spellcheck-en-custom.txt 15 | pipeline: 16 | - pyspelling.filters.context: 17 | context_visible_first: true 18 | escapes: '\\[\\`~]' 19 | delimiters: 20 | # Ignore multiline content between fences (fences can have 3 or more back ticks) 21 | # ```language 22 | # content 23 | # ``` 24 | - open: '(?s)^(?P *`{3,}).*?$' 25 | close: '^(?P=open)$' 26 | # Ignore text between inline back ticks 27 | - open: '(?P`+)' 28 | close: '(?P=open)' 29 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | See [community/CONTRIBUTING.md](https://github.com/instruct-lab/community/blob/main/CONTRIBUTING.md). 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help 2 | help: 3 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 4 | 5 | .PHONY: spellcheck 6 | spellcheck: .spellcheck.yml ## Spellcheck markdown files 7 | pyspelling --config $< 8 | 9 | .PHONY: spellcheck-sort 10 | spellcheck-sort: .spellcheck-en-custom.txt ## Sort spellcheck directory 11 | sort -d -f -o $< $< 12 | 13 | # 14 | # If you want to see the full commands, run: 15 | # NOISY_BUILD=y make 16 | # 17 | ifeq ($(NOISY_BUILD),) 18 | ECHO_PREFIX=@ 19 | CMD_PREFIX=@ 20 | PIPE_DEV_NULL=> /dev/null 2> /dev/null 21 | else 22 | ECHO_PREFIX=@\# 23 | CMD_PREFIX= 24 | PIPE_DEV_NULL= 25 | endif 26 | 27 | .PHONY: md-lint 28 | md-lint: ## Lint markdown files 29 | $(ECHO_PREFIX) printf " %-12s ./...\n" "[MD LINT]" 30 | $(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest > /dev/null 31 | 32 | .PHONY: md-lint-fix 33 | md-lint-fix: ## Fixes all fixable linting errors 34 | $(ECHO_PREFIX) printf " %-12s ./...\n" "[MD LINT]" 35 | $(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest --fix > /dev/null 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InstructLab Developer Documentation 2 | 3 | This repository contains design artifacts that define the architecture and 4 | design of components in the InstructLab project. The repo also serves as a home 5 | for technical policies that apply across all components. 6 | 7 | ## Governance 8 | 9 | Governance for InstructLab overall is documented in the [community 10 | repository](https://github.com/instructlab/community/blob/main/GOVERNANCE.md). 11 | 12 | The [InstructLab Oversight 13 | Committee](https://github.com/instructlab/community/blob/main/MAINTAINERS.md) is 14 | responsible for the contents of this repository. 15 | 16 | The rules for merging depend on the type of change in question and its scope of impact. If you 17 | are unsure about the scope of impact for a change, i.e. if a change is trivial or non-trivial, 18 | please ping the Oversight Committee for help. 19 | 20 | * Trivial changes may be merged with 1 review from any InstructLab maintainer. 21 | * Examples of trivial changes include minor wording adjustments or typo fixes in 22 | documentation, changes to CI fixes, CI dependency updates, etc. 23 | * Non-trivial changes have more loosely defined requirements. Input should be sought 24 | out from maintainers of relevant components. The broader the scope or more 25 | controversial the change, the more broad the consensus should be required for 26 | merging. The final approval and merge (or action, e.g. deleting a repo) 27 | falls to two maintainers of any InstructLab Organization repository as well as 28 | an additional third maintainer of any InstructLab Organization repository to 29 | merge the PR after verifying that sufficient reviews have been given. If there are 30 | disputes on the design document that cannot be resolved, an Oversight Committee 31 | member can be consulted as an arbitrator. These approvals ensure that 32 | adequate opportunity and attention has been given by the affected parties. 33 | * Examples of non-trivial changes include approving proposal for new repositories, 34 | creation of new repositories, changes to organization level GitHub settings, archiving 35 | or deleting repositories, design proposals, etc. 36 | * Any maintainer or oversight committee member may request that a change receive 37 | a full vote from the Oversight Committee. More substantial policy changes or a 38 | proposed new project under InstructLab are examples of when this may be 39 | appropriate. 40 | 41 | ## Formatting Guidelines 42 | 43 | Design documents should be placed in `docs/`. 44 | 45 | ### Text 46 | 47 | Files should be in [Markdown](https://github.github.com/gfm/) format. 48 | 49 | ### Images 50 | 51 | Diagrams are encouraged, but must be submitted in a format where they can be 52 | easily updated in the future as needed. Some options include: 53 | 54 | * [Mermaid](https://github.com/mermaid-js/mermaid#readme) 55 | * [Excalidraw](https://excalidraw.com/) 56 | ** Be sure to leave "Embed Scene" turned on when exporting the PNG. 57 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | You can find information on how to report a potential security vulnerability, as well as where to subscribe to receive security alerts, on the project's [Security Page](https://github.com/instructlab/.github/blob/main/SECURITY.md). 2 | -------------------------------------------------------------------------------- /docs/adr-feature-gating.md: -------------------------------------------------------------------------------- 1 | # InstructLab Feature Gating Patterns 2 | 3 | ## Context 4 | 5 | Motivated by the currently [in-progress RAG development](https://github.com/instructlab/dev-docs/pull/161) in [dev preview scope](https://access.redhat.com/support/offerings/devpreview), there is a use case for introducing feature gating (sometimes equivalently called "feature flags"). The specific use case desired is to have default settings for feature gating in the application that can be overridden using environment variables. 6 | 7 | One common pattern is per-feature gating, i.e., configuration per-feature. This fine-grained control might be desirable in the future, especially to enable user segmentation for individual experimental features. We do not need that level of fine-grained control at this time. 8 | 9 | There is precedent [in OpenShift](https://docs.openshift.com/container-platform/4.17/nodes/clusters/nodes-cluster-enabling-features.html) for enabling sets of features based on support scope. Following this pattern would be consistent with OpenShift terminology and meets our needs at this time. 10 | 11 | ## Decision 12 | 13 | InstructLab will adopt feature gating based on feature sets using the OpenShift terminology of `DevPreviewNoUpgrade` and `TechPreviewNoUpgrade`, able to be overridden using an environment variable. 14 | 15 | ## Status 16 | 17 | Proposed 18 | 19 | ## Consequences 20 | 21 | * Feature gating concepts will be consistent with OpenShift, lowering the learning curve of one application's configuration when coming from the other. 22 | * We will not need to (*yet*) spend the time to develop our own taxonomy for feature gating. 23 | * Messaging to users about support commitments when using dev preview or tech preview will be clear - in particular, that no version upgrade commitments are made, nor the ability to disable those scopes in order to revert to a supported application state. 24 | * We have a decision to make about whether to introduce a new dependency for feature flagging or make our own simple one. 25 | * There is a migration path to finer-grained feature gating via the [`CustomNoUpgrade`](https://github.com/openshift/api/blob/master/config/v1/types_feature.go#L54) scope. 26 | * We will have to make sure to communicate what these feature gate scopes mean to users and what commitments they entail, in documentation and/or otherwise. 27 | -------------------------------------------------------------------------------- /docs/backend/README.md: -------------------------------------------------------------------------------- 1 | # InstructLab Backend Working Group 2 | 3 | ## Introduction 4 | 5 | The InstructLab project to date has produced a 6 | [CLI](https://github.com/instructlab/instructlab) that allows the execution of 7 | the InstructLab workflow on a variety of single-machine footprints. 8 | 9 | Some members of the InstructLab project also operate some backend services that 10 | produce regular full model builds using contributions to the 11 | [taxonomy](https://github.com/instructlab/taxonomy/). This runs in a larger 12 | clustered environment. This original backend is not built in a way that is 13 | easily reusable by others. It is a goal of the InstructLab project to produce an 14 | evolution of these backend capabilities that is more easily reusable by others. 15 | 16 | The InstructLab Backend Working Group represents a group of contributors 17 | interested in evolving the InstructLab backend capabilities. The group is open 18 | to all contributors interested in this work. 19 | 20 | ## Group Membership 21 | 22 | There is no formal membership of this group. Those interested in contributing 23 | just need to show up and participate in the work of the group. 24 | 25 | ## Primary Artifacts Produced by the Group 26 | 27 | The primary artifacts produced by the InstructLab Backend Working Group are 28 | architecture and design docs in the `docs/backend` directory of this repository. 29 | This is where the group will work out high-level planning for various components 30 | that should be created. 31 | 32 | Once individual components are created and implementation is in progress, code 33 | and associated documents for that component will be stored in those 34 | repositories. 35 | 36 | ## Communication 37 | 38 | Communication among the group occurs in the following ways: 39 | 40 | - **Meetings**: The group meets regularly on a schedule determined by the group. 41 | Meetings are open to all interested contributors. Meeting times are posted in 42 | the [InstructLab Community 43 | Calendar](https://calendar.google.com/calendar/embed?src=c_23c2f092cd6d147c45a9d2b79f815232d6c3e550b56c3b49da24c4b5d2090e8f%40group.calendar.google.com). 44 | 45 | - **Discord**: The group has various channels for corresponding components such as `#core`, `#sdg`, `#sdg`, `#eval`, and `#infra`, which can all be found in the [InstructLab Discord](https://instructlab.ai/discord). 46 | 47 | - **Slack**: The group uses the `#backend` channel in the InstructLab Slack. 48 | 49 | - **Mailing Lists**: There is a [development mailing 50 | list](https://groups.google.com/a/instructlab.ai/g/dev) should the group desire 51 | to share or discuss topics by email. 52 | 53 | - **GitHub**: Artifacts produced by this group will be stored in the 54 | `docs/backend` directory of this repository and will be reviewed via GitHub 55 | pull requests. GitHub Issues may also be used to track work items as needed. 56 | -------------------------------------------------------------------------------- /docs/ci/ci-remove-dco-check.md: -------------------------------------------------------------------------------- 1 | # Remove DCO Sign-off Check 2 | 3 | ## Overview 4 | 5 | Presently, all repositories under our `instructlab` GitHub org require contributors to add a [Developer Certificate Origin (DCO)](https://developercertificate.org/) sign-off in each commit to indicate that they agree to the DCO legal terms. While these DCO sign-offs can be a great way for us to validate that a contributor agrees to the DCO legal terms, many upstream projects actively use the DCO without explicitly requiring contributors to add "Signed-off-by" to their commits. So in other words, we can still require contributors to agree to the DCO terms, but we don't necessarily _need_ contributors to add DCO sign-offs in each commit. 6 | 7 | We can look at the `gnuradio/volk` repository as an example of how to enforce the DCO terms without explicitly requiring contributors to add sign-offs. Essentially, they have added a copy of the DCO text to their repository under [docs/DCO.txt](https://github.com/gnuradio/volk/blob/d9d6abfcc4aaeec2d60abd482f74fded158818b5/docs/DCO.txt) and referenced that file [here in their CONTRIBUTING.md](https://github.com/gnuradio/volk/blob/308948abf8384bb4bf6467e14b585df708789782/docs/CONTRIBUTING.md?plain=1#L22) guide. 8 | 9 | ## Changes Going Forward 10 | 11 | We should mimic how other open source projects implicitly enforce the DCO terms. More specifically, we should make an exact copy of the [DCO agreement document](https://developercertificate.org/) and save that copy to a `DCO.txt` file located at the root of each InstructLab repository (i.e., on the same level as the main `README.md` file in each repository). While some open source repositories opt to place the DCO agreement terms under a `docs/` folder, us placing `DCO.txt` at the root of each repository makes the DCO agreement visible and easily accessible for all contributors. Finally, we can then update each repository's `CONTRIBUTING.md` guide to clearly state that anyone who makes a contribution to InstructLab will be implicitly agreeing to the DCO terms located under `DCO.txt` -- and a relative link to that file shall be provided within each InstructLab repository. -------------------------------------------------------------------------------- /docs/ci/ci-universal-actions.md: -------------------------------------------------------------------------------- 1 | # Create Separate Repo for Custom GitHub Actions 2 | 3 | ## Idea Overview 4 | 5 | Create a separate repository within the `instructlab` GitHub org to house our custom CI Github Actions. 6 | 7 | ## Motivation for this Proposal 8 | 9 | Within our `instructlab` GitHub org, we have an in-house GitHub action that we use across several repos: `free-disk-space`. 10 | 11 | Examples: 12 | 13 | - [`free-disk-space` in the `eval` repo](https://github.com/instructlab/eval/blob/main/.github/actions/free-disk-space/action.yml) 14 | - [`free-disk-space` in the `instructlab` repo](https://github.com/instructlab/instructlab/blob/main/.github/actions/free-disk-space/action.yml) 15 | - [`free-disk-space` in the `sdg` repo](https://github.com/instructlab/sdg/blob/a532a8d99ffe447152948e9442464923aba62637/.github/actions/free-disk-space/action.yml) 16 | 17 | This GitHub action is universal as it is a simple script used to clean up disk space on GitHub runners and can be modified to free disk space on our CentOS-based EC2 runners. 18 | 19 | Note that all of these in-house `free-disk-space` action files are exactly the same, so we essentially have the same file stored in three different repos. 20 | 21 | ## Pros and Cons 22 | 23 | Below are some pros and cons of creating a separate repository to house our GitHub actions. 24 | 25 | ### Pros 26 | 27 | - We will have one action file in one location 28 | - Easier to make changes in one location instead of (our present) three locations 29 | - Contributors will know where to look for in-house action files 30 | - We can create releases and utilize version control 31 | 32 | ### Cons 33 | 34 | - Extra repository to maintain. 35 | - We can't publish any of our actions to the GitHub marketplace if we have multiple actions stored in one repository.\* 36 | 37 | \* In reference to the last bullet point, [the GitHub docs for publishing Actions](https://docs.github.com/en/actions/sharing-automations/creating-actions/publishing-actions-in-github-marketplace#about-publishing-actions) states: 38 | > Actions are published to GitHub Marketplace immediately and aren't reviewed by GitHub as long as they meet these requirements: 39 | > 40 | > - The action must be in a public repository. 41 | > - Each repository must contain a single action. 42 | 43 | If we do care about publishing our actions, then we should consider creating separate repositories for these actions. If we don't care to publish, then this is a non-issue. (See next section below.) 44 | 45 | ## Additional Info 46 | 47 | Even if we cannot publish our actions to the GitHub marketplace, we can _still_ use these actions in our repository. For example, if our repo was named `ci-actions` with this layout: 48 | 49 | ```bash 50 | . 51 | ├── custom-action-1/ 52 | │ ├── action.yml 53 | ├── custom-action-2 54 | │ ├── action.yml 55 | ``` 56 | 57 | ...then we'd reference them in our other repos like so: 58 | 59 | ```yaml 60 | name: Some Name 61 | 62 | on: 63 | workflow_dispatch: 64 | 65 | jobs: 66 | some-job: 67 | runs-on: ubuntu-latest 68 | 69 | steps: 70 | - name: Checkout code 71 | uses: actions/checkout 72 | 73 | - name: Use custom action 1 74 | uses: instructlab/ci-actions/custom-action-1@main 75 | 76 | - name: Use custom action 2 77 | uses: instructlab/ci-actions/custom-action-2@main 78 | ``` 79 | 80 | Reference: [StackOverflow - "How to Publish Multiple GitHub Actions from a Single Repo and Call Them from Another Repo"](https://stackoverflow.com/a/79100136) -------------------------------------------------------------------------------- /docs/ci/dependency-constraints.md: -------------------------------------------------------------------------------- 1 | # Dependency Constraints in InstructLab CI 2 | 3 | This document describes how Python dependencies should be managed in 4 | InstructLab CI. 5 | 6 | ## Goals 7 | 8 | 1. Ensure that the CI environment is consistent and reproducible. 9 | 2. Ensure that new dependency releases do not break the CI environment. 10 | 3. Ensure that we adopt new dependency releases in the CI environment in a 11 | timely manner. 12 | 13 | ## Approach 14 | 15 | ### Pin all versions with constraints files 16 | 17 | Each repository should contain a [pip 18 | constraints](https://pip.pypa.io/en/stable/user_guide/#constraints-files) file 19 | that lists the pinned versions of all dependencies used in the CI environment. 20 | 21 | In case a repository supports multiple platforms (`linux` vs `darwin`) or 22 | accelerators (`cuda`, `cpu`, `hpu`), multiple constraints files may be 23 | generated. 24 | 25 | The constraints files should be used in all CI jobs that install Python 26 | dependencies (using `pip` or otherwise), including linters, unit tests, 27 | integration tests, and functional tests. 28 | 29 | ### Update constraints files regularly 30 | 31 | Each repository should define a new `tox` target called `constraints` that will 32 | be used to re-generate all the constraints files in the project. 33 | 34 | Updates to these files should be generated automatically by the CI system using 35 | the [update-constraints](https://github.com/instructlab/ci-actions/tree/main/actions/update-constraints) 36 | action from `ci-actions` repository and should not be modified manually 37 | (subject to rare exceptions). A periodic CI job should be added to ensure this 38 | happens on a regular basis (at least once a week). 39 | 40 | The job will update constraints file and post the result as a PR for review. 41 | The PR will be validated by all the relevant CI jobs. Project core team is 42 | expected to review these PRs in a timely manner (within 3 business days). The 43 | team should make sure that all the relevant CI jobs are passing before merging 44 | the PR. 45 | 46 | ### Uncap all dependencies 47 | 48 | At this point, no new dependency releases should affect the CI environment. 49 | Projects should then uncap all their dependencies in requirements files, as per 50 | [dependency management policy](../dependency-management.md). 51 | 52 | Note: In rare situations, a cap may be justified. For example, when we know for 53 | sure that a new release of the upstream dependency will break the project, and 54 | when we don't have capacity to deliver a fix in a timely manner. These 55 | situations should be rare and a mitigation plan should be in place to uncap the 56 | dependency. 57 | 58 | Specifically, 59 | 60 | - A tracking issue should be reported in the issue tracking system capturing 61 | any known details about the issue. 62 | - A new **temporary** constraint should be added to `constraints-dev.txt.in` 63 | file. The temporary constraint should refer to the tracking issue in a 64 | comment above it. 65 | - The tracking issue should be assigned a high priority and considered a 66 | blocker for an upcoming release. 67 | -------------------------------------------------------------------------------- /docs/cli/ilab-model-engine.md: -------------------------------------------------------------------------------- 1 | # Redesign `ilab` Command Structure to be Resource Based 2 | 3 | This document describes a new structure for `ilab`, consisting of sub-commands that act as parent or "group" commands for further sub-commands. Currently the only group command is `ilab`. If the models `ilab` produces are meant to be the golden standard for open source AI, the tool to use and manage these models needs to be fully-fledged and extensible, especially for models we have not produced but should be able to run. 4 | 5 | open source container tools like [podman](https://podman.io/) and [docker](https://www.docker.com/) are commonly viewed as engines. `ilab` should be a model engine, managing the deployment and creation of AI models. To be clear, I will be using tools like podman as a structural analogy here with full understanding they have different implications and use-cases. 6 | 7 | ## Key Component 8 | 9 | ### InstructLab Structure Redesign 10 | 11 | Here are two outlines. They represent the state of `ilab` before/after this enhancement. 12 | 13 | ```console 14 | ilab 15 | | 16 | |_____chat 17 | |_____check 18 | |_____convert 19 | |_____download 20 | |_____generate 21 | |_____init 22 | |_____list 23 | |_____serve 24 | |_____test 25 | |_____train 26 | ``` 27 | 28 | vs. after: 29 | 30 | ```console 31 | ilab 32 | | 33 | |_______model 34 | | | 35 | | |____convert 36 | | |____download 37 | | |____train 38 | | |____serve 39 | | |____chat 40 | | |____evaluate 41 | | | 42 | |_______data 43 | | | 44 | | |____generate 45 | | | 46 | |_______config 47 | | | 48 | | |____init 49 | | | 50 | |_______taxonomy 51 | | | 52 | | |____diff 53 | | |____check 54 | | |____download 55 | ``` 56 | 57 | The main point of this structure is to introduce a resource based hierarchy and to improve the usability of the existing `ilab` structure. At the top level of commands, we are not thinking about verbs but resources commonly managed in `ilab`, under them fall some actions. Some key things missing in `ilab` currently are: 58 | 59 | 1. Ambiguity in "what" we are generating, training etc. 60 | - The current structure of `ilab` requires an in-depth knowledge of the ordering of commands and background knowledge for how our CLI is built. This is not the best way to grow. Commands that are not intuitive and require a deep dive into the docs just to get started might not engage users as we hope to. Having an intuitive structure in which people can see easily from the docstring what each command does and how they might use them, makes more sense. We are generating data, then training a model, running a server to interact with that model, then finally chatting with the model 61 | 2. Some commands like `ilab test` and `ilab list` should be renamed to show their true purpose (`ilab model evaluate` and `ilab taxonomy diff`) 62 | 3. We will hit a point where we want to add more commands to `ilab` due to expected integration with the backend, model registries etc. This list will grow to a point where this form of top group organization is not maintainable. 63 | 64 | In general, moving to this structure allows for more growth when `ilab` reaches a stable release and encourages broader usage of the tool for all sorts of open-source AI models. 65 | 66 | #### Goals for 5/30/2024 milestone release of CLI 67 | 68 | For the next few milestones, it has been identified that the overall structure shift should be in place. We should aim to have the base hierarchy of commands in place that will exist for the future of `ilab` so that when users become familiar with the CLI, further drastic changes are not needed. 69 | 70 | This means we will now have commands that have a parent group `ilab` followed by a sub-command/group terminated by a grandchild command: `ilab model list` for example. This workflow not only adds purposeful organization to this project, but will encourage contributors to add commands as they use `ilab`, and file bugs in specific parts of the codebase that need work as opposed to the current flow which is hard to keep track of from a user's point of view. 71 | 72 | ### Necessity of a input -> configuration -> result mental model 73 | 74 | The commands uncovered by adding this structure resemble the typical architecture for an "engine". `ilab` is not complete without the mechanisms to create, list, delete, and inspect the models. Models, as container images do in container engines, act as the configuration for the end result: the chat process. The interesting thing about this analogy, is that there needs to be a pre-cursor to configuration as well. There is the container image, the container, and the `Containerfile`. The `Containerfile` is the raw user input that leads to an image. 75 | 76 | `ilab` needs to have a clear source of information, a result of compiling this source, and a "running" end result. Having these three steps purposefully delineated creates the need for commands in which users to manipulate and act upon each of these step. 77 | 78 | The mental model a user has impacts the way they use the application and whether they view it as a Proof Of Concept. 79 | 80 | Here is a diagram of what I mean when I describe this mental model: 81 | 82 | ```console 83 | SOURCE SINK 84 | 85 | User has un-commited changes in their taxonomy Data is generated and then the user runs A model is created, and the user can tag the model After converting the model, the 86 | or specifies `--skill` `--knowledge` in the ==> `ilab model train` using the newly generated data ==> or just list it using the new model commands ==> user runs the chat procces by specifying `ilab model serve` followed by `ilab model chat 87 | new `ilab data generate` 88 | 89 | ``` 90 | 91 | So in `ilab` the clear source of information is the taxonomy, or the new skills/knowledge brought into generate. This data then goes through processes that end up in the sink, which is a running/actionable process which the user can interact with. Providing commands that clearly display these different processes is key. Groups for these larger "buckets" of processes make sense as well due to the likelihood of more commands being needed to properly interact with the newly generated model. 92 | 93 | ### Alternatives 94 | 95 | An alternative flow to `ilab` -> `child-command` -> `grandchild-command` is: 96 | 97 | `ilab` -> `verb` -> `noun`. The positive to this approach is commands would sound better: `ilab generate data`. However there are a few negatives. While this is easier to say, it makes less sense from an organizational standpoint: 98 | 99 | ```console 100 | ilab 101 | | 102 | |_______download__| 103 | |_______convert___|-----model 104 | |_______train_____| 105 | ``` 106 | 107 | This structure works when commands are ALL linked to the same `verb`. What happens when a group like `download` gets a command that the others don't? Then it looks more like this: 108 | 109 | ```console 110 | ilab 111 | | 112 | |_______download 113 | | |____model 114 | | |____taxonomy 115 | |_______convert 116 | | |_____model 117 | |_______train 118 | | |_____model 119 | ``` 120 | 121 | This is opposed to the structure in this EP, which might have a duplicate here and there for something like `download`, but in this structure duplicate commands are the norm and will result in a larger and clunkier codebase that is confusing to read. 122 | 123 | This first structure looks nice, and `ilab download model` sounds nice. However in terms of implementation, this makes little sense. In [click](https://click.palletsprojects.com/en/8.1.x/), the CLI library we use, `ilab` is a "group" that commands are currently "grouped under". In this Model, `ilab` would be a group, `download`, `convert` etc would be a sub-group, and `model` would be a command under ALL of these groups. This would require an implementation of the `model` function in different packages all linking to a different group parent. From the perspective of user contributions and general code readability doesn't make much sense. Part of this design is to encourage user contributions by making the structure of the codebase logical. While these commands make the commands sound better, they make them harder to group, and understand. 124 | 125 | ### General workflow as compared to alternatives 126 | 127 | currently one has to: 128 | 129 | 1. `ilab init` 130 | 2. `ilab download` 131 | 3. `ilab serve` 132 | 4. `ilab generate` 133 | 5. `ilab train` 134 | 6. `ilab convert` 135 | 7. `ilab serve --model XXX` 136 | 8. `ilab chat` 137 | 138 | how to use `ilab` with this new structure: 139 | 140 | 1. `ilab config init` 141 | 2. `ilab model download` 142 | 3. `ilab data generate` 143 | 4. `ilab model train` 144 | 5. `ilab model serve` 145 | 6. `ilab model chat` 146 | 147 | It is now clear what is happening. For example, one may ask: "What does `ilab init` do?" while `ilab config init` clearly initializes the CLI's config. A more clear example is `ilab generate`. Are we generating a model? A config? No, we are generating *data*: `ilab data generate`. 148 | 149 | ## Changes to Existing flow 150 | 151 | The current `ilab` commands will still work. Users will be able to type commands like `ilab model train` or `ilab train` for the foreseeable future to ensure feature parity. Eventually, this alias should be removed, and only the sub-commands should probably exist. 152 | 153 | ## Known Issues 154 | 155 | ### click 156 | 157 | click doesn't like the setup we currently have in `ilab`. So, adding sub-parent commands won't be as easy as creating a new `click.group`. We will need to make different libraries, each of which is a click group most likely. 158 | -------------------------------------------------------------------------------- /docs/cli/ilab-model-management-standardization.md: -------------------------------------------------------------------------------- 1 | # Standardizing model storage, management and referencing in the CLI and libraries 2 | 3 | ## Introduction 4 | 5 | - **Purpose**: Standardizing how models are stored, managed and accessed via flags used in CLI tools and libraries to ensure consistency, usability, and maintainability. 6 | - **Scope**: Covers model directory structure in cache and all model-related flags used in the command-line interface (CLI) and libraries within the project. 7 | - **Audience**: Developers, maintainers, and contributors involved in the development and usage of the CLI tools and libraries. 8 | 9 | ## Summary 10 | 11 | This proposal aims to establish a consistent practice for how models are managed within InstructLab. It does so in 2 parts - one proposing a consolidation of all existing model related flags, and the other proposing a change to how models are organized in the cache upon download, and thereby referenced within various operations 12 | 13 | ## Motivation 14 | 15 | Currently there are 10+ instances across all major commands and subcommands that accept either `--model` , `--model-path`, `--model-name`, or `--model-dir`; all of which serve slightly different purposes, and might handle different use cases including local vs remote models, relative vs absolute paths etc. This leads to a significant amount of avoidable confusion among developers and users alike. Introducing some consolidation and consistency in this aspect would provide a big UX improvement. 16 | 17 | In addition to this, we currently support multiple sources for model downloads. There could be models that span multiple sources and also have different versions and tags that users may want to be able to work with. Currently, there is a lack of uniformity in how we organize models downloaded from Hugging Face vs OCI registries. We do not have a way to differentiate and maintain unique identities for models in cache, based on their sources. We also currently don't have the ability to support version management. Users are only able to work with one version of a given model at a time, as downloading a different version just overwrites their existing model. 18 | 19 | ## Proposal 20 | 21 | ### Part 1: Narrowing down to a singular model related flag 22 | 23 | #### Problem Statement - Section 1 24 | 25 | We want to streamline the number of model related flags available, while establishing consistency and predictability between their uses and handling 3 separate use-cases: absolute paths, relative paths and remote repository names. 26 | 27 | #### Suggested approach - Section 1 28 | 29 | - Retain `--model` and deprecate all other model related flags 30 | - Alternatively, we could retain `--model-path` but passing in a repo name to a flag called "model-path" seems a little more awkward than passing in a path to a field just called "model" 31 | - `--model` should accept both paths, as well as strings (for repo names) 32 | - First `--model` should simply check whether the supplied path exists - This would include determining whether the provided path is absolute, or relative to wherever the calling program is. 33 | - If it exists, we should run an additional check to determine if it points to a safetensor or gguf models (use existing `is_model_safetensors` and `is_model_gguf` checks for this) 34 | - If path does not exist, or exists but is not determined to be a valid model, then move on 35 | - If path is not found after step 1, check against all tracked models/checkpoints/adapters etc currently found in `~/.local/cache` and or `~/.local/checkpoints`. If not, move on 36 | - If supplied content is neither an absolute path, nor a relative path - assume it is the name of a remote repo on HF and download it 37 | - alternatively, we can error out here and require that user download the model explicitly via `ilab model download` 38 | 39 | This would standardize the behavior of the `--model` flag across all the commands that it appears in. There could be a dedicated model resolver function that implements the above described process. 40 | 41 | The only exception may be `ilab model download` which contains a `--model-dir` flag, which acts as a sink rather than a source. This flag could stand to benefit from being renamed to `--destination` instead. 42 | 43 | ### Part 2: Standardizing what gets passed _into_ the model flag (value) 44 | 45 | #### Problem Statement - Section 2 46 | 47 | In addition to standardizing the flag itself, we must also standardize what gets passed INTO the flag, i.e the value passed to `--model`. The format used to reference models should work consistently regardless of whether the user is referencing 48 | a local model or a remote one. They should be able to use a consistent string to reference a given, specific model under all circumstances. 49 | 50 | #### Suggested approach - Section 2 51 | 52 | - Standardize around the usage of a model's full URL as the way to reference that model at all times (e.g `quay.io/ai-lab/models/granite-7b-lab`). 53 | - Download logic is updated such that models are always downloaded into `~/.local/cache/instructlab/models` under sub-directories that follow the same structure as their URL. We also account for model versioning by doing the following: 54 | - Creation of dedicated sub-directories based on tag/branch/commit SHA 55 | - Creation of a `.metadata` file that records the tag/branch/commit information 56 | - Keeping these in mind, an example for what the final model path could look like is: `~/.local/cache/instructlab/models/quay.io/ai-lab/models/granite-7b-lab/v1.1` 57 | - Users use the full URL `quay.io/ai-lab/models/granite-7b-lab` when specifying where to download the model from. Thereafter, users continue to use this string (with the version included) to reference this model from cache, as this will now match the relative path of the model (based on point #4 of suggested approach of part 1) 58 | - E.g: `ilab model chat --model quay.io/ai-lab/models/granite-7b-lab/v1.1` 59 | - Alternatively, we could adopt standard convention and allow specification of version through a colon (`ilab model chat --model quay.io/ai-lab/models/granite-7b-lab:v1.1`) and extract the version from this URL and use it to locate the right version folder 60 | - There could be some UX enhancements we add on top, such as allowing users to just specify "granite-7b-lab" and use some logic to determine which sub-folder and version to default to if multiple copies of that model exist across different sources and versions, similar to Podman 61 | 62 | An ongoing work-in-progress effort for this can be found at: [#1895](https://github.com/instructlab/instructlab/pull/1895). 63 | 64 | The proposed combined approach will resolve [#2200](https://github.com/instructlab/instructlab/issues/2200), [#1871](https://github.com/instructlab/instructlab/issues/1871) and all issues associated with it. 65 | 66 | #### Open question 67 | 68 | One issue with following the suggested approach would be that we might like to store Hugging Face models under `huggingface.co/` in the cache, similar to `quay.io/` for example. However, the Hugging Face API expects users to specify models following the pattern 69 | `/` and does not accept `huggingface.co//`. As such, we could still store the models that way and have users specify `huggingface.co` for the sake of uniformity and have some logic to strip out the `huggingface.co` from the URL before sending the API request. This seems rather clunky and unnecessary. On the other hand, if we continue storing Hugging Face models in the same `/` format, it would break the pattern with most models being 70 | collected under their host domains, and Hugging Face models arbitrarily stored one level higher than the rest. What's an acceptable solution in this case? 71 | 72 | ## Follow up work 73 | 74 | - `ilab model list` is updated to include a `version` column that displays all available versions for a given model. This logic should read from the `.metadata` files to pull the version info. Existing models already downloaded by models won't contain this file and hence should automatically have a version of `n/a` since we cannot determine the version of their models after the fact 75 | - `ilab model list` is updated to accept `--adapters` and `--checkpoints` flags to act as filters and contain dedicated sections to list model adapters and checkpoints 76 | - The existing `--list-checkpoints` could be deprecated for uniformity reasons 77 | 78 | ## How would backwards-compatibility be handled? 79 | 80 | All other model flags will be deprecated for a couple releases and called out in the release notes. They will eventually be removed. 81 | The fields in the config file will need to be updated to match `--model`, which might be a breaking change and may warrant bumping the config version. This might require implementation of some kind of automatic config conversion mechanism between versions -------------------------------------------------------------------------------- /docs/cli/ilab-model-serve-backend.md: -------------------------------------------------------------------------------- 1 | # Design for `ilab model serve` command with backend support 2 | 3 | ## Background 4 | 5 | With the [request from the community](https://github.com/instructlab/instructlab/issues/1106) for `ilab` to serve different backends such as [vllm](https://docs.vllm.ai/en/stable/) and the [cli redesign](ilab-model-backend.md), this design doc's purpose is to flesh out the behavior of the `ilab model serve` command. 6 | 7 | Specifically, this doc addresses the design of subcommands of `ilab model serve` that apply to 8 | different serving backends. 9 | 10 | ## Design 11 | 12 | ### Backend 13 | 14 | Since the subject of the `ilab model serve` command is a model, regardless of the format of the model, every command takes in the `--model` flag or uses its default value in the config. 15 | 16 | `ilab model serve` has a new flag `--backend` that will be used to serve models with. As of this design, the two backends `ilab` would serve with are `llama-cpp` and `vllm`. 17 | 18 | This would lead to the commands: 19 | 20 | - `ilab model serve --backend llama-cpp` 21 | - `ilab model serve --backend vllm` 22 | 23 | There are specific flags for `ilab model serve` that would apply to all backends. These can be viewed by running `ilab model serve --help`. 24 | 25 | The following is an overview for the flags of `ilab model serve`: 26 | 27 | ```console 28 | ilab model serve 29 | | 30 | |_______ (backend agnostic flags) 31 | | 32 | |_______ --backend ['llama-cpp', 'vllm'] 33 | |_______ --backend-args 34 | ``` 35 | 36 | The `backend` flag will also be available as an option in the config file (`config.yaml`). This will allow users to 37 | set a default backend for `ilab model serve` in the config. Also, commands like `ilab model chat` 38 | and `ilab data generate` that serve models in the background will use the default backend specified 39 | in the config. Here is an example of what the config file would look like: 40 | 41 | ```yaml 42 | serve: 43 | gpu_layers: -1 44 | host_port: 127.0.0.1:8000 45 | max_ctx_size: 4096 46 | model_path: models/merlinite-7b-lab-Q4_K_M.gguf 47 | backend: llama-cpp 48 | ``` 49 | 50 | ### Backend flags 51 | 52 | The `--backend-args` flag is a string that will be passed to the backend as arguments. This flag is used to pass 53 | backend-specific arguments to the backend. Multiple values will be supported, however the exact formatting will be 54 | defined in the implementation proposal. The backend will be responsible for parsing individual arguments. 55 | 56 | It will also be available as an option in the config file (`config.yaml`). This will allow users to set default backend arguments for `ilab model serve` in the config. Here is an example of what the config file would look like: 57 | 58 | ```yaml 59 | serve: 60 | backend: llama-cpp 61 | backend_args: 62 | num_gpu_layers: 4 63 | max_ctx_size: 1024 64 | ``` 65 | 66 | For clarity and ease of implementation, when using the `--backend-args` flag, the user must pass the 67 | `--backend` flag as well. This is to ensure that the backend-specific arguments are passed to the 68 | correct backend. Any backend-specific arguments that are not passed to the correct backend will be 69 | reported as an error. 70 | 71 | ## Command Examples 72 | 73 | ### Bare-bones but model specific command 74 | 75 | ```shell 76 | ilab model serve --model 77 | ``` 78 | 79 | - Serves the model at ``. 80 | - If the `` is the path for a model that can be run by `llama-cpp` then `llama-cpp` is 81 | automatically used as the model serving backend. The current auto-detection logic will rely on a 82 | valid GGUF file format. If the model is a valid GGUF file, then `llama-cpp` will be used as the model serving backend. 83 | - If the `` is the path for a model that can be run by `vllm` then `vllm` is automatically used as the model serving backend. 84 | - If the model at `` can be run by either backend, then a default backend defined in the 85 | config will be used as the model serving backend. In the case where there is ambiguity and a setting is not defined, a hardcoded preference will be used (all currently supported providers do not have this issue). A future profile specification will likely replace the hardcoded fallback. 86 | 87 | ### Bare-bones command 88 | 89 | ```shell 90 | ilab model serve 91 | ``` 92 | 93 | - This command has the same behavior as the one above but the `--model` is whatever the default model path is in the config. This is the existing behavior of `ilab serve` today. 94 | 95 | ### Llama-cpp backend specific commands 96 | 97 | ```shell 98 | ilab model serve --model --backend llama-cpp --backend-args '--num-gpu-layers 4' 99 | ``` 100 | 101 | - This command serves a model with `llama-cpp`. 102 | - If the model provided is not able to be served by llama-cpp, this command would error out and suggest an alternate backend to use. 103 | - The existing flags to `ilab serve` (besides `--model-path` & `--log-file`) are now specific to the llama-cpp backend. 104 | 105 | ### vllm backend specific commands 106 | 107 | ```shell 108 | ilab model serve --model --backend vllm --backend-args '--chat-template ' 109 | ``` 110 | 111 | - This command serves a model with `vllm`. 112 | - If the path provided is not able to be served by `vllm`, this command would error out and suggest an alternate backend to use. 113 | - There are [dozens](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server) of flags for vllm. Whichever arguments the community deems the most important to include, will be added as flags to `ilab model serve vllm`. 114 | - Any remaining arguments can be specified in the value of the flag `--vllm-args`. 115 | 116 | ## Testing 117 | 118 | An additional end-to-end test will be added that for a new backend for `ilab model serve`. This new test should be triggered whenever code changes to the new backend serving code are made or before a release. 119 | 120 | This new test will do the following: 121 | 122 | 1. Initialize ilab in a virtual env via `ilab config init`. 123 | 2. Download a model via `ilab model download`. 124 | 3. Serve the downloaded model with the new backend via `ilab model serve`. 125 | 4. Generate synthetic data using the served model via `ilab data generate`. 126 | 5. Chat with the served model via `ilab model chat`. 127 | 6. Any future commands that interact with a served model should be added to the test. 128 | 129 | Some commands, like `ilab model chat` and `ilab data generate`, serve models in the background as part of the command. If automatic serving of a new backend is implemented for a command, testing of that command that will also be included in the new end-to-end test. 130 | 131 | ## Handling existing backend-specific commands 132 | 133 | The existing `ilab model serve` command has flags that are specific to the `llama-cpp` backend. The current list of flags are: 134 | 135 | - `--num-gpu-layers` 136 | - `--max-ctx-size` 137 | - `--num-threads` 138 | 139 | These flags will be moved to `--backend-args` and will be used as the default arguments for 140 | `llama-cpp` backend. This will allow for a more consistent experience across backends. The flag will 141 | be supported up to two releases after the release of the new backend. After that, the flag will be 142 | removed. During the two releases, a warning will be printed to the user when the flag is used. 143 | -------------------------------------------------------------------------------- /docs/cli/ilab-processes.md: -------------------------------------------------------------------------------- 1 | # Processes in InstructLab 2 | 3 | The ability to detach from processes is crucial to the user experience of InstructLab. However, the concept of multi-processing, process management, and the monitoring of processes is very complex. 4 | 5 | It is important to try and add this concept in as simply as possible, expanding on the state reporting, logging, and other features as we go along. 6 | 7 | ## Phased approach to InstructLab Processes 8 | 9 | This document is going to describe phase 1 of implementing processes in InstructLab. Phase 1 is to be described as the "ilab simple process management system". This will depend purely on python packages, PID tracking, and log files to create the experience of detachable processes. The key here is the concept of the UUID, allowing a future REST API to keep track of InstructLab processes using these unique identifiers. 10 | 11 | We can re-visit all this in phase 2, when we discuss if we want to utilize something like systemd or a more in-depth process-monitor repo to track processes. 12 | 13 | ### Phase 1 14 | 15 | Phase one would focus on adding the concept of detaching from processes, re-attaching to them, and managing the various artifacts from the processes. 16 | 17 | Process management would only apply to `ilab data generate` and `ilab model train` in a first iteration. This would be followed by commands like `ilab model evaluate`, `ilab model serve`, and `ilab model download`. All of these commands have long running processes that would benefit from detachment. 18 | 19 | The workflow would allow for: 20 | 21 | `ilab data generate -dt` (run a detached generation process) 22 | `ilab model train -dt` (run a detached training process) 23 | 24 | `ilab process list` 25 | 26 | ```console= 27 | +------------+-------+--------------------------------------+------------------------------------------------------------------------------------------------------------------+----------+ 28 | | Type | PID | UUID | Log File | Runtime | 29 | +------------+-------+--------------------------------------+------------------------------------------------------------------------------------------------------------------+----------+ 30 | | Generation | 39832 | 82d00a5b-5ed5-4cfd-9a75-a87e4f420b27 | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-82d00a5b-5ed5-4cfd-9a75-a87e4f420b27.log | 69:26:28 | 31 | | Generation | 40791 | 09f9d301-4fd9-4045-bfda-8a56f1d96016 | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-09f9d301-4fd9-4045-bfda-8a56f1d96016.log | 68:45:40 | 32 | | Generation | 47390 | 4ccabfa5-604f-49c6-b5c3-730ce328d62a | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-4ccabfa5-604f-49c6-b5c3-730ce328d62a.log | 67:26:33 | 33 | | Generation | 50872 | 093ac2e9-080c-45fe-89c5-43d508d6369c | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-093ac2e9-080c-45fe-89c5-43d508d6369c.log | 05:24:56 | 34 | +------------+-------+--------------------------------------+------------------------------------------------------------------------------------------------------------------+----------+ 35 | ``` 36 | 37 | `ilab process attach ` 38 | 39 | This command would re-attach to the given process, allowing to user to view the live logs of the process. `attach` would trail the log file and listen for user-input to kill the process. 40 | 41 | These commands will be done in a very simple way at first using the following architecture: 42 | 43 | 1. a detached process be re-attachable by tailing the log file and then allowing the user to ctrl+c the process as normal using `KeyboardInterrupt` 44 | 2. The process registry will be maintained for tracking UUIDs created via the `uuid` python package, the PID of the actual process, a `log_file` where the process will be outputting its logs to so that the user can re-attach, and the start time of the process. The log file directory will be tracked using our `DEFAULTS` package and will be standard throughout releases. 45 | 46 | The general flow would be: 47 | 48 | 1. a user runs `ilab data generate -dt` 49 | 2. a UUID, PID, and log file is added to the process registry. 50 | 3. the process would exit, and print the UUID of the sdg run 51 | 4. a user could attach to this process using `ilab process attach `. 52 | 5. This command would look in the process registry for the PID and/or UUID, get the log file, tail the log file, and listen for a ctrl+c keyboard interrupt. 53 | 54 | This allows us to detach from processes while still running them in the background and maintain log files all without the use of anything other than UUID and subprocess. 55 | 56 | #### Log file management 57 | 58 | If existing log files from the various libraries exist, those will be used in this scenario. If they do not, InstructLab will manage writing process logs to disk. Regardless of whether the libraries maintain their own log file, InstructLab will need to co-locate the log files in a centralized directory. 59 | 60 | If a log file exists, it will be copied and renamed into the following directory format: 61 | 62 | `~/.local/share/instructlab/logs//-.log` 63 | 64 | If the log file does not exist, InstructLab will create one with this format. Libraries are responsible for standardizing where their logs are stored if they already exist so the Core package can access them in a uniform fashion and copy them to the proper directory. -------------------------------------------------------------------------------- /docs/dependency-management.md: -------------------------------------------------------------------------------- 1 | # Dependency Management Policy for InstructLab 2 | 3 | This document describes the policies for adding and updating build and runtime dependencies of all InstructLab components. 4 | 5 | ## Best practices 6 | 7 | 8 | 1. Express dependencies by setting a minimum version (using `>=`) to ensure compatibility. 9 | 2. Do not "pin" a dependency to a single version (using `==`). 10 | 3. Exclude specific versions known to not work (using `!=`). 11 | 4. Only apply "caps" to dependencies (using `<`) when that dependency has established a pattern of producing new releases with breaking changes. 12 | 5. Pin versions in CI, with frequent automated updates. 13 | 14 | Best practices for python dependencies call for using ranges in package requirements and [pinning versions only in CI jobs](./ci/dependency-constraints.md). 15 | 16 | Using pinned versions in a list of constraints used for tests allows us to know and advertise exactly what versions have been tested in CI. 17 | That information is useful for users and re-packagers to understand which versions of dependencies are compatible with more specificity than the ranges provide. 18 | Tools like Dependabot will submit PRs to automatically update those pins to help us keep up with new releases of all of our dependencies. 19 | 20 | Pinning to specific versions in the package dependencies so that installing a built package requires an exact version of a dependency is not a good practice. 21 | It makes it very easy for sets of packages that need to be installed together to have incompatible dependencies, which in turn makes it impossible to actually install them (for example, [instructlab/training #34](https://github.com/instructlab/training/issues/34). 22 | Pinned dependencies also make it difficult to deal with CVEs or other critical bugs in those dependencies, which makes delivering products from this project more challenging. 23 | Do not pin to specific versions of libraries. 24 | 25 | Instead of pinning, we use version ranges. 26 | This ensures that re-packagers and installers have some flexibility in case a dependency of our package has a critical CVE and needs to be updated. 27 | Those ranges should include a minimum version, and in some cases a maximum version (a "cap"). 28 | 29 | Specifying the minimum value for the range (`foo>=x.y`) allows us to declare that we need features that only show up in or after that version of the dependency, which means we won't get bugs from users trying to use instructlab with an old dependency that has an incompatible API or is completely lacking a feature we need. 30 | 31 | Specifying a maximum value for the range (`foo>=x.y, repository. 13 | and create a `CNAME` to point to . 14 | 2) We migrate the different `README.md`s to this static site, (most likely [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/), 15 | and iterate organizing the documentation via PRs and useful flow of information. 16 | 3) We leverage our TechWriters to have a single location to update official "downstream" documentation. 17 | 4) We have the dedicated `README.md`s for each project, but stripped down to very specific niche things for the sub-project, or fast feedback development notes. 18 | -------------------------------------------------------------------------------- /docs/evaluation/eval-repo.md: -------------------------------------------------------------------------------- 1 | # New Repository Proposal: eval 2 | 3 | ## Summary 4 | 5 | This document proposes a new repository under the `instructlab` GitHub organization: 6 | 7 | - `instructlab/eval` 8 | 9 | ## Background 10 | 11 | The `instructlab/instructlab` repository currently includes no real implementation 12 | of Evaluation as described by the [LAB paper](https://arxiv.org/abs/2403.01081). The 13 | closest implementation currently in `instructlab/instructlab` via the `ilab test` command. 14 | 15 | `ilab test` as of this writing is only implemented for macOS with M-series chips. It uses 16 | a JSON Lines file and a LoRA adapter to compare output of a given model before and after 17 | LoRA training with MLX, thus the macOS M-series dependency. 18 | 19 | We desire to build out a library for methods that satisfy the evaluation described in the 20 | paper, using more high-level evaluation schemes such as 21 | [Multi-turn Benchmark](https://arxiv.org/abs/2306.05685) for skills and 22 | [Massive Multitask Language Understanding](https://arxiv.org/abs/2009.03300) (MMLU) for 23 | knowledge. We propose a new repository to house this code that publishes a new Python 24 | library called `instructlab-eval`. The reasoning for a new repository and library includes: 25 | 26 | - We expect multiple consumers of this code. The `ilab` CLI is one, but we also envision 27 | building a REST API around it to help support scaling out this functionality on a cluster. 28 | - We expect there is broader community interest in an open-source library and service for 29 | evaluation. We envision this library could support other evaluation techniques over time. 30 | - We also realize that much of model evaluation is generally useful outside the context of 31 | InstructLab. Other libraries may emerge in the broader ecosystem that handle parts of what 32 | we need, while this library will always remain to handle the InstructLab-specific details 33 | of how evaluation works in our workflow. 34 | 35 | ## Maintainers 36 | 37 | The initial team of maintainers for this repository will be a copy of the 38 | `Backend Maintainers` GitHub team. 39 | 40 | ## Alternatives Considered 41 | 42 | ### Add to `instructlab/instructlab` 43 | 44 | We could add this code to the existing `instructlab/instructlab` repository. 45 | 46 | The primary argument against this approach is that we expect the scope of an 47 | `instructlab-eval` library to expand beyond the scope of what would be run by the 48 | `ilab` CLI. We instead envision a different community of contributors organizing 49 | around Evaluation specifically. 50 | -------------------------------------------------------------------------------- /docs/examples/examples-repo.md: -------------------------------------------------------------------------------- 1 | # Create A Repo for InstructLab Examples 2 | 3 | ## Idea Overview 4 | 5 | Create a separate repository within the `instructlab` GitHub org called `examples`. 6 | This repository would house [Jupyter notebooks](https://jupyter.org/) and other examples that illustrate parts of or the entire LAB methodology. 7 | The `examples` repo would be a place where the maintainers of the InstructLab project can collect notebooks for the benefit of the community. 8 | All notebooks submitted to this repository would be carefully documented and tested before being merged. 9 | 10 | ## Repository Structure 11 | 12 | The repository will start by housing notebooks and have two categories of notebooks. Notebooks either live in the `combined-stages` or `use-cases` directories. 13 | 14 | ```bash 15 | examples 16 | | 17 | |- notebooks 18 | | 19 | |- combined-stages 20 | | |- training-with-eval 21 | | |- requirements.txt 22 | | |- training-with-eval.ipynb 23 | |- use-cases 24 | | |- policy-documents 25 | | | |- requirements.txt 26 | | | |- legislative-act.ipynb 27 | | |- instruction-manuals 28 | | | |- requirements.txt 29 | | | |- how-to-build-a-house.ipynb 30 | ``` 31 | 32 | ### Notebooks for Combined InstructLab stages 33 | 34 | Notebooks in the `combined-stages` directory go through parts of or the entire InstructLab workflow that users might want to reference or use. 35 | Some examples of combined stages are a notebook that runs through training then evaluation or a notebook that goes from document pre-processing to SDG. 36 | 37 | ### Notebooks for End-to-End (e2e) use cases 38 | 39 | Notebooks in the `use-cases` directory reflect real world use cases from start to finish. 40 | 41 | ## Additional Info 42 | 43 | This repo would not be released as a package on PYPI but initially as just `.zip` and `.tar.gz` files on GitHub. 44 | Releases would serve the purpose of giving users specific versions of notebooks they could run reliably. 45 | -------------------------------------------------------------------------------- /docs/github-actions-providers.md: -------------------------------------------------------------------------------- 1 | # GitHub Actions Providers 2 | 3 | ## Approved Providers 4 | 5 | * `actions/*@*` - GitHub created actions 6 | * `aws-actions/*@*` - AWS actions 7 | * `DavidAnson/markdownlint-cli2-action@*` - Markdownlint-cli2 action 8 | * `docker/*@*` - Docker actions 9 | * `machulav/ec2-github-runner@*` - EC2 GitHub Runner action 10 | * `pypa/*@*` - Python Packaging actions 11 | * `rojopolis/spellcheck-github-actions@*` - Spellcheck action 12 | * `sigstore/gh-action-sigstore-python@*` - Sigstore Python action 13 | * `step-security/harden-runner@*` - Harden Runner action 14 | * `ludeeus/action-shellcheck@*` - Shellcheck action 15 | * `hynek/build-and-inspect-python-package@*` - Build and inspect Python package action 16 | * `andstor/file-existence-action@*` - File existence action 17 | 18 | ## Denied Providers 19 | 20 | There are currently no denied providers. 21 | -------------------------------------------------------------------------------- /docs/github-actions-use-policy.md: -------------------------------------------------------------------------------- 1 | # GitHub Actions Use Policy for InstructLab 2 | 3 | This document describes the use policy for GitHub Actions (actions) in workflows for all repositories in the [InstructLab](https://github.com/instructlab) organization. 4 | 5 | ## Background 6 | 7 | GitHub Actions are an important tool for CI/CD use within the repositories of the InstructLab project. 8 | One of the main values is to verify the quality of pull requests for things like tests passing, spelling checks, well-formedness of files, etc. 9 | Repositories may also use actions to build and publish releases for the project. 10 | 11 | Since actions play a critical role in the project, care must be taken in how they are used due to their place in the security of the software supply chain of the project. 12 | 13 | ## Dependabot 14 | 15 | Each repository using GitHub Actions must configure Dependabot to manage the action dependencies. 16 | The repository must contain a `.github/dependabot.yml` file with the following minimum configuration: 17 | 18 | ```yaml 19 | version: 2 20 | updates: 21 | # Maintain dependencies for GitHub Actions 22 | - package-ecosystem: "github-actions" 23 | directory: "/" 24 | schedule: 25 | interval: "daily" 26 | ``` 27 | 28 | Repository maintainers must review and respond to all pull requests submitted by Dependabot in a timely manner. 29 | 30 | ## SHAs instead of tags 31 | 32 | GitHub Actions must be referenced using SHA values for a specific commit. 33 | Use of other reference types such as tag or branch names is not permitted. 34 | 35 | ```yaml 36 | - name: Git Checkout 37 | uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 38 | ``` 39 | 40 | instead of 41 | 42 | ```yaml 43 | - name: Git Checkout 44 | uses: actions/checkout@v4 45 | ``` 46 | 47 | The use of SHAs can avoid security issues if the tag or branch of the action is moved to a malicious commit. 48 | We also gain build repeatability for future builds by referring to a precise revision of the action. 49 | 50 | Since we use Dependabot to manage our action dependencies, Dependabot will handle the chore of using the proper SHA values in the submitted pull requests when action dependencies are updated. 51 | 52 | ## Trusted Providers of GitHub Actions 53 | 54 | There are many GitHub Actions available in GitHub. 55 | Not all can be necessarily trusted. 56 | The InstructLab project must maintain [a list of allowed providers and a list of denied providers](github-actions-providers.md). 57 | 58 | Allowed providers will include all GitHub created actions (`actions/*`) as well as other official actions such as Python Packaging actions (`pypa/*`). 59 | 60 | The InstructLab organization's Settings->Actions->General must be configured to allow select actions including actions created by GitHub along with the allowed providers. 61 | 62 | Adding actions to the allowed providers or denied providers lists will require approval by the organization maintainers along with updating the organization's settings. This can be done by submitting a Pull Request to modify [`github-actions-providers.md`](github-actions-providers.md). 63 | 64 | ## Harden the GitHub Action runner 65 | 66 | To further increase security, the first step of each job in a workflow must be the [`step-security/harden-runner`](https://github.com/step-security/harden-runner) action. 67 | This action hardens the action runner by implementing network egress control and some infrastructure security. 68 | 69 | ```yaml 70 | - name: "Harden Runner" 71 | uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 72 | with: 73 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 74 | ``` 75 | 76 | Each execution of the workflow job will produce a report showing endpoints accessed by the job and possible source file overwrites. 77 | The report will also suggest modifications to the harden-runner action's configuration to further increase security. 78 | 79 | ```yaml 80 | - name: "Harden Runner" 81 | uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 82 | with: 83 | disable-sudo: true 84 | egress-policy: block 85 | allowed-endpoints: > 86 | github.com:443 87 | ``` 88 | 89 | Hardening the action runner like this can prevent malicious or misbehaving actions or the misuse of actions from exfiltrating secrets. 90 | The article [Harden-Runner Defends Against Arbitrary Command Execution in `tj-actions/changed-files` GitHub Action](https://www.stepsecurity.io/blog/defend-against-arbitrary-command-execution-in-tj-actions-changed-files) shows how the misuse of an action could allow an attacker with a well-crafted pull request to exfiltrate secrets. 91 | -------------------------------------------------------------------------------- /docs/github-merge-strategy.md: -------------------------------------------------------------------------------- 1 | # GitHub Merge Strategy for InstructLab 2 | 3 | This document describes the merge strategy used for Pull Requests within all repositories in the [InstructLab](https://github.com/instruct-lab) organization. 4 | 5 | ## Requirements for Merging a Pull Request 6 | 7 | Every Pull Request that is made to an InstructLab repository should meet the below requirements - certain repositories such as [taxonomy](https://github.com/instruct-lab/taxonomy) may have additional requirements. 8 | 9 | ### CI checks 10 | 11 | We should require that all CI checks pass on a Pull Request before it can be considered for merge. Every repository should have at mimimum the following checks: 12 | 13 | - Linting 14 | - Testing (Unit, Functional, etc) 15 | - DCO Commit Signoff via a `Signed-off-by` header. There is a DCO check enabled for all repositories in this GitHub organization. 16 | 17 | Additional checks might be required for repositories on a case-by-case basis. 18 | 19 | ### Approvals from Project Maintainers 20 | 21 | At least one Project Maintainer should need to have an approving review on a Pull Request for it to be considered for merge. Requiring more reviews is left up to the discretion and consensus of the application maintainers team for a repository. The definition of a Project Maintainer can be found [here](https://github.com/instruct-lab/community/blob/main/governance.md#project-maintainers-overview). 22 | 23 | Project Maintainers are given access permissions via [GitHub Teams](https://github.com/orgs/instruct-lab/teams) - you can see more details on the specific responsibilities of these teams [here](https://github.com/instruct-lab/community/blob/main/MAINTAINERS.md). 24 | 25 | ## Method for Merging a Pull Request 26 | 27 | There are [three different merge methods offered by GitHub](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/configuring-pull-request-merges/about-merge-methods-on-github) for merging Pull Requests. 28 | 29 | We use the default merge method of creating merge commits for PRs. This is to ensure we retain the full commit history as intentionally structured by the PR author while also retaining metadata about the PR itself in the merge commit. 30 | 31 | This requires project maintainers to include commit messages and the overall structure of the commit series as part of their review. When multiple commits are present, they should represent a logical series of changes that implement the overall change proposed in the PR. The commit message for each should clearly explain that step of the progression. 32 | 33 | It is common that a PR author may need to do a final rebase to clean up their proposed commit series before a PR can be merged. It is also fine for a project maintainer to perform this step when the changes necessary are straight forward enough to do so. This includes doing a final rebase on `main` if necessary. The PR itself should NOT include any merge commits of `main` back into the developer's branch. We expect the proposed commit series to be a clean set of commits against `main` without conflicts or merge commit history. We only use a merge commit to record the PR's inclusion into `main`. 34 | 35 | ## Merge Automation 36 | 37 | Repositories may use [Mergify](https://mergify.io/) to automate the merge 38 | process and enforcement of merge policies. Using this tool allows us to encode 39 | the merge requirements in a file stored in the git repository itself. Once all 40 | requirements are met, Mergify will automatically merge the PR. 41 | 42 | An example configuration can be found in the [instructlab-bot 43 | repo](https://github.com/instructlab/instructlab-bot/blob/main/.github/.mergify.yml). 44 | -------------------------------------------------------------------------------- /docs/github-taxonomy-automation.md: -------------------------------------------------------------------------------- 1 | # GitHub Automation for Taxonomy 2 | 3 | This document describes the GitHub automation used with the 4 | [Taxonomy](https://github.com/instruct-lab/taxonomy) repository. 5 | 6 | ![GitHub Automation for Taxonomy](images/github-taxonomy-automation.excalidraw.png) 7 | 8 | ## Key Components 9 | 10 | ### InstructLab Bot 11 | 12 | There is a [GitHub bot](https://github.com/instruct-lab/instruct-lab-bot) used 13 | to facilitate this automation. This bot receives GitHub events and performs 14 | actions on behalf of different components within Instruct Lab. More details on 15 | the architecture of the bot itself can be found in its repository. 16 | 17 | ### GitHub Actions 18 | 19 | Not everything needs to be orchestrated by an external bot. The bot is good for 20 | facilitating user interactions and managing long-running tasks on custom 21 | infrastructure. However, many simpler tasks should just be GitHub actions. These 22 | are easier to write and maintain, and they can be triggered by a variety of 23 | events. 24 | 25 | ### Mergify (future) 26 | 27 | [Mergify](https://docs.mergify.com) is a tool that can perform workflow 28 | automation for a GitHub repository. It can be used to implement a variety of 29 | automation tasks, such as automatically merging PRs when certain conditions are 30 | met. While not yet in use, it should be kept in mind for future needs. 31 | 32 | ## Points of Interaction 33 | 34 | ### Access Controls 35 | 36 | Team membership for triagers of the Taxonomy repository is managed using the 37 | [taxonomy-triagers GitHub 38 | team](https://github.com/orgs/instruct-lab/teams/taxonomy-triagers). 39 | 40 | The bot should allow configuring a list of GitHub teams that are allowed to 41 | enable its functionality on a PR. The first command will only be accessible to 42 | the following teams: 43 | 44 | * Taxonomy Triagers 45 | * Taxonomy Maintainers 46 | * Backend Maintainers 47 | * Instruct Lab Org Admins 48 | 49 | This first command is used to indicate that a PR has been reviewed enough to determine 50 | that it is safe to run automated tasks against it. This is to help avoid abuse 51 | of our backend resources with malicious PRs. 52 | 53 | ```text 54 | @instruct-lab-bot enable 55 | ``` 56 | 57 | If the bot determines the author of this command is in one of the required 58 | teams, it will set a [commit 59 | status](https://docs.github.com/en/rest/commits/statuses) on the PR indicating 60 | that the bot functionality is turned on. The use of commit status here is 61 | important because commit status will automatically be reset if the author of 62 | the PR makes further changes. Any changes should require another check by a 63 | triager or maintainer for the bot to be re-enabled. 64 | 65 | At this point, the bot should respond with a welcome message including a list of 66 | commands that have been enabled for this PR (`precheck`, `generate`, for example). 67 | 68 | * [Bot PR adding gating label support](https://github.com/instruct-lab/instruct-lab-bot/pull/120) 69 | 70 | Two labels will be used: 71 | 72 | * `skill` - This label indicates that the bot can run automation intended for skills PRs. 73 | * `knowledge` - This label indicates that the bot can run automation intended for knowledge PRs. 74 | 75 | ### Pre-Check Using the Existing Model 76 | 77 | The trigger for this step should be a PR comment with the following format: 78 | 79 | ```text 80 | @instruct-lab-bot precheck 81 | ``` 82 | 83 | At this point, the bot will kick off a process that will run the questions from 84 | the `qna.yaml` against the existing Merlinite model. The intent is to allow 85 | contributors and reviewers to see if the proposed addition is already a 86 | capability of the model by comparing the model's answers to the provided sample 87 | answers. 88 | 89 | When the process is complete, the bot will post a comment with instructions on 90 | how to access the results. 91 | 92 | * 93 | 94 | ### Post-Check Using the Trained Model 95 | 96 | The trigger for this step should be a PR comment with the following format: 97 | 98 | ```text 99 | @instruct-lab-bot postcheck 100 | ``` 101 | 102 | At this point, the bot will kick off a process that will run the questions from 103 | the `qna.yaml` against the new Merlinite model. This will allow contributors and 104 | triagers to see how the new model behaves for the sample questions. 105 | 106 | The GitHub bot will manage this process, but the actual model training will be 107 | done by the backend pipeline. The bot will hit an API endpoint that is serving 108 | the new, candidate model. The exact API endpoint is a private endpoint hosted by 109 | backend infrastructure. 110 | 111 | When the process is complete, the bot will post a comment with instructions on 112 | how to access the results. 113 | 114 | * 115 | 116 | ### Synthetic Data Generation Check 117 | 118 | The trigger for this step should be a PR comment with the following format: 119 | 120 | ```text 121 | @instruct-lab-bot generate 122 | ``` 123 | 124 | At this point, the bot will kick off a process that will generate a sample of 125 | synthetic data generated by the Mixtral model. This is to allow contributors and 126 | reviewers to determine if the synthetic data generated as a result of the 127 | proposed addition is reasonable. 128 | 129 | When the process is complete, the bot will post a comment with instructions on 130 | how to access the results. 131 | 132 | * Working, but still needs remote endpoint support added: 133 | 134 | -------------------------------------------------------------------------------- /docs/images/github-taxonomy-automation.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/images/github-taxonomy-automation.excalidraw.png -------------------------------------------------------------------------------- /docs/images/sdg-api-interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/images/sdg-api-interface.png -------------------------------------------------------------------------------- /docs/instructlab-app.md: -------------------------------------------------------------------------------- 1 | # InstructLab macOS App 2 | 3 | ## Scope 4 | 5 | This document is targeted for macOS applications, but the idea could easily be transferred to other operating systems. 6 | 7 | ## Problem statement 8 | 9 | Starting InstructLab on your local laptop is hard. It requires a significant amount of `python` knowledge and terminal 10 | work, that is unrealistic for a non technologist to use. Having to install `git` specific versions of `python` and 11 | `xcode` requires a level of expertise that will create barriers of adoption to the InstructLab project. 12 | 13 | ## Proposed solution 14 | 15 | [ollama][ollama] has a macOS application that is a double-click installation for their server to run the commands 16 | locally. We propose creating the same "system bar" application, with the ability to run `ilab model serve` in the background 17 | and a possible way to do `ilab model chat` from said application. 18 | 19 | Having the `ilab` dog up in the system bar telling you that `ilab model serve` is running, could open up the opportunity 20 | to have a model open to ask a quick question to the local model, and even an ability to open up a "long-running" 21 | conversation via a web browser or the like. 22 | 23 | ## Next steps 24 | 25 | 1. Create a simple MVP of starting the `ilab model serve` application, with controls for the `serve` options, including 26 | what model you'd like to run, i.e. Granite or Merlinite. 27 | 2. Create an option to ask a quick question (`-qq` option) to the via the drop-down 28 | 3. Create a `ilab model chat` type interface via a window or web browser. 29 | 30 | [ollama]: https://ollama.com/download/mac 31 | -------------------------------------------------------------------------------- /docs/instructlab-cli-1.0.0.md: -------------------------------------------------------------------------------- 1 | # The Road to 1.0.0 2 | 3 | _Or: How I Learned to Stop Worrying and Love to GA_ 4 | 5 | ## Context and Goals 6 | 7 | The `instructlab/instructlab` repo started off as `instructlab/cli` - a basic Python Click-based command-line interface designed to prototype an application capable of 8 | running the LAB methodology created by IBM Research. As the project evolved and the organization looked into creating a proper PyPI package for it, the decision was made 9 | to rename the repo to `instructlab/instructlab` to keep the repo name consistent with the PyPI package name. The rest of this document will being using "InstructLab" to 10 | refer to this repo and Python package. 11 | 12 | Today, InstructLab has gone from a scrappy research project to an upstream community serving as the basis for multiple downstreams, with the goal 13 | to continuing to evolve the community to encourage more participation from additional stakeholders. To wit, it would behoove us to determine what exactly we should be 14 | roadmapping between now and a proper 1.0.0 release, which demonstrates the following to existing and potential community members: 15 | 16 | 1. An official goalpost for the community denoting the evolution of InstructLab from a pre-1.0 project lacking the stability and supportability typically seen from 1.0-and-beyond projects. 17 | 1. A dedicated set of V1 interfaces, both for internal configs and an API, that can be counted on for continuous usage of InstructLab 1.0 with future provisions made for backwards compatibility for subsequent Y-Streams and Z-Streams. 18 | 1. A commitment from the Oversight Committee and Maintainer teams to continue to maintain InstructLab throughout a 1.y cycle and work towards an eventual 2.0. 19 | 20 | ## MVP for an InstructLab 1.0.0 21 | 22 | At a high-level, these are the items the Maintainer teams believe should serve as prereqs for releasing an InstructLab 1.0.0: 23 | 24 | ### Updating relevant references of "CLI" to "Core" 25 | 26 | As noted in the `Context and Goals` section, InstructLab started off as just as a CLI - however, we are planning for this package to serve as a more general "Engine" - 27 | being a place where a future REST API can be defined that is used by both the CLI aspect as well as an official GUI for orchestrating the entire LAB workflow. Despite 28 | this, the repo is often still referred to as "the CLI". We as an organization need a better term to refer to this repo as, and should adopt the relevant documentation 29 | and meetings accordingly. 30 | 31 | An open community vote made as part of the drafting of this document decided that "Core" would be the new term used. You can see a record of the vote 32 | [here](https://github.com/instructlab/dev-docs/pull/159#issuecomment-2514885516). This name change will begin to go into effect after the merging of this document 33 | and should be completed by the time of a 1.0.0. 34 | 35 | ### A fully-realized configuration scheme, centered around the usage of system profiles 36 | 37 | The InstructLab configuration scheme has transformed in many ways since the project's inception, from the `config.yaml` file that initially served as the user's config, 38 | to the addition of code-based Pydantic defaults, to train profiles, to system profiles. We need to fully-decouple this config from the Click library, remove the need for 39 | a `config.yaml` file, and have a consistent config scheme that can be easily extended. 40 | 41 | ### An official v1 REST API schema 42 | 43 | We need to have a defined v1 REST API schema - while this does not preclude future updates, something mature enough to serve as a v1 API throughout subsequent Y-Streams 44 | for an InstructLab 1.0 is a must for such a milestone. 45 | 46 | ### Integration of InstructLab with RAG 47 | 48 | RAG is currently being planned on being integrated into InstructLab - that work should be in a stable state adhering to our v1 API standard. 49 | 50 | ### An upgrade path to subsequent Y-Streams and an eventual 2.0 51 | 52 | Any user wishing to install an InstructLab 1.0 must have an upgrade path to 1.1, 1.2, ..., 1.n. Upon being ready for an InstructLab 2.0, we should also be expecting to 53 | provide a path for users wishing to upgrade from our final 1.y stream to 2.0. 54 | 55 | ### Backwards compatibility across the 1.y stream 56 | 57 | Any user going down our upgrade path described above should expect that the release they upgrade to is backwards-compatible with the release they upgrade from. 58 | 59 | ### An official hardware support matrix 60 | 61 | We need to have a documented matrix of what hardware footprints we support and to what extent - this includes hardware we know will not work, hardware that we know might 62 | work, and hardware we have confirmed will work with regular CI testing. 63 | 64 | ### A robust CI ecosystem 65 | 66 | We should have a CI ecosystem that includes linting as well as unit, functional, and integration/end-to-end (E2E) tests in the InstructLab repo, along with proper documentation and Makefiles that allow developers to easily run subsets of them locally on their machines. 67 | 68 | ## Q&A 69 | 70 | **Q. What about the libraries? Will they 1.0.0 as well?** 71 | 72 | A. It depends - we historically have not aligned the InstructLab and Library releases on a particular version numbering scheme, apart from matching Y-Streams to Y-Streams (e.g., InstructLab 0.20 used SDG 0.4, Training 0.5, and Eval 0.3). At this stage, this document scopes only the prereqs we want for the InstructLab package. 73 | 74 | ## Conclusions and Decision Outcome 75 | 76 | This document will be debated and updated as part of the Pull Request review process. Upon reaching a lazy consensus by the Oversight Committee and Maintainer teams, the author of this document (Nathan Weinberg) will merge the document, denoting the following: 77 | 78 | 1. The items in the above section `MVP for an InstructLab 1.0.0` will become official prerequisites for the InstructLab CLI Maintainer team to releasing a `1.0.0` of the InstructLab. 79 | 2. Any amendments to this list can only be made with a subsequent PR editing this document, subject to the same review process. 80 | -------------------------------------------------------------------------------- /docs/knowledge-submissions-past-wikipedia.md: -------------------------------------------------------------------------------- 1 | # Knowledge Submissions Past Wikipedia 2 | 3 | ## Problem Statement 4 | 5 | For us to continue building our knowledge taxonomy tree we need more trusted places to 6 | teach our models about the world. Wikipedia can get us only so far, so we need safe 7 | places we can take the data from. 8 | 9 | ## Proposed Solution 10 | 11 | Due to the restrictions of copyright law, and how the internet is put together 12 | we need a list of possible places we can eventually approve as content providers. 13 | This document is will provide places that we would like to propose other then 14 | Wikipedia for our legal experts to start investigating if they can be 15 | on the [approved list][approved]. We have questions and thoughts from 16 | community members on places that should be approved, so here is a place 17 | can continue the discussion and track ideas from our community. 18 | 19 | ### Proposed Domains 20 | 21 | Status: 22 | 23 | - `approved`: Approved by the legal team, and posted on the [approved list][approved]. 24 | - `denied`: Denied by the legal team, and posted on the [avoided list][avoided]. 25 | - `submitted`: Sent to the legal team for review 26 | - `proposed`: The community would like to propose this as a possible place to take knowledge submissions from. 27 | - `reviewed - manually verify`: Legal team has reviewed this domain and while much of its source material meets our open licensing criteria, not all of it does. Each submission from this source must be manually verified to actually be under an appropriate content license or definitively in the public domain. 28 | 29 | For the purposes of Knowledge submissions to the InstructLab project, data sourced from items in the `approved` category require no further vetting from the Triage and/or other Maintainer teams. Items in the `reviewed - manually verify` category will require vetting before the submission can be accepted. 30 | 31 | To ensure that the data you would like to include in your knowledge submission meets the project licensing criteria, please make sure to talk to the Taxonomy maintainer team *before* you begin work on your submission. We would hate for you to do a great deal of work only to be told that the data source you selected would not work for the project. Please make sure you review the [Getting Started with Knowledge Submissions](https://github.com/instructlab/taxonomy?tab=readme-ov-file#getting-started-with-knowledge-contributions) documentation prior to submitting your request. 32 | 33 | | Domain name | Status | Notes | 34 | | :-- | :-- | :-- | 35 | | Wikipedia: | approved | | 36 | | Project Gutenberg: | approved | Pre-1927 works; public domain under US copyright law | 37 | | Wikisource (library): | approved | "free library that anyone can improve" | 38 | | OpenStax textbooks family of publications | approved | | 39 | | The Open Organization publications | approved | | 40 | | The Scrum Guide | approved | | 41 | | | reviewed - manually verify | | 42 | | | reviewed - manually verify | | 43 | | | reviewed - manually verify | | 44 | | | reviewed - manually verify| | 45 | | NASA: | reviewed - manually verify | See guidelines: | 46 | | Smithsonian Libraries: | reviewed - manually verify | For any material marked \"No Copyright - United States" or "CC0" as described here: | 47 | | European Union (EU): | reviewed - manually verify | Specifically documents submitted under "public registrars": | 48 | | Internet Archive: | reviewed - manually verify | Pre-1927 works; public domain under US copyright law | 49 | | PLOS family of open access journals: | reviewed - manually verify | | 50 | | Open Practice Library: | reviewed - manually verify | | 51 | | Cynefin.io wiki: | reviewed - manually verify | | 52 | | The Open Education Project: | reviewed - manually verify | | 53 | 54 | ### Process steps 55 | 56 | 1. Collect suggested places from the community by requesting they submit a pull request against this dev doc. 57 | 1. Work with our legal team to adjudicate. [@lhawthorn](https://github.com/lhawthorn) is currently the owner of this step, but is happy to educate & empower other folks to do this work. 58 | 1. Inform the triage team and triagers of the new locations we can or can not accept. This is currently done via an announcement in the [daily Triager Standup](https://github.com/instructlab/community/blob/main/Collaboration.md#triager-standup) and via a pull request to update the Knowledge Guide in one of the two locations listed below. 59 | 60 | - [Approved sources][approved] 61 | - [Rejected sources][avoided] 62 | 63 | [approved]: https://github.com/instructlab/taxonomy/blob/main/docs/KNOWLEDGE_GUIDE.md#accepted-knowledge 64 | [avoided]: https://github.com/instructlab/taxonomy/blob/main/docs/KNOWLEDGE_GUIDE.md#avoid-these-topics 65 | -------------------------------------------------------------------------------- /docs/library-release-strategy.md: -------------------------------------------------------------------------------- 1 | # Library Release Strategy for InstructLab 2 | 3 | This document describes the overarching release strategy and policies for Python libraries in the InstructLab organization. 4 | 5 | ## Background and Problem Statement 6 | 7 | The InstructLab organization features multiple code repositories that are tagged and released as Python libraries. 8 | Primarily this has been the [CLI repository](https://github.com/instructlab/instructlab) but has grown over time, 9 | including but not limited to: 10 | 11 | - [sdg](https://github.com/instructlab/sdg) 12 | - [training](https://github.com/instructlab/training) 13 | - [eval](https://github.com/instructlab/eval) 14 | - [schema](https://github.com/instructlab/schema) 15 | - [GPTDolomite](https://github.com/instructlab/GPTDolomite) 16 | - [instructlab-quantize](https://github.com/instructlab/instructlab-quantize) 17 | 18 | You can see the published versions of these libraries on PyPI [here](https://pypi.org/search/?q=instructlab). 19 | 20 | While these libraries are all owned and maintained by the InstructLab organization, only the CLI has an official 21 | [release strategy](https://github.com/instructlab/instructlab/blob/main/docs/release-strategy.md) documented. Other library 22 | releases have been handled directly by the Maintainer teams at their own discretion. The organization needs to have certain 23 | overarching principals around this topic, while still allowing for flexibility for each library on case-by-case basis. 24 | 25 | ## Proposal 26 | 27 | By default, each existing and new library should have the following a `release-strategy.md` aligned with the CLI doc. This proposal recognizes that certain libraries may need flexibility 28 | on case-by-case basis - therefore, Maintainer teams are empowered to modify these documents as they see fit, so long as the 29 | following tenants remain consistent: 30 | 31 | 1. Packages **must** be named `instructlab-` 32 | 1. Packages **must** follow the `X.Y.Z` numbering scheme (i.e. [semvar](https://semver.org/)) 33 | 1. Packages **must** have GitHub tagged releases named `vX.Y.Z` 34 | 1. Packages **must** use release branches for Y-Streams of the form `release-vX.Y` 35 | 1. Packages **must** maintain a `CHANGELOG.md` 36 | 1. Maintainer teams **must** publicly communicate Y-Stream releases through official InstructLab channels such as Discord, Slack or Mailing Lists. Z-Stream release communication is up to Maintainer discretion. 37 | -------------------------------------------------------------------------------- /docs/license-headers.md: -------------------------------------------------------------------------------- 1 | # Guidance for License Headers in Source Files 2 | 3 | ## Code Originating from InstructLab 4 | 5 | * We prefer not to have a traditional full license header at the top of every 6 | file. We feel it is unnecessary. 7 | 8 | * We prefer to have a single-line license identifier at the top of our source 9 | files, though it is not strictly required. 10 | 11 | For example: 12 | 13 | ```text 14 | // SPDX-License-Identifier: Apache-2.0 15 | ``` 16 | 17 | * We do not include Copyright statements at the top of our source files. These 18 | statements are notoriously difficult to keep up-to-date and fully accurate and 19 | are deemed unnecessary. 20 | -------------------------------------------------------------------------------- /docs/naming-conventions.md: -------------------------------------------------------------------------------- 1 | # Naming Conventions 2 | 3 | ## InstructLab - The Overall Project Name 4 | 5 | * `instructlab` - in URLs, no hyphen 6 | * `instruct-lab` - in URLs if `instructlab` is not available for some reason 7 | (like Slack) 8 | * `InstructLab` - CamelCase when referring to the name of the overall project. 9 | * Never `Instruct Lab` with a space. 10 | 11 | ## Models 12 | 13 | * `Merlinite` and `Granite` are capitalized unless referring to a filename 14 | like `merlinite-7b-lab-Q4_K_M.gguf`. 15 | -------------------------------------------------------------------------------- /docs/publish-strategy.md: -------------------------------------------------------------------------------- 1 | # Publishing Strategy for InstructLab 2 | 3 | This document describes the publishing strategy used for all models in the InstructLab HuggingFace [organization](https://huggingface.co/instructlab) and the InstructLab Ollama [user account](https://ollama.com/instructlab). 4 | 5 | ## What are we publishing and why? 6 | 7 | The InstructLab team will be periodically training the full unquantized model with new Pull Requests to the [taxonomy](https://github.com/instructlab/taxonomy) repository. When the evaluation shows that the model has improved, the team will be publishing 4-bit quantized GGUF form model to a platform called Ollama and both an unquantized model and a 4-bit quantized GGUF form model to a platform called HuggingFace. 8 | 9 | ## What is Ollama? 10 | 11 | [Ollama](https://ollama.com/) is a platform that allows users to run open-source large language models locally on their machines. Ollama covers a broad spectrum of users, from seasoned AI professionals to people looking to explore the potential of AI and makes it easier for users to leverage the power of LLMs without having to rely on a cloud infrastructure. Since Ollama only supports publishing GGUF models, we will only be publishing the 4-bit quantized versions of our Merlinite and Granite models to the InstructLab Ollama user account. 12 | 13 | ## What is HuggingFace? 14 | 15 | [HuggingFace](https://huggingface.co/) is a centralized web service platform, similar to GitHub, for hosting Git-based repositories related to data science and machine learning. In the context of InstructLab, HuggingFace is the platform where we will be publishing releases of our model for consumption by the community. 16 | 17 | We will be publishing two different kinds of model families - Merlinite and Granite. 18 | 19 | ## Merlinite 20 | 21 | The Merlinite model family is based off the [Mistral](https://mistral.ai/) model family and uses the [Large-scale Alignment for chatBots (LAB)](https://arxiv.org/abs/2403.01081) alignment. You can read more about it [here](https://huggingface.co/instructlab/merlinite-7b-lab). 22 | 23 | The InstructLab organization will be publishing a community version of the Merlinite 7B size model, in both unquantized and 4-bit quantized GGUF form to HuggingFace and just the 4-bit quantized GGUF form to Ollama. 24 | 25 | ## Granite 26 | 27 | The Granite model family is the [foundational model family](https://www.ibm.com/downloads/cas/X9W4O6BM) for the IBM watsonx AI platform, designed for usage in a business environment. You can read more about it [here](https://huggingface.co/instructlab/granite-7b-lab). 28 | 29 | The InstructLab organization will be publishing a community version of the Granite 7B size model using the [Large-scale Alignment for chatBots (LAB)](https://arxiv.org/abs/2403.01081) alignment, in both unquantized and 4-bit quantized GGUF form to HuggingFace and just the 4-bit quantized GGUF form to Ollama. 30 | 31 | ## HuggingFace Naming Scheme 32 | 33 | The naming scheme for both Merlinite and Granite will follow this generic scheme: 34 | 35 | ` - - - - ` 36 | 37 | The specific schemes that will be published are detailed below: 38 | 39 | | Model Family, Size, Alignment, etc. | Release Branch Format | Purpose | 40 | | --- | --- | --- | 41 | | [`merlinite-7b-lab`](https://huggingface.co/instructlab/merlinite-7b-lab) | `release-yyyymmdd` | Where the full precision Merlinite safetensors live | 42 | | [`merlinite-7b-lab-GGUF`](https://huggingface.co/instructlab/merlinite-7b-lab-GGUF) | `release-yyyymmdd` | Where the full precision and quantized Merlinite GGUFs live | 43 | | [`granite-7b-lab`](https://huggingface.co/instructlab/granite-7b-lab) | `release-yyyymmdd` | Where the full precision Granite safetensors live | 44 | | [`granite-7b-lab-GGUF`](https://huggingface.co/instructlab/granite-7b-lab-GGUF) | `release-yyyymmdd` | Where the full precision and quantized Granite GGUFs live | 45 | 46 | ## Ollama Naming Scheme 47 | 48 | The naming scheme for both Merlinite and Granite will follow this generic scheme: 49 | 50 | ` - - - ` 51 | 52 | The specific schemes that will be published are detailed below: 53 | 54 | | Model Family, Size, Alignment, etc. | Release Tag Format | Purpose | 55 | | --- | --- | --- | 56 | | [`merlinite-7b-lab`](https://ollama.com/instructlab/merlinite-7b-lab) | `release-yyyymmdd` | Where the full precision and quantized Merlinite GGUFs live | 57 | | [`granite-7b-lab`](https://ollama.com/instructlab/granite-7b-lab) | `release-yyyymmdd` | Where the full precision and quantized Granite GGUFs live | 58 | 59 | ## Retention Policy 60 | 61 | The InstructLab team will maintain the most recent **10** published versions of the respective models. 62 | -------------------------------------------------------------------------------- /docs/rag/adrs/README.md: -------------------------------------------------------------------------------- 1 | # Architecture Decision Records 2 | 3 | The ADR is a lightweight record format intended to capture individual architecturally important decisions. They are meant to be easy to write - 10 minutes or less. They should be stored in the codebase they affect, go through peer review, and have a commit history. 4 | 5 | This simple format, which is described below, has a surprising number of functions: 6 | 7 | * **Decision making process**: by going through peer review, it includes the entire team and gives all perspectives a chance to be heard. There is a clear decision making process with a clear lifecycle - once an ADR meets whatever approval criteria the team chooses, it is merged and the decision is done. If new information comes to light that causes the team to reconsider the decision, then that is simply a new ADR. 8 | * **Institutional knowledge and transparency**: Not everyone will comment on every ADR, but the transparency of the mechanism should serve to keep everyone informed and encode tribal knowledge into writing. This also builds resilience - there should ideally never be decision making that is blocked by someone being sick or on vacation. The team should always be able to make significant decisions. 9 | * **Distribute design authority**: As a team becomes familiar and comfortable with the ADR mechanism, every team member has an equal tool to bring design decisions to the team. This encourages autonomy, accountability, and ownership. 10 | * **Onboarding and training material**: A natural consequence of it being easy to write an ADR and getting into the habit of doing so is that new team members can simply read the record of ADRs to onboard. 11 | * **Knowledge sharing**: The peer review phase allows sharing of expertise between team members. 12 | * **Fewer meetings**: As decision making becomes asynchronous and as the team forms its social norms around the process, there should be less time required in meetings. 13 | 14 | ## When to write an ADR 15 | 16 | * A decision is being made that required discussion between two or more people. 17 | * A decision is being made that required significant investigation. 18 | * A decision is being proposed for feedback / discussion. 19 | * A decision is being proposed that affects multiple teams. 20 | 21 | ## Template 22 | 23 | [Here](template.md). 24 | 25 | ## Related Reading 26 | 27 | * [Suggestions for writing good ADRs](https://github.com/joelparkerhenderson/architecture-decision-record?tab=readme-ov-file#suggestions-for-writing-good-adrs) 28 | * [ADRs at RedHat](https://www.redhat.com/architect/architecture-decision-records) 29 | * [ADRs at Amazon](https://docs.aws.amazon.com/prescriptive-guidance/latest/architectural-decision-records/adr-process.html) 30 | * [ADRs at GitHub](https://adr.github.io/) 31 | * [ADRs at Google](https://cloud.google.com/architecture/architecture-decision-records) -------------------------------------------------------------------------------- /docs/rag/adrs/adr-vectordb.md: -------------------------------------------------------------------------------- 1 | # Initial InstructLab Vector Store 2 | 3 | ## Context 4 | 5 | One of the first choices to make in implementing RAG is to choose an initial vector store to develop against. Though the usage of frameworks like LangChain or Haystack make it easy to swap vector databases, we need a working end to end implementation for RAG that is tested against and available to install with InstructLab. There are many options (see [here](https://docs.haystack.deepset.ai/docs/choosing-a-document-store)). 6 | 7 | Our main long-term requirements are that our chosen store have fully-developed document update (and thus some sort of notion of primary key), that it be scalable to cluster size, and that it have a permissive license (Apache, MIT, or similar). Among the available choices, [Milvus](https://milvus.io/) provides strategic advantage due to its [investment from watsonx](https://www.ibm.com/new/announcements/ibm-watsonx-data-vector-database-ai-ready-data-management). 8 | 9 | Milvus can be used in-process ([Milvus Lite](https://milvus.io/docs/milvus_lite.md)), single-node ([Milvus](https://milvus.io/docs/prerequisite-docker.md)), or cluster-scale ([Milvus Distributed](https://milvus.io/docs/prerequisite-helm.md)). 10 | 11 | ## Decision 12 | 13 | InstructLab will initially integrate with and use Milvus Lite for vector storage and retrieval augmented generation. 14 | 15 | ## Status 16 | 17 | Accepted 18 | 19 | ## Consequences 20 | 21 | * Users will have a clear [upgrade path](https://milvus.io/docs/upgrade_milvus_cluster-operator.md) from the laptop use case to cluster scale. 22 | * We should be able to have access to expert resources with Milvus via IBM. 23 | * The laptop use case of InstructLab will have a minimally resource intensive option for prototyping. 24 | * Since Milvus is used in watsonx, we can have confidence that it can meet expected scaling requirements. 25 | * Document updates can be accommodated using well-established [primary key functionality](https://milvus.io/docs/primary-field.md) and [partition key](https://milvus.io/docs/use-partition-key.md). 26 | * There is a risk of developing against a mature vector store leading to usage of functionality not available in some other vector store that a potential customer requires to be used. 27 | -------------------------------------------------------------------------------- /docs/rag/adrs/granite-embeddings.md: -------------------------------------------------------------------------------- 1 | # InstructLab RAG will use Granite Embeddings as the default embedding model 2 | 3 | ## Context 4 | 5 | InstructLab RAG will be using dense vector retrieval via a vector database to select documents for use in response generation. Dense vector retrieval requires a model to provide dense vectors of the content at indexing time and dense vectors of the query at retrieval time. 6 | 7 | Some key considerations: 8 | 9 | - We want a model that is reasonably effective at driving accurate semantic search. 10 | - We want a model that runs quickly and doesn't require a lot of expensive hardware to run. 11 | - We want a model that does not require an inordinate amount of storage space for the vectors. 12 | - We want a model with license terms that are compatible with the license for InstructLab so that anyone using InstructLab under the terms of its license can use its default embedding model without any additional encumbrance. 13 | - We want a model where the license terms for all of the data used to train the model are compatible with the license terms of that model. 14 | - If possible, it would be nice for the embedding model used for RAG to be the same embedding model that is used for other purposes in InstructLab. At this time, there are no other uses of embedding models for InstructLab, but we understand there are experiments under way to use embedding models for subset selection for synthetic data. 15 | 16 | We don't need this to be one model that is the best possible fit for all users. There is no such model because many of the criteria are trade-offs, e.g., models that are bigger tend to be more effective but also slower and require more memory and storage space. Fortunately, we are just choosing a default value, and users will be free to override that default and plug in the embedding model of their choice. For the default, the main consideration is that the model be acceptable across all the criteria. 17 | 18 | Another consideration might be the maximum amount of text that the embedding model can produce embeddings for. Many popular models have a maximum input length of 512 tokens, but some have longer maxima. A larger maximum might be useful for a RAG solution in which the response generation model has a large context window. However, it might not be: the larger context window might be better used by getting a larger number of search results or by looking up the text that came before and after each search result. A larger maximum input length for the query might also be useful for encoding not just the current user utterance but also the history of the chat leading up to that utterance. We will continue to explore this space and if we find that maximum input length is an important consideration, it could result in selecting a different default embedding model in the future. 19 | 20 | ### Alternatives 21 | 22 | There are two IBM Granite English embedding models: 23 | 24 | - [IBM Granite-Embedding-30m-English](https://huggingface.co/ibm-granite/granite-embedding-30m-english) 25 | - [IBM Granite-Embedding-125m-English](https://huggingface.co/ibm-granite/granite-embedding-125m-english) 26 | 27 | The 30m model provides accuracy that is comparable to 125m model on some data sets but significantly lower on other data sets. For use cases where there is a large amount of robust evaluation data and the ability to reliable metrics, it would generally be sensible to to try both and measure the speed/size/accuracy trade-offs. On the other hand, for engagements where the quantity of data to be indexed is enormous, the advantages in indexing time and storage space from using a smaller model can be overwhelming. Such engagements would generally be better served by 30m-English. However, for a simple proof-of-concept where a user won't be able to measure accuracy robustly, 125m-English is probably a better choice over 30m-English because 125m is small enough for most purposes and is the one most likely to be accurate enough that a user will be happy with the results. The simple proof-of-concept use case seems like the most important one for determining the *default* model because users who are outside of the context of a simple proof-of-concept are much more likely to be overriding the default anyway. Both of these models have a maximum input length of 512 tokens. 28 | 29 | Other options available include: 30 | 31 | - IBM Granite multi-lingual embedding models seem like they could be very useful as part of a broader InstructLab multilingual strategy. For now, the defaults we have are focused on English, but in the future we might want users to provide the target language or languages they are working with during initial setup and then defaults for various settings depend on that choice. Since this would involve a broader end-to-end change, it is out of scope for this ADR. The Granite multi-lingual models support 12 languages, but there are plenty of languages that they do not support. Other models support larger numbers of languages. 32 | - [NV-Embed-v2](https://huggingface.co/nvidia/NV-Embed-v2) has outstanding accuracy (for example, see the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) overall English and retrieval English results). However, it is licensed for non-commercial use only and requires almost 30 gb of memory (so presumably it requires some expensive hardware to run at speed). For non-commercial users that prioritize accuracy over every other consideration, this might still be a fine model to choose, but it does not seem like a good default value because it doesn't meet all the criteria. 33 | - [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) was proposed as a default in an [early draft of another dev doc in this repository](https://github.com/instructlab/dev-docs/pull/161/commits/7ca3ab624526a4c5a5c70d282f8a6be26c292020#diff-b103ed3331fbeb65d7569ea836c9fd4b53c845853d8c8e7567c34864eebcdfb8R51). It is a very popular, lightweight embedding model. However, its [list of training sources](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2#training-data) includes MS MARCO which is clearly marked as non-commercial use only. In contrast, the [Granite embedding model card](https://huggingface.co/ibm-granite/granite-embedding-125m-english) asserts that "Notably, we do not use the popular MS-MARCO retrieval dataset in our training corpus due to its non-commercial license, while other open-source models train on this dataset due to its high quality." Note that all-MiniLM-L6-v2 is licensed as Apache 2.0 which authorizes commercial use of this model, but it seems potentially problematic for the creators of a model to authorize commercial use when they trained on data that was not authorized for commercial use. We would prefer to avoid this tricky legal concern. 34 | - There are many other open source models of comparable size to the Granite embedding models. However, most highly competitive models use MS-MARCO or other sources with problematic provenance. Furthermore, IBM has [published benchmark results](https://www.ibm.com/new/announcements/ibm-granite-3-1-powerful-performance-long-context-and-more#Granite+embedding+models) showing that Granite's accuracy is highly competitive with other popular open source options of comparable size. Note, however, that these results do not include some newer competing models. They also don't include any much larger, more resource intensive models; such comparisons wouldn't really be *fair* (i.e., a bigger model that uses more memory has a fundamental advantage) but they would be *useful* for someone who has flexibility on hardware requirements and would like to explore trade-offs. More benchmarking would be helpful. 35 | 36 | ## Decision 37 | 38 | InstructLab RAG will use [IBM Granite-Embedding-125m-English](https://huggingface.co/ibm-granite/granite-embedding-125m-english) as the default embedding model. 39 | 40 | ## Consequences 41 | 42 | - We will have consistency with the default options for model training (which are also based on IBM Granite). 43 | - Anyone using the default option will be able to rely on IBM's outstanding, well-earned reputation for respecting intellectual property rights and careful compliance with legal restrictions. 44 | - Anyone who wants a different balance between key considerations (hardware requirements, speed, effectiveness, terms of use, etc.) will want to override the default embedding model. For example, users who want the best possible accuracy and have a lot of expensive hardware available to run the model will probably want to choose a larger model, and users with extremely limited hardware who can tolerate lower accuracy will probably choose a smaller model. 45 | - We should consider strategies of offsetting perception of prioritizing IBM models over the needs of the product in addition to the clear motivation described here, such as in documentation. 46 | - We may wish to change to a different default embedding model in the future if more benchmarking shows that a different model provides a better balance of accuracy, speed, and hardware requirements. 47 | - More work is needed to figure out whether a larger maximum input length would be better for our RAG solution. 48 | -------------------------------------------------------------------------------- /docs/rag/adrs/template.md: -------------------------------------------------------------------------------- 1 | # Succinct title 2 | 3 | ## Context 4 | 5 | _What is the context of this decision? What are the technical, social, and political factors? For example, the decision to use a particular library might be simply because most of the team is familiar with it; that is a social context. A political factor might be influences from other teams or executive decisions_ 6 | 7 | ## Decision 8 | 9 | _a single decision statement, written in active voice, stated in a single sentence_ 10 | 11 | ## Status 12 | 13 | [Proposed | Accepted | Rejected ] 14 | 15 | ## Consequences 16 | 17 | _A bulleted list and might be the most important section. What are the consequences of this decision? Does it introduce design constraints into a codebase? Does it require further decisions or investigations to be made? Will it require training/onboarding for team members? Does it impact performance? What about cost? Does it impact development processes? What else? As a rule of thumb, there should usually be 4-6 identified consequences_ 18 | -------------------------------------------------------------------------------- /docs/rag/images/ingestion-mvp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/ingestion-mvp.png -------------------------------------------------------------------------------- /docs/rag/images/rag-chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-chat.png -------------------------------------------------------------------------------- /docs/rag/images/rag-model-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-model-training.png -------------------------------------------------------------------------------- /docs/rag/images/rag-plug-and-play.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-plug-and-play.png -------------------------------------------------------------------------------- /docs/rag/images/rag-taxonomy-no-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-taxonomy-no-training.png -------------------------------------------------------------------------------- /docs/rag/rag-initial-code-location.md: -------------------------------------------------------------------------------- 1 | # Code location for RAG 2 | 3 | | Created | Dec 5, 2024 | 4 | | -------- | -------- | 5 | | Authors | Bill Murdock | 6 | | Replaces | N/A | 7 | | Replaced by | N/A | 8 | 9 | ## What 10 | 11 | We want a retrieval-augmented generation (RAG) capability that provides outstanding results with minimal effort, is seamlessly integrated with InstructLab, and is also general enough to be used in other applications as well. 12 | 13 | ## Why 14 | 15 | Many InstructLab users want to train a model and then use it to RAG. Often they build something simple themselves for this purpose. Two problems with this approach: 16 | 17 | - Building their own RAG is extra work. 18 | - Users who are not experts on RAG might not build a RAG that provides outstanding results. 19 | 20 | There is a very simple RAG capability at . It is not tightly integrated with InstructLab and it does not use any advanced RAG capabilities. However, we have a request from a stakeholder to not just unilaterally delete it or replace it with something radically different. 21 | 22 | ## Goals 23 | 24 | Provide a built-in alternative for users who do not want to build their own RAG. Keep the existing capability at somewhere, but potentially somewhere other than it is now (e.g., in a new branch of the existing repository). 25 | 26 | ## Non-goals 27 | 28 | Evaluation of RAG will be addressed in one or more other development documents. That topic is out of scope for this document. 29 | 30 | ## Decision 31 | 32 | - For now, RAG will be located in the core repository in its own directory: `src/instructlab/rag` in the core InstructLab repository (). 33 | 34 | ## How 35 | 36 | ### Phase 1 37 | 38 | - RAG will be located in the core repository in its own directory: `src/instructlab/rag` in the core InstructLab repository (). 39 | - This directory will include all of the following: 40 | - Loading the content from Docling-format JSON files (that are produced by SDG preprocessing). 41 | - Chunking that content to sizes that fit the requirements of the selected embedding model for vector database storage and retrieval. 42 | - Storing those chunks with their vector representations in a vector database. 43 | - End-to-end runtime RAG. The initial version of this includes the following: 44 | - Taking as input a session history (including a current user query) and providing a response (e.g., something along the lines of the [OpenAI chat completion API](https://platform.openai.com/docs/api-reference/chat/create)). 45 | - During that processing, it retrieves relevant search results from the vector database, it converts those into a prompt to send to the response generation model, it prompts that model, and it returns the response from that model. 46 | - This will be invoked from the existing `ilab` CLI, as described in the [RAG ingestion and chat pipelines](https://github.com/instructlab/dev-docs/pull/161) dev doc. 47 | 48 | ### Future phases 49 | 50 | - In the near future, RAG might be moved to the existing repository. 51 | - If so, something will be done with the existing code in , e.g., moving it to a branch of that repository or moving it to a different repository. 52 | - Alternatively, some or all of it might move to a new repository. 53 | - For example, maybe the indexing and retrieval portions move to a separate retrieval repository while the rest of end-to-end runtime RAG might move somewhere else. 54 | - If/when we move ahead with any of these options, *we will open a new ADR for that decision*. 55 | - Also, the capabilities will keep improving and adding more functionality. 56 | 57 | ## Alternatives 58 | 59 | - Put the indexing and run-time RAG code in a new repository. 60 | - Pro: Having a dedicated repository gives the RAG team the most freedom and flexibility to make technical decisions that work for that team. 61 | - Pro: Starting with a new repository provides a blank slate that can be set up in whatever way makes the most sense for that functionality. 62 | - Pro: Having the capability in one repository makes it easier for consumers such as RamaLama to reuse it for their purposes too. 63 | - Con: Creating and configuring a new repository is some work. (This is a fairly small con, but a real one.) 64 | - Con: Integrating a new repository into the continuous integration and delivery capabilities for both upstream InstructLab and downstream consumers is a *lot* of work. This is a much bigger con. 65 | - Con: All that extra work would almost certainly result in slower time to market. This risks missing some market opportunities. 66 | - Put the indexing code in (SDG) and the run-time RAG code in (core) 67 | - Pro: This has the advantage of not adding any new dependencies. 68 | - Pro: The document processing is already in SDG and chat functionality is already in core so this would require the fewest code changes. 69 | - Con: Splitting the RAG functionality across multiple repositories makes it more complicated to reuse in other applications outside of InstructLab. 70 | - Con: Many things we will want to do to add advanced functionality to make RAG more effective will require changes to both indexing and run-time RAG. If those components are split across multiple repositories, that will make delivering such changes more complicated. 71 | - Start by putting the code into existing InstructLab repositories (either of the above options) and then split if off into its own repository later. 72 | - Pro: Gets us integrated into InstructLab sooner. 73 | - Con: Adds extra work to the second phase where we have to split it off into its own repository. 74 | - Con: There is a risk that we never get around to splitting it off and we wind up stuck with the cons of being jammed in to other components indefinitely. 75 | - Put the indexing and run-time RAG code in a new repo outside . 76 | - Pro: This signals that this is not specific to InstructLab but is instead intended to be useful in a variety of applications. That makes it more likely the work could have broader impact. 77 | - Con: If we put this out there as something that is intended to be useful in a variety of applications, the pressure is on us to make sure it is differentiated from other broadly applicable RAG capabilities. Hopefully that will be true eventually, but it probably won't be true for a while. It might make more sense to give this some time to mature as a local component of InstructLab before trying to spin it off as its own thing. 78 | - Con: If we put it out there as its own open source project, that project needs all of the infrastructure of a full open source activity (governing structures, communication tools and protocols, etc.). That's a lot of work to set up. Keeping it inside InstructLab for now lets us keep using the infrastructure that InstructLab has for this purpose). 79 | - Con: If we put it out there as its own open source project, it needs a name. It is a lot of work to come up with a good name and there will be a lot of stakeholders with an interest in the name that comes up. 80 | - Keep the indexing and run-time RAG code in which is an emerging technologies prototype for this work. 81 | - Mostly the same pros and cons as putting it in a new repo outside InstructLab plus the following: 82 | - Pro: A prototype for the code we want is already there. 83 | - Pro: It already has its own distinctive name (PaRAGon). 84 | - Con: The existing repository has its own simple command-line interface which is useful for the prototype but we don't want it in the capability we release because too many command-line interfaces will confuse users. 85 | - Con: The name PaRAGon seems fine to me, but probably more stakeholders need to weigh in on what a name would be. 86 | - Con: The `redhat-et` label suggests that this is something "owned" by Red Hat which makes sense for the prototype but not so much for something we want a community to own in the long run. 87 | - Put the indexing and run-time RAG code in AND keep the existing RAG functionality in that repository intact. 88 | - Pro: It already exists. 89 | - Pro: It avoids the confusion of having two different RAG repositories in . 90 | - Con: It creates the confusion of having two different RAG solutions in the same repository. We could mitigate that with developer documentation and marking legacy stuff as "deprecated". 91 | - Put the indexing and run-time RAG code in AND eliminate the existing RAG functionality in that repository. 92 | - Pro: It already exists. 93 | - Pro: It avoids the confusion of having two different RAG repositories in . 94 | - Pro: It avoids the confusion of having two different RAG solutions since we'd be eliminating the old one. 95 | - Con: There is still some interest in keeping this around. 96 | 97 | ## Risks 98 | 99 | - Putting the RAG functionality in the core repository requires any application that wants to use this functionality to bring in the entire core which then brings in all of the libraries it depend on, so this becomes an enormous dependency. This discourages reuse in other applications. It *encourages* either of the following behaviors that would be unfortunate: 100 | - Other applications pull directly from and in doing so duplicate the ongoing effort to harden that code base. 101 | - Other applications may implement their own RAG solutions or pull from some other upstream unrelated to ours. 102 | - As noted earlier, putting the capability inside signals that this is a component of InstructLab and not a generally useful feature. That creates a risk that the work could miss out on additional opportunities for impact. We hope to mitigate that risk by spinning it off to its own open source project when it is mature enough, but there is a risk that we will get distracted by other things and never get around to this. 103 | - The flow for document processing for InstructLab winds up being quite complicated in this proposal. Since the existing document processing is in SDG, the flow for indexing for RAG winds up being a bit complicated (i.e., it starts with a CLI call handled by the core repo then goes to SDG for some of the document processing and then back to the core `/data` directory which then calls out the the `core/rag` directory for chunking and vector database indexing). Having the document processing move from core to SDG and back to core and forward to RAG makes that capability more difficult to understand and maintain. This complexity will be partially mitigated when the preprocessing code moves from SDG to core. It will be further mitigated by having a clear, well-documented contract between core and the RAG repository indicating the responsibilities of each. 104 | 105 | ## References 106 | 107 | - 108 | - 109 | - 110 | -------------------------------------------------------------------------------- /docs/retrieval-augmented-generation/rag-framework.md: -------------------------------------------------------------------------------- 1 | # RAG will use Haystack 2 | 3 | | Created | Dec 4, 2024 | 4 | | -------- | -------- | 5 | | Authors | Ryan Cook, Ilya Kolchinsky, Hema Veeradhi | 6 | | Replaces | N/A | 7 | | Replaced by | N/A | 8 | 9 | ## What 10 | 11 | This ADR defines the decision on the framework used to support the RAG pipeline for InstructLab. The Red Hat Emerging Technologies team is pushing for the adoption of Haystack to be used for the framework of the RAG offering. Haystack will handle the data ingestion and retrieval processes for this to be productized RAG solution. 12 | 13 | NOTE: Speaking with Peter Staar on Dec 3, 2025 [the Docling team](https://github.com/DS4SD/docling) is aware of the efforts to potentially use Haystack and are already looking into adding the functionality for [support of Haystack](https://github.com/DS4SD/docling-haystack). 14 | 15 | ## Why 16 | 17 | Multiple options for frameworks currently exist. During our initial analysis, the following options were the only ones to satisfy the basic requirements in terms of functionality, reliability and open-source availability: 18 | 19 | - [Haystack](https://haystack.deepset.ai/) 20 | - [Langchain/Langgraph](https://www.langchain.com/) 21 | - [LlamaIndex](https://www.llamaindex.ai/) 22 | - [RAGFlow](https://ragflow.io/) 23 | 24 | All of the above offer a variation of a modular pipeline architecture, where users can chain together components (like retrievers, readers, and generators) to process data in different stages. 25 | 26 | Out of those, we propose to use Haystack for the following reasons: 27 | 28 | 1. **Focus on RAG.** Haystack is a framework specifically targeting RAG use cases and sophisticated RAG indexing and retrieval pipelines. While Langchain and LlamaIndex shine in their own areas, the former is a generalist framework and the latter has a different focus, namely building custom indices over data. Haystack provides functionality that is strongly tailored for RAG and includes a comprehensive library of out-of-the-box solutions for advanced RAG scenarios. As a result, many essential or soon-to-be-essential RAG capabilities can be implemented in a few lines in Haystack but require considerable work to be supported over Langchain or LlamaIndex. Some examples include hybrid retrieval, iterative RAG, HyDE, combining multiple ingestion sources, custom data preprocessing and metadata augmentation. As the decision discussed in this document involves only the RAG component of Instructlab, we believe that choosing the best RAG framework, as opposed to the best general LLM serving framework, would be more strategically correct. 29 | 30 | 2. **Maturity and stability.** Haystack is the most mature, established and stable product among the considered alternatives. It has been around for more time overall (since 2017) and accumulated more mileage. Haystack has an active, sizable and steadily growing community. 31 | 32 | 3. **Extensive vendor support.** Haystack natively supports all currently popular vector DBs and provides dedicated backends for incorporating them into its pipelines. Additionally, Haystack supports multiple models and model providers out-of-the-box. 33 | 34 | 4. **Enterprise-level performance.** Haystack is designed for production-grade scalability, supporting distributed systems and high-throughput applications. Moreover, and in contrast to the alternatives (of which only LlamaIndex showcases similar performance and scalability), Haystack is specifically optimized for efficient search and retrieval in the RAG setting. 35 | 36 | 5. **Ease of use and documentation.** Being strictly focused on RAG as opposed to taking a generalist approach, the learning curve of Haystack is less steep than that of Langchain. At the same time, Haystack offers extensive documentation and tutorials which are more well-organized and easy to use than those of LlamaIndex. 37 | 38 | 6. **Architecture.** Extending the previous point, Haystack can be seen as a middle ground between Langchain and LlamaIndex, sharing their benefits while only partially inheriting their drawbacks. Like the former, Haystack enables building custom flows and pipelines. Unlike Langchain though, Haystack does not try to be too abstract and general, strictly focusing on RAG and document search instead. As a result, Haystack is more straightforward to use, especially for users looking to implement custom and highly non-standard scenarios. On the other hand, like LlamaIndex, Haystack's performance is optimized towards data retrieval and indexing, but it offers a higher degree of flexibility and better interfaces for custom use cases. 39 | 40 | 7. **Actively maintained open source project under permissive license.** Haystack is very [actively](https://github.com/deepset-ai/haystack/pulse/monthly) [maintained](https://github.com/deepset-ai/haystack/issues?q=is%3Aissue+is%3Aclosed) and [supported](https://github.com/deepset-ai/haystack/discussions). [Tagged versions](https://github.com/deepset-ai/haystack/releases) are released on a regular basis and [trusted publishing automation](https://github.com/deepset-ai/haystack/actions/workflows/pypi_release.yml) is used. Haystack is licensed under Apache 2.0, and all of its dependencies (jinja2, lazy-imports, more-itertools, networkx, numpy, openai, pandas, posthog, python-dateutil, pyyaml, requests, tenacity, tqdm, typing-extensions) are licensed under Apache, MIT, BSD or PSFL. 41 | 42 | 8. **Published to PyPI via Trusted Publishing.** 43 | See [trusted-publishers](https://docs.pypi.org/trusted-publishers/) for more information on trusted publishing. Haystack satisfies the requirements for trusted publishing with this [Github Actions workflow](https://github.com/deepset-ai/haystack/actions/workflows/pypi_release.yml). 44 | 45 | ## Goals 46 | 47 | - The primary goal here is to provide a framework optimized for RAG specifically and that the decision isn't focused on optimizing the choice on the best agentic pattern solution. However, Haystack can potentially implement many [agentic patterns](https://docs.haystack.deepset.ai/v1.22/docs/agent). Since Haystack provides the functionality [to build arbitrarily complex pipelines](https://docs.haystack.deepset.ai/docs/pipelines), it is possible to extend our RAG architecture to introduce nodes for tools, reasoning and agentic actions. 48 | 49 | ## Non-goals 50 | 51 | - What can we ignore when making this decision? 52 | 53 | ## Decision 54 | 55 | Upon acceptance of this integration our next step is to include additional ADRs for the subsequent components required for the RAG pipeline. Next up will be decisions on Milvus and containerized/non-containerized offerings of that solution. 56 | 57 | Upon denial of this integration the team will need to go back and evaluate alternative technologies and ensure they meet the needs of the project goal and ensure they meet the larger project plan goals of a configurable RAG pipeline. 58 | 59 | ## How 60 | 61 | A downstream should be generated of the [https://github.com/deepset-ai/haystack](https://github.com/deepset-ai/haystack) project. 62 | 63 | ## Alternatives 64 | 65 | - Langchain/Langgraph 66 | - A generalist framework (as opposed to a RAG-focused solution) 67 | - Complicated as compared to the alternatives, steep learning curve 68 | - Lower performance in large-scale production environments than LlamaIndex and Haystack 69 | 70 | - LlamaIndex 71 | - Limited flexibility and customization options as compared to the alternatives 72 | - Limited out-of-the-box support for building complex, multi-component pipelines as compared to Langchain and Haystack 73 | - Documentation is less well-maintained and more difficult to use as compared to the alternatives 74 | 75 | - RAGFlow 76 | - Limited support for many of the mainstream vector DB providers 77 | - Limited scalability as compared to the alternatives 78 | 79 | - Do not use a framework; write everything directly in Python or some other programming language 80 | - This would take longer to get started. 81 | - This would make it a lot more work to add more vector DBs since we'd need to add additional code for each vector DB we want to support. We would hide that work behind an abstraction layer the same way the frameworks do, but it is work to build and maintain the abstraction layer, and the frameworks do that for us (and have put a lot of time and effort into doing it well). 82 | - This would make it harder to bring in advanced functionality that the frameworks already provide. For example, Haystack provides support for RAG self-correction loops which we might want some day. 83 | - This might make it easier to bring in advanced functionality that the frameworks do not already provide. Frameworks provide an abstraction layer that is generally useful when you want to do things that the framework developers support but often counterproductive when you want to do things that the frameworks do not support. For example, if there is a call to the framework that collapses multiple atomic steps into a single function call, that generally makes it harder to insert your own logic in between those atomic steps. 84 | 85 | ## Risks 86 | 87 | Future versions of Haystack can potentially introduce new dependencies, that could be: 88 | 89 | 1. Distributed under a non-permissive license (or not open source at all) 90 | 2. Not regularly and/or properly maintained 91 | 92 | If such a situation arises, the following actions can be taken on our end: 93 | 94 | 1. Pin to the old version that doesn't have that dependency. That's often OK for a while, but eventually we're likely to run into updates that we need (e.g., critical fixes, compatibility with new vectordbs, etc.). 95 | 2. Fork the project to avoid the problematic dependencies. 96 | 3. Move off of Haystack completely. 97 | 98 | ## References 99 | 100 | - [https://github.com/deepset-ai/haystack](https://github.com/deepset-ai/haystack) 101 | -------------------------------------------------------------------------------- /docs/schema/taxonomy-api.md: -------------------------------------------------------------------------------- 1 | # Central API for taxonomy reading and validation 2 | 3 | Current there are multiple places where the taxonomy `qna.yaml` files are read, parsed, and validated. There is a `check_yaml.py` script in the `taxonomy` repository and there are methods in the `instructlab` repository in the `src/instructlab/util.py` file. 4 | 5 | The methods in `utils` are used by both the `ilab taxonomy diff` command as well as in the SDG code which has been moved to the `sdg` repository. This arrangement results in a circular dependency between the `instructlab` package to access the SDG code and from the SDG code in the `instructlab-sdg` package to access the `utils` methods to read and validate the taxonomy files. 6 | 7 | ## Use instructlab-schema package for the central API 8 | 9 | We now have an `instructlab-schema` package on PyPI which holds the JSON schema files for the taxonomy `qna.yaml` files. This is now used by `instructlab` to access these schema files for taxonomy file validation. 10 | 11 | We should relocate the taxonomy reading and validation code from `instructlab` to `instructlab-schema`. This will provide for a central place near to the JSON schema it uses for a shared API for reading, parsing, and validating taxonomy `qna.yaml` files. 12 | 13 | Then we can modify the `instructlab` and `instructlab-sdg` packages to depend upon the `instructlab-schema` package for these APIs which will remove a circular dependency. We can also use these APIs in the taxonomy repositories `check_yaml.py` script as well. 14 | -------------------------------------------------------------------------------- /docs/sdg/README.md: -------------------------------------------------------------------------------- 1 | # SDG Design Docs 2 | 3 | Design docs associated with the `instructlab/sdg` repository 4 | -------------------------------------------------------------------------------- /docs/sdg/sdg-api-interface.md: -------------------------------------------------------------------------------- 1 | # SDG Library Design 2 | 3 | ## Objective 4 | 5 | Library called `instructlab-sdg` that can be called per seed example that includes question and answer pairs, and context for grounded skills. 6 | 7 | ## Structure of the SDG Library 8 | 9 | We propose the following structure for the SDG library. There will be config files that contain all the prompt templates for the pipelines. 10 | 11 | ```markdown 12 | - src/instructlab/sdg/ 13 | - configs/ 14 | - gen_q.yaml 15 | - gen_a.yaml 16 | - ... 17 | - init.py 18 | - block.py 19 | - llmblock.py 20 | - pipeline.py 21 | - sdg.py 22 | ``` 23 | 24 | ![example API interface](../images/sdg-api-interface.png) 25 | 26 | ## CLI 27 | 28 | The CLI client uses the instructlab SDG library and provides it a run configuration with input parameters. The following represents a sample of what Library usage could look like. 29 | 30 | ```python 31 | # cli_driver.py 32 | 33 | from sdg import SDG 34 | from run_config import SynthDataFlow 35 | from pipeline import Pipeline 36 | import yaml 37 | 38 | client = openai_client(endpoint) 39 | model = "model-version" 40 | 41 | synth_skills_flow = SynthDataFlow(client, model).get_flow() 42 | skills_pipe = Pipeline(synth_skills_flow) 43 | 44 | cli_sdg = SDG([synth_skills_flow]) # run config has all the variables like num_samples, pipelinesteps etc 45 | generated_samples = cli_sdg.generate() 46 | ``` 47 | 48 | As an initial integration milestone, we will modify the `generate_data` function in `instructlab.sdg.generate_data` to make use of the updated SDG API. This is the function the `ilab` CLI already uses, so modifying this implementation will allow us to get the updated SDG API in place without disrupting the CLI integration. 49 | 50 | CLI integration will require additional changes later to allow passing in customizations to the SDG pipeline, but we will treat that as a follow-up implementation milestone. 51 | 52 | The run configuration includes the necessary parameters for executing the SDG code library. It specifies the templates required for running the SDG code, the prompt template, and the default model system prompt template. 53 | 54 | * `num_samples` is the number of synthetic samples that you wish to generate per seed example. 55 | * `num_procs` is the number of parallel processes that you want to run 56 | * `max_retry` is the maximum number of non-greedy retries you want to make if the `num_samples` is not reached. The number of samples in the generated output will be the samples achieved until `max_retry` is reached. 57 | * Pipeline steps contains the steps that you want to invoke in the SDG pipeline and the prompt configurations per step. The variable names of the blocks can be anything and the prompt configurations must be compatible with the teacher model. 58 | * `max_new_tokens` is the maximum number of tokens we want to generate. In other words, the size of the output sequence, not including the tokens in the prompt. 59 | * `model` is the name of the served up teacher model we would want to use to generate the synthetic data. 60 | * `model_prompt`: the default model prompt for the model. 61 | * `client` points to an OpenAI client used to interface with the model. Example of a client: 62 | 63 | ```python 64 | client = OpenAI( 65 | api_key=openai_api_key, 66 | base_url=openai_api_base, 67 | ) 68 | ``` 69 | 70 | ```python 71 | # run_config.py 72 | class Flow(ABC): 73 | def __init__(self, client, model_id) -> None: 74 | self.client = client 75 | self.model_id = model_id 76 | 77 | @abstractmethod 78 | def get_flow(self) -> list: 79 | pass 80 | 81 | 82 | class SynthDataFlow(Flow): 83 | def get_flow(self) -> list: 84 | return [ 85 | { 86 | 'block_type': LLMBlock, 87 | 'block_config': { 88 | 'block_name': "gen_q", 89 | 'config_path': "configs/gen_q.yaml", 90 | 'client': self.client, 91 | 'model_id': self.model_id, 92 | 'model_prompt': ' [INST] {prompt} [/INST]', 93 | 'output_cols': ['question'], 94 | 'batch_kwargs': { 95 | 'num_procs': 8, 96 | 'num_samples': 30, 97 | 'batched': True, 98 | }, 99 | 'max_retry' : 5, 100 | 'max_new_tokens': 10000 101 | ``` 102 | -------------------------------------------------------------------------------- /docs/sdg/sdg-api-simplification.md: -------------------------------------------------------------------------------- 1 | # SDG API Simplification 2 | 3 | ## Objective 4 | 5 | Identify simplifications to [the original SDG API design](sdg-api-interface.md) based on retrospective insights from working with the implementation of that design. 6 | 7 | ## Original API Design 8 | 9 | Consider the original API sketch: 10 | 11 | ```python 12 | from sdg import SDG 13 | from run_config import SynthDataFlow 14 | from pipeline import Pipeline 15 | import yaml 16 | 17 | client = openai_client(endpoint) 18 | model = "model-version" 19 | 20 | synth_skills_flow = SynthDataFlow(client, model).get_flow() 21 | skills_pipe = Pipeline(synth_skills_flow) 22 | 23 | cli_sdg = SDG([synth_skills_flow]) # run config has all the variables like num_samples, pipelinesteps etc 24 | generated_samples = cli_sdg.generate() 25 | ``` 26 | 27 | The nouns above are: 28 | 29 | * Dataset - this is from Hugging Face's datasets library - used for the return value from `SDG.generate()`, but also what is passed between elements of the data generation pipeline 30 | * Block - not shown in the code above, but required to understand a pipeline - a block provides a `generate()` method transforms an input dataset and returns an output dataset 31 | * Block config - a description of how to instantiate and invoke a block, a sequence of these is returned from `get_flow()` above 32 | * Flow - a class which describes how to render a sequence of block configs for a pipeline 33 | * Pipeline - a pipeline is created from a sequence of block configs, and provides a generate() method in which it instantiates and invokes blocks in turn, passing the input dataset and collecting the output 34 | * SDG - an SDG is created from a list of pipelines, and its generate() method calls pipelines in turn 35 | 36 | ## Simplification Proposals 37 | 38 | ### Remove `SDG` 39 | 40 | We don't need both SDG and Pipeline since Pipeline can already do everything SDG can do. If more advanced orchestration of multiple pipelines is required later, an orchestration abstraction can be added then. 41 | 42 | ### Remove `Flow` 43 | 44 | With flows migrating to a YAML file format (#109), their purpose becomes more clear - they are simply an expression of the configuration of a sequence of blocks, used to create a pipeline. We can simply refer to these YAML files as pipeline descriptions. 45 | 46 | ### Add PipelineContext 47 | 48 | A number of runtime parameters are required by blocks in a pipeline - e.g. every `LLMBlock` requires an OpenAI API client and a model name. These parameters are distinct from configuration that is specified by a pipeline author. 49 | 50 | It would be much more straightforward if `Block` were able to access these runtime parameters via their parent `Pipeline`. 51 | 52 | In the case where multiple pipelines with the same runtime context is desired, it would also be beneficial to abstract these runtime parameters into a `PipelineContext` class. 53 | 54 | ## New API Design 55 | 56 | ```python 57 | ds = Dataset.from_list(samples) 58 | 59 | ctx = PipelineContext(client, "mixtral", teacher_model) 60 | 61 | knowledge_pipe = Pipeline.from_configs(ctx, [MMLU_BENCH_PIPELINE, SYNTH_KNOWLEDGE_PIPELINE]) 62 | 63 | gen_data = knowledge_pipe.generate(ds) 64 | ``` 65 | -------------------------------------------------------------------------------- /docs/sdg/sdg-flow-yaml.md: -------------------------------------------------------------------------------- 1 | # SDG API - Add a file format for defining custom Flows 2 | 3 | ## Problem Statement 4 | 5 | The `instructlab/sdg` library is introducing more extensive data generation pipelines. To enable customization, we should allow users of the library to provide a configuration file which defines a custom pipeline or extends an existing pipeline. 6 | 7 | In terms of the API constructs, a Pipeline is created from a sequence of “block configurations” which express how to instantiate and invoke the individual steps (aka blocks) in the pipeline. A Flow construct serves as a template from which a sequence of block configs can be generated. 8 | 9 | ## Objective 10 | 11 | - Library users can specify a custom flow using a well-defined file format. 12 | - Library users can either use a custom flow standalone, or combine a custom flow with existing flows. 13 | - The file format and library can evolve substantially without making breaking changes. 14 | - Incompatible changes can be introduced while retaining support for existing custom flows for a deprecation period. 15 | 16 | ## Proposal 17 | 18 | ### Existing API Review 19 | 20 | The current `Pipeline` API allows instantiation with a list of `Block` configurations. 21 | These configurations could come from one or many sources. In its simplest form: 22 | 23 | ```python 24 | pipeline = Pipeline(block_configs) 25 | ``` 26 | 27 | or if you had two separate lists of block configurations to append together: 28 | 29 | ```python 30 | pipeline = Pipeline(block_configs1 + block_configs2) 31 | ``` 32 | 33 | ### API Additions 34 | 35 | We will add an API that instantiates a `Pipeline` object from a YAML file: 36 | 37 | ```python 38 | pipeline = Pipeline.from_file(ctx, 'mycustomflow.yaml') 39 | ``` 40 | 41 | The YAML file format will mirror the API and look like this: 42 | 43 | ```yaml 44 | version: 1.0 45 | blocks: 46 | - name: gen_knowledge 47 | type: LLMBlock 48 | config: # LLMBlock constructor kwargs 49 | output_cols: ["question", "response"] 50 | gen_kwargs: # kwargs for block.generate() 51 | max_tokens": 2048, 52 | drop_duplicates: ["question"] 53 | - name: filter_faithfulness 54 | type: FilterByValueBlock 55 | config: 56 | filter_column: judgment 57 | filter_value: YES 58 | operation: eq 59 | drop_columns: ["judgment", "explanation"] 60 | ``` 61 | 62 | ## Versioning 63 | 64 | A mandatory `version` field in the YAML file expresses major and minor versions (e.g., 1.0, 1.1, 2.0). 65 | 66 | Compatibility rules 67 | 68 | 1. If the major version of the YAML file is higher than the parser can handle, the parser should reject the file. 69 | 2. If the minor version of the YAML file is higher than the highest version the parser is aware of, the parser should read the file but ignore any unrecognized content. 70 | 3. If the file’s version is lower than the parser version, the parser should provide default values for any configuration introduced in later versions. 71 | 72 | Example parsing logic: 73 | 74 | ```python 75 | def parse_custom_flow(content): 76 | version = content['version'] 77 | major, minor = map(int, version.split('.')) 78 | 79 | if major > PARSER_MAJOR: 80 | raise IncompatibleVersionError("The custom flow file format is from a future major version.") 81 | elif major <= PARSER_MAJOR and minor > PARSER_MINOR: 82 | logger.warning("The custom flow file may have new features that will be ignored.") 83 | ``` 84 | 85 | ### Pipeline Context 86 | 87 | The following runtime parameters will no longer be part of the pipeline configuration definition and instead available to blocks via a `PipelineContext` object: 88 | 89 | - client - an OpenAI completions API client for talking to the teacher model via the serving backend (i.e. llama-cpp or vLLM) 90 | - model_family - e.g. mixtral or merlinite 91 | - model_id - a path name for the specific teacher model being used 92 | - num_instructions_to_generate - how many samples to generate 93 | 94 | For now, we assume there is no need to do any sort of templating in the custom pipelines based on these runtime parameters. 95 | 96 | ### Model Prompts 97 | 98 | Based on whether model_family is mixtral or merlinite, a different prompt is used with the teacher model 99 | 100 | ```python 101 | _MODEL_PROMPT_MIXTRAL = " [INST] {prompt} [/INST]" 102 | _MODEL_PROMPT_MERLINITE = "'<|system|>\nYou are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.\n<|user|>\n{prompt}\n<|assistant|>\n'" 103 | ``` 104 | 105 | For now, we assume that the `LLMBlock` class will choose the appropriate model prompt based on the family and that there is no need to specify a custom prompt. 106 | 107 | ### Prompt Config Files 108 | 109 | Every LLMBlock references a separate prompt config file, and presumably a custom pipeline will provide custom prompt configs too. 110 | 111 | These prompt config files are quite simple YAML files - they contain a single object with system, introduction, principles, examples, and generation keys. See e.g. [src/instructlab/sdg/configs/skills/freeform_questions.yaml](https://github.com/instructlab/sdg/blob/main/src/instructlab/sdg/configs/skills/freeform_questions.yaml) 112 | 113 | We will continue to use these config files unchanged, and custom files can be specified with an absolute path. Relative paths are assumed to be relative to the Python package e.g. `configs/skills/...`. 114 | 115 | ### Model Serving 116 | 117 | Custom pipelines may have more unique model serving requirements. Instead of serving just one model, we may need to launch the model server with a model and an additional model with adapter. vLLM, for example, can host both a model and a model+adapter under two different model IDs. 118 | 119 | The pipeline author needs some way of disambiguating between these multiple models - i.e. the definition of each `LLMBlock` needs to specify a particular model. 120 | 121 | Right now the `Pipeline` constructor takes two relevant parameters - the OpenAI client instance, and the model ID for the default model. It's important to note that this model ID is defined by the user at runtime, and it may not match the model IDs that the pipeline author used. 122 | 123 | The use cases will be: 124 | 125 | 1. Most LLMBlock definitions will use the default teacher model - and we can make the semantic that if the pipeline author doesn't specify a model in an `LLMBlock`, the default in `PipelineContext.model_id` is used. 126 | 2. In cases where a model+adapter is to be served, the pipeline author should choose a descriptive model ID using `block.gen_kwargs.model_id` and the user should ensure that this is the model ID that is served. 127 | 128 | For example, a pipeline author might define: 129 | 130 | ```yaml 131 | version: "1.0" 132 | blocks: 133 | - name: gen_questions 134 | type: LLMBlock 135 | config: 136 | config_path: configs/skills/freeform_questions.yaml 137 | add_num_samples: True 138 | gen_kwargs: 139 | model_id: mycustomadapter 140 | output_cols: 141 | - question 142 | drop_duplicates: 143 | - question 144 | ``` 145 | 146 | and the user will be required to define a serving configuration like: 147 | 148 | ```bash 149 | --lora-modules=mycustomadapter=path/to/my_custom_adapter 150 | ``` 151 | 152 | ### Re-use of Built-in Pipelines 153 | 154 | A custom pipeline may want to extend an existing built-in pipeline. In that 155 | case, a new block type, `ImportBlock`, may be used to import the blocks from 156 | another configuration file. 157 | 158 | ```yaml 159 | version: "1.0" 160 | blocks: 161 | - name: import_from_full 162 | type: ImportBlock 163 | path: configs/full/synth_freeform_skills.yaml 164 | - name: custom_post_processing_block 165 | type: LLMBlock 166 | ... 167 | ``` 168 | 169 | ### CLI Integration 170 | 171 | As of the current version of `ilab`, it supports `simple` and `full` as parameters to `--pipeline` to select one of the two types of built-in pipelines included in the library. 172 | 173 | Once we have support for loading custom pipelines, we need a way for these to be specified with the CLI. We believe the most common case for custom pipelines is for them to extend the `full` pipeline and, as such, we should support extending existing pipelines with a custom pipeline rather than simply specifiying a single pipeline. 174 | 175 | Here is a proposed CLI UX for this: 176 | 177 | > `ilab data generate` 178 | 179 | Use the default pipeline, `simple`. 180 | 181 | > `ilab data generate --pipeline full` 182 | 183 | Use the built-in `full` pipeline. 184 | 185 | > `ilab data generate --pipeline path/to/custom_pipeline_directory/` 186 | 187 | Use a custom pipeline configuration. The custom pipeline may include references to the built-in flows to be used in conjunction with custom ones, but those details are contained within the yaml files in the custom directory. 188 | 189 | ### File and Directory Structure 190 | 191 | The existing contents of `default_flows.py` will become these files in the source tree: 192 | 193 | ```text 194 | src/ 195 | instructlab/ 196 | sdg/ 197 | pipelines/ 198 | simple/ 199 | knowledge.yaml 200 | freeform_skills.yaml 201 | grounded_skills.yaml 202 | full/ 203 | knowledge.yaml # also contains the current contents of mmlu_bench.yaml 204 | freeform_skills.yaml 205 | grounded_skills.yaml 206 | ``` 207 | 208 | When the `--pipeline` option to `ilab data generate` is used to point to a 209 | custom directory, we will assume that the same 3 files are present. All three 210 | files will be loaded and used according to the type of taxonomy additions 211 | present when running `ilab data generate`. 212 | 213 | ### Future CLI Improvements 214 | 215 | A possible improvement would be to have a well-defined place on the filesystem where custom pipeline configs can be automatically loaded and included as options to the `--pipeline` parameter. 216 | 217 | For example, if the config format included new parameters, `name: full-extended` and `extends: full`, and the CLI discovered and loaded it automatically, we could support `--pipeline full-extended` without needing the additional `--pipeline-extend` option. 218 | 219 | `/usr/share/instructlab/sdg/` is a proposed location for this as a place for a distribution of InstructLab to include pre-defined custom pipelines, at least for Linux. See the [Filesystem Hierarchy Standard](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch04s11.html) for more details on why this path is appropriate for this use-case. 220 | 221 | It would also make sense to support a configuration directory for user's own custom pipeline configurations. Assuming there is a base config directory, these could go in a `sdg` subdirectory. There is a separate proposal that discusses a proposed configuration location: . Note this is separate from the distribution-provided, read-only pipelines discussed above with a different location. 222 | 223 | If we have a location with pipeline examples then a nice to have would be to have a `ilab data generate --list-pipelines`. 224 | 225 | ## Alternative Approaches 226 | 227 | Alternatives already considered and discarded are listed below. 228 | 229 | ### No Custom Flows 230 | 231 | It would be preferable to not support custom flows, especially so early in the project because: 232 | 233 | - We will need an extensive API to support this customization, and we will need to be careful about making incompatible changes to that API once it has been adopted. 234 | - We would learn more about the pipelines that users are creating if they were added to the library. 235 | 236 | This approach was discarded because of strong demand from downstream users to define custom flows to encapsulate proprietary pipeline configuration.. 237 | 238 | ### Custom Flows as Code 239 | 240 | If we have an API for creating flows, users could define these custom flows in Python rather than with a configuration file format. 241 | 242 | This approach was discarded because of a desire by downstream users to separate reusable logic from proprietary pipeline configuration. 243 | 244 | The initial version of the initial SDG library API design (#98) proposed using YAML files and this was changed to Python code based on this feedback: 245 | 246 | > Does this need to be a yaml file? 247 | > 248 | > or is it actually a Python dict passed to the library? 249 | > 250 | > I actually think it would be a nice simplification to not worry about config files at all, and from the library perspective, assume configuration is passed in via data structures. 251 | > 252 | > How that config is constructed could be a problem of the library consumer. Maybe they hardcode it. maybe they allow a subset to be considered. Some could be driven by CLI args, for example. 253 | 254 | Since adopting YAML may now appear contradictory to that feedback, it is useful to understand how the feedback relates to this new design: 255 | 256 | 1. The feedback assumes that YAML will be used for custom pipelines, but wonders whether it would be better to implement that in the CLI instead of the library. 257 | 2. Not called out is that at the time it was unclear whether custom pipeline definitions would also need to include custom model serving configuration - if so, the model serving configuration would not belong in the SDG library. It is now better understood that no model serving configuration needs to be included in the pipeline definitions. (See above) 258 | 3. The POC implementation of this format makes it clear - in a way that wasn't clear from an API design - that using the YAML format within the library is an improvement. 259 | -------------------------------------------------------------------------------- /docs/sdg/sdg-repo.md: -------------------------------------------------------------------------------- 1 | # New Repository Proposal: sdg 2 | 3 | ## Summary 4 | 5 | This document proposes a new repository under the `instructlab` GitHub organization: 6 | 7 | - `instructlab/sdg` 8 | 9 | ## Background 10 | 11 | The `instructlab/instructlab` repository includes a basic implementation of 12 | Synthetic Data Generation (SDG). This implementation does not implement the full 13 | approach as described by the [LAB paper](https://arxiv.org/abs/2403.01081). 14 | 15 | We desire to build out a more complete implementation of SDG that is more in 16 | line with the LAB methodology. We propose a new repository to house this code 17 | that publishes a new Python library called `instructlab-sdg`. The reasoning for 18 | a new repository and library includes: 19 | 20 | - We expect multiple consumers of this code. The `ilab` CLI is one, but we also 21 | envision building a REST API around it to help support scaling out this 22 | functionality on a cluster. 23 | - We expect there is broader community interest in an open-source library and 24 | service for synthetic data generation. We envision this library could support 25 | other data generation techniques over time. 26 | 27 | ## Maintainers 28 | 29 | The initial team of maintainers for this repository will be a copy of the 30 | `Backend Maintainers` GitHub team. 31 | 32 | ## Alternatives Considered 33 | 34 | ### Add to `instructlab/instructlab` 35 | 36 | We could add this code to the existing `instructlab/instructlab` repository. 37 | 38 | The primary argument against this approach is that we expect the scope of an 39 | `instructlab-sdg` library to expand beyond the scope of what would be run by the 40 | `ilab` CLI. We instead envision a different community of contributors organizing 41 | around SDG specifically. 42 | -------------------------------------------------------------------------------- /docs/sdk/instructlab-sdk.md: -------------------------------------------------------------------------------- 1 | # InstructLab Python SDK 2 | 3 | ## Motivation 4 | 5 | Today, the only way to "drive" the InstructLab opinionated workflow is via the `ilab` CLI. While this process provides a succinct way for everyday users to initialize a config, generate synthetic data, train a model, and evaluate it: the guardrails are quite limiting both in what a user can do and what the development team can add as features exposed directly to the user over time. 6 | 7 | Additionally, current consumers of InstructLab are finding themselves importing different library private and public APIs in combination with CLI functionality to achieve the workflows they want. While these more advanced usage patterns are not for everyone, providing ways to run bespoke and piecemeal workflows in a standardized and safe way for the community is a necessity. 8 | 9 | Unifying these various ranges of advanced workflows under an overarching `InstructLab Python SDK` will allow for new usage patterns and a clearer story on what InstructLab can and should provide as user accessible endpoints. 10 | 11 | While each library can and _should_ have their own publicly accessible SDK, not all functionality being added to SDG, Training, and Eval needs to be correlated directly to the "InstructLab workflow". This Python SDK should, as the CLI does, expose an opinionated flow that uses functionality from the various libraries. The InstructLab SDK should be derived from the library APIs, not the other way around. SDG, for example, currently has a `generate_data` method, meant to only be accessed by InstructLab. This method simply calls other publicly available SDG functionality. Orchestration of the InstructLab flow like this should not be of concern to the individual libraries and instead be handled by the overarching InstructLab SDK which will maintain the user contracts. The InstructLab SDK will need to work within the bounds of what the Libraries expose as public APIs. 12 | 13 | The benefit of the above is that the opinionated flow can be accessed in a more nuanced and piecemeal way while also gaining the potential for more advanced features. Say a consumer wants to: 14 | 15 | 1. Setup a custom config file for ilab (optional) 16 | 2. Initialize a taxonomy 17 | 3. Ensure their taxonomy is valid 18 | 4. Ingest some data for RAG and SDG (SDG coming soon) 19 | 5. Generate synthetic data using an InstructLab pipeline 20 | 6. Do some custom handling per their use case 21 | 7. Fine-tune a model using the custom config they initialized for their hardware 22 | 8. Evaluate their model after training using various benchmarks 23 | 24 | A user could do this if they had an SDK. 25 | 26 | (the structure of the SDK and actual arguments is discussed below) 27 | 28 | However today, users are forced to run a sequence of commands tailored to only work with the proper directory structure on the system. 29 | 30 | ## Major Goals 31 | 32 | 1. Modularize the InstructLab workflow such that any part can be run independently 33 | 2. Allow users to choose whether or not to take advantage of the config/system-profile method of running InstructLab. Meaning they do not need any pre-existing configuration to run the SDK. 34 | 3. Standardize user contracts for the existing functionality of the InstructLab workflow. Existing CLI commands should be using the SDK once past click parsing, not separate code. 35 | 4. Define Contracts loose enough that functionality can be expanded as more advanced features are released. 36 | 5. Document SDK usage in upcoming InstructLab releases. 37 | 38 | ## Non-Goals 39 | 40 | 1. Exposing all library functionality immediately 41 | 2. Replacing CLI 42 | 3. Shipping an SDK that is generally available as opposed to v1alpha1 or v1beta1. 43 | 44 | ## Design 45 | 46 | ### Versioning 47 | 48 | The SDK would start at version v1alpha1 such that it can change/break at any time for the first few iterations as libraries adjust their API surface. 49 | 50 | ### Structure 51 | 52 | This SDK should live in a net new package inside of `instructlab/instructlab` preferably to limit unnecessary imports in a new repository. The SDK could be imported as `instructlab.core...` 53 | 54 | The user surface initially should look like this: 55 | 56 | `instructlab.core` contains all SDK definitions. Users can `from instructlab.core import...` to use specific SDK classes 57 | 58 | For most of the existing InstructLab command groups, there should be a class: 59 | 60 | `from instructlab.core import, Config, Taxonomy, Data, Model, RAG, System` 61 | 62 | The full list of classes and their methods for now (subject to change during development process): 63 | 64 | ```console 65 | instructlab.core.Config 66 | instructlab.core.Config.init 67 | instructlab.core.Config.show (get) 68 | instructlab.core.Taxonomy 69 | instructlab.core.Taxonomy.diff 70 | instructlab.core.System 71 | instructlab.core.System.info 72 | instructlab.core.Data 73 | instructlab.core.Data.ingest 74 | instructlab.core.Data.generate_data 75 | instructlab.core.Model 76 | instructlab.core.Model.serve 77 | instructlab.core.Model.train_model 78 | instructlab.core.Model.process_data (calling the training library's data process class in a safe way) 79 | instructlab.core.Model.evaluate_mt_bench 80 | instructlab.core.Model.evaluate_dk_bench 81 | instructlab.core.Model.evaluate_mmlu_bench 82 | instructlab.core.RAG.ingest 83 | instructlab.core.RAG.convert 84 | ``` 85 | 86 | a brief example: 87 | 88 | ```python 89 | 90 | from instructlab.core import Config, Taxonomy, Data, Model 91 | 92 | config_object = Config.init(...) 93 | diff = Taxonomy.diff() 94 | 95 | if diff: 96 | data_client = Data(data_path="", teacher_model="", num_cpus="", taxonomy_path="",) 97 | 98 | # not in v1alpha1 99 | data_path = data_client.ingest() 100 | # not in v1alpha1 101 | 102 | openai_compat_client = some_server() 103 | 104 | data_jsonls = data_client.generate_data(client=openai_compat_client, data=data_path) 105 | 106 | some_custom_handling(data_jsonls) 107 | 108 | # you can either use a config obj or pass trainer args 109 | model_client = Model(student_model=path_to_student_model, configuration=config_object) 110 | 111 | model_path = model_client.train_model() 112 | 113 | # since we initialized the model client with the config, the training args are passed implicitly 114 | eval_output = model_client.mt_bench(model_path=model_path) 115 | ``` 116 | 117 | The above example utilizes the configuration object to instantiate the `Model` class. However, a user could also pass `training_args=` directly to `model_client.train_model` to override the configuration class defaults. This allows the SDK to utilize the System Profiles of the ilab CLI but not rely on them too much. 118 | 119 | Presumably, the distinct methods under each class will grow, which is why I am opting to make very distinct classes per command group. Another benefit to the parent classes is that individual methods can inherit defaults from the instantiation of the object. 120 | 121 | These initial exposed functions can expand to include any new functionality that is more SDK oriented from the various libraries. For example, if SDG adds something like subset selection, teacher as annotator, data mixing, etc we could expose an `instructlab.core.Data.annotate` or `instructlab.core.Data.mix` that could be invoked in sequence in a user's script with other parts of the ilab workflow. Some things make _less_ sense to be exposed via a CLI, but still are critical to ensuring users get a good model and properly generated data. 122 | 123 | There are certain things that only exist in `ilab` currently and functionality that is going to be moving these such as data ingestion, RAG, etc. Forming an SDK for `instructlab` allows us to capture all of these concerns under one API. 124 | 125 | These endpoints in combination with the curated InstructLab Config File will open up these workflows to users and allow InstructLab to be easily incorporated into other projects. Allowing people to run things like data generation, and full fine-tuning via an SDK that pulls in their pre-existing `config.yaml` but also can be run independently will open new avenues for InstructLab adoption and extensibility. 126 | 127 | ## Changes to the CLI 128 | 129 | The `ilab` CLI will need to adapt to this new structure. Commands like `ilab data generate` should, in terms of code, follow this flow: 130 | 131 | 1. `src/instructlab/cli/data/generate.py` 132 | 2. `src/instructlab/data/generate.py` 133 | 3. `src/instructlab/process.py` 134 | 4. `src/instructlab/core/data/generate.py` 135 | 136 | So generally: cli -> process management package to kick off a sub-process -> internal handling package -> core SDK (public definitions) -> library code, is the flow. 137 | 138 | The flow of the CLI today is such that the cli package for a command (`src/instructlab/cli/data/generate.py`) parses the command line options, manages creating a sub-process, and passes control to the core code (`/src/instructlab/core/data/generate.py`). This then calls out to the library APIs 139 | 140 | The internal handling package is necessary as it allows us to split off a sub-process when it makes the most sense for us before calling the library code directly. This is how the CLI works today. 141 | 142 | The difference with an SDK is that we would eventually want to end up executing `core/data/generator.py`, the actual publicly consumable python SDK. This will ensure that the CLI can do whatever custom handling it needs to do on top, but eventually it must boil down to the `core` package which uses publicly available methods from the various libraries. 143 | 144 | ## Scope of work 145 | 146 | In upcoming releases the InstructLab team should aim to: 147 | 148 | 1. Design the SDK given the structure above 149 | 2. Converse with Library maintainers to negotiate user contracts 150 | 3. Begin work to re-architect how the CLI works using the SDK 151 | 4. Publish an alpha SDK for public consumption 152 | 153 | After this initial work, the team can scope adding net new functionality that is not in the CLI to the SDK. -------------------------------------------------------------------------------- /docs/support-utils/support-utils-repo.md: -------------------------------------------------------------------------------- 1 | # Create Separate Repo for User Utilities 2 | 3 | ## Idea Overview 4 | 5 | Create a separate repository within the `instructlab` GitHub org called `support-utils`. 6 | This repository would house scripts and notebooks outside of the scope of the LAB Methodology implemented in the [Instructlab Core](https://github.com/instructlab/instructlab) repository that enhance the InstructLab experience. 7 | Many users and community members already have such scripts they use day to day. 8 | The `support-utils` repo would be a place where the maintainers of the InstructLab project can collect and curate them for the benefit of the community. 9 | Scripts in this repository may become features or incorporated in the InstructLab Core repository after use and review by users and developers. 10 | 11 | ## Repository Structure 12 | 13 | The repository will have two categories of scripts. Scripts either live in the `hack` and `beta` directories. 14 | 15 | ```bash 16 | support-utils 17 | | 18 | | 19 | |- beta 20 | | 21 | |- hack 22 | ``` 23 | 24 | The `hack` directory is open for the contribution of scripts of any quality. 25 | 26 | Scripts in the `beta` directory will be required to have documentation, and automated functional testing. 27 | These scripts are meant to be run by users for feedback and may graduate into full blown features in other InstructLab repos. 28 | 29 | Beyond this initial structure, the structure within those two directories will evolve as scripts are contributed to each. 30 | 31 | ## Additional Info 32 | 33 | A few areas of focus for the first scripts that will be added to the repository are: 34 | 35 | - Automating qna.yaml creation 36 | - Assessing document readiness knowing the limitations of Docling 37 | - Visualizing synthetically generated data for inspection 38 | 39 | This repo would not be released as a package on PYPI but initially as just `.zip` and `.tar.gz` files on GitHub. 40 | Releases would serve the purpose of giving users having specific versions of scripts in `beta` and for development project management purposes. 41 | -------------------------------------------------------------------------------- /docs/taxonomy-tree-organization.md: -------------------------------------------------------------------------------- 1 | # Taxonomy Tree Organization 2 | 3 | This document describes the justification and decision to choose 4 | to emulate the [Wikipedia taxonomy](https://en.wikipedia.org/wiki/Wikipedia:Contents) structure for our `knowledge/` tree in the taxonomy repository. 5 | 6 | The definition of taxonomy from Wikipedia: 7 | > A taxonomy is a scheme of classification, especially a hierarchical classification, in which things are organized into groups or types. Among other things, a taxonomy can be used to organize and index knowledge (stored as documents, articles, videos, etc.), such as in the form of a library classification system, or a search engine taxonomy, so that users can more easily find the information they are searching for. Many taxonomies are hierarchies (and thus, have an intrinsic tree structure), but not all are. 8 | 9 | ## Why do we need this? 10 | 11 | Our taxonomy tree is not only how humans will place 12 | the different `qna.yaml`s, but it's how people will look for 13 | and update changes for specific questions and answers. 14 | Having the challenge of this organization, copying Wikipedia's 15 | tree is a good default standard. 16 | 17 | ## What are we going to do to enforce this? 18 | 19 | The triage team will take into consideration the location of the 20 | directory and how it pertains to the suggested tree that Wikipedia 21 | publishes their as. The merging of [this PR](https://github.com/instructlab/taxonomy/pull/780) 22 | will build the initial tree, which we can work with the backend 23 | team to solidify the tree going forward. 24 | Creating a new "top level" directory will require understanding 25 | that it will be a special rare case, while lower left nodes of 26 | the tree as long as they are put in logical place is empowered 27 | by the contributor. 28 | The side effect leveraging using this we can verify where the 29 | knowledge is placed on Wikipedia and reinforce the location 30 | in the taxonomy tree. 31 | 32 | ## Conflicts and Resolutions 33 | 34 | With adopting this format and structure there will be conflicts and debate 35 | about the placement of the `qna.yaml`. The triage team will do their best 36 | to take into consideration of the challenges that may arise, and work 37 | with the contributor to hear and engage with that conflict. The triage 38 | team has the ultimate decision on the location of the directory and 39 | file. 40 | -------------------------------------------------------------------------------- /docs/training/training-accelerate-dep.md: -------------------------------------------------------------------------------- 1 | # Design Proposal - HF Accelerate 2 | 3 | ## Overview 4 | 5 | InstructLab training is currently being required to support two distributed sharding frameworks: DeepSpeed and FSDP1. Additionally, there is a future plan to adopt FSDP2 once the project matures. Each of these frameworks has its own process for preparing models, optimizers, sharding, and all of their own custom options. 6 | 7 | With all this in mind, in order to maintain a simple common codebase, an abstraction for sharding frameworks is required. This is why we have pulled in Hugging Face Accelerate as a lightweight sharding abstraction, to enable a common interface for sharding frameworks with pluggable configs, and to avoid overly-branching code paths. 8 | 9 | ## How it is being used 10 | 11 | (maybe insert diagram here) 12 | 13 | ### How it is implemented in code 14 | 15 | Accelerate usage consists of a single import: 16 | 17 | ```python 18 | from accelerate import Accelerator 19 | ``` 20 | 21 | We begin by setting up our accelerator object via our `setup_accelerator` function: 22 | 23 | ```python 24 | accelerator = setup_accelerator(args, model, grad_accum) 25 | ``` 26 | 27 | This checks the selected sharding framework, and sets up the appropriate config: 28 | 29 | ```python 30 | def setup_accelerator(args, model, grad_accum): 31 | if args.distributed_training_framework == "deepspeed": 32 | # Third Party 33 | from deepspeed import DeepSpeedEngine 34 | ... 35 | accel_args = { 36 | "deepspeed_plugin": get_ds_plugin(...), 37 | } 38 | elif args.distributed_training_framework == "fsdp": 39 | accel_args = { 40 | "fsdp_plugin": get_fsdp_config(args, model), 41 | } 42 | else: 43 | raise ValueError( 44 | f"Unknown sharding framework: {args.distributed_training_framework}" 45 | ) 46 | accelerator = Accelerator( 47 | **accel_args, 48 | ) 49 | accelerator.even_batches = False 50 | return accelerator 51 | ``` 52 | 53 | Now this Accelerator object can act as a universal sharding framework config, and we can prepare our training objects accordingly: 54 | 55 | ```python 56 | model, optimizer, _, lr_scheduler = accelerator.prepare( 57 | model, 58 | optimizer, 59 | deepcopy(train_loader), 60 | lr_scheduler, 61 | ) 62 | ``` 63 | 64 | As an additional bonus, the accelerator object allows us to do universal checkpoint saving and resuming as well. For saving model checkpoints, we can simply use: 65 | 66 | ```python 67 | accelerator.save_model( 68 | model, 69 | save_directory=output_dir, 70 | max_shard_size="5GB", 71 | safe_serialization=True, 72 | ) 73 | ``` 74 | 75 | and to save full state for resuming training, we can use: 76 | 77 | ```python 78 | accelerator.save_state( 79 | output_dir=output_dir, 80 | safe_serialization=True, 81 | ) 82 | ``` 83 | 84 | ## The immediate benefit 85 | 86 | The inclusion of Accelerate in this manner drastically simplifies the process of supporting multiple sharding frameworks. Rather than having diverging code paths for model setup, optimizer setup, sharding configuration, distributed initialization, state saving, and model checkpointing, all of these training steps can be supported with the same code. The common abstraction allows us to maintain both DeepSpeed and FSDP, and prepares us for inclusion of additional sharding frameworks. 87 | 88 | ## Impact on overhead 89 | 90 | ### Performance 91 | 92 | There has been no noticeable performance impact observed during development, but we will defer to the final measurements seen from comprehensive performance testing and benchmarking. 93 | 94 | ### Usability 95 | 96 | This makes the library easier to read, but also requires knowledge of an additional package. While there exists [documentation](https://huggingface.co/docs/accelerate/v1.0.0rc1/en/index), it does require some additional code exploration to actually understand what some functions (like `prepare`, `save_model`, etc.) are doing behind the scenes, making some processes less transparent than if implemented directly with `torch`. 97 | 98 | While sharding framework code will not need to be reviewed as thoroughly, this does not change the fact that one still needs to understand the configuration options per framework and how they behave. 99 | 100 | ### Package Management 101 | 102 | This inclusion requires one additional python package to be managed as a dependency. Currently, that package is `accelerate==0.34.2` but we plan to immediately upgrade to `accelerate==1.0.0` once it is moved from pre-release to official release. 103 | 104 | ## Long-Term Bonuses and Risks 105 | 106 | The two clearest long-term bonuses are: 107 | 108 | - Simplifies the process of onboarding additional sharding frameworks in the future, as well as deprecating existing sharding frameworks 109 | - Vastly improves the readability and maintenance of our code by avoiding diverging paths for various sharding frameworks 110 | 111 | There are, however, important risks to be considered. Ultimately, if we are confident that we wish to eventually stick with a single sharding framework in the future and contribute directly to that project, the risks of including Accelerate may outweigh the benefits. Please read the next section carefully to understand why, if Accelerate no longer provides a significant benefit to us, should be removed. 112 | 113 | ### Why Stop There? 114 | 115 | With the inclusion of a Hugging Face abstraction library, this begs the question "why not keep going and pull in more of the HF stack? Won't it simplify things further then?" 116 | 117 | It is important to reiterate here the clear downsides of HF Accelerate, and what we wish to avoid moving forward. Accelerate makes our code more opaque, and hinders our ability to easily understand and customize as needed without being reliant upon a third-party dependency. We become reliant upon HF bug fixes, release cycles, and documentation, and without an explicit member of the HF community to manage these for us, this introduces risk that has to be heavily considered. 118 | 119 | For example, if we wanted to change how model saving worked, or a user wanted to simply understand how model saving worked, they would no longer be able to do so through our library directly, but instead would need to read HF documentation, and contribute back to HF repos with the hope that the customization is considered generally useful and approved. More niche or experimental customizations may not be approved at all, and would have to be patched in directly or overwritten in a rather hacky manner within our library. This as described is not the most dire of circumstances, as one could argue that if it is not ready to be included upstream, it may not be ready to be included in our library. Where things get more serious, however, is with the introduction of bugs. At this very moment, there is a bug in `v0.34.2` that means FSDP model saving does not behave as expected. This fix is resolved in `v1.0.0rc1`, but there is no official release yet that can be pulled in with this fix. So for instructlab-training `v0.5.x`, we are currently maintaining a patch directly in our library to fix the bug while working with `v0.34.2`, and it will sit there temporarily until Accelerate has a new official release. 120 | 121 | This expands well into the wider topic of dependency management. A package like Accelerate will have frequent updates and changes moving forward, and we will need to ensure consistent compatibility between our training library releases and the required Accelerate versions. A new dependency also means shipping with another package, with a large set of code. While in most cases this is harmless, a package that is used directly on top of our work in place of our existing code, adds a significant amount of bloat for developers and users alike. The requirement shifts from understanding a line of code, to understanding a full code path behind a line of code, the quality of which cannot be guaranteed by our engineers. 122 | 123 | Ultimately, these risks and overhead management costs make sense for Accelerate, because the package provides significant benefit that far outweigh our concerns. When considering additional HF packages, however, there is currently **no obvious benefit** to their inclusion, which only further enunciates the risks associated. It must be made clear that the risks of Accelerate are non-negligible, and that the conveniences and positive impact provided are what push it over the line of inclusion. It is by no means a "no brainer" to further include any additional HF packages at this time, unless they afford us a similar benefit. 124 | -------------------------------------------------------------------------------- /docs/ui/project-proposal.md: -------------------------------------------------------------------------------- 1 | # New Repository Proposal: ui 2 | 3 | ## Summary 4 | 5 | This document proposes a new repository under the `instructlab` GitHub organization: 6 | 7 | - `instructlab/ui` 8 | 9 | ## Background 10 | 11 | Currently we don’t have public repository under InstructLab that hosts UI-related work. The goal of this project is to provide a space for hacking a UI for the InstructLab. The scope of this project is to support the following personas through the UI: 12 | 13 | - External contributors who want to contribute skill and knowledge to the taxonomy repo 14 | - Triagers who will be evaluating the taxonomy contributions. 15 | 16 | These personas are not limited to the above list, but the initial focus will be on these two. 17 | 18 | Intent is to build this UI as a SaaS service for the upstream project and its taxonomy repo, but build in a way that someone could deploy the same service on their own infrastructure for managing their own taxonomy repo. 19 | 20 | ## Maintainers 21 | 22 | The initial team of maintainers (GitHub Team - `UI Maintainers`) for this repository will be: 23 | 24 | - Anil Kumar Vishnoi 25 | - Brent Salisbury 26 | - Taiga Nakamura 27 | - Guang-Jie Ren 28 | - Juan Cappi 29 | - Daniel Tan 30 | - Gregory Pereira 31 | 32 | ## Seed Code Contribution 33 | 34 | We are planning to seed this repository with the code from the `instructlab/instructlab-bot` repository [here](https://github.com/instructlab/instructlab-bot/tree/main/ui). This code will be used as a starting point for the UI work. This code is already open source and licensed under the Apache 2.0 license. 35 | 36 | ## Alternatives Considered 37 | 38 | ### Use `instructlab/instructlab-bot` 39 | 40 | We currently have a very initial version of InstructLab UI related code present in `instructlab/instructlab-bot` repo (in "ui" directory). We can continue hacking the code in that repo, but I believe that is not the right place for the UI work, as instructlab-bot is supposed to be a backend component that is scoped to work as a triagers assistant tool with no direct interfacing to contributors or traigers. To work with a cleaner scope of these separate work streams, it makes sense to separate the UI work in its own repository. 41 | --------------------------------------------------------------------------------