├── .github
    ├── dependabot.yml
    ├── mergify.yml
    └── workflows
    │   ├── actionlint.dockerfile
    │   ├── actionlint.yml
    │   ├── docs.yml
    │   ├── matchers
    │       └── actionlint.json
    │   ├── spellcheck.yml
    │   └── stale_bot.yml
├── .gitignore
├── .markdownlint-cli2.yaml
├── .spellcheck-en-custom.txt
├── .spellcheck.yml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── SECURITY.md
└── docs
    ├── adr-feature-gating.md
    ├── backend
        └── README.md
    ├── ci
        ├── ci-remove-dco-check.md
        ├── ci-universal-actions.md
        └── dependency-constraints.md
    ├── cli
        ├── ilab-model-engine.md
        ├── ilab-model-management-standardization.md
        ├── ilab-model-serve-backend.md
        └── ilab-processes.md
    ├── dependency-management.md
    ├── docs-instructlab-website.md
    ├── evaluation
        └── eval-repo.md
    ├── examples
        └── examples-repo.md
    ├── github-actions-providers.md
    ├── github-actions-use-policy.md
    ├── github-merge-strategy.md
    ├── github-taxonomy-automation.md
    ├── images
        ├── github-taxonomy-automation.excalidraw.png
        └── sdg-api-interface.png
    ├── instructlab-app.md
    ├── instructlab-cli-1.0.0.md
    ├── knowledge-submissions-past-wikipedia.md
    ├── library-release-strategy.md
    ├── license-headers.md
    ├── naming-conventions.md
    ├── publish-strategy.md
    ├── rag
        ├── adrs
        │   ├── README.md
        │   ├── adr-vectordb.md
        │   ├── granite-embeddings.md
        │   └── template.md
        ├── ilab-rag-retrieval.md
        ├── images
        │   ├── ingestion-mvp.png
        │   ├── rag-chat.png
        │   ├── rag-model-training.png
        │   ├── rag-plug-and-play.png
        │   └── rag-taxonomy-no-training.png
        └── rag-initial-code-location.md
    ├── retrieval-augmented-generation
        └── rag-framework.md
    ├── schema
        └── taxonomy-api.md
    ├── sdg
        ├── README.md
        ├── sdg-api-interface.md
        ├── sdg-api-simplification.md
        ├── sdg-flow-yaml.md
        ├── sdg-refactor.md
        └── sdg-repo.md
    ├── sdk
        └── instructlab-sdk.md
    ├── support-utils
        └── support-utils-repo.md
    ├── taxonomy-tree-organization.md
    ├── training
        └── training-accelerate-dep.md
    └── ui
        └── project-proposal.md


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | # GitHub Dependabot configuration file
 4 | version: 2
 5 | updates:
 6 | 
 7 |   # Maintain dependencies for GitHub Actions
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 |   - package-ecosystem: "docker"
13 |     directory: "/.github/workflows"
14 |     schedule:
15 |       interval: "daily"
16 | 


--------------------------------------------------------------------------------
/.github/mergify.yml:
--------------------------------------------------------------------------------
 1 | pull_request_rules:
 2 | - name: label-backend
 3 |   description: Automatically apply backend label
 4 |   conditions:
 5 |     - or:
 6 |       - files~=docs/backend/.*
 7 |   actions:
 8 |     label:
 9 |       add:
10 |         - backend
11 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.dockerfile:
--------------------------------------------------------------------------------
1 | # Since dependabot cannot update workflows using docker,
2 | # we use this indirection since dependabot can update this file.
3 | FROM rhysd/actionlint:1.7.7@sha256:887a259a5a534f3c4f36cb02dca341673c6089431057242cdc931e9f133147e9
4 | 


--------------------------------------------------------------------------------
/.github/workflows/actionlint.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Lint GitHub Actions workflows
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - "main"
 8 |     paths:
 9 |       - '.github/workflows/*.ya?ml'
10 |       - '.github/workflows/actionlint.*' # This workflow
11 |   pull_request:
12 |     branches:
13 |       - "main"
14 |     paths:
15 |       - '.github/workflows/*.ya?ml'
16 |       - '.github/workflows/actionlint.*' # This workflow
17 | 
18 | env:
19 |   LC_ALL: en_US.UTF-8
20 | 
21 | defaults:
22 |   run:
23 |     shell: bash
24 | 
25 | permissions:
26 |   contents: read
27 | 
28 | jobs:
29 |   actionlint:
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - name: "Harden Runner"
33 |         uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
34 |         with:
35 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
36 | 
37 |       - name: "Checkout"
38 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
39 |         with:
40 |           fetch-depth: 0
41 | 
42 |       - name: "Download actionlint"
43 |         run: |
44 |           docker build --tag actionlint - < .github/workflows/actionlint.dockerfile
45 | 
46 |       - name: "Check workflow files"
47 |         run: |
48 |           echo "::add-matcher::.github/workflows/matchers/actionlint.json"
49 |           docker run --volume="${PWD}:/repo" --workdir=/repo actionlint -color
50 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Lint Markdown documents
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - "main"
 9 |     paths:
10 |       - '**/*.md'
11 |       - '.markdownlint-cli2.yaml'
12 |       - '.github/workflows/docs.yml' # This workflow
13 |   pull_request:
14 |     branches:
15 |       - "main"
16 |     paths:
17 |       - '**/*.md'
18 |       - '.markdownlint-cli2.yaml'
19 |       - '.github/workflows/docs.yml' # This workflow
20 | 
21 | env:
22 |   LC_ALL: en_US.UTF-8
23 | 
24 | defaults:
25 |   run:
26 |     shell: bash
27 | 
28 | permissions:
29 |   contents: read
30 | 
31 | jobs:
32 |   markdown-lint:
33 |     runs-on: ubuntu-latest
34 |     steps:
35 |       - name: "Harden Runner"
36 |         uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
37 |         with:
38 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
39 |       - name: "Checkout"
40 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41 |         with:
42 |           fetch-depth: 0
43 |       - name: "Check Markdown documents"
44 |         uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0
45 |         with:
46 |           globs: '**/*.md'
47 | 


--------------------------------------------------------------------------------
/.github/workflows/matchers/actionlint.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "problemMatcher": [
 3 |     {
 4 |       "owner": "actionlint",
 5 |       "pattern": [
 6 |         {
 7 |           "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
 8 |           "file": 1,
 9 |           "line": 2,
10 |           "column": 3,
11 |           "message": 4,
12 |           "code": 5
13 |         }
14 |       ]
15 |     }
16 |   ]
17 | }
18 | 


--------------------------------------------------------------------------------
/.github/workflows/spellcheck.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: Spellcheck
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - "main"
 9 |     paths:
10 |       - '**.md'
11 |       - '.github/workflows/spellcheck.yml' # This workflow
12 |   pull_request:
13 |     branches:
14 |       - "main"
15 |     paths:
16 |       - '**.md'
17 |       - '.github/workflows/spellcheck.yml' # This workflow
18 | 
19 | env:
20 |   LC_ALL: en_US.UTF-8
21 | 
22 | defaults:
23 |   run:
24 |     shell: bash
25 | 
26 | permissions:
27 |   contents: read
28 | 
29 | jobs:
30 |   spellcheck:
31 |     name: Spellcheck (en_US)
32 |     runs-on: ubuntu-latest
33 |     steps:
34 |       - name: "Harden Runner"
35 |         uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
36 |         with:
37 |          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
38 | 
39 |       - name: "Checkout"
40 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
41 |         with:
42 |           fetch-depth: 0
43 | 
44 |       - name: Spellcheck
45 |         uses: rojopolis/spellcheck-github-actions@584b2ae95998967a53af7fbfb7f5b15352c38748 # v0.49.0
46 | 


--------------------------------------------------------------------------------
/.github/workflows/stale_bot.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | name: 'Close stale issues and PRs'
 4 | 
 5 | on:
 6 |   schedule:
 7 |     - cron: '30 1 * * *'
 8 | 
 9 | env:
10 |   LC_ALL: en_US.UTF-8
11 | 
12 | defaults:
13 |   run:
14 |     shell: bash
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   stale:
21 |     permissions:
22 |       issues: write
23 |       pull-requests: write
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       - name: "Harden Runner"
27 |         uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
28 |         with:
29 |           disable-sudo: true
30 |           egress-policy: block
31 |           allowed-endpoints: >
32 |             api.github.com:443
33 | 
34 |       - name: "Stale Action"
35 |         uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
36 |         with:
37 |           stale-issue-label: 'stale'
38 |           stale-issue-message: >
39 |             This issue has been automatically marked as stale because it has not had activity within 60 days.
40 |             It will be automatically closed if no further activity occurs within 30 days.
41 |           close-issue-message: >
42 |             This issue has been automatically closed due to inactivity. Please feel free to reopen if you feel it is still relevant!
43 |           days-before-issue-stale: 60
44 |           days-before-issue-close: 30
45 |           stale-pr-label: 'stale'
46 |           stale-pr-message: >
47 |             This pull request has been automatically marked as stale because it has not had activity within 30 days.
48 |             It will be automatically closed if no further activity occurs within 7 days.
49 |           close-pr-message: >
50 |             This pull request has been automatically closed due to inactivity. Please feel free to reopen if you intend to continue working on it!
51 |           days-before-pr-stale: 30
52 |           days-before-pr-close: 7
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | 
 3 | # Spelling
 4 | dictionary.dic
 5 | 
 6 | # python virtualenv
 7 | venv
 8 | 
 9 | # Emacs
10 | *~
11 | \#*\#
12 | .\#*
13 | .projectile
14 | .dir-locals.el
15 | 


--------------------------------------------------------------------------------
/.markdownlint-cli2.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | config:
 4 |   line-length: false
 5 |   no-emphasis-as-header: false
 6 |   no-emphasis-as-heading: false
 7 |   first-line-heading: false
 8 |   code-block-style: false
 9 |   no-duplicate-header: false
10 |   single-trailing-newline: false
11 | globs:
12 |   - "**/*.md"
13 | ignores:
14 |   - ".github/**"
15 |   - ".tox/**"
16 |   - "venv/**"
17 |   - ".venv/**"
18 | 


--------------------------------------------------------------------------------
/.spellcheck-en-custom.txt:
--------------------------------------------------------------------------------
  1 | # make spellcheck-sort
  2 | # Please keep this file sorted:
  3 | Abhishek
  4 | ADR
  5 | agentic
  6 | Akash
  7 | AMDGPU
  8 | Anil
  9 | API
 10 | api
 11 | arge
 12 | args
 13 | arXiv
 14 | backend
 15 | backends
 16 | benchmarking
 17 | Bhandwaldar
 18 | brainer
 19 | Cappi
 20 | checkpointing
 21 | chunkers
 22 | CLI
 23 | cli
 24 | CLI's
 25 | codebase
 26 | Colab
 27 | compositional
 28 | Conda
 29 | config
 30 | configs
 31 | Containerfile
 32 | cpp
 33 | ctrl
 34 | cuBLAS
 35 | CUDA
 36 | customizations
 37 | CVE
 38 | CVEs
 39 | CWD
 40 | Cynefin
 41 | Daniele
 42 | dataset
 43 | datasets
 44 | dateutil
 45 | DBs
 46 | DCO
 47 | Dependabot
 48 | dev
 49 | disambiguating
 50 | ditaa
 51 | Docling
 52 | docling
 53 | docstring
 54 | DocumentSplitter
 55 | downstreams
 56 | dr
 57 | Dropdown
 58 | DSL
 59 | embedder
 60 | embeddings
 61 | env
 62 | EP
 63 | Eval
 64 | eval
 65 | Excalidraw
 66 | exfiltrate
 67 | exfiltrating
 68 | Filesystem
 69 | filesystem
 70 | Finetuning
 71 | formedness
 72 | FQN
 73 | freeform
 74 | FSDP
 75 | GB
 76 | gb
 77 | GFX
 78 | GGUF
 79 | gguf
 80 | GGUFs
 81 | GiB
 82 | github
 83 | Gmail
 84 | GPTDolomite
 85 | GPU
 86 | gpu
 87 | Guang
 88 | hacky
 89 | hardcode
 90 | hardcoded
 91 | Hema
 92 | hipBLAS
 93 | HyDE
 94 | ilab
 95 | Ilya
 96 | impactful
 97 | Inferencing
 98 | init
 99 | instantiation
100 | instructlab
101 | integrations
102 | io
103 | ISA
104 | iters
105 | itertools
106 | Jie
107 | jinja
108 | JIT
109 | JSON
110 | json
111 | Jupyter
112 | KAGGLE
113 | Kaggle's
114 | Kai
115 | Kolchinsky
116 | Kubernetes
117 | Kumar
118 | Langchain
119 | Langgraph
120 | leaderboard
121 | lifecycle
122 | lignment
123 | linters
124 | LLM
125 | LLMs
126 | llms
127 | LLVM
128 | lora
129 | Makefiles
130 | Markdownlint
131 | Martinoli
132 | md
133 | Mergify
134 | Merlinite
135 | merlinite
136 | Milvus
137 | MilvusEmbeddingRetriever
138 | MilvusLite
139 | mimimum
140 | Miniforge
141 | MiniLM
142 | Mixtral
143 | mixtral
144 | MLX
145 | MMLU
146 | modularize
147 | modularized
148 | MTEB
149 | Murdock
150 | mvp
151 | Nakamura
152 | natively
153 | networkx
154 | num
155 | numpy
156 | NVidia
157 | OCI
158 | Ollama
159 | ollama
160 | onboarding
161 | OpenAI
162 | openai
163 | OpenStax
164 | optimizers
165 | orchestrator
166 | ots
167 | PaRAGon
168 | Params
169 | Pareja
170 | PEFT
171 | Pereira
172 | PID
173 | Plag
174 | PlantUML
175 | PLOS
176 | pluggable
177 | PNG
178 | POC
179 | Podman
180 | podman
181 | posthog
182 | postprocessing
183 | pre
184 | preprint
185 | preprocessing
186 | prereqs
187 | productize
188 | productized
189 | PR's
190 | PSFL
191 | Pydantic
192 | pyenv
193 | PyPI
194 | pyproject
195 | PyTorch
196 | pyyaml
197 | qlora
198 | qna
199 | quantized
200 | Quantizing
201 | Radeon
202 | RDNA
203 | README
204 | rebase
205 | Ren
206 | repo
207 | repos
208 | RHEL
209 | roadmapping
210 | ROCm
211 | RTX
212 | runtime
213 | RX
214 | SaaS
215 | safetensor
216 | safetensors
217 | Salawu
218 | scalable
219 | SDG
220 | sdg
221 | SDK
222 | sdk
223 | semvar
224 | sexualized
225 | SHA
226 | sharding
227 | Shellcheck
228 | Shivchander
229 | Signoff
230 | Sigstore
231 | specifiying
232 | splitter
233 | src
234 | Srivastava
235 | Staar
236 | Standup
237 | subcommand
238 | subcommands
239 | subdirectory
240 | subfolder
241 | subprocess
242 | Sudalairaj
243 | supportability
244 | systemd
245 | Taj
246 | tatsu
247 | TBD
248 | templating
249 | Tesla
250 | th
251 | tl
252 | TODO
253 | tox
254 | tqdm
255 | traigers
256 | triager
257 | Triagers
258 | triagers
259 | txt
260 | UI
261 | ui
262 | unquantized
263 | unstaged
264 | URI
265 | USM
266 | UUID
267 | UUIDs
268 | UX
269 | vectordbs
270 | Veeradhi
271 | venv
272 | Vishnoi
273 | vLLM
274 | vllm
275 | watsonx
276 | Wikisource
277 | wikisql
278 | WIP
279 | WSL
280 | xcode
281 | XNACK
282 | XT
283 | XTX
284 | Xu
285 | YAML
286 | yaml
287 | 


--------------------------------------------------------------------------------
/.spellcheck.yml:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | matrix:
 4 | - name: markdown
 5 |   aspell:
 6 |     lang: en
 7 |     d: en_US
 8 |     camel-case: true
 9 |     mode: markdown
10 |   sources:
11 |   - "**/*.md|!.tox/**"
12 |   dictionary:
13 |     wordlists:
14 |     - .spellcheck-en-custom.txt
15 |   pipeline:
16 |   - pyspelling.filters.context:
17 |       context_visible_first: true
18 |       escapes: '\\[\\`~]'
19 |       delimiters:
20 |       # Ignore multiline content between fences (fences can have 3 or more back ticks)
21 |       # ```language
22 |       # content
23 |       # ```
24 |       - open: '(?s)^(?P<open> *`{3,}).*?$'
25 |         close: '^(?P=open)$'
26 |       # Ignore text between inline back ticks
27 |       - open: '(?P<open>`+)'
28 |         close: '(?P=open)'
29 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | See [community/CONTRIBUTING.md](https://github.com/instruct-lab/community/blob/main/CONTRIBUTING.md).
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: help
 2 | help:
 3 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 4 | 
 5 | .PHONY: spellcheck
 6 | spellcheck: .spellcheck.yml ## Spellcheck markdown files
 7 | 	pyspelling --config $<
 8 | 
 9 | .PHONY: spellcheck-sort
10 | spellcheck-sort: .spellcheck-en-custom.txt ## Sort spellcheck directory
11 | 	sort -d -f -o $< $<
12 | 
13 | #
14 | # If you want to see the full commands, run:
15 | #   NOISY_BUILD=y make
16 | #
17 | ifeq ($(NOISY_BUILD),)
18 |     ECHO_PREFIX=@
19 |     CMD_PREFIX=@
20 |     PIPE_DEV_NULL=> /dev/null 2> /dev/null
21 | else
22 |     ECHO_PREFIX=@\#
23 |     CMD_PREFIX=
24 |     PIPE_DEV_NULL=
25 | endif
26 | 
27 | .PHONY: md-lint
28 | md-lint: ## Lint markdown files
29 | 	$(ECHO_PREFIX) printf "  %-12s ./...\n" "[MD LINT]"
30 | 	$(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest > /dev/null
31 | 
32 | .PHONY: md-lint-fix
33 | md-lint-fix: ## Fixes all fixable linting errors
34 | 	$(ECHO_PREFIX) printf "  %-12s ./...\n" "[MD LINT]"
35 | 	$(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest --fix > /dev/null
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # InstructLab Developer Documentation
 2 | 
 3 | This repository contains design artifacts that define the architecture and
 4 | design of components in the InstructLab project. The repo also serves as a home
 5 | for technical policies that apply across all components.
 6 | 
 7 | ## Governance
 8 | 
 9 | Governance for InstructLab overall is documented in the [community
10 | repository](https://github.com/instructlab/community/blob/main/GOVERNANCE.md).
11 | 
12 | The [InstructLab Oversight
13 | Committee](https://github.com/instructlab/community/blob/main/MAINTAINERS.md) is
14 | responsible for the contents of this repository.
15 | 
16 | The rules for merging depend on the type of change in question and its scope of impact. If you
17 | are unsure about the scope of impact for a change, i.e. if a change is trivial or non-trivial,
18 | please ping the Oversight Committee for help.
19 | 
20 | * Trivial changes may be merged with 1 review from any InstructLab maintainer.
21 |   * Examples of trivial changes include minor wording adjustments or typo fixes in
22 |     documentation, changes to CI fixes, CI dependency updates, etc.
23 | * Non-trivial changes have more loosely defined requirements. Input should be sought
24 |   out from maintainers of relevant components. The broader the scope or more
25 |   controversial the change, the more broad the consensus should be required for
26 |   merging. The final approval and merge (or action, e.g. deleting a repo)
27 |   falls to two maintainers of any InstructLab Organization repository as well as
28 |   an additional third maintainer of any InstructLab Organization repository to
29 |   merge the PR after verifying that sufficient reviews have been given. If there are
30 |   disputes on the design document that cannot be resolved, an Oversight Committee
31 |   member can be consulted as an arbitrator. These approvals ensure that
32 |   adequate opportunity and attention has been given by the affected parties.
33 |   * Examples of non-trivial changes include approving proposal for new repositories,
34 |     creation of new repositories, changes to organization level GitHub settings, archiving
35 |     or deleting repositories, design proposals, etc.
36 | * Any maintainer or oversight committee member may request that a change receive
37 |   a full vote from the Oversight Committee. More substantial policy changes or a
38 |   proposed new project under InstructLab are examples of when this may be
39 |   appropriate.
40 | 
41 | ## Formatting Guidelines
42 | 
43 | Design documents should be placed in `docs/`.
44 | 
45 | ### Text
46 | 
47 | Files should be in [Markdown](https://github.github.com/gfm/) format.
48 | 
49 | ### Images
50 | 
51 | Diagrams are encouraged, but must be submitted in a format where they can be
52 | easily updated in the future as needed. Some options include:
53 | 
54 | * [Mermaid](https://github.com/mermaid-js/mermaid#readme)
55 | * [Excalidraw](https://excalidraw.com/)
56 | ** Be sure to leave "Embed Scene" turned on when exporting the PNG.
57 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | You can find information on how to report a potential security vulnerability, as well as where to subscribe to receive security alerts, on the project's [Security Page](https://github.com/instructlab/.github/blob/main/SECURITY.md).
2 | 


--------------------------------------------------------------------------------
/docs/adr-feature-gating.md:
--------------------------------------------------------------------------------
 1 | # InstructLab Feature Gating Patterns
 2 | 
 3 | ## Context
 4 | 
 5 | Motivated by the currently [in-progress RAG development](https://github.com/instructlab/dev-docs/pull/161) in [dev preview scope](https://access.redhat.com/support/offerings/devpreview), there is a use case for introducing feature gating (sometimes equivalently called "feature flags"). The specific use case desired is to have default settings for feature gating in the application that can be overridden using environment variables.
 6 | 
 7 | One common pattern is per-feature gating, i.e., configuration per-feature. This fine-grained control might be desirable in the future, especially to enable user segmentation for individual experimental features. We do not need that level of fine-grained control at this time.
 8 | 
 9 | There is precedent [in OpenShift](https://docs.openshift.com/container-platform/4.17/nodes/clusters/nodes-cluster-enabling-features.html) for enabling sets of features based on support scope. Following this pattern would be consistent with OpenShift terminology and meets our needs at this time.
10 | 
11 | ## Decision
12 | 
13 | InstructLab will adopt feature gating based on feature sets using the OpenShift terminology of `DevPreviewNoUpgrade` and `TechPreviewNoUpgrade`, able to be overridden using an environment variable.
14 | 
15 | ## Status
16 | 
17 | Proposed
18 | 
19 | ## Consequences
20 | 
21 | * Feature gating concepts will be consistent with OpenShift, lowering the learning curve of one application's configuration when coming from the other.
22 | * We will not need to (*yet*) spend the time to develop our own taxonomy for feature gating.
23 | * Messaging to users about support commitments when using dev preview or tech preview will be clear - in particular, that no version upgrade commitments are made, nor the ability to disable those scopes in order to revert to a supported application state.
24 | * We have a decision to make about whether to introduce a new dependency for feature flagging or make our own simple one.
25 | * There is a migration path to finer-grained feature gating via the [`CustomNoUpgrade`](https://github.com/openshift/api/blob/master/config/v1/types_feature.go#L54) scope.
26 | * We will have to make sure to communicate what these feature gate scopes mean to users and what commitments they entail, in documentation and/or otherwise.
27 | 


--------------------------------------------------------------------------------
/docs/backend/README.md:
--------------------------------------------------------------------------------
 1 | # InstructLab Backend Working Group
 2 | 
 3 | ## Introduction
 4 | 
 5 | The InstructLab project to date has produced a
 6 | [CLI](https://github.com/instructlab/instructlab) that allows the execution of
 7 | the InstructLab workflow on a variety of single-machine footprints.
 8 | 
 9 | Some members of the InstructLab project also operate some backend services that
10 | produce regular full model builds using contributions to the
11 | [taxonomy](https://github.com/instructlab/taxonomy/). This runs in a larger
12 | clustered environment. This original backend is not built in a way that is
13 | easily reusable by others. It is a goal of the InstructLab project to produce an
14 | evolution of these backend capabilities that is more easily reusable by others.
15 | 
16 | The InstructLab Backend Working Group represents a group of contributors
17 | interested in evolving the InstructLab backend capabilities. The group is open
18 | to all contributors interested in this work.
19 | 
20 | ## Group Membership
21 | 
22 | There is no formal membership of this group. Those interested in contributing
23 | just need to show up and participate in the work of the group.
24 | 
25 | ## Primary Artifacts Produced by the Group
26 | 
27 | The primary artifacts produced by the InstructLab Backend Working Group are
28 | architecture and design docs in the `docs/backend` directory of this repository.
29 | This is where the group will work out high-level planning for various components
30 | that should be created.
31 | 
32 | Once individual components are created and implementation is in progress, code
33 | and associated documents for that component will be stored in those
34 | repositories.
35 | 
36 | ## Communication
37 | 
38 | Communication among the group occurs in the following ways:
39 | 
40 | - **Meetings**: The group meets regularly on a schedule determined by the group.
41 |   Meetings are open to all interested contributors. Meeting times are posted in
42 |   the [InstructLab Community
43 |   Calendar](https://calendar.google.com/calendar/embed?src=c_23c2f092cd6d147c45a9d2b79f815232d6c3e550b56c3b49da24c4b5d2090e8f%40group.calendar.google.com).
44 | 
45 | - **Discord**: The group has various channels for corresponding components such as `#core`, `#sdg`, `#sdg`, `#eval`, and `#infra`, which can all be found in the [InstructLab Discord](https://instructlab.ai/discord).
46 | 
47 | - **Slack**: The group uses the `#backend` channel in the InstructLab Slack.
48 | 
49 | - **Mailing Lists**: There is a [development mailing
50 | list](https://groups.google.com/a/instructlab.ai/g/dev) should the group desire
51 | to share or discuss topics by email.
52 | 
53 | - **GitHub**: Artifacts produced by this group will be stored in the
54 |   `docs/backend` directory of this repository and will be reviewed via GitHub
55 |   pull requests. GitHub Issues may also be used to track work items as needed.
56 | 


--------------------------------------------------------------------------------
/docs/ci/ci-remove-dco-check.md:
--------------------------------------------------------------------------------
 1 | # Remove DCO Sign-off Check
 2 | 
 3 | ## Overview
 4 | 
 5 | Presently, all repositories under our `instructlab` GitHub org require contributors to add a [Developer Certificate Origin (DCO)](https://developercertificate.org/) sign-off in each commit to indicate that they agree to the DCO legal terms. While these DCO sign-offs can be a great way for us to validate that a contributor agrees to the DCO legal terms, many upstream projects actively use the DCO without explicitly requiring contributors to add "Signed-off-by" to their commits. So in other words, we can still require contributors to agree to the DCO terms, but we don't necessarily _need_ contributors to add DCO sign-offs in each commit.
 6 | 
 7 | We can look at the `gnuradio/volk` repository as an example of how to enforce the DCO terms without explicitly requiring contributors to add sign-offs. Essentially, they have added a copy of the DCO text to their repository under [docs/DCO.txt](https://github.com/gnuradio/volk/blob/d9d6abfcc4aaeec2d60abd482f74fded158818b5/docs/DCO.txt) and referenced that file [here in their CONTRIBUTING.md](https://github.com/gnuradio/volk/blob/308948abf8384bb4bf6467e14b585df708789782/docs/CONTRIBUTING.md?plain=1#L22) guide.
 8 | 
 9 | ## Changes Going Forward
10 | 
11 | We should mimic how other open source projects implicitly enforce the DCO terms. More specifically, we should make an exact copy of the [DCO agreement document](https://developercertificate.org/) and save that copy to a `DCO.txt` file located at the root of each InstructLab repository (i.e., on the same level as the main `README.md` file in each repository). While some open source repositories opt to place the DCO agreement terms under a `docs/` folder, us placing `DCO.txt` at the root of each repository makes the DCO agreement visible and easily accessible for all contributors. Finally, we can then update each repository's `CONTRIBUTING.md` guide to clearly state that anyone who makes a contribution to InstructLab will be implicitly agreeing to the DCO terms located under `DCO.txt` -- and a relative link to that file shall be provided within each InstructLab repository.


--------------------------------------------------------------------------------
/docs/ci/ci-universal-actions.md:
--------------------------------------------------------------------------------
 1 | # Create Separate Repo for Custom GitHub Actions
 2 | 
 3 | ## Idea Overview
 4 | 
 5 | Create a separate repository within the `instructlab` GitHub org to house our custom CI Github Actions.
 6 | 
 7 | ## Motivation for this Proposal
 8 | 
 9 | Within our `instructlab` GitHub org, we have an in-house GitHub action that we use across several repos: `free-disk-space`.
10 | 
11 | Examples:
12 | 
13 | - [`free-disk-space` in the `eval` repo](https://github.com/instructlab/eval/blob/main/.github/actions/free-disk-space/action.yml)
14 | - [`free-disk-space` in the `instructlab` repo](https://github.com/instructlab/instructlab/blob/main/.github/actions/free-disk-space/action.yml)
15 | - [`free-disk-space` in the `sdg` repo](https://github.com/instructlab/sdg/blob/a532a8d99ffe447152948e9442464923aba62637/.github/actions/free-disk-space/action.yml)
16 | 
17 | This GitHub action is universal as it is a simple script used to clean up disk space on GitHub runners and can be modified to free disk space on our CentOS-based EC2 runners.
18 | 
19 | Note that all of these in-house `free-disk-space` action files are exactly the same, so we essentially have the same file stored in three different repos.
20 | 
21 | ## Pros and Cons
22 | 
23 | Below are some pros and cons of creating a separate repository to house our GitHub actions.
24 | 
25 | ### Pros
26 | 
27 | - We will have one action file in one location
28 |   - Easier to make changes in one location instead of (our present) three locations
29 |   - Contributors will know where to look for in-house action files
30 | - We can create releases and utilize version control
31 | 
32 | ### Cons
33 | 
34 | - Extra repository to maintain.
35 | - We can't publish any of our actions to the GitHub marketplace if we have multiple actions stored in one repository.\*
36 | 
37 | \* In reference to the last bullet point, [the GitHub docs for publishing Actions](https://docs.github.com/en/actions/sharing-automations/creating-actions/publishing-actions-in-github-marketplace#about-publishing-actions) states:
38 | > Actions are published to GitHub Marketplace immediately and aren't reviewed by GitHub as long as they meet these requirements:
39 | >
40 | > - The action must be in a public repository.
41 | > - Each repository must contain a single action.
42 | 
43 | If we do care about publishing our actions, then we should consider creating separate repositories for these actions. If we don't care to publish, then this is a non-issue. (See next section below.)
44 | 
45 | ## Additional Info
46 | 
47 | Even if we cannot publish our actions to the GitHub marketplace, we can _still_ use these actions in our repository. For example, if our repo was named `ci-actions` with this layout:
48 | 
49 | ```bash
50 | .
51 | ├── custom-action-1/
52 | │   ├── action.yml
53 | ├── custom-action-2
54 | │   ├── action.yml
55 | ```
56 | 
57 | ...then we'd reference them in our other repos like so:
58 | 
59 | ```yaml
60 | name: Some Name
61 | 
62 | on:
63 |   workflow_dispatch:
64 | 
65 | jobs:
66 |   some-job:
67 |     runs-on: ubuntu-latest
68 | 
69 |     steps:
70 |       - name: Checkout code
71 |         uses: actions/checkout
72 |   
73 |       - name: Use custom action 1
74 |         uses: instructlab/ci-actions/custom-action-1@main
75 |   
76 |       - name: Use custom action 2
77 |         uses: instructlab/ci-actions/custom-action-2@main
78 | ```
79 | 
80 | Reference: [StackOverflow - "How to Publish Multiple GitHub Actions from a Single Repo and Call Them from Another Repo"](https://stackoverflow.com/a/79100136)


--------------------------------------------------------------------------------
/docs/ci/dependency-constraints.md:
--------------------------------------------------------------------------------
 1 | # Dependency Constraints in InstructLab CI
 2 | 
 3 | This document describes how Python dependencies should be managed in
 4 | InstructLab CI.
 5 | 
 6 | ## Goals
 7 | 
 8 | 1. Ensure that the CI environment is consistent and reproducible.
 9 | 2. Ensure that new dependency releases do not break the CI environment.
10 | 3. Ensure that we adopt new dependency releases in the CI environment in a
11 |    timely manner.
12 | 
13 | ## Approach
14 | 
15 | ### Pin all versions with constraints files
16 | 
17 | Each repository should contain a [pip
18 | constraints](https://pip.pypa.io/en/stable/user_guide/#constraints-files) file
19 | that lists the pinned versions of all dependencies used in the CI environment.
20 | 
21 | In case a repository supports multiple platforms (`linux` vs `darwin`) or
22 | accelerators (`cuda`, `cpu`, `hpu`), multiple constraints files may be
23 | generated.
24 | 
25 | The constraints files should be used in all CI jobs that install Python
26 | dependencies (using `pip` or otherwise), including linters, unit tests,
27 | integration tests, and functional tests.
28 | 
29 | ### Update constraints files regularly
30 | 
31 | Each repository should define a new `tox` target called `constraints` that will
32 | be used to re-generate all the constraints files in the project.
33 | 
34 | Updates to these files should be generated automatically by the CI system using
35 | the [update-constraints](https://github.com/instructlab/ci-actions/tree/main/actions/update-constraints)
36 | action from `ci-actions` repository and should not be modified manually
37 | (subject to rare exceptions). A periodic CI job should be added to ensure this
38 | happens on a regular basis (at least once a week).
39 | 
40 | The job will update constraints file and post the result as a PR for review.
41 | The PR will be validated by all the relevant CI jobs. Project core team is
42 | expected to review these PRs in a timely manner (within 3 business days). The
43 | team should make sure that all the relevant CI jobs are passing before merging
44 | the PR.
45 | 
46 | ### Uncap all dependencies
47 | 
48 | At this point, no new dependency releases should affect the CI environment.
49 | Projects should then uncap all their dependencies in requirements files, as per
50 | [dependency management policy](../dependency-management.md).
51 | 
52 | Note: In rare situations, a cap may be justified. For example, when we know for
53 | sure that a new release of the upstream dependency will break the project, and
54 | when we don't have capacity to deliver a fix in a timely manner. These
55 | situations should be rare and a mitigation plan should be in place to uncap the
56 | dependency.
57 | 
58 | Specifically,
59 | 
60 | - A tracking issue should be reported in the issue tracking system capturing
61 |   any known details about the issue.
62 | - A new **temporary** constraint should be added to `constraints-dev.txt.in`
63 |   file. The temporary constraint should refer to the tracking issue in a
64 |   comment above it.
65 | - The tracking issue should be assigned a high priority and considered a
66 |   blocker for an upcoming release.
67 | 


--------------------------------------------------------------------------------
/docs/cli/ilab-model-engine.md:
--------------------------------------------------------------------------------
  1 | # Redesign `ilab` Command Structure to be Resource Based
  2 | 
  3 | This document describes a new structure for `ilab`, consisting of sub-commands that act as parent or "group" commands for further sub-commands. Currently the only group command is `ilab`. If the models `ilab` produces are meant to be the golden standard for open source AI, the tool to use and manage these models needs to be fully-fledged and extensible, especially for models we have not produced but should be able to run.
  4 | 
  5 | open source container tools like [podman](https://podman.io/) and [docker](https://www.docker.com/) are commonly viewed as engines. `ilab` should be a model engine, managing the deployment and creation of AI models. To be clear, I will be using tools like podman as a structural analogy here with full understanding they have different implications and use-cases.
  6 | 
  7 | ## Key Component
  8 | 
  9 | ### InstructLab Structure Redesign
 10 | 
 11 | Here are two outlines. They represent the state of `ilab` before/after this enhancement.
 12 | 
 13 | ```console
 14 | ilab
 15 | |
 16 | |_____chat
 17 | |_____check
 18 | |_____convert
 19 | |_____download
 20 | |_____generate
 21 | |_____init
 22 | |_____list
 23 | |_____serve
 24 | |_____test
 25 | |_____train
 26 | ```
 27 | 
 28 | vs. after:
 29 | 
 30 | ```console
 31 | ilab
 32 | |
 33 | |_______model
 34 | |       |
 35 | |       |____convert
 36 | |       |____download
 37 | |       |____train
 38 | |       |____serve
 39 | |       |____chat
 40 | |       |____evaluate
 41 | |       |
 42 | |_______data
 43 | |       |
 44 | |       |____generate
 45 | |       |
 46 | |_______config
 47 | |       |
 48 | |       |____init
 49 | |       |
 50 | |_______taxonomy
 51 | |       |
 52 | |       |____diff
 53 | |       |____check
 54 | |       |____download
 55 | ```
 56 | 
 57 | The main point of this structure is to introduce a resource based hierarchy and to improve the usability of the existing `ilab` structure. At the top level of commands, we are not thinking about verbs but resources commonly managed in `ilab`, under them fall some actions. Some key things missing in `ilab` currently are:
 58 | 
 59 | 1. Ambiguity in "what" we are generating, training etc.
 60 |     - The current structure of `ilab` requires an in-depth knowledge of the ordering of commands and background knowledge for how our CLI is built. This is not the best way to grow. Commands that are not intuitive and require a deep dive into the docs just to get started might not engage users as we hope to. Having an intuitive structure in which people can see easily from the docstring what each command does and how they might use them, makes more sense. We are generating data, then training a model, running a server to interact with that model, then finally chatting with the model
 61 | 2. Some commands like `ilab test` and `ilab list` should be renamed to show their true purpose (`ilab model evaluate` and `ilab taxonomy diff`)
 62 | 3. We will hit a point where we want to add more commands to `ilab` due to expected integration with the backend, model registries etc. This list will grow to a point where this form of top group organization is not maintainable.
 63 | 
 64 | In general, moving to this structure allows for more growth when `ilab` reaches a stable release and encourages broader usage of the tool for all sorts of open-source AI models.
 65 | 
 66 | #### Goals for 5/30/2024 milestone release of CLI
 67 | 
 68 | For the next few milestones, it has been identified that the overall structure shift should be in place. We should aim to have the base hierarchy of commands in place that will exist for the future of `ilab` so that when users become familiar with the CLI, further drastic changes are not needed.
 69 | 
 70 | This means we will now have commands that have a parent group `ilab` followed by a sub-command/group terminated by a grandchild command: `ilab model list` for example. This workflow not only adds purposeful organization to this project, but will encourage contributors to add commands as they use `ilab`, and file bugs in specific parts of the codebase that need work as opposed to the current flow which is hard to keep track of from a user's point of view.
 71 | 
 72 | ### Necessity of a input -> configuration -> result mental model
 73 | 
 74 | The commands uncovered by adding this structure resemble the typical architecture for an "engine". `ilab` is not complete without the mechanisms to create, list, delete, and inspect the models. Models, as container images do in container engines, act as the configuration for the end result: the chat process. The interesting thing about this analogy, is that there needs to be a pre-cursor to configuration as well. There is the container image, the container, and the `Containerfile`. The `Containerfile` is the raw user input that leads to an image.
 75 | 
 76 | `ilab` needs to have a clear source of information, a result of compiling this source, and a "running" end result. Having these three steps purposefully delineated creates the need for commands in which users to manipulate and act upon each of these step.
 77 | 
 78 | The mental model a user has impacts the way they use the application and whether they view it as a Proof Of Concept.
 79 | 
 80 | Here is a diagram of what I mean when I describe this mental model:
 81 | 
 82 | ```console
 83 | SOURCE                                                                                                                                                                                                                                              SINK
 84 | 
 85 |         User has un-commited changes in their taxonomy                          Data is generated and then the user runs                               A model is created, and the user can tag the model               After converting the model, the
 86 |         or specifies `--skill` `--knowledge` in the                 ==>        `ilab model train` using the newly generated data       ==>             or just list it using the new model commands            ==>      user runs the chat procces by specifying `ilab model serve` followed by `ilab model chat
 87 |         new `ilab data generate`
 88 | 
 89 | ```
 90 | 
 91 | So in `ilab` the clear source of information is the taxonomy, or the new skills/knowledge brought into generate. This data then goes through processes that end up in the sink, which is a running/actionable process which the user can interact with. Providing commands that clearly display these different processes is key. Groups for these larger "buckets" of processes make sense as well due to the likelihood of more commands being needed to properly interact with the newly generated model.
 92 | 
 93 | ### Alternatives
 94 | 
 95 | An alternative flow to `ilab` -> `child-command` -> `grandchild-command` is:
 96 | 
 97 | `ilab` -> `verb` -> `noun`. The positive to this approach is commands would sound better: `ilab generate data`. However there are a few negatives. While this is easier to say, it makes less sense from an organizational standpoint:
 98 | 
 99 | ```console
100 | ilab
101 | |
102 | |_______download__|
103 | |_______convert___|-----model
104 | |_______train_____|
105 | ```
106 | 
107 | This structure works when commands are ALL linked to the same `verb`. What happens when a group like `download` gets a command that the others don't? Then it looks more like this:
108 | 
109 | ```console
110 | ilab
111 | |
112 | |_______download
113 | |       |____model
114 | |       |____taxonomy
115 | |_______convert
116 | |       |_____model
117 | |_______train
118 | |       |_____model
119 | ```
120 | 
121 | This is opposed to the structure in this EP, which might have a duplicate here and there for something like `download`, but in this structure duplicate commands are the norm and will result in a larger and clunkier codebase that is confusing to read.
122 | 
123 | This first structure looks nice, and `ilab download model` sounds nice. However in terms of implementation, this makes little sense. In [click](https://click.palletsprojects.com/en/8.1.x/), the CLI library we use, `ilab` is a "group" that commands are currently "grouped under". In this Model, `ilab` would be a group, `download`, `convert` etc would be a sub-group, and `model` would be a command under ALL of these groups. This would require an implementation of the `model` function in different packages all linking to a different group parent. From the perspective of user contributions and general code readability doesn't make much sense. Part of this design is to encourage user contributions by making the structure of the codebase logical. While these commands make the commands sound better, they make them harder to group, and understand.
124 | 
125 | ### General workflow as compared to alternatives
126 | 
127 | currently one has to:
128 | 
129 | 1. `ilab init`
130 | 2. `ilab download`
131 | 3. `ilab serve`
132 | 4. `ilab generate`
133 | 5. `ilab train`
134 | 6. `ilab convert`
135 | 7. `ilab serve --model XXX`
136 | 8. `ilab chat`
137 | 
138 | how to use `ilab` with this new structure:
139 | 
140 | 1. `ilab config init`
141 | 2. `ilab model download`
142 | 3. `ilab data generate`
143 | 4. `ilab model train`
144 | 5. `ilab model serve`
145 | 6. `ilab model chat`
146 | 
147 | It is now clear what is happening. For example, one may ask: "What does `ilab init` do?" while `ilab config init` clearly initializes the CLI's config. A more clear example is `ilab generate`. Are we generating a model? A config? No, we are generating *data*: `ilab data generate`.
148 | 
149 | ## Changes to Existing flow
150 | 
151 | The current `ilab` commands will still work. Users will be able to type commands like `ilab model train` or `ilab train` for the foreseeable future to ensure feature parity. Eventually, this alias should be removed, and only the sub-commands should probably exist.
152 | 
153 | ## Known Issues
154 | 
155 | ### click
156 | 
157 | click doesn't like the setup we currently have in `ilab`. So, adding sub-parent commands won't be as easy as creating a new `click.group`. We will need to make different libraries, each of which is a click group most likely.
158 | 


--------------------------------------------------------------------------------
/docs/cli/ilab-model-management-standardization.md:
--------------------------------------------------------------------------------
 1 | # Standardizing model storage, management and referencing in the CLI and libraries
 2 | 
 3 | ## Introduction
 4 | 
 5 | - **Purpose**: Standardizing how models are stored, managed and accessed via flags used in CLI tools and libraries to ensure consistency, usability, and maintainability.
 6 | - **Scope**: Covers model directory structure in cache and all model-related flags used in the command-line interface (CLI) and libraries within the project.
 7 | - **Audience**: Developers, maintainers, and contributors involved in the development and usage of the CLI tools and libraries.
 8 | 
 9 | ## Summary
10 | 
11 | This proposal aims to establish a consistent practice for how models are managed within InstructLab. It does so in 2 parts - one proposing a consolidation of all existing model related flags, and the other proposing a change to how models are organized in the cache upon download, and thereby referenced within various operations
12 | 
13 | ## Motivation
14 | 
15 | Currently there are 10+ instances across all major commands and subcommands that accept either `--model` , `--model-path`, `--model-name`, or `--model-dir`; all of which serve slightly different purposes, and might handle different use cases including local vs remote models, relative vs absolute paths etc. This leads to a significant amount of avoidable confusion among developers and users alike. Introducing some consolidation and consistency in this aspect would provide a big UX improvement.
16 | 
17 | In addition to this, we currently support multiple sources for model downloads. There could be models that span multiple sources and also have different versions and tags that users may want to be able to work with. Currently, there is a lack of uniformity in how we organize models downloaded from Hugging Face vs OCI registries. We do not have a way to differentiate and maintain unique identities for models in cache, based on their sources. We also currently don't have the ability to support version management. Users are only able to work with one version of a given model at a time, as downloading a different version just overwrites their existing model.
18 | 
19 | ## Proposal
20 | 
21 | ### Part 1: Narrowing down to a singular model related flag
22 | 
23 | #### Problem Statement - Section 1
24 | 
25 | We want to streamline the number of model related flags available, while establishing consistency and predictability between their uses and handling 3 separate use-cases: absolute paths, relative paths and remote repository names.
26 | 
27 | #### Suggested approach - Section 1
28 | 
29 | - Retain `--model` and deprecate all other model related flags
30 |   - Alternatively, we could retain `--model-path` but passing in a repo name to a flag called "model-path" seems a little more awkward than passing in a path to a field just called "model"
31 | - `--model` should accept both paths, as well as strings (for repo names)
32 | - First `--model` should simply check whether the supplied path exists - This would include determining whether the provided path is absolute, or relative to wherever the calling program is.
33 |   - If it exists, we should run an additional check to determine if it points to a safetensor or gguf models (use existing `is_model_safetensors` and `is_model_gguf` checks for this)
34 |   - If path does not exist, or exists but is not determined to be a valid model, then move on
35 | - If path is not found after step 1, check against all tracked models/checkpoints/adapters etc currently found in `~/.local/cache` and or `~/.local/checkpoints`. If not, move on
36 | - If supplied content is neither an absolute path, nor a relative path - assume it is the name of a remote repo on HF and download it
37 |   - alternatively, we can error out here and require that user download the model explicitly via `ilab model download`
38 | 
39 | This would standardize the behavior of the `--model` flag across all the commands that it appears in. There could be a dedicated model resolver function that implements the above described process.
40 | 
41 | The only exception may be `ilab model download` which contains a `--model-dir` flag, which acts as a sink rather than a source. This flag could stand to benefit from being renamed to `--destination` instead.
42 | 
43 | ### Part 2: Standardizing what gets passed _into_ the model flag (value)
44 | 
45 | #### Problem Statement - Section 2
46 | 
47 | In addition to standardizing the flag itself, we must also standardize what gets passed INTO the flag, i.e the value passed to `--model`. The format used to reference models should work consistently regardless of whether the user is referencing
48 | a local model or a remote one. They should be able to use a consistent string to reference a given, specific model under all circumstances.
49 | 
50 | #### Suggested approach - Section 2
51 | 
52 | - Standardize around the usage of a model's full URL as the way to reference that model at all times (e.g `quay.io/ai-lab/models/granite-7b-lab`).
53 | - Download logic is updated such that models are always downloaded into `~/.local/cache/instructlab/models` under sub-directories that follow the same structure as their URL. We also account for model versioning by doing the following:
54 |   - Creation of dedicated sub-directories based on tag/branch/commit SHA
55 |   - Creation of a `.metadata` file that records the tag/branch/commit information
56 | - Keeping these in mind, an example for what the final model path could look like is: `~/.local/cache/instructlab/models/quay.io/ai-lab/models/granite-7b-lab/v1.1`
57 | - Users use the full URL `quay.io/ai-lab/models/granite-7b-lab` when specifying where to download the model from. Thereafter, users continue to use this string (with the version included) to reference this model from cache, as this will now match the relative path of the model (based on point #4 of suggested approach of part 1)
58 |   - E.g: `ilab model chat --model quay.io/ai-lab/models/granite-7b-lab/v1.1`
59 |   - Alternatively, we could adopt standard convention and allow specification of version through a colon (`ilab model chat --model quay.io/ai-lab/models/granite-7b-lab:v1.1`) and extract the version from this URL and use it to locate the right version folder
60 |   - There could be some UX enhancements we add on top, such as allowing users to just specify "granite-7b-lab" and use some logic to determine which sub-folder and version to default to if multiple copies of that model exist across different sources and versions, similar to Podman
61 | 
62 | An ongoing work-in-progress effort for this can be found at: [#1895](https://github.com/instructlab/instructlab/pull/1895).
63 | 
64 | The proposed combined approach will resolve [#2200](https://github.com/instructlab/instructlab/issues/2200), [#1871](https://github.com/instructlab/instructlab/issues/1871) and all issues associated with it.
65 | 
66 | #### Open question
67 | 
68 | One issue with following the suggested approach would be that we might like to store Hugging Face models under `huggingface.co/` in the cache, similar to `quay.io/` for example. However, the Hugging Face API expects users to specify models following the pattern
69 | `<repo-name>/<model-name>` and does not accept `huggingface.co/<repo-name>/<model-name>`. As such, we could still store the models that way and have users specify `huggingface.co` for the sake of uniformity and have some logic to strip out the `huggingface.co` from the URL before sending the API request. This seems rather clunky and unnecessary. On the other hand, if we continue storing Hugging Face models in the same `<repo-name>/<model-name>` format, it would break the pattern with most models being
70 | collected under their host domains, and Hugging Face models arbitrarily stored one level higher than the rest. What's an acceptable solution in this case?
71 | 
72 | ## Follow up work
73 | 
74 | - `ilab model list` is updated to include a `version` column that displays all available versions for a given model. This logic should read from the `.metadata` files to pull the version info. Existing models already downloaded by models won't contain this file and hence should automatically have a version of `n/a` since we cannot determine the version of their models after the fact
75 | - `ilab model list` is updated to accept `--adapters` and `--checkpoints` flags to act as filters and contain dedicated sections to list model adapters and checkpoints
76 |   - The existing `--list-checkpoints` could be deprecated for uniformity reasons
77 | 
78 | ## How would backwards-compatibility be handled?
79 | 
80 | All other model flags will be deprecated for a couple releases and called out in the release notes. They will eventually be removed.
81 | The fields in the config file will need to be updated to match `--model`, which might be a breaking change and may warrant bumping the config version. This might require implementation of some kind of automatic config conversion mechanism between versions


--------------------------------------------------------------------------------
/docs/cli/ilab-model-serve-backend.md:
--------------------------------------------------------------------------------
  1 | # Design for `ilab model serve` command with backend support
  2 | 
  3 | ## Background
  4 | 
  5 | With the [request from the community](https://github.com/instructlab/instructlab/issues/1106) for `ilab` to serve different backends such as [vllm](https://docs.vllm.ai/en/stable/) and the [cli redesign](ilab-model-backend.md), this design doc's purpose is to flesh out the behavior of the `ilab model serve` command.
  6 | 
  7 | Specifically, this doc addresses the design of subcommands of `ilab model serve` that apply to
  8 | different serving backends.
  9 | 
 10 | ## Design
 11 | 
 12 | ### Backend
 13 | 
 14 | Since the subject of the `ilab model serve` command is a model, regardless of the format of the model, every command takes in the `--model` flag or uses its default value in the config.
 15 | 
 16 | `ilab model serve` has a new flag `--backend` that will be used to serve models with. As of this design, the two backends `ilab` would serve with are `llama-cpp` and `vllm`.
 17 | 
 18 | This would lead to the commands:
 19 | 
 20 | - `ilab model serve --backend llama-cpp`
 21 | - `ilab model serve --backend vllm`
 22 | 
 23 | There are specific flags for `ilab model serve` that would apply to all backends. These can be viewed by running `ilab model serve --help`.
 24 | 
 25 | The following is an overview for the flags of `ilab model serve`:
 26 | 
 27 | ```console
 28 | ilab model serve
 29 | |
 30 | |_______ (backend agnostic flags)
 31 | |
 32 | |_______ --backend ['llama-cpp', 'vllm']
 33 | |_______ --backend-args
 34 | ```
 35 | 
 36 | The `backend` flag will also be available as an option in the config file (`config.yaml`). This will allow users to
 37 | set a default backend for `ilab model serve` in the config. Also, commands like `ilab model chat`
 38 | and `ilab data generate` that serve models in the background will use the default backend specified
 39 | in the config. Here is an example of what the config file would look like:
 40 | 
 41 | ```yaml
 42 | serve:
 43 |   gpu_layers: -1
 44 |   host_port: 127.0.0.1:8000
 45 |   max_ctx_size: 4096
 46 |   model_path: models/merlinite-7b-lab-Q4_K_M.gguf
 47 |   backend: llama-cpp
 48 | ```
 49 | 
 50 | ### Backend flags
 51 | 
 52 | The `--backend-args` flag is a string that will be passed to the backend as arguments. This flag is used to pass
 53 | backend-specific arguments to the backend. Multiple values will be supported, however the exact formatting will be
 54 | defined in the implementation proposal. The backend will be responsible for parsing individual arguments.
 55 | 
 56 | It will also be available as an option in the config file (`config.yaml`). This will allow users to set default backend arguments for `ilab model serve` in the config. Here is an example of what the config file would look like:
 57 | 
 58 | ```yaml
 59 | serve:
 60 |   backend: llama-cpp
 61 |   backend_args:
 62 |     num_gpu_layers: 4
 63 |     max_ctx_size: 1024
 64 | ```
 65 | 
 66 | For clarity and ease of implementation, when using the `--backend-args` flag, the user must pass the
 67 | `--backend` flag as well. This is to ensure that the backend-specific arguments are passed to the
 68 | correct backend. Any backend-specific arguments that are not passed to the correct backend will be
 69 | reported as an error.
 70 | 
 71 | ## Command Examples
 72 | 
 73 | ### Bare-bones but model specific command
 74 | 
 75 | ```shell
 76 | ilab model serve --model <PATH>
 77 | ```
 78 | 
 79 | - Serves the model at `<PATH>`.
 80 | - If the `<PATH>` is the path for a model that can be run by `llama-cpp` then `llama-cpp` is
 81 |   automatically used as the model serving backend. The current auto-detection logic will rely on a
 82 |   valid GGUF file format. If the model is a valid GGUF file, then `llama-cpp` will be used as the model serving backend.
 83 | - If the `<PATH>` is the path for a model that can be run by `vllm` then `vllm` is automatically used as the model serving backend.
 84 | - If the model at `<PATH>` can be run by either backend, then a default backend defined in the
 85 |   config will be used as the model serving backend. In the case where there is ambiguity and a setting is not defined, a hardcoded preference will be used (all currently supported providers do not have this issue). A future profile specification will likely replace the hardcoded fallback.
 86 | 
 87 | ### Bare-bones command
 88 | 
 89 | ```shell
 90 | ilab model serve
 91 | ```
 92 | 
 93 | - This command has the same behavior as the one above but the `--model` is whatever the default model path is in the config. This is the existing behavior of `ilab serve` today.
 94 | 
 95 | ### Llama-cpp backend specific commands
 96 | 
 97 | ```shell
 98 | ilab model serve --model <PATH> --backend llama-cpp --backend-args '--num-gpu-layers 4'
 99 | ```
100 | 
101 | - This command serves a model with `llama-cpp`.
102 | - If the model provided is not able to be served by llama-cpp, this command would error out and suggest an alternate backend to use.
103 | - The existing flags to `ilab serve` (besides `--model-path` & `--log-file`) are now specific to the llama-cpp backend.
104 | 
105 | ### vllm backend specific commands
106 | 
107 | ```shell
108 | ilab model serve --model <PATH> --backend vllm --backend-args '--chat-template <PATH>'
109 | ```
110 | 
111 | - This command serves a model with `vllm`.
112 | - If the path provided is not able to be served by `vllm`, this command would error out and suggest an alternate backend to use.
113 | - There are [dozens](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server) of flags for vllm. Whichever arguments the community deems the most important to include, will be added as flags to `ilab model serve vllm`.
114 | - Any remaining arguments can be specified in the value of the flag `--vllm-args`.
115 | 
116 | ## Testing
117 | 
118 | An additional end-to-end test will be added that for a new backend for `ilab model serve`. This new test should be triggered whenever code changes to the new backend serving code are made or before a release.
119 | 
120 | This new test will do the following:
121 | 
122 | 1. Initialize ilab in a virtual env via `ilab config init`.
123 | 2. Download a model via `ilab model download`.
124 | 3. Serve the downloaded model with the new backend via `ilab model serve`.
125 | 4. Generate synthetic data using the served model via `ilab data generate`.
126 | 5. Chat with the served model via `ilab model chat`.
127 | 6. Any future commands that interact with a served model should be added to the test.
128 | 
129 | Some commands, like `ilab model chat` and `ilab data generate`, serve models in the background as part of the command. If automatic serving of a new backend is implemented for a command, testing of that command that will also be included in the new end-to-end test.
130 | 
131 | ## Handling existing backend-specific commands
132 | 
133 | The existing `ilab model serve` command has flags that are specific to the `llama-cpp` backend. The current list of flags are:
134 | 
135 | - `--num-gpu-layers`
136 | - `--max-ctx-size`
137 | - `--num-threads`
138 | 
139 | These flags will be moved to `--backend-args` and will be used as the default arguments for
140 | `llama-cpp` backend. This will allow for a more consistent experience across backends. The flag will
141 | be supported up to two releases after the release of the new backend. After that, the flag will be
142 | removed. During the two releases, a warning will be printed to the user when the flag is used.
143 | 


--------------------------------------------------------------------------------
/docs/cli/ilab-processes.md:
--------------------------------------------------------------------------------
 1 | # Processes in InstructLab
 2 | 
 3 | The ability to detach from processes is crucial to the user experience of InstructLab. However, the concept of multi-processing, process management, and the monitoring of processes is very complex.
 4 | 
 5 | It is important to try and add this concept in as simply as possible, expanding on the state reporting, logging, and other features as we go along.
 6 | 
 7 | ## Phased approach to InstructLab Processes
 8 | 
 9 | This document is going to describe phase 1 of implementing processes in InstructLab. Phase 1 is to be described as the "ilab simple process management system". This will depend purely on python packages, PID tracking, and log files to create the experience of detachable processes. The key here is the concept of the UUID, allowing a future REST API to keep track of InstructLab processes using these unique identifiers.
10 | 
11 | We can re-visit all this in phase 2, when we discuss if we want to utilize something like systemd or a more in-depth process-monitor repo to track processes.
12 | 
13 | ### Phase 1
14 | 
15 | Phase one would focus on adding the concept of detaching from processes, re-attaching to them, and managing the various artifacts from the processes.
16 | 
17 | Process management would only apply to `ilab data generate` and `ilab model train` in a first iteration. This would be followed by commands like `ilab model evaluate`, `ilab model serve`, and `ilab model download`. All of these commands have long running processes that would benefit from detachment.
18 | 
19 | The workflow would allow for:
20 | 
21 | `ilab data generate -dt` (run a detached generation process)
22 | `ilab model train -dt` (run a detached training process)
23 | 
24 | `ilab process list`
25 | 
26 | ```console=
27 | +------------+-------+--------------------------------------+------------------------------------------------------------------------------------------------------------------+----------+
28 | | Type       | PID   | UUID                                 | Log File                                                                                                         | Runtime  |
29 | +------------+-------+--------------------------------------+------------------------------------------------------------------------------------------------------------------+----------+
30 | | Generation | 39832 | 82d00a5b-5ed5-4cfd-9a75-a87e4f420b27 | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-82d00a5b-5ed5-4cfd-9a75-a87e4f420b27.log | 69:26:28 |
31 | | Generation | 40791 | 09f9d301-4fd9-4045-bfda-8a56f1d96016 | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-09f9d301-4fd9-4045-bfda-8a56f1d96016.log | 68:45:40 |
32 | | Generation | 47390 | 4ccabfa5-604f-49c6-b5c3-730ce328d62a | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-4ccabfa5-604f-49c6-b5c3-730ce328d62a.log | 67:26:33 |
33 | | Generation | 50872 | 093ac2e9-080c-45fe-89c5-43d508d6369c | /Users/charliedoern/.local/share/instructlab/logs/generation/generation-093ac2e9-080c-45fe-89c5-43d508d6369c.log | 05:24:56 |
34 | +------------+-------+--------------------------------------+------------------------------------------------------------------------------------------------------------------+----------+
35 | ```
36 | 
37 | `ilab process attach <UUID>`
38 | 
39 | This command would re-attach to the given process, allowing to user to view the live logs of the process. `attach` would trail the log file and listen for user-input to kill the process.
40 | 
41 | These commands will be done in a very simple way at first using the following architecture:
42 | 
43 | 1. a detached process be re-attachable by tailing the log file and then allowing the user to ctrl+c the process as normal using `KeyboardInterrupt`
44 | 2. The process registry will be maintained for tracking UUIDs created via the `uuid` python package, the PID of the actual process, a `log_file` where the process will be outputting its logs to so that the user can re-attach, and the start time of the process. The log file directory will be tracked using our `DEFAULTS` package and will be standard throughout releases.
45 | 
46 | The general flow would be:
47 | 
48 | 1. a user runs `ilab data generate -dt`
49 | 2. a UUID, PID, and log file is added to the process registry.
50 | 3. the process would exit, and print the UUID of the sdg run
51 | 4. a user could attach to this process using `ilab process attach <UUID>`.
52 | 5. This command would look in the process registry for the PID and/or UUID, get the log file, tail the log file, and listen for a ctrl+c keyboard interrupt.
53 | 
54 | This allows us to detach from processes while still running them in the background and maintain log files all without the use of anything other than UUID and subprocess.
55 | 
56 | #### Log file management
57 | 
58 | If existing log files from the various libraries exist, those will be used in this scenario. If they do not, InstructLab will manage writing process logs to disk. Regardless of whether the libraries maintain their own log file, InstructLab will need to co-locate the log files in a centralized directory.
59 | 
60 | If a log file exists, it will be copied and renamed into the following directory format:
61 | 
62 | `~/.local/share/instructlab/logs/<command_name>/<command_name>-<timestamp>.log`
63 | 
64 | If the log file does not exist, InstructLab will create one with this format. Libraries are responsible for standardizing where their logs are stored if they already exist so the Core package can access them in a uniform fashion and copy them to the proper directory.


--------------------------------------------------------------------------------
/docs/dependency-management.md:
--------------------------------------------------------------------------------
 1 | # Dependency Management Policy for InstructLab
 2 | 
 3 | This document describes the policies for adding and updating build and runtime dependencies of all InstructLab components.
 4 | 
 5 | ## Best practices
 6 | 
 7 | <!-- This section modified from https://github.com/instructlab/training/issues/34#issuecomment-2176071856 -->
 8 | 1. Express dependencies by setting a minimum version (using `>=`) to ensure compatibility.
 9 | 2. Do not "pin" a dependency to a single version (using `==`).
10 | 3. Exclude specific versions known to not work (using `!=`).
11 | 4. Only apply "caps" to dependencies (using `<`) when that dependency has established a pattern of producing new releases with breaking changes.
12 | 5. Pin versions in CI, with frequent automated updates.
13 | 
14 | Best practices for python dependencies call for using ranges in package requirements and [pinning versions only in CI jobs](./ci/dependency-constraints.md).
15 | 
16 | Using pinned versions in a list of constraints used for tests allows us to know and advertise exactly what versions have been tested in CI.
17 | That information is useful for users and re-packagers to understand which versions of dependencies are compatible with more specificity than the ranges provide.
18 | Tools like Dependabot will submit PRs to automatically update those pins to help us keep up with new releases of all of our dependencies.
19 | 
20 | Pinning to specific versions in the package dependencies so that installing a built package requires an exact version of a dependency is not a good practice.
21 | It makes it very easy for sets of packages that need to be installed together to have incompatible dependencies, which in turn makes it impossible to actually install them (for example, [instructlab/training #34](https://github.com/instructlab/training/issues/34).
22 | Pinned dependencies also make it difficult to deal with CVEs or other critical bugs in those dependencies, which makes delivering products from this project more challenging.
23 | Do not pin to specific versions of libraries.
24 | 
25 | Instead of pinning, we use version ranges.
26 | This ensures that re-packagers and installers have some flexibility in case a dependency of our package has a critical CVE and needs to be updated.
27 | Those ranges should include a minimum version, and in some cases a maximum version (a "cap").
28 | 
29 | Specifying the minimum value for the range (`foo>=x.y`) allows us to declare that we need features that only show up in or after that version of the dependency, which means we won't get bugs from users trying to use instructlab with an old dependency that has an incompatible API or is completely lacking a feature we need.
30 | 
31 | Specifying a maximum value for the range (`foo>=x.y,<x+1.0`) allows us to prevent a library that is known to introduce breaking changes in new releases from being brought into dev and test environments without being tested.
32 | That maximum version setting should be used with care, though, and only when definitely needed, because it can prevent good new versions from being used and can cause incompatibility issues similar to pinning versions.
33 | 
34 | Before setting a maximum version, wait for something to break, then set a cap to exclude the version that causes the breakage until the dependency is fixed or our code is updated to work with the new version of the dependency.
35 | Bad releases of dependencies (with incompatibilities or known bugs) can be excluded using the requirements syntax `!=a.b`.
36 | If the new dependency requires incompatible changes in our code, then it becomes the new minimum version for our requirements range.
37 | 
38 | ## Coordinating dependencies with downstream builders
39 | 
40 | As the maintainers of InstructLab, it is our responsibility to ensure that the minimum versions of all of our dependencies are specified accurately so that downstream builders can know what to build.
41 | There are many reasons why a new version of a dependency available on [PyPI](https://pypi.org/) might not be built downstream, including timing, source availability, build dependencies, and CVEs.
42 | We cannot, therefore, assume that downstream builds reproduce the same set of packages that we get when we install using pip and PyPI.
43 | If we determine that we need a new version of a dependency, we _must_ update the minimum version specifier in our requirements list to signal that need.
44 | Problems with dependencies that are reported against downstream builds that include versions that match the upstream requirements specifications will be treated as bugs in those requirements specifications.
45 | 


--------------------------------------------------------------------------------
/docs/docs-instructlab-website.md:
--------------------------------------------------------------------------------
 1 | # `docs.instructlab.ai` website
 2 | 
 3 | ## Problem statement
 4 | 
 5 | Reading the numerous `README.md` files for the InstructLab Project has become overwhelming and cumbersome.
 6 | Understanding the branching structure of the different ways to implement and use InstructLab is not only
 7 | confusing but frustrating to beginning users. Not having a centralized documentation site that is the
 8 | first and default stop for our downstream users is doing us a disservice.
 9 | 
10 | ## Proposed resolution
11 | 
12 | 1) Leveraging the static site ability of GitHub we spin up the <https://github.com/instructlab/docs> repository.
13 | and create a `CNAME` to point to <https://docs.instructlab.ai>.
14 | 2) We migrate the different `README.md`s to this static site, (most likely [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/),
15 | and iterate organizing the documentation via PRs and useful flow of information.
16 | 3) We leverage our TechWriters to have a single location to update official "downstream" documentation.
17 | 4) We have the dedicated `README.md`s for each project, but stripped down to very specific niche things for the sub-project, or fast feedback development notes.
18 | 


--------------------------------------------------------------------------------
/docs/evaluation/eval-repo.md:
--------------------------------------------------------------------------------
 1 | # New Repository Proposal: eval
 2 | 
 3 | ## Summary
 4 | 
 5 | This document proposes a new repository under the `instructlab` GitHub organization:
 6 | 
 7 | - `instructlab/eval`
 8 | 
 9 | ## Background
10 | 
11 | The `instructlab/instructlab` repository currently includes no real implementation
12 | of Evaluation as described by the [LAB paper](https://arxiv.org/abs/2403.01081). The
13 | closest implementation currently in `instructlab/instructlab` via the `ilab test` command.
14 | 
15 | `ilab test` as of this writing is only implemented for macOS with M-series chips. It uses
16 | a JSON Lines file and a LoRA adapter to compare output of a given model before and after
17 | LoRA training with MLX, thus the macOS M-series dependency.
18 | 
19 | We desire to build out a library for methods that satisfy the evaluation described in the
20 | paper, using more high-level evaluation schemes such as
21 | [Multi-turn Benchmark](https://arxiv.org/abs/2306.05685) for skills and
22 | [Massive Multitask Language Understanding](https://arxiv.org/abs/2009.03300) (MMLU) for
23 | knowledge. We propose a new repository to house this code that publishes a new Python
24 | library called `instructlab-eval`. The reasoning for a new repository and library includes:
25 | 
26 | - We expect multiple consumers of this code. The `ilab` CLI is one, but we also envision
27 | building a REST API around it to help support scaling out this functionality on a cluster.
28 | - We expect there is broader community interest in an open-source library and service for
29 | evaluation. We envision this library could support other evaluation techniques over time.
30 | - We also realize that much of model evaluation is generally useful outside the context of
31 | InstructLab. Other libraries may emerge in the broader ecosystem that handle parts of what
32 | we need, while this library will always remain to handle the InstructLab-specific details
33 | of how evaluation works in our workflow.
34 | 
35 | ## Maintainers
36 | 
37 | The initial team of maintainers for this repository will be a copy of the
38 | `Backend Maintainers` GitHub team.
39 | 
40 | ## Alternatives Considered
41 | 
42 | ### Add to `instructlab/instructlab`
43 | 
44 | We could add this code to the existing `instructlab/instructlab` repository.
45 | 
46 | The primary argument against this approach is that we expect the scope of an
47 | `instructlab-eval` library to expand beyond the scope of what would be run by the
48 | `ilab` CLI. We instead envision a different community of contributors organizing
49 | around Evaluation specifically.
50 | 


--------------------------------------------------------------------------------
/docs/examples/examples-repo.md:
--------------------------------------------------------------------------------
 1 | # Create A Repo for InstructLab Examples
 2 | 
 3 | ## Idea Overview
 4 | 
 5 | Create a separate repository within the `instructlab` GitHub org called `examples`.
 6 | This repository would house [Jupyter notebooks](https://jupyter.org/) and other examples that illustrate parts of or the entire LAB methodology.
 7 | The `examples` repo would be a place where the maintainers of the InstructLab project can collect notebooks for the benefit of the community.
 8 | All notebooks submitted to this repository would be carefully documented and tested before being merged.
 9 | 
10 | ## Repository Structure
11 | 
12 | The repository will start by housing notebooks and have two categories of notebooks. Notebooks either live in the `combined-stages` or `use-cases` directories.
13 | 
14 | ```bash
15 | examples
16 | |
17 | |- notebooks
18 |     |
19 |     |- combined-stages
20 |     |   |- training-with-eval
21 |     |       |- requirements.txt
22 |     |       |- training-with-eval.ipynb
23 |     |- use-cases
24 |     |   |- policy-documents
25 |     |   |   |- requirements.txt
26 |     |   |   |- legislative-act.ipynb
27 |     |   |- instruction-manuals
28 |     |   |   |- requirements.txt
29 |     |   |   |- how-to-build-a-house.ipynb
30 | ```
31 | 
32 | ### Notebooks for Combined InstructLab stages
33 | 
34 | Notebooks in the `combined-stages` directory go through parts of or the entire InstructLab workflow that users might want to reference or use.
35 | Some examples of combined stages are a notebook that runs through training then evaluation or a notebook that goes from document pre-processing to SDG.
36 | 
37 | ### Notebooks for End-to-End (e2e) use cases
38 | 
39 | Notebooks in the `use-cases` directory reflect real world use cases from start to finish.
40 | 
41 | ## Additional Info
42 | 
43 | This repo would not be released as a package on PYPI but initially as just `.zip` and `.tar.gz` files on GitHub.
44 | Releases would serve the purpose of giving users specific versions of notebooks they could run reliably.
45 | 


--------------------------------------------------------------------------------
/docs/github-actions-providers.md:
--------------------------------------------------------------------------------
 1 | # GitHub Actions Providers
 2 | 
 3 | ## Approved Providers
 4 | 
 5 | * `actions/*@*` - GitHub created actions
 6 | * `aws-actions/*@*` - AWS actions
 7 | * `DavidAnson/markdownlint-cli2-action@*` - Markdownlint-cli2 action
 8 | * `docker/*@*` - Docker actions
 9 | * `machulav/ec2-github-runner@*` - EC2 GitHub Runner action
10 | * `pypa/*@*` - Python Packaging actions
11 | * `rojopolis/spellcheck-github-actions@*` - Spellcheck action
12 | * `sigstore/gh-action-sigstore-python@*` - Sigstore Python action
13 | * `step-security/harden-runner@*` - Harden Runner action
14 | * `ludeeus/action-shellcheck@*` - Shellcheck action
15 | * `hynek/build-and-inspect-python-package@*` - Build and inspect Python package action
16 | * `andstor/file-existence-action@*` - File existence action
17 | 
18 | ## Denied Providers
19 | 
20 | There are currently no denied providers.
21 | 


--------------------------------------------------------------------------------
/docs/github-actions-use-policy.md:
--------------------------------------------------------------------------------
 1 | # GitHub Actions Use Policy for InstructLab
 2 | 
 3 | This document describes the use policy for GitHub Actions (actions) in workflows for all repositories in the [InstructLab](https://github.com/instructlab) organization.
 4 | 
 5 | ## Background
 6 | 
 7 | GitHub Actions are an important tool for CI/CD use within the repositories of the InstructLab project.
 8 | One of the main values is to verify the quality of pull requests for things like tests passing, spelling checks, well-formedness of files, etc.
 9 | Repositories may also use actions to build and publish releases for the project.
10 | 
11 | Since actions play a critical role in the project, care must be taken in how they are used due to their place in the security of the software supply chain of the project.
12 | 
13 | ## Dependabot
14 | 
15 | Each repository using GitHub Actions must configure Dependabot to manage the action dependencies.
16 | The repository must contain a `.github/dependabot.yml` file with the following minimum configuration:
17 | 
18 | ```yaml
19 | version: 2
20 | updates:
21 |   # Maintain dependencies for GitHub Actions
22 |   - package-ecosystem: "github-actions"
23 |     directory: "/"
24 |     schedule:
25 |       interval: "daily"
26 | ```
27 | 
28 | Repository maintainers must review and respond to all pull requests submitted by Dependabot in a timely manner.
29 | 
30 | ## SHAs instead of tags
31 | 
32 | GitHub Actions must be referenced using SHA values for a specific commit.
33 | Use of other reference types such as tag or branch names is not permitted.
34 | 
35 | ```yaml
36 | - name: Git Checkout
37 |   uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4
38 | ```
39 | 
40 | instead of
41 | 
42 | ```yaml
43 | - name: Git Checkout
44 |   uses: actions/checkout@v4
45 | ```
46 | 
47 | The use of SHAs can avoid security issues if the tag or branch of the action is moved to a malicious commit.
48 | We also gain build repeatability for future builds by referring to a precise revision of the action.
49 | 
50 | Since we use Dependabot to manage our action dependencies, Dependabot will handle the chore of using the proper SHA values in the submitted pull requests when action dependencies are updated.
51 | 
52 | ## Trusted Providers of GitHub Actions
53 | 
54 | There are many GitHub Actions available in GitHub.
55 | Not all can be necessarily trusted.
56 | The InstructLab project must maintain [a list of allowed providers and a list of denied providers](github-actions-providers.md).
57 | 
58 | Allowed providers will include all GitHub created actions (`actions/*`) as well as other official actions such as Python Packaging actions (`pypa/*`).
59 | 
60 | The InstructLab organization's Settings->Actions->General must be configured to allow select actions including actions created by GitHub along with the allowed providers.
61 | 
62 | Adding actions to the allowed providers or denied providers lists will require approval by the organization maintainers along with updating the organization's settings. This can be done by submitting a Pull Request to modify [`github-actions-providers.md`](github-actions-providers.md).
63 | 
64 | ## Harden the GitHub Action runner
65 | 
66 | To further increase security, the first step of each job in a workflow must be the [`step-security/harden-runner`](https://github.com/step-security/harden-runner) action.
67 | This action hardens the action runner by implementing network egress control and some infrastructure security.
68 | 
69 | ```yaml
70 | - name: "Harden Runner"
71 |   uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1
72 |   with:
73 |     egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
74 | ```
75 | 
76 | Each execution of the workflow job will produce a report showing endpoints accessed by the job and possible source file overwrites.
77 | The report will also suggest modifications to the harden-runner action's configuration to further increase security.
78 | 
79 | ```yaml
80 | - name: "Harden Runner"
81 |   uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1
82 |   with:
83 |     disable-sudo: true
84 |     egress-policy: block
85 |     allowed-endpoints: >
86 |       github.com:443
87 | ```
88 | 
89 | Hardening the action runner like this can prevent malicious or misbehaving actions or the misuse of actions from exfiltrating secrets.
90 | The article [Harden-Runner Defends Against Arbitrary Command Execution in `tj-actions/changed-files` GitHub Action](https://www.stepsecurity.io/blog/defend-against-arbitrary-command-execution-in-tj-actions-changed-files) shows how the misuse of an action could allow an attacker with a well-crafted pull request to exfiltrate secrets.
91 | 


--------------------------------------------------------------------------------
/docs/github-merge-strategy.md:
--------------------------------------------------------------------------------
 1 | # GitHub Merge Strategy for InstructLab
 2 | 
 3 | This document describes the merge strategy used for Pull Requests within all repositories in the [InstructLab](https://github.com/instruct-lab) organization.
 4 | 
 5 | ## Requirements for Merging a Pull Request
 6 | 
 7 | Every Pull Request that is made to an InstructLab repository should meet the below requirements - certain repositories such as [taxonomy](https://github.com/instruct-lab/taxonomy) may have additional requirements.
 8 | 
 9 | ### CI checks
10 | 
11 | We should require that all CI checks pass on a Pull Request before it can be considered for merge. Every repository should have at mimimum the following checks:
12 | 
13 | - Linting
14 | - Testing (Unit, Functional, etc)
15 | - DCO Commit Signoff via a `Signed-off-by` header. There is a DCO check enabled for all repositories in this GitHub organization.
16 | 
17 | Additional checks might be required for repositories on a case-by-case basis.
18 | 
19 | ### Approvals from Project Maintainers
20 | 
21 | At least one Project Maintainer should need to have an approving review on a Pull Request for it to be considered for merge. Requiring more reviews is left up to the discretion and consensus of the application maintainers team for a repository. The definition of a Project Maintainer can be found [here](https://github.com/instruct-lab/community/blob/main/governance.md#project-maintainers-overview).
22 | 
23 | Project Maintainers are given access permissions via [GitHub Teams](https://github.com/orgs/instruct-lab/teams) - you can see more details on the specific responsibilities of these teams [here](https://github.com/instruct-lab/community/blob/main/MAINTAINERS.md).
24 | 
25 | ## Method for Merging a Pull Request
26 | 
27 | There are [three different merge methods offered by GitHub](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/configuring-pull-request-merges/about-merge-methods-on-github) for merging Pull Requests.
28 | 
29 | We use the default merge method of creating merge commits for PRs. This is to ensure we retain the full commit history as intentionally structured by the PR author while also retaining metadata about the PR itself in the merge commit.
30 | 
31 | This requires project maintainers to include commit messages and the overall structure of the commit series as part of their review. When multiple commits are present, they should represent a logical series of changes that implement the overall change proposed in the PR. The commit message for each should clearly explain that step of the progression.
32 | 
33 | It is common that a PR author may need to do a final rebase to clean up their proposed commit series before a PR can be merged. It is also fine for a project maintainer to perform this step when the changes necessary are straight forward enough to do so.  This includes doing a final rebase on `main` if necessary. The PR itself should NOT include any merge commits of `main` back into the developer's branch. We expect the proposed commit series to be a clean set of commits against `main` without conflicts or merge commit history. We only use a merge commit to record the PR's inclusion into `main`.
34 | 
35 | ## Merge Automation
36 | 
37 | Repositories may use [Mergify](https://mergify.io/) to automate the merge
38 | process and enforcement of merge policies. Using this tool allows us to encode
39 | the merge requirements in a file stored in the git repository itself. Once all
40 | requirements are met, Mergify will automatically merge the PR.
41 | 
42 | An example configuration can be found in the [instructlab-bot
43 | repo](https://github.com/instructlab/instructlab-bot/blob/main/.github/.mergify.yml).
44 | 


--------------------------------------------------------------------------------
/docs/github-taxonomy-automation.md:
--------------------------------------------------------------------------------
  1 | # GitHub Automation for Taxonomy
  2 | 
  3 | This document describes the GitHub automation used with the
  4 | [Taxonomy](https://github.com/instruct-lab/taxonomy) repository.
  5 | 
  6 | ![GitHub Automation for Taxonomy](images/github-taxonomy-automation.excalidraw.png)
  7 | 
  8 | ## Key Components
  9 | 
 10 | ### InstructLab Bot
 11 | 
 12 | There is a [GitHub bot](https://github.com/instruct-lab/instruct-lab-bot) used
 13 | to facilitate this automation. This bot receives GitHub events and performs
 14 | actions on behalf of different components within Instruct Lab. More details on
 15 | the architecture of the bot itself can be found in its repository.
 16 | 
 17 | ### GitHub Actions
 18 | 
 19 | Not everything needs to be orchestrated by an external bot. The bot is good for
 20 | facilitating user interactions and managing long-running tasks on custom
 21 | infrastructure. However, many simpler tasks should just be GitHub actions. These
 22 | are easier to write and maintain, and they can be triggered by a variety of
 23 | events.
 24 | 
 25 | ### Mergify (future)
 26 | 
 27 | [Mergify](https://docs.mergify.com) is a tool that can perform workflow
 28 | automation for a GitHub repository. It can be used to implement a variety of
 29 | automation tasks, such as automatically merging PRs when certain conditions are
 30 | met. While not yet in use, it should be kept in mind for future needs.
 31 | 
 32 | ## Points of Interaction
 33 | 
 34 | ### Access Controls
 35 | 
 36 | Team membership for triagers of the Taxonomy repository is managed using the
 37 | [taxonomy-triagers GitHub
 38 | team](https://github.com/orgs/instruct-lab/teams/taxonomy-triagers).
 39 | 
 40 | The bot should allow configuring a list of GitHub teams that are allowed to
 41 | enable its functionality on a PR. The first command will only be accessible to
 42 | the following teams:
 43 | 
 44 | * Taxonomy Triagers
 45 | * Taxonomy Maintainers
 46 | * Backend Maintainers
 47 | * Instruct Lab Org Admins
 48 | 
 49 | This first command is used to indicate that a PR has been reviewed enough to determine
 50 | that it is safe to run automated tasks against it. This is to help avoid abuse
 51 | of our backend resources with malicious PRs.
 52 | 
 53 | ```text
 54 | @instruct-lab-bot enable
 55 | ```
 56 | 
 57 | If the bot determines the author of this command is in one of the required
 58 | teams, it will set a [commit
 59 | status](https://docs.github.com/en/rest/commits/statuses) on the PR indicating
 60 | that the bot functionality is turned on. The use of commit status here is
 61 | important because commit status will automatically be reset if the author of
 62 | the PR makes further changes. Any changes should require another check by a
 63 | triager or maintainer for the bot to be re-enabled.
 64 | 
 65 | At this point, the bot should respond with a welcome message including a list of
 66 | commands that have been enabled for this PR (`precheck`, `generate`, for example).
 67 | 
 68 | * [Bot PR adding gating label support](https://github.com/instruct-lab/instruct-lab-bot/pull/120)
 69 | 
 70 | Two labels will be used:
 71 | 
 72 | * `skill` - This label indicates that the bot can run automation intended for skills PRs.
 73 | * `knowledge` - This label indicates that the bot can run automation intended for knowledge PRs.
 74 | 
 75 | ### Pre-Check Using the Existing Model
 76 | 
 77 | The trigger for this step should be a PR comment with the following format:
 78 | 
 79 | ```text
 80 | @instruct-lab-bot precheck
 81 | ```
 82 | 
 83 | At this point, the bot will kick off a process that will run the questions from
 84 | the `qna.yaml` against the existing Merlinite model. The intent is to allow
 85 | contributors and reviewers to see if the proposed addition is already a
 86 | capability of the model by comparing the model's answers to the provided sample
 87 | answers.
 88 | 
 89 | When the process is complete, the bot will post a comment with instructions on
 90 | how to access the results.
 91 | 
 92 | * <https://github.com/instruct-lab/instruct-lab-bot/issues/85>
 93 | 
 94 | ### Post-Check Using the Trained Model
 95 | 
 96 | The trigger for this step should be a PR comment with the following format:
 97 | 
 98 | ```text
 99 | @instruct-lab-bot postcheck
100 | ```
101 | 
102 | At this point, the bot will kick off a process that will run the questions from
103 | the `qna.yaml` against the new Merlinite model. This will allow contributors and
104 | triagers to see how the new model behaves for the sample questions.
105 | 
106 | The GitHub bot will manage this process, but the actual model training will be
107 | done by the backend pipeline. The bot will hit an API endpoint that is serving
108 | the new, candidate model. The exact API endpoint is a private endpoint hosted by
109 | backend infrastructure.
110 | 
111 | When the process is complete, the bot will post a comment with instructions on
112 | how to access the results.
113 | 
114 | * <https://github.com/instruct-lab/instruct-lab-bot/issues/127>
115 | 
116 | ### Synthetic Data Generation Check
117 | 
118 | The trigger for this step should be a PR comment with the following format:
119 | 
120 | ```text
121 | @instruct-lab-bot generate
122 | ```
123 | 
124 | At this point, the bot will kick off a process that will generate a sample of
125 | synthetic data generated by the Mixtral model. This is to allow contributors and
126 | reviewers to determine if the synthetic data generated as a result of the
127 | proposed addition is reasonable.
128 | 
129 | When the process is complete, the bot will post a comment with instructions on
130 | how to access the results.
131 | 
132 | * Working, but still needs remote endpoint support added:
133 |   <https://github.com/instruct-lab/instruct-lab-bot/issues/116>
134 | 


--------------------------------------------------------------------------------
/docs/images/github-taxonomy-automation.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/images/github-taxonomy-automation.excalidraw.png


--------------------------------------------------------------------------------
/docs/images/sdg-api-interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/images/sdg-api-interface.png


--------------------------------------------------------------------------------
/docs/instructlab-app.md:
--------------------------------------------------------------------------------
 1 | # InstructLab macOS App
 2 | 
 3 | ## Scope
 4 | 
 5 | This document is targeted for macOS applications, but the idea could easily be transferred to other operating systems.
 6 | 
 7 | ## Problem statement
 8 | 
 9 | Starting InstructLab on your local laptop is hard. It requires a significant amount of `python` knowledge and terminal
10 | work, that is unrealistic for a non technologist to use. Having to install `git` specific versions of `python` and
11 | `xcode` requires a level of expertise that will create barriers of adoption to the InstructLab project.
12 | 
13 | ## Proposed solution
14 | 
15 | [ollama][ollama] has a macOS application that is a double-click installation for their server to run the commands
16 | locally. We propose creating the same "system bar" application, with the ability to run `ilab model serve` in the background
17 | and a possible way to do `ilab model chat` from said application.
18 | 
19 | Having the `ilab` dog up in the system bar telling you that `ilab model serve` is running, could open up the opportunity
20 | to have a model open to ask a quick question to the local model, and even an ability to open up a "long-running"
21 | conversation via a web browser or the like.
22 | 
23 | ## Next steps
24 | 
25 | 1. Create a simple MVP of starting the `ilab model serve` application, with controls for the `serve` options, including
26 |    what model you'd like to run, i.e. Granite or Merlinite.
27 | 2. Create an option to ask a quick question (`-qq` option) to the via the drop-down
28 | 3. Create a `ilab model chat` type interface via a window or web browser.
29 | 
30 | [ollama]: https://ollama.com/download/mac
31 | 


--------------------------------------------------------------------------------
/docs/instructlab-cli-1.0.0.md:
--------------------------------------------------------------------------------
 1 | # The Road to 1.0.0
 2 | 
 3 | _Or: How I Learned to Stop Worrying and Love to GA_
 4 | 
 5 | ## Context and Goals
 6 | 
 7 | The `instructlab/instructlab` repo started off as `instructlab/cli` - a basic Python Click-based command-line interface designed to prototype an application capable of
 8 | running the LAB methodology created by IBM Research. As the project evolved and the organization looked into creating a proper PyPI package for it, the decision was made
 9 | to rename the repo to `instructlab/instructlab` to keep the repo name consistent with the PyPI package name. The rest of this document will being using "InstructLab" to
10 | refer to this repo and Python package.
11 | 
12 | Today, InstructLab has gone from a scrappy research project to an upstream community serving as the basis for multiple downstreams, with the goal
13 | to continuing to evolve the community to encourage more participation from additional stakeholders. To wit, it would behoove us to determine what exactly we should be
14 | roadmapping between now and a proper 1.0.0 release, which demonstrates the following to existing and potential community members:
15 | 
16 | 1. An official goalpost for the community denoting the evolution of InstructLab from a pre-1.0 project lacking the stability and supportability typically seen from 1.0-and-beyond projects.
17 | 1. A dedicated set of V1 interfaces, both for internal configs and an API, that can be counted on for continuous usage of InstructLab 1.0 with future provisions made for backwards compatibility for subsequent Y-Streams and Z-Streams.
18 | 1. A commitment from the Oversight Committee and Maintainer teams to continue to maintain InstructLab throughout a 1.y cycle and work towards an eventual 2.0.
19 | 
20 | ## MVP for an InstructLab 1.0.0
21 | 
22 | At a high-level, these are the items the Maintainer teams believe should serve as prereqs for releasing an InstructLab 1.0.0:
23 | 
24 | ### Updating relevant references of "CLI" to "Core"
25 | 
26 | As noted in the `Context and Goals` section, InstructLab started off as just as a CLI - however, we are planning for this package to serve as a more general "Engine" -
27 | being a place where a future REST API can be defined that is used by both the CLI aspect as well as an official GUI for orchestrating the entire LAB workflow. Despite
28 | this, the repo is often still referred to as "the CLI". We as an organization need a better term to refer to this repo as, and should adopt the relevant documentation
29 | and meetings accordingly.
30 | 
31 | An open community vote made as part of the drafting of this document decided that "Core" would be the new term used. You can see a record of the vote
32 | [here](https://github.com/instructlab/dev-docs/pull/159#issuecomment-2514885516). This name change will begin to go into effect after the merging of this document
33 | and should be completed by the time of a 1.0.0.
34 | 
35 | ### A fully-realized configuration scheme, centered around the usage of system profiles
36 | 
37 | The InstructLab configuration scheme has transformed in many ways since the project's inception, from the `config.yaml` file that initially served as the user's config,
38 | to the addition of code-based Pydantic defaults, to train profiles, to system profiles. We need to fully-decouple this config from the Click library, remove the need for
39 | a `config.yaml` file, and have a consistent config scheme that can be easily extended.
40 | 
41 | ### An official v1 REST API schema
42 | 
43 | We need to have a defined v1 REST API schema - while this does not preclude future updates, something mature enough to serve as a v1 API throughout subsequent Y-Streams
44 | for an InstructLab 1.0 is a must for such a milestone.
45 | 
46 | ### Integration of InstructLab with RAG
47 | 
48 | RAG is currently being planned on being integrated into InstructLab - that work should be in a stable state adhering to our v1 API standard.
49 | 
50 | ### An upgrade path to subsequent Y-Streams and an eventual 2.0
51 | 
52 | Any user wishing to install an InstructLab 1.0 must have an upgrade path to 1.1, 1.2, ..., 1.n. Upon being ready for an InstructLab 2.0, we should also be expecting to
53 | provide a path for users wishing to upgrade from our final 1.y stream to 2.0.
54 | 
55 | ### Backwards compatibility across the 1.y stream
56 | 
57 | Any user going down our upgrade path described above should expect that the release they upgrade to is backwards-compatible with the release they upgrade from.
58 | 
59 | ### An official hardware support matrix
60 | 
61 | We need to have a documented matrix of what hardware footprints we support and to what extent - this includes hardware we know will not work, hardware that we know might
62 | work, and hardware we have confirmed will work with regular CI testing.
63 | 
64 | ### A robust CI ecosystem
65 | 
66 | We should have a CI ecosystem that includes linting as well as unit, functional, and integration/end-to-end (E2E) tests in the InstructLab repo, along with proper documentation and Makefiles that allow developers to easily run subsets of them locally on their machines.
67 | 
68 | ## Q&A
69 | 
70 | **Q. What about the libraries? Will they 1.0.0 as well?**
71 | 
72 | A. It depends - we historically have not aligned the InstructLab and Library releases on a particular version numbering scheme, apart from matching Y-Streams to Y-Streams (e.g., InstructLab 0.20 used SDG 0.4, Training 0.5, and Eval 0.3). At this stage, this document scopes only the prereqs we want for the InstructLab package.
73 | 
74 | ## Conclusions and Decision Outcome
75 | 
76 | This document will be debated and updated as part of the Pull Request review process. Upon reaching a lazy consensus by the Oversight Committee and Maintainer teams, the author of this document (Nathan Weinberg) will merge the document, denoting the following:
77 | 
78 | 1. The items in the above section `MVP for an InstructLab 1.0.0` will become official prerequisites for the InstructLab CLI Maintainer team to releasing a `1.0.0` of the InstructLab.
79 | 2. Any amendments to this list can only be made with a subsequent PR editing this document, subject to the same review process.
80 | 


--------------------------------------------------------------------------------
/docs/knowledge-submissions-past-wikipedia.md:
--------------------------------------------------------------------------------
 1 | # Knowledge Submissions Past Wikipedia
 2 | 
 3 | ## Problem Statement
 4 | 
 5 | For us to continue building our knowledge taxonomy tree we need more trusted places to
 6 | teach our models about the world. Wikipedia can get us only so far, so we need safe
 7 | places we can take the data from.
 8 | 
 9 | ## Proposed Solution
10 | 
11 | Due to the restrictions of copyright law, and how the internet is put together
12 | we need a list of possible places we can eventually approve as content providers.
13 | This document is will provide places that we would like to propose other then
14 | Wikipedia for our legal experts to start investigating if they can be
15 | on the [approved list][approved]. We have questions and thoughts from
16 | community members on places that should be approved, so here is a place
17 | can continue the discussion and track ideas from our community.
18 | 
19 | ### Proposed Domains
20 | 
21 | Status:
22 | 
23 | - `approved`: Approved by the legal team, and posted on the [approved list][approved].
24 | - `denied`: Denied by the legal team, and posted on the [avoided list][avoided].
25 | - `submitted`: Sent to the legal team for review
26 | - `proposed`: The community would like to propose this as a possible place to take knowledge submissions from.
27 | - `reviewed - manually verify`: Legal team has reviewed this domain and while much of its source material meets our open licensing criteria, not all of it does. Each submission from this source must be manually verified to actually be under an appropriate content license or definitively in the public domain.
28 | 
29 | For the purposes of Knowledge submissions to the InstructLab project, data sourced from items in the `approved` category require no further vetting from the Triage and/or other Maintainer teams. Items in the `reviewed - manually verify` category will require vetting before the submission can be accepted.
30 | 
31 | To ensure that the data you would like to include in your knowledge submission meets the project licensing criteria, please make sure to talk to the Taxonomy maintainer team *before* you begin work on your submission. We would hate for you to do a great deal of work only to be told that the data source you selected would not work for the project. Please make sure you review the [Getting Started with Knowledge Submissions](https://github.com/instructlab/taxonomy?tab=readme-ov-file#getting-started-with-knowledge-contributions) documentation prior to submitting your request.
32 | 
33 | | Domain name | Status | Notes |
34 | | :--  | :--  | :-- |
35 | | Wikipedia: <https://en.wikipedia.org/wiki/Main_Page> | approved | |
36 | | Project Gutenberg: <https://www.gutenberg.org/> | approved | Pre-1927 works; public domain under US copyright law |
37 | | Wikisource (library): <https://en.wikisource.org/> | approved | "free library that anyone can improve" |
38 | | OpenStax textbooks family of publications <https://openstax.org/subjects> | approved | |
39 | | The Open Organization publications <https://theopenorganization.org/> | approved | |
40 | | The Scrum Guide <https://scrumguides.org/index.html> | approved | |
41 | | <https://www.congress.gov/> | reviewed - manually verify | |
42 | | <https://www.whitehouse.gov/> | reviewed - manually verify | |
43 | | <https://www.senate.gov/> | reviewed - manually verify | |
44 | | <https://www.irs.gov/> | reviewed - manually verify| |
45 | | NASA: <https://www.nasa.gov/> | reviewed - manually verify | See guidelines: <https://www.nasa.gov/nasa-brand-center/images-and-media/> |
46 | | Smithsonian Libraries: <https://library.si.edu/>| reviewed - manually verify | For any material marked \"No Copyright - United States" or "CC0" as described here: <https://library.si.edu/copyright> |
47 | | European Union (EU): <https://european-union.europa.eu/> | reviewed - manually verify | Specifically documents submitted under "public registrars": <https://european-union.europa.eu/principles-countries-history/principles-and-values/access-information_en> |
48 | | Internet Archive: <https://archive.org/> | reviewed - manually verify | Pre-1927 works; public domain under US copyright law |
49 | | PLOS family of open access journals: <https://plos.org/publish/> | reviewed - manually verify  | |
50 | | Open Practice Library: <https://openpracticelibrary.com/>  | reviewed - manually verify  | |
51 | | Cynefin.io wiki: <https://cynefin.io/wiki/Main_Page>  | reviewed - manually verify  | |
52 | | The Open Education Project: <https://research.redhat.com/blog/research_project/foundations-in-open-source-education/>  | reviewed - manually verify  | |
53 | 
54 | ### Process steps
55 | 
56 | 1. Collect suggested places from the community by requesting they submit a pull request against this dev doc.
57 | 1. Work with our legal team to adjudicate. [@lhawthorn](https://github.com/lhawthorn) is currently the owner of this step, but is happy to educate & empower other folks to do this work.
58 | 1. Inform the triage team and triagers of the new locations we can or can not accept. This is currently done via an announcement in the [daily Triager Standup](https://github.com/instructlab/community/blob/main/Collaboration.md#triager-standup) and via a pull request to update the Knowledge Guide in one of the two locations listed below.
59 | 
60 | - [Approved sources][approved]
61 | - [Rejected sources][avoided]
62 | 
63 | [approved]: https://github.com/instructlab/taxonomy/blob/main/docs/KNOWLEDGE_GUIDE.md#accepted-knowledge
64 | [avoided]: https://github.com/instructlab/taxonomy/blob/main/docs/KNOWLEDGE_GUIDE.md#avoid-these-topics
65 | 


--------------------------------------------------------------------------------
/docs/library-release-strategy.md:
--------------------------------------------------------------------------------
 1 | # Library Release Strategy for InstructLab
 2 | 
 3 | This document describes the overarching release strategy and policies for Python libraries in the InstructLab organization.
 4 | 
 5 | ## Background and Problem Statement
 6 | 
 7 | The InstructLab organization features multiple code repositories that are tagged and released as Python libraries.
 8 | Primarily this has been the [CLI repository](https://github.com/instructlab/instructlab) but has grown over time,
 9 | including but not limited to:
10 | 
11 | - [sdg](https://github.com/instructlab/sdg)
12 | - [training](https://github.com/instructlab/training)
13 | - [eval](https://github.com/instructlab/eval)
14 | - [schema](https://github.com/instructlab/schema)
15 | - [GPTDolomite](https://github.com/instructlab/GPTDolomite)
16 | - [instructlab-quantize](https://github.com/instructlab/instructlab-quantize)
17 | 
18 | You can see the published versions of these libraries on PyPI [here](https://pypi.org/search/?q=instructlab).
19 | 
20 | While these libraries are all owned and maintained by the InstructLab organization, only the CLI has an official
21 | [release strategy](https://github.com/instructlab/instructlab/blob/main/docs/release-strategy.md) documented. Other library
22 | releases have been handled directly by the Maintainer teams at their own discretion. The organization needs to have certain
23 | overarching principals around this topic, while still allowing for flexibility for each library on case-by-case basis.
24 | 
25 | ## Proposal
26 | 
27 | By default, each existing and new library should have the following a `release-strategy.md` aligned with the CLI doc. This proposal recognizes that certain libraries may need flexibility
28 | on case-by-case basis - therefore, Maintainer teams are empowered to modify these documents as they see fit, so long as the
29 | following tenants remain consistent:
30 | 
31 | 1. Packages **must** be named `instructlab-<package-name>`
32 | 1. Packages **must** follow the `X.Y.Z` numbering scheme (i.e. [semvar](https://semver.org/))
33 | 1. Packages **must** have GitHub tagged releases named `vX.Y.Z`
34 | 1. Packages **must** use release branches for Y-Streams of the form `release-vX.Y`
35 | 1. Packages **must** maintain a `CHANGELOG.md`
36 | 1. Maintainer teams **must** publicly communicate Y-Stream releases through official InstructLab channels such as Discord, Slack or Mailing Lists. Z-Stream release communication is up to Maintainer discretion.
37 | 


--------------------------------------------------------------------------------
/docs/license-headers.md:
--------------------------------------------------------------------------------
 1 | # Guidance for License Headers in Source Files
 2 | 
 3 | ## Code Originating from InstructLab
 4 | 
 5 | * We prefer not to have a traditional full license header at the top of every
 6 |   file. We feel it is unnecessary.
 7 | 
 8 | * We prefer to have a single-line license identifier at the top of our source
 9 |   files, though it is not strictly required.
10 | 
11 | For example:
12 | 
13 | ```text
14 | // SPDX-License-Identifier: Apache-2.0
15 | ```
16 | 
17 | * We do not include Copyright statements at the top of our source files. These
18 |   statements are notoriously difficult to keep up-to-date and fully accurate and
19 |   are deemed unnecessary.
20 | 


--------------------------------------------------------------------------------
/docs/naming-conventions.md:
--------------------------------------------------------------------------------
 1 | # Naming Conventions
 2 | 
 3 | ## InstructLab - The Overall Project Name
 4 | 
 5 | * `instructlab` - in URLs, no hyphen
 6 | * `instruct-lab` - in URLs if `instructlab` is not available for some reason
 7 |   (like Slack)
 8 | * `InstructLab` - CamelCase when referring to the name of the overall project.
 9 | * Never `Instruct Lab` with a space.
10 | 
11 | ## Models
12 | 
13 | * `Merlinite` and `Granite` are capitalized unless referring to a filename
14 |   like `merlinite-7b-lab-Q4_K_M.gguf`.
15 | 


--------------------------------------------------------------------------------
/docs/publish-strategy.md:
--------------------------------------------------------------------------------
 1 | # Publishing Strategy for InstructLab
 2 | 
 3 | This document describes the publishing strategy used for all models in the InstructLab HuggingFace [organization](https://huggingface.co/instructlab) and the InstructLab Ollama [user account](https://ollama.com/instructlab).
 4 | 
 5 | ## What are we publishing and why?
 6 | 
 7 | The InstructLab team will be periodically training the full unquantized model with new Pull Requests to the [taxonomy](https://github.com/instructlab/taxonomy) repository. When the evaluation shows that the model has improved, the team will be publishing 4-bit quantized GGUF form model to a platform called Ollama and both an unquantized model and a 4-bit quantized GGUF form model to a platform called HuggingFace.
 8 | 
 9 | ## What is Ollama?
10 | 
11 | [Ollama](https://ollama.com/) is a platform that allows users to run open-source large language models locally on their machines. Ollama covers a broad spectrum of users, from seasoned AI professionals to people looking to explore the potential of AI and makes it easier for users to leverage the power of LLMs without having to rely on a cloud infrastructure. Since Ollama only supports publishing GGUF models, we will only be publishing the 4-bit quantized versions of our Merlinite and Granite models to the InstructLab Ollama user account.
12 | 
13 | ## What is HuggingFace?
14 | 
15 | [HuggingFace](https://huggingface.co/) is a centralized web service platform, similar to GitHub, for hosting Git-based repositories related to data science and machine learning. In the context of InstructLab, HuggingFace is the platform where we will be publishing releases of our model for consumption by the community.
16 | 
17 | We will be publishing two different kinds of model families - Merlinite and Granite.
18 | 
19 | ## Merlinite
20 | 
21 | The Merlinite model family is based off the [Mistral](https://mistral.ai/) model family and uses the [Large-scale Alignment for chatBots (LAB)](https://arxiv.org/abs/2403.01081) alignment. You can read more about it [here](https://huggingface.co/instructlab/merlinite-7b-lab).
22 | 
23 | The InstructLab organization will be publishing a community version of the Merlinite 7B size model, in both unquantized and 4-bit quantized GGUF form to HuggingFace and just the 4-bit quantized GGUF form to Ollama.
24 | 
25 | ## Granite
26 | 
27 | The Granite model family is the [foundational model family](https://www.ibm.com/downloads/cas/X9W4O6BM) for the IBM watsonx AI platform, designed for usage in a business environment. You can read more about it [here](https://huggingface.co/instructlab/granite-7b-lab).
28 | 
29 | The InstructLab organization will be publishing a community version of the Granite 7B size model using the [Large-scale Alignment for chatBots (LAB)](https://arxiv.org/abs/2403.01081) alignment, in both unquantized and 4-bit quantized GGUF form to HuggingFace and just the 4-bit quantized GGUF form to Ollama.
30 | 
31 | ## HuggingFace Naming Scheme
32 | 
33 | The naming scheme for both Merlinite and Granite will follow this generic scheme:
34 | 
35 | `<model family> - <size> - <type (optional)> - <alignment > - <optional elements with more detailed info>`
36 | 
37 | The specific schemes that will be published are detailed below:
38 | 
39 | | Model Family, Size, Alignment, etc. | Release Branch Format | Purpose |
40 | | --- | --- | --- |
41 | | [`merlinite-7b-lab`](https://huggingface.co/instructlab/merlinite-7b-lab) | `release-yyyymmdd` | Where the full precision Merlinite safetensors live |
42 | | [`merlinite-7b-lab-GGUF`](https://huggingface.co/instructlab/merlinite-7b-lab-GGUF) | `release-yyyymmdd` | Where the full precision and quantized Merlinite GGUFs live |
43 | | [`granite-7b-lab`](https://huggingface.co/instructlab/granite-7b-lab) | `release-yyyymmdd` | Where the full precision Granite safetensors live |
44 | | [`granite-7b-lab-GGUF`](https://huggingface.co/instructlab/granite-7b-lab-GGUF) | `release-yyyymmdd` | Where the full precision and quantized Granite GGUFs live |
45 | 
46 | ## Ollama Naming Scheme
47 | 
48 | The naming scheme for both Merlinite and Granite will follow this generic scheme:
49 | 
50 | `<model family> - <size> - <type (optional)> - <alignment>`
51 | 
52 | The specific schemes that will be published are detailed below:
53 | 
54 | | Model Family, Size, Alignment, etc. | Release Tag Format | Purpose |
55 | | --- | --- | --- |
56 | | [`merlinite-7b-lab`](https://ollama.com/instructlab/merlinite-7b-lab) | `release-yyyymmdd` | Where the full precision and quantized Merlinite GGUFs live |
57 | | [`granite-7b-lab`](https://ollama.com/instructlab/granite-7b-lab) | `release-yyyymmdd` | Where the full precision and quantized Granite GGUFs live |
58 | 
59 | ## Retention Policy
60 | 
61 | The InstructLab team will maintain the most recent **10** published versions of the respective models.
62 | 


--------------------------------------------------------------------------------
/docs/rag/adrs/README.md:
--------------------------------------------------------------------------------
 1 | # Architecture Decision Records
 2 | 
 3 | The ADR is a lightweight record format intended to capture individual architecturally important decisions. They are meant to be easy to write - 10 minutes or less. They should be stored in the codebase they affect, go through peer review, and have a commit history.
 4 | 
 5 | This simple format, which is described below, has a surprising number of functions:
 6 | 
 7 | * **Decision making process**: by going through peer review, it includes the entire team and gives all perspectives a chance to be heard. There is a clear decision making process with a clear lifecycle - once an ADR meets whatever approval criteria the team chooses, it is merged and the decision is done. If new information comes to light that causes the team to reconsider the decision, then that is simply a new ADR.
 8 | * **Institutional knowledge and transparency**: Not everyone will comment on every ADR, but the transparency of the mechanism should serve to keep everyone informed and encode tribal knowledge into writing. This also builds resilience - there should ideally never be decision making that is blocked by someone being sick or on vacation. The team should always be able to make significant decisions.
 9 | * **Distribute design authority**: As a team becomes familiar and comfortable with the ADR mechanism, every team member has an equal tool to bring design decisions to the team. This encourages autonomy, accountability, and ownership.
10 | * **Onboarding and training material**: A natural consequence of it being easy to write an ADR and getting into the habit of doing so is that new team members can simply read the record of ADRs to onboard.
11 | * **Knowledge sharing**: The peer review phase allows sharing of expertise between team members.
12 | * **Fewer meetings**: As decision making becomes asynchronous and as the team forms its social norms around the process, there should be less time required in meetings.
13 | 
14 | ## When to write an ADR
15 | 
16 | * A decision is being made that required discussion between two or more people.
17 | * A decision is being made that required significant investigation.
18 | * A decision is being proposed for feedback / discussion.
19 | * A decision is being proposed that affects multiple teams.
20 | 
21 | ## Template
22 | 
23 | [Here](template.md).
24 | 
25 | ## Related Reading
26 | 
27 | * [Suggestions for writing good ADRs](https://github.com/joelparkerhenderson/architecture-decision-record?tab=readme-ov-file#suggestions-for-writing-good-adrs)
28 | * [ADRs at RedHat](https://www.redhat.com/architect/architecture-decision-records)
29 | * [ADRs at Amazon](https://docs.aws.amazon.com/prescriptive-guidance/latest/architectural-decision-records/adr-process.html)
30 | * [ADRs at GitHub](https://adr.github.io/)
31 | * [ADRs at Google](https://cloud.google.com/architecture/architecture-decision-records)


--------------------------------------------------------------------------------
/docs/rag/adrs/adr-vectordb.md:
--------------------------------------------------------------------------------
 1 | # Initial InstructLab Vector Store
 2 | 
 3 | ## Context
 4 | 
 5 | One of the first choices to make in implementing RAG is to choose an initial vector store to develop against. Though the usage of frameworks like LangChain or Haystack make it easy to swap vector databases, we need a working end to end implementation for RAG that is tested against and available to install with InstructLab. There are many options (see [here](https://docs.haystack.deepset.ai/docs/choosing-a-document-store)).
 6 | 
 7 | Our main long-term requirements are that our chosen store have fully-developed document update (and thus some sort of notion of primary key), that it be scalable to cluster size, and that it have a permissive license (Apache, MIT, or similar). Among the available choices, [Milvus](https://milvus.io/) provides strategic advantage due to its [investment from watsonx](https://www.ibm.com/new/announcements/ibm-watsonx-data-vector-database-ai-ready-data-management).
 8 | 
 9 | Milvus can be used in-process ([Milvus Lite](https://milvus.io/docs/milvus_lite.md)), single-node ([Milvus](https://milvus.io/docs/prerequisite-docker.md)), or cluster-scale ([Milvus Distributed](https://milvus.io/docs/prerequisite-helm.md)).
10 | 
11 | ## Decision
12 | 
13 | InstructLab will initially integrate with and use Milvus Lite for vector storage and retrieval augmented generation.
14 | 
15 | ## Status
16 | 
17 | Accepted
18 | 
19 | ## Consequences
20 | 
21 | * Users will have a clear [upgrade path](https://milvus.io/docs/upgrade_milvus_cluster-operator.md) from the laptop use case to cluster scale.
22 | * We should be able to have access to expert resources with Milvus via IBM.
23 | * The laptop use case of InstructLab will have a minimally resource intensive option for prototyping.
24 | * Since Milvus is used in watsonx, we can have confidence that it can meet expected scaling requirements.
25 | * Document updates can be accommodated using well-established [primary key functionality](https://milvus.io/docs/primary-field.md) and [partition key](https://milvus.io/docs/use-partition-key.md).
26 | * There is a risk of developing against a mature vector store leading to usage of functionality not available in some other vector store that a potential customer requires to be used.
27 | 


--------------------------------------------------------------------------------
/docs/rag/adrs/granite-embeddings.md:
--------------------------------------------------------------------------------
 1 | # InstructLab RAG will use Granite Embeddings as the default embedding model
 2 | 
 3 | ## Context
 4 | 
 5 | InstructLab RAG will be using dense vector retrieval via a vector database to select documents for use in response generation.  Dense vector retrieval requires a model to provide dense vectors of the content at indexing time and dense vectors of the query at retrieval time.
 6 | 
 7 | Some key considerations:
 8 | 
 9 | - We want a model that is reasonably effective at driving accurate semantic search.
10 | - We want a model that runs quickly and doesn't require a lot of expensive hardware to run.
11 | - We want a model that does not require an inordinate amount of storage space for the vectors.
12 | - We want a model with license terms that are compatible with the license for InstructLab so that anyone using InstructLab under the terms of its license can use its default embedding model without any additional encumbrance.
13 | - We want a model where the license terms for all of the data used to train the model are compatible with the license terms of that model.
14 | - If possible, it would be nice for the embedding model used for RAG to be the same embedding model that is used for other purposes in InstructLab.  At this time, there are no other uses of embedding models for InstructLab, but we understand there are experiments under way to use embedding models for subset selection for synthetic data.
15 | 
16 | We don't need this to be one model that is the best possible fit for all users.  There is no such model because many of the criteria are trade-offs, e.g., models that are bigger tend to be more effective but also slower and require more memory and storage space.  Fortunately, we are just choosing a default value, and users will be free to override that default and plug in the embedding model of their choice.  For the default, the main consideration is that the model be acceptable across all the criteria.
17 | 
18 | Another consideration might be the maximum amount of text that the embedding model can produce embeddings for.  Many popular models have a maximum input length of 512 tokens, but some have longer maxima.  A larger maximum might be useful for a RAG solution in which the response generation model has a large context window.  However, it might not be: the larger context window might be better used by getting a larger number of search results or by looking up the text that came before and after each search result.  A larger maximum input length for the query might also be useful for encoding not just the current user utterance but also the history of the chat leading up to that utterance.  We will continue to explore this space and if we find that maximum input length is an important consideration, it could result in selecting a different default embedding model in the future.
19 | 
20 | ### Alternatives
21 | 
22 | There are two IBM Granite English embedding models:
23 | 
24 | - [IBM Granite-Embedding-30m-English](https://huggingface.co/ibm-granite/granite-embedding-30m-english)
25 | - [IBM Granite-Embedding-125m-English](https://huggingface.co/ibm-granite/granite-embedding-125m-english)
26 | 
27 | The 30m model provides accuracy that is comparable to 125m model on some data sets but significantly lower on other data sets. For use cases where there is a large amount of robust evaluation data and the ability to reliable metrics, it would generally be sensible to to try both and measure the speed/size/accuracy trade-offs.  On the other hand, for engagements where the quantity of data to be indexed is enormous, the advantages in indexing time and storage space from using a smaller model can be overwhelming.  Such engagements would generally be better served by 30m-English.  However, for a simple proof-of-concept where a user won't be able to measure accuracy robustly, 125m-English is probably a better choice over 30m-English because 125m is small enough for most purposes and is the one most likely to be accurate enough that a user will be happy with the results.  The simple proof-of-concept use case seems like the most important one for determining the *default* model because users who are outside of the context of a simple proof-of-concept are much more likely to be overriding the default anyway.  Both of these models have a maximum input length of 512 tokens.
28 | 
29 | Other options available include:
30 | 
31 | - IBM Granite multi-lingual embedding models seem like they could be very useful as part of a broader InstructLab multilingual strategy.  For now, the defaults we have are focused on English, but in the future we might want users to provide the target language or languages they are working with during initial setup and then defaults for various settings depend on that choice.  Since this would involve a broader end-to-end change, it is out of scope for this ADR.  The Granite multi-lingual models support 12 languages, but there are plenty of languages that they do not support.  Other models support larger numbers of languages.
32 | - [NV-Embed-v2](https://huggingface.co/nvidia/NV-Embed-v2) has outstanding accuracy (for example, see the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) overall English and retrieval English results).  However, it is licensed for non-commercial use only and requires almost 30 gb of memory (so presumably it requires some expensive hardware to run at speed).  For non-commercial users that prioritize accuracy over every other consideration, this might still be a fine model to choose, but it does not seem like a good default value because it doesn't meet all the criteria.
33 | - [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) was proposed as a default in an [early draft of another dev doc in this repository](https://github.com/instructlab/dev-docs/pull/161/commits/7ca3ab624526a4c5a5c70d282f8a6be26c292020#diff-b103ed3331fbeb65d7569ea836c9fd4b53c845853d8c8e7567c34864eebcdfb8R51).  It is a very popular, lightweight embedding model.  However, its [list of training sources](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2#training-data) includes MS MARCO which is clearly marked as non-commercial use only.  In contrast, the [Granite embedding model card](https://huggingface.co/ibm-granite/granite-embedding-125m-english) asserts that "Notably, we do not use the popular MS-MARCO retrieval dataset in our training corpus due to its non-commercial license, while other open-source models train on this dataset due to its high quality."  Note that all-MiniLM-L6-v2 is licensed as Apache 2.0 which authorizes commercial use of this model, but it seems potentially problematic for the creators of a model to authorize commercial use when they trained on data that was not authorized for commercial use.  We would prefer to avoid this tricky legal concern.
34 | - There are many other open source models of comparable size to the Granite embedding models.  However, most highly competitive models use MS-MARCO or other sources with problematic provenance.  Furthermore, IBM has [published benchmark results](https://www.ibm.com/new/announcements/ibm-granite-3-1-powerful-performance-long-context-and-more#Granite+embedding+models) showing that Granite's accuracy is highly competitive with other popular open source options of comparable size.  Note, however, that these results do not include some newer competing models.  They also don't include any much larger, more resource intensive models; such comparisons wouldn't really be *fair* (i.e., a bigger model that uses more memory has a fundamental advantage) but they would be *useful* for someone who has flexibility on hardware requirements and would like to explore trade-offs.  More benchmarking would be helpful.
35 | 
36 | ## Decision
37 | 
38 | InstructLab RAG will use [IBM Granite-Embedding-125m-English](https://huggingface.co/ibm-granite/granite-embedding-125m-english) as the default embedding model.
39 | 
40 | ## Consequences
41 | 
42 | - We will have consistency with the default options for model training (which are also based on IBM Granite).
43 | - Anyone using the default option will be able to rely on IBM's outstanding, well-earned reputation for respecting intellectual property rights and careful compliance with legal restrictions.
44 | - Anyone who wants a different balance between key considerations (hardware requirements, speed, effectiveness, terms of use, etc.) will want to override the default embedding model.  For example, users who want the best possible accuracy and have a lot of expensive hardware available to run the model will probably want to choose a larger model, and users with extremely limited hardware who can tolerate lower accuracy will probably choose a smaller model.
45 | - We should consider strategies of offsetting perception of prioritizing IBM models over the needs of the product in addition to the clear motivation described here, such as in documentation.
46 | - We may wish to change to a different default embedding model in the future if more benchmarking shows that a different model provides a better balance of accuracy, speed, and hardware requirements.
47 | - More work is needed to figure out whether a larger maximum input length would be better for our RAG solution.
48 | 


--------------------------------------------------------------------------------
/docs/rag/adrs/template.md:
--------------------------------------------------------------------------------
 1 | # Succinct title
 2 | 
 3 | ## Context
 4 | 
 5 | _What is the context of this decision? What are the technical, social, and political factors? For example, the decision to use a particular library might be simply because most of the team is familiar with it; that is a social context. A political factor might be influences from other teams or executive decisions_
 6 | 
 7 | ## Decision
 8 | 
 9 | _a single decision statement, written in active voice, stated in a single sentence_
10 | 
11 | ## Status
12 | 
13 | [Proposed | Accepted | Rejected ]
14 | 
15 | ## Consequences
16 | 
17 | _A bulleted list and might be the most important section. What are the consequences of this decision? Does it introduce design constraints into a codebase? Does it require further decisions or investigations to be made? Will it require training/onboarding for team members? Does it impact performance? What about cost? Does it impact development processes? What else? As a rule of thumb, there should usually be 4-6 identified consequences_
18 | 


--------------------------------------------------------------------------------
/docs/rag/images/ingestion-mvp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/ingestion-mvp.png


--------------------------------------------------------------------------------
/docs/rag/images/rag-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-chat.png


--------------------------------------------------------------------------------
/docs/rag/images/rag-model-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-model-training.png


--------------------------------------------------------------------------------
/docs/rag/images/rag-plug-and-play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-plug-and-play.png


--------------------------------------------------------------------------------
/docs/rag/images/rag-taxonomy-no-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/instructlab/dev-docs/30cb9d62d9a13dc2eeeefe92db67feefce4582da/docs/rag/images/rag-taxonomy-no-training.png


--------------------------------------------------------------------------------
/docs/rag/rag-initial-code-location.md:
--------------------------------------------------------------------------------
  1 | # Code location for RAG
  2 | 
  3 | | Created  | Dec 5, 2024 |
  4 | | -------- | -------- |
  5 | | Authors | Bill Murdock |
  6 | | Replaces | N/A |
  7 | | Replaced by | N/A |
  8 | 
  9 | ## What
 10 | 
 11 | We want a retrieval-augmented generation (RAG) capability that provides outstanding results with minimal effort, is seamlessly integrated with InstructLab, and is also general enough to be used in other applications as well.
 12 | 
 13 | ## Why
 14 | 
 15 | Many InstructLab users want to train a model and then use it to RAG.  Often they build something simple themselves for this purpose.  Two problems with this approach:
 16 | 
 17 | - Building their own RAG is extra work.
 18 | - Users who are not experts on RAG might not build a RAG that provides outstanding results.
 19 | 
 20 | There is a very simple RAG capability at <https://github.com/instructlab/rag> .  It is not tightly integrated with InstructLab and it does not use any advanced RAG capabilities.  However, we have a request from a stakeholder to not just unilaterally delete it or replace it with something radically different.
 21 | 
 22 | ## Goals
 23 | 
 24 | Provide a built-in alternative for users who do not want to build their own RAG.  Keep the existing capability at <https://github.com/instructlab/rag> somewhere, but potentially somewhere other than it is now (e.g., in a new branch of the existing repository).
 25 | 
 26 | ## Non-goals
 27 | 
 28 | Evaluation of RAG will be addressed in one or more other development documents.  That topic is out of scope for this document.
 29 | 
 30 | ## Decision
 31 | 
 32 | - For now, RAG will be located in the core repository in its own directory: `src/instructlab/rag` in the core InstructLab repository (<https://github.com/instructlab/instructlab>).
 33 | 
 34 | ## How
 35 | 
 36 | ### Phase 1
 37 | 
 38 | - RAG will be located in the core repository in its own directory: `src/instructlab/rag` in the core InstructLab repository (<https://github.com/instructlab/instructlab>).
 39 | - This directory will include all of the following:
 40 |   - Loading the content from Docling-format JSON files (that are produced by SDG preprocessing).
 41 |   - Chunking that content to sizes that fit the requirements of the selected embedding model for vector database storage and retrieval.
 42 |   - Storing those chunks with their vector representations in a vector database.
 43 |   - End-to-end runtime RAG.  The initial version of this includes the following:
 44 |     - Taking as input a session history (including a current user query) and providing a response (e.g., something along the lines of the [OpenAI chat completion API](https://platform.openai.com/docs/api-reference/chat/create)).
 45 |     - During that processing, it retrieves relevant search results from the vector database, it converts those into a prompt to send to the response generation model, it prompts that model, and it returns the response from that model.
 46 | - This will be invoked from the existing `ilab` CLI, as described in the [RAG ingestion and chat pipelines](https://github.com/instructlab/dev-docs/pull/161) dev doc.
 47 | 
 48 | ### Future phases
 49 | 
 50 | - In the near future, RAG might be moved to the existing <https://github.com/instructlab/rag> repository.
 51 |   - If so, something will be done with the existing code in <https://github.com/instructlab/rag>, e.g., moving it to a branch of that repository or moving it to a different repository.
 52 | - Alternatively, some or all of it might move to a new repository.
 53 |   - For example, maybe the indexing and retrieval portions move to a separate retrieval repository while the rest of end-to-end runtime RAG might move somewhere else.
 54 | - If/when we move ahead with any of these options, *we will open a new ADR for that decision*.
 55 | - Also, the capabilities will keep improving and adding more functionality.
 56 | 
 57 | ## Alternatives
 58 | 
 59 | - Put the indexing and run-time RAG code in a new repository.
 60 |   - Pro: Having a dedicated repository gives the RAG team the most freedom and flexibility to make technical decisions that work for that team.
 61 |   - Pro: Starting with a new repository provides a blank slate that can be set up in whatever way makes the most sense for that functionality.
 62 |   - Pro: Having the capability in one repository makes it easier for consumers such as RamaLama to reuse it for their purposes too.
 63 |   - Con: Creating and configuring a new repository is some work.  (This is a fairly small con, but a real one.)
 64 |   - Con: Integrating a new repository into the continuous integration and delivery capabilities for both upstream InstructLab and downstream consumers is a *lot* of work.  This is a much bigger con.
 65 |   - Con: All that extra work would almost certainly result in slower time to market.  This risks missing some market opportunities.
 66 | - Put the indexing code in <https://github.com/instructlab/sdg> (SDG) and the run-time RAG code in <https://github.com/instructlab/instructlab> (core)
 67 |   - Pro: This has the advantage of not adding any new dependencies.
 68 |   - Pro: The document processing is already in SDG and chat functionality is already in core so this would require the fewest code changes.
 69 |   - Con: Splitting the RAG functionality across multiple repositories makes it more complicated to reuse in other applications outside of InstructLab.
 70 |   - Con: Many things we will want to do to add advanced functionality to make RAG more effective will require changes to both indexing and run-time RAG.  If those components are split across multiple repositories, that will make delivering such changes more complicated.
 71 | - Start by putting the code into existing InstructLab repositories (either of the above options) and then split if off into its own repository later.
 72 |   - Pro: Gets us integrated into InstructLab sooner.
 73 |   - Con: Adds extra work to the second phase where we have to split it off into its own repository.
 74 |   - Con: There is a risk that we never get around to splitting it off and we wind up stuck with the cons of being jammed in to other components indefinitely.
 75 | - Put the indexing and run-time RAG code in a new repo outside <https://github.com/instructlab/>.
 76 |   - Pro: This signals that this is not specific to InstructLab but is instead intended to be useful in a variety of applications.  That makes it more likely the work could have broader impact.
 77 |   - Con: If we put this out there as something that is intended to be useful in a variety of applications, the pressure is on us to make sure it is differentiated from other broadly applicable RAG capabilities.  Hopefully that will be true eventually, but it probably won't be true for a while.  It might make more sense to give this some time to mature as a local component of InstructLab before trying to spin it off as its own thing.
 78 |   - Con: If we put it out there as its own open source project, that project needs all of the infrastructure of a full open source activity (governing structures, communication tools and protocols, etc.).  That's a lot of work to set up.  Keeping it inside InstructLab for now lets us keep using the infrastructure that InstructLab has for this purpose).
 79 |   - Con: If we put it out there as its own open source project, it needs a name.  It is a lot of work to come up with a good name and there will be a lot of stakeholders with an interest in the name that comes up.
 80 | - Keep the indexing and run-time RAG code in <https://github.com/redhat-et/PaRAGon> which is an emerging technologies prototype for this work.
 81 |   - Mostly the same pros and cons as putting it in a new repo outside InstructLab plus the following:
 82 |   - Pro: A prototype for the code we want is already there.
 83 |   - Pro: It already has its own distinctive name (PaRAGon).
 84 |   - Con: The existing repository has its own simple command-line interface which is useful for the prototype but we don't want it in the capability we release because too many command-line interfaces will confuse users.
 85 |   - Con: The name PaRAGon seems fine to me, but probably more stakeholders need to weigh in on what a name would be.
 86 |   - Con: The `redhat-et` label suggests that this is something "owned" by Red Hat which makes sense for the prototype but not so much for something we want a community to own in the long run.
 87 | - Put the indexing and run-time RAG code in <https://github.com/instructlab/rag> AND keep the existing RAG functionality in that repository intact.
 88 |   - Pro: It already exists.
 89 |   - Pro: It avoids the confusion of having two different RAG repositories in <https://github.com/instructlab/>.
 90 |   - Con: It creates the confusion of having two different RAG solutions in the same repository.  We could mitigate that with developer documentation and marking legacy stuff as "deprecated".
 91 | - Put the indexing and run-time RAG code in <https://github.com/instructlab/rag> AND eliminate the existing RAG functionality in that repository.
 92 |   - Pro: It already exists.
 93 |   - Pro: It avoids the confusion of having two different RAG repositories in <https://github.com/instructlab/>.
 94 |   - Pro: It avoids the confusion of having two different RAG solutions since we'd be eliminating the old one.
 95 |   - Con: There is still some interest in keeping this around.
 96 | 
 97 | ## Risks
 98 | 
 99 | - Putting the RAG functionality in the core repository requires any application that wants to use this functionality to bring in the entire core which then brings in all of the libraries it depend on, so this becomes an enormous dependency.  This discourages reuse in other applications.  It *encourages* either of the following behaviors that would be unfortunate:
100 |   - Other applications pull directly from <https://github.com/redhat-et/PaRAGon> and in doing so duplicate the ongoing effort to harden that code base.
101 |   - Other applications may implement their own RAG solutions or pull from some other upstream unrelated to ours.
102 | - As noted earlier, putting the capability inside <https://github.com/instructlab/> signals that this is a component of InstructLab and not a generally useful feature.  That creates a risk that the work could miss out on additional opportunities for impact.  We hope to mitigate that risk by spinning it off to its own open source project when it is mature enough, but there is a risk that we will get distracted by other things and never get around to this.
103 | - The flow for document processing for InstructLab winds up being quite complicated in this proposal.  Since the existing document processing is in SDG, the flow for indexing for RAG winds up being a bit complicated (i.e., it starts with a CLI call handled by the core repo then goes to SDG for some of the document processing and then back to the core `/data` directory which then calls out the the `core/rag` directory for chunking and vector database indexing).  Having the document processing move from core to SDG and back to core and forward to RAG makes that capability more difficult to understand and maintain.  This complexity will be partially mitigated when the preprocessing code moves from SDG to core.  It will be further mitigated by having a clear, well-documented contract between core and the RAG repository indicating the responsibilities of each.
104 | 
105 | ## References
106 | 
107 | - <https://github.com/redhat-et/PaRAGon>
108 | - <https://github.com/instructlab>
109 | - <https://github.com/instructlab/rag>
110 | 


--------------------------------------------------------------------------------
/docs/retrieval-augmented-generation/rag-framework.md:
--------------------------------------------------------------------------------
  1 | # RAG  will use Haystack
  2 | 
  3 | | Created  | Dec 4, 2024 |
  4 | | -------- | -------- |
  5 | | Authors |  Ryan Cook, Ilya Kolchinsky, Hema Veeradhi |
  6 | | Replaces | N/A |
  7 | | Replaced by | N/A |
  8 | 
  9 | ## What
 10 | 
 11 | This ADR defines the decision on the framework used to support the RAG pipeline for InstructLab. The Red Hat Emerging Technologies team is pushing for the adoption of Haystack to be used for the framework of the RAG offering. Haystack will handle the data ingestion and retrieval processes for this to be productized RAG solution.
 12 | 
 13 | NOTE: Speaking with Peter Staar on Dec 3, 2025 [the Docling team](https://github.com/DS4SD/docling) is aware of the efforts to potentially use Haystack and are already looking into adding the functionality for [support of Haystack](https://github.com/DS4SD/docling-haystack).
 14 | 
 15 | ## Why
 16 | 
 17 | Multiple options for frameworks currently exist. During our initial analysis, the following options were the only ones to satisfy the basic requirements in terms of functionality, reliability and open-source availability:
 18 | 
 19 | - [Haystack](https://haystack.deepset.ai/)
 20 | - [Langchain/Langgraph](https://www.langchain.com/)
 21 | - [LlamaIndex](https://www.llamaindex.ai/)
 22 | - [RAGFlow](https://ragflow.io/)
 23 | 
 24 | All of the above offer a variation of a modular pipeline architecture, where users can chain together components (like retrievers, readers, and generators) to process data in different stages.
 25 | 
 26 | Out of those, we propose to use Haystack for the following reasons:
 27 | 
 28 | 1. **Focus on RAG.** Haystack is a framework specifically targeting RAG use cases and sophisticated RAG indexing and retrieval pipelines. While Langchain and LlamaIndex shine in their own areas, the former is a generalist framework and the latter has a different focus, namely building custom indices over data. Haystack provides functionality that is strongly tailored for RAG and includes a comprehensive library of out-of-the-box solutions for advanced RAG scenarios. As a result, many essential or soon-to-be-essential RAG capabilities can be implemented in a few lines in Haystack but require considerable work to be supported over Langchain or LlamaIndex. Some examples include hybrid retrieval, iterative RAG, HyDE, combining multiple ingestion sources, custom data preprocessing and metadata augmentation. As the decision discussed in this document involves only the RAG component of Instructlab, we believe that choosing the best RAG framework, as opposed to the best general LLM serving framework, would be more strategically correct.
 29 | 
 30 | 2. **Maturity and stability.** Haystack is the most mature, established and stable product among the considered alternatives. It has been around for more time overall (since 2017) and accumulated more mileage. Haystack has an active, sizable and steadily growing community.
 31 | 
 32 | 3. **Extensive vendor support.** Haystack natively supports all currently popular vector DBs and provides dedicated backends for incorporating them into its pipelines. Additionally, Haystack supports multiple models and model providers out-of-the-box.
 33 | 
 34 | 4. **Enterprise-level performance.** Haystack is designed for production-grade scalability, supporting distributed systems and high-throughput applications. Moreover, and in contrast to the alternatives (of which only LlamaIndex showcases similar performance and scalability), Haystack is specifically optimized for efficient search and retrieval in the RAG setting.
 35 | 
 36 | 5. **Ease of use and documentation.** Being strictly focused on RAG as opposed to taking a generalist approach, the learning curve of Haystack is less steep than that of Langchain. At the same time, Haystack offers extensive documentation and tutorials which are more well-organized and easy to use than those of LlamaIndex.
 37 | 
 38 | 6. **Architecture.** Extending the previous point, Haystack can be seen as a middle ground between Langchain and LlamaIndex, sharing their benefits while only partially inheriting their drawbacks. Like the former, Haystack enables building custom flows and pipelines. Unlike Langchain though, Haystack does not try to be too abstract and general, strictly focusing on RAG and document search instead. As a result, Haystack is more straightforward to use, especially for users looking to implement custom and highly non-standard scenarios. On the other hand, like LlamaIndex, Haystack's performance is optimized towards data retrieval and indexing, but it offers a higher degree of flexibility and better interfaces for custom use cases.
 39 | 
 40 | 7. **Actively maintained open source project under permissive license.** Haystack is very [actively](https://github.com/deepset-ai/haystack/pulse/monthly) [maintained](https://github.com/deepset-ai/haystack/issues?q=is%3Aissue+is%3Aclosed) and [supported](https://github.com/deepset-ai/haystack/discussions). [Tagged versions](https://github.com/deepset-ai/haystack/releases) are released on a regular basis and [trusted publishing automation](https://github.com/deepset-ai/haystack/actions/workflows/pypi_release.yml) is used. Haystack is licensed under Apache 2.0, and all of its dependencies (jinja2, lazy-imports, more-itertools, networkx, numpy, openai, pandas, posthog, python-dateutil, pyyaml, requests, tenacity, tqdm, typing-extensions) are licensed under Apache, MIT, BSD or PSFL.
 41 | 
 42 | 8. **Published to PyPI via Trusted Publishing.**
 43 | See [trusted-publishers](https://docs.pypi.org/trusted-publishers/) for more information on trusted publishing. Haystack satisfies the requirements for trusted publishing with this [Github Actions workflow](https://github.com/deepset-ai/haystack/actions/workflows/pypi_release.yml).
 44 | 
 45 | ## Goals
 46 | 
 47 | - The primary goal here is to provide a framework optimized for RAG specifically and that the decision isn't focused on optimizing the choice on the best agentic pattern solution. However, Haystack can potentially implement many [agentic patterns](https://docs.haystack.deepset.ai/v1.22/docs/agent). Since Haystack provides the functionality [to build arbitrarily complex pipelines](https://docs.haystack.deepset.ai/docs/pipelines), it is possible to extend our RAG architecture to introduce nodes for tools, reasoning and agentic actions.
 48 | 
 49 | ## Non-goals
 50 | 
 51 | - What can we ignore when making this decision?
 52 | 
 53 | ## Decision
 54 | 
 55 | Upon acceptance of this integration our next step is to include additional ADRs for the subsequent components required for the RAG pipeline. Next up will be decisions on Milvus and containerized/non-containerized offerings of that solution.
 56 | 
 57 | Upon denial of this integration the team will need to go back and evaluate alternative technologies and ensure they meet the needs of the project goal and ensure they meet the larger project plan goals of a configurable RAG pipeline.
 58 | 
 59 | ## How
 60 | 
 61 | A downstream should be generated of the [https://github.com/deepset-ai/haystack](https://github.com/deepset-ai/haystack) project.
 62 | 
 63 | ## Alternatives
 64 | 
 65 | - Langchain/Langgraph
 66 |   - A generalist framework (as opposed to a RAG-focused solution)
 67 |   - Complicated as compared to the alternatives, steep learning curve
 68 |   - Lower performance in large-scale production environments than LlamaIndex and Haystack
 69 | 
 70 | - LlamaIndex
 71 |   - Limited flexibility and customization options as compared to the alternatives
 72 |   - Limited out-of-the-box support for building complex, multi-component pipelines as compared to Langchain and Haystack
 73 |   - Documentation is less well-maintained and more difficult to use as compared to the alternatives
 74 | 
 75 | - RAGFlow
 76 |   - Limited support for many of the mainstream vector DB providers
 77 |   - Limited scalability as compared to the alternatives
 78 | 
 79 | - Do not use a framework; write everything directly in Python or some other programming language
 80 |   - This would take longer to get started.
 81 |   - This would make it a lot more work to add more vector DBs since we'd need to add additional code for each vector DB we want to support. We would hide that work behind an abstraction layer the same way the frameworks do, but it is work to build and maintain the abstraction layer, and the frameworks do that for us (and have put a lot of time and effort into doing it well).
 82 |   - This would make it harder to bring in advanced functionality that the frameworks already provide. For example, Haystack provides support for RAG self-correction loops which we might want some day.
 83 |   - This might make it easier to bring in advanced functionality that the frameworks do not already provide. Frameworks provide an abstraction layer that is generally useful when you want to do things that the framework developers support but often counterproductive when you want to do things that the frameworks do not support. For example, if there is a call to the framework that collapses multiple atomic steps into a single function call, that generally makes it harder to insert your own logic in between those atomic steps.
 84 | 
 85 | ## Risks
 86 | 
 87 | Future versions of Haystack can potentially introduce new dependencies, that could be:
 88 | 
 89 | 1. Distributed under a non-permissive license (or not open source at all)
 90 | 2. Not regularly and/or properly maintained
 91 | 
 92 | If such a situation arises, the following actions can be taken on our end:
 93 | 
 94 | 1. Pin to the old version that doesn't have that dependency. That's often OK for a while, but eventually we're likely to run into updates that we need (e.g., critical fixes, compatibility with new vectordbs, etc.).
 95 | 2. Fork the project to avoid the problematic dependencies.
 96 | 3. Move off of Haystack completely.
 97 | 
 98 | ## References
 99 | 
100 | - [https://github.com/deepset-ai/haystack](https://github.com/deepset-ai/haystack)
101 | 


--------------------------------------------------------------------------------
/docs/schema/taxonomy-api.md:
--------------------------------------------------------------------------------
 1 | # Central API for taxonomy reading and validation
 2 | 
 3 | Current there are multiple places where the taxonomy `qna.yaml` files are read, parsed, and validated. There is a `check_yaml.py` script in the `taxonomy` repository and there are methods in the `instructlab` repository in the `src/instructlab/util.py` file.
 4 | 
 5 | The methods in `utils` are used by both the `ilab taxonomy diff` command as well as in the SDG code which has been moved to the `sdg` repository. This arrangement results in a circular dependency between the `instructlab` package to access the SDG code and from the SDG code in the `instructlab-sdg` package to access the `utils` methods to read and validate the taxonomy files.
 6 | 
 7 | ## Use instructlab-schema package for the central API
 8 | 
 9 | We now have an `instructlab-schema` package on PyPI which holds the JSON schema files for the taxonomy `qna.yaml` files. This is now used by `instructlab` to access these schema files for taxonomy file validation.
10 | 
11 | We should relocate the taxonomy reading and validation code from `instructlab` to `instructlab-schema`. This will provide for a central place near to the JSON schema it uses for a shared API for reading, parsing, and validating taxonomy `qna.yaml` files.
12 | 
13 | Then we can modify the `instructlab` and `instructlab-sdg` packages to depend upon the `instructlab-schema` package for these APIs which will remove a circular dependency. We can also use these APIs in the taxonomy repositories `check_yaml.py` script as well.
14 | 


--------------------------------------------------------------------------------
/docs/sdg/README.md:
--------------------------------------------------------------------------------
1 | # SDG Design Docs
2 | 
3 | Design docs associated with the `instructlab/sdg` repository
4 | 


--------------------------------------------------------------------------------
/docs/sdg/sdg-api-interface.md:
--------------------------------------------------------------------------------
  1 | # SDG Library Design
  2 | 
  3 | ## Objective
  4 | 
  5 | Library called `instructlab-sdg` that can be called per seed example that includes question and answer pairs, and context for grounded skills.
  6 | 
  7 | ## Structure of the SDG Library
  8 | 
  9 | We propose the following structure for the SDG library. There will be config files that contain all the prompt templates for the pipelines.
 10 | 
 11 | ```markdown
 12 | - src/instructlab/sdg/
 13 |     - configs/ 
 14 |         - gen_q.yaml 
 15 |         - gen_a.yaml
 16 |         - ...
 17 |     - init.py 
 18 |     - block.py 
 19 |     - llmblock.py 
 20 |     - pipeline.py 
 21 |     - sdg.py
 22 | ```
 23 | 
 24 | ![example API interface](../images/sdg-api-interface.png)
 25 | 
 26 | ## CLI
 27 | 
 28 | The CLI client uses the instructlab SDG library and provides it a run configuration with input parameters. The following represents a sample of what Library usage could look like.
 29 | 
 30 | ```python
 31 | # cli_driver.py
 32 | 
 33 | from sdg import SDG
 34 | from run_config import SynthDataFlow
 35 | from pipeline import Pipeline
 36 | import yaml
 37 | 
 38 | client = openai_client(endpoint)
 39 | model = "model-version"
 40 | 
 41 | synth_skills_flow = SynthDataFlow(client, model).get_flow()
 42 | skills_pipe = Pipeline(synth_skills_flow)
 43 | 
 44 | cli_sdg = SDG([synth_skills_flow])  # run config has all the variables like num_samples, pipelinesteps etc
 45 | generated_samples = cli_sdg.generate()
 46 | ```
 47 | 
 48 | As an initial integration milestone, we will modify the `generate_data` function in `instructlab.sdg.generate_data` to make use of the updated SDG API. This is the function the `ilab` CLI already uses, so modifying this implementation will allow us to get the updated SDG API in place without disrupting the CLI integration.
 49 | 
 50 | CLI integration will require additional changes later to allow passing in customizations to the SDG pipeline, but we will treat that as a follow-up implementation milestone.
 51 | 
 52 | The run configuration includes the necessary parameters for executing the SDG code library. It specifies the templates required for running the SDG code, the prompt template, and the default model system prompt template.
 53 | 
 54 | * `num_samples` is the number of synthetic samples that you wish to generate per seed example.
 55 | * `num_procs` is the number of parallel processes that you want to run
 56 | * `max_retry` is the maximum number of non-greedy retries you want to make if the `num_samples` is not reached. The number of samples in the generated output will be the samples achieved until `max_retry` is reached.
 57 | * Pipeline steps contains the steps that you want to invoke in the SDG pipeline and the prompt configurations per step. The variable names of the blocks can be anything and the prompt configurations must be compatible with the teacher model.
 58 | * `max_new_tokens` is the maximum number of tokens we want to generate. In other words, the size of the output sequence, not including the tokens in the prompt.
 59 | * `model` is the name of the served up teacher model we would want to use to generate the synthetic data.
 60 | * `model_prompt`: the default model prompt for the model.
 61 | * `client` points to an OpenAI client used to interface with the model. Example of a client:
 62 |   
 63 |   ```python
 64 |   client = OpenAI(
 65 |       api_key=openai_api_key,
 66 |       base_url=openai_api_base,
 67 |   )
 68 |   ```
 69 | 
 70 | ```python
 71 | # run_config.py
 72 | class Flow(ABC):
 73 |     def __init__(self, client, model_id) -> None:
 74 |         self.client = client
 75 |         self.model_id = model_id
 76 |     
 77 |     @abstractmethod
 78 |     def get_flow(self) -> list:
 79 |         pass
 80 | 
 81 | 
 82 | class SynthDataFlow(Flow):
 83 |     def get_flow(self) -> list:
 84 |         return [
 85 |             {
 86 |                 'block_type': LLMBlock,
 87 |                 'block_config': {
 88 |                     'block_name': "gen_q",
 89 |                     'config_path': "configs/gen_q.yaml",
 90 |                     'client': self.client,
 91 |                     'model_id': self.model_id,
 92 |                     'model_prompt': '<s> [INST] {prompt} [/INST]',
 93 |                     'output_cols': ['question'],
 94 |                     'batch_kwargs': {
 95 |                         'num_procs': 8,
 96 |                         'num_samples': 30,
 97 |                         'batched': True,
 98 |                     },
 99 |                     'max_retry' : 5,
100 |                     'max_new_tokens': 10000
101 | ```
102 | 


--------------------------------------------------------------------------------
/docs/sdg/sdg-api-simplification.md:
--------------------------------------------------------------------------------
 1 | # SDG API Simplification
 2 | 
 3 | ## Objective
 4 | 
 5 | Identify simplifications to [the original SDG API design](sdg-api-interface.md) based on retrospective insights from working with the implementation of that design.
 6 | 
 7 | ## Original API Design
 8 | 
 9 | Consider the original API sketch:
10 | 
11 | ```python
12 | from sdg import SDG
13 | from run_config import SynthDataFlow
14 | from pipeline import Pipeline
15 | import yaml
16 | 
17 | client = openai_client(endpoint)
18 | model = "model-version"
19 | 
20 | synth_skills_flow = SynthDataFlow(client, model).get_flow()
21 | skills_pipe = Pipeline(synth_skills_flow)
22 | 
23 | cli_sdg = SDG([synth_skills_flow])  # run config has all the variables like num_samples, pipelinesteps etc
24 | generated_samples = cli_sdg.generate()
25 | ```
26 | 
27 | The nouns above are:
28 | 
29 | * Dataset - this is from Hugging Face's datasets library - used for the return value from `SDG.generate()`, but also what is passed between elements of the data generation pipeline
30 | * Block - not shown in the code above, but required to understand a pipeline - a block provides a `generate()` method transforms an input dataset and returns an output dataset
31 | * Block config - a description of how to instantiate and invoke a block, a sequence of these is returned from `get_flow()` above
32 | * Flow - a class which describes how to render a sequence of block configs for a pipeline
33 | * Pipeline - a pipeline is created from a sequence of block configs, and provides a generate() method in which it instantiates and invokes blocks in turn, passing the input dataset and collecting the output
34 | * SDG - an SDG is created from a list of pipelines, and its generate() method calls pipelines in turn
35 | 
36 | ## Simplification Proposals
37 | 
38 | ### Remove `SDG`
39 | 
40 | We don't need both SDG and Pipeline since Pipeline can already do everything SDG can do. If more advanced orchestration of multiple pipelines is required later, an orchestration abstraction can be added then.
41 | 
42 | ### Remove `Flow`
43 | 
44 | With flows migrating to a YAML file format (#109), their purpose becomes more clear - they are simply an expression of the configuration of a sequence of blocks, used to create a pipeline. We can simply refer to these YAML files as pipeline descriptions.
45 | 
46 | ### Add PipelineContext
47 | 
48 | A number of runtime parameters are required by blocks in a pipeline - e.g. every `LLMBlock` requires an OpenAI API client and a model name. These parameters are distinct from configuration that is specified by a pipeline author.
49 | 
50 | It would be much more straightforward if `Block` were able to access these runtime parameters via their parent `Pipeline`.
51 | 
52 | In the case where multiple pipelines with the same runtime context is desired, it would also be beneficial to abstract these runtime parameters into a `PipelineContext` class.
53 | 
54 | ## New API Design
55 | 
56 | ```python
57 | ds = Dataset.from_list(samples)
58 | 
59 | ctx = PipelineContext(client, "mixtral", teacher_model)
60 | 
61 | knowledge_pipe = Pipeline.from_configs(ctx, [MMLU_BENCH_PIPELINE, SYNTH_KNOWLEDGE_PIPELINE])
62 | 
63 | gen_data = knowledge_pipe.generate(ds)
64 | ```
65 | 


--------------------------------------------------------------------------------
/docs/sdg/sdg-flow-yaml.md:
--------------------------------------------------------------------------------
  1 | # SDG API - Add a file format for defining custom Flows
  2 | 
  3 | ## Problem Statement
  4 | 
  5 | The `instructlab/sdg` library is introducing more extensive data generation pipelines. To enable customization, we should allow users of the library to provide a configuration file which defines a custom pipeline or extends an existing pipeline.
  6 | 
  7 | In terms of the API constructs, a Pipeline is created from a sequence of “block configurations” which express how to instantiate and invoke the individual steps (aka blocks) in the pipeline. A Flow construct serves as a template from which a sequence of block configs can be generated.
  8 | 
  9 | ## Objective
 10 | 
 11 | - Library users can specify a custom flow using a well-defined file format.
 12 | - Library users can either use a custom flow standalone, or combine a custom flow with existing flows.
 13 | - The file format and library can evolve substantially without making breaking changes.
 14 | - Incompatible changes can be introduced while retaining support for existing custom flows for a deprecation period.
 15 | 
 16 | ## Proposal
 17 | 
 18 | ### Existing API Review
 19 | 
 20 | The current `Pipeline` API allows instantiation with a list of `Block` configurations.
 21 | These configurations could come from one or many sources. In its simplest form:
 22 | 
 23 | ```python
 24 | pipeline = Pipeline(block_configs)
 25 | ```
 26 | 
 27 | or if you had two separate lists of block configurations to append together:
 28 | 
 29 | ```python
 30 | pipeline = Pipeline(block_configs1 + block_configs2)
 31 | ```
 32 | 
 33 | ### API Additions
 34 | 
 35 | We will add an API that instantiates a `Pipeline` object from a YAML file:
 36 | 
 37 | ```python
 38 | pipeline = Pipeline.from_file(ctx, 'mycustomflow.yaml')
 39 | ```
 40 | 
 41 | The YAML file format will mirror the API and look like this:
 42 | 
 43 | ```yaml
 44 | version: 1.0
 45 | blocks:
 46 |   - name: gen_knowledge
 47 |     type: LLMBlock
 48 |     config: # LLMBlock constructor kwargs
 49 |       output_cols: ["question", "response"]
 50 |     gen_kwargs: # kwargs for block.generate()
 51 |       max_tokens": 2048,
 52 |     drop_duplicates: ["question"]
 53 |   - name: filter_faithfulness
 54 |     type: FilterByValueBlock
 55 |     config:
 56 |       filter_column: judgment
 57 |       filter_value: YES
 58 |       operation: eq
 59 |       drop_columns: ["judgment", "explanation"]
 60 | ```
 61 | 
 62 | ## Versioning
 63 | 
 64 | A mandatory `version` field in the YAML file expresses major and minor versions (e.g., 1.0, 1.1, 2.0).
 65 | 
 66 | Compatibility rules
 67 | 
 68 | 1. If the major version of the YAML file is higher than the parser can handle, the parser should reject the file.
 69 | 2. If the minor version of the YAML file is higher than the highest version the parser is aware of, the parser should read the file but ignore any unrecognized content.
 70 | 3. If the file’s version is lower than the parser version, the parser should provide default values for any configuration introduced in later versions.
 71 | 
 72 | Example parsing logic:
 73 | 
 74 | ```python
 75 | def parse_custom_flow(content):
 76 |     version = content['version']
 77 |     major, minor = map(int, version.split('.'))
 78 | 
 79 |     if major > PARSER_MAJOR:
 80 |         raise IncompatibleVersionError("The custom flow file format is from a future major version.")
 81 |     elif major <= PARSER_MAJOR and minor > PARSER_MINOR:
 82 |         logger.warning("The custom flow file may have new features that will be ignored.")
 83 | ```
 84 | 
 85 | ### Pipeline Context
 86 | 
 87 | The following runtime parameters will no longer be part of the pipeline configuration definition and instead available to blocks via a `PipelineContext` object:
 88 | 
 89 | - client - an OpenAI completions API client for talking to the teacher model via the serving backend (i.e. llama-cpp or vLLM)
 90 | - model_family - e.g. mixtral or merlinite
 91 | - model_id - a path name for the specific teacher model being used
 92 | - num_instructions_to_generate - how many samples to generate
 93 | 
 94 | For now, we assume there is no need to do any sort of templating in the custom pipelines based on these runtime parameters.
 95 | 
 96 | ### Model Prompts
 97 | 
 98 | Based on whether model_family is mixtral or merlinite, a different prompt is used with the teacher model
 99 | 
100 | ```python
101 | _MODEL_PROMPT_MIXTRAL = "<s> [INST] {prompt} [/INST]"
102 | _MODEL_PROMPT_MERLINITE = "'<|system|>\nYou are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.\n<|user|>\n{prompt}\n<|assistant|>\n'"
103 | ```
104 | 
105 | For now, we assume that the `LLMBlock` class will choose the appropriate model prompt based on the family and that there is no need to specify a custom prompt.
106 | 
107 | ### Prompt Config Files
108 | 
109 | Every LLMBlock references a separate prompt config file, and presumably a custom pipeline will provide custom prompt configs too.
110 | 
111 | These prompt config files are quite simple YAML files - they contain a single object with system, introduction, principles, examples, and generation keys. See e.g. [src/instructlab/sdg/configs/skills/freeform_questions.yaml](https://github.com/instructlab/sdg/blob/main/src/instructlab/sdg/configs/skills/freeform_questions.yaml)
112 | 
113 | We will continue to use these config files unchanged, and custom files can be specified with an absolute path. Relative paths are assumed to be relative to the Python package e.g. `configs/skills/...`.
114 | 
115 | ### Model Serving
116 | 
117 | Custom pipelines may have more unique model serving requirements. Instead of serving just one model, we may need to launch the model server with a model and an additional model with adapter. vLLM, for example, can host both a model and a model+adapter under two different model IDs.
118 | 
119 | The pipeline author needs some way of disambiguating between these multiple models - i.e. the definition of each `LLMBlock` needs to specify a particular model.
120 | 
121 | Right now the `Pipeline` constructor takes two relevant parameters - the OpenAI client instance, and the model ID for the default model. It's important to note that this model ID is defined by the user at runtime, and it may not match the model IDs that the pipeline author used.
122 | 
123 | The use cases will be:
124 | 
125 | 1. Most LLMBlock definitions will use the default teacher model - and we can make the semantic that if the pipeline author doesn't specify a model in an `LLMBlock`, the default in `PipelineContext.model_id` is used.
126 | 2. In cases where a model+adapter is to be served, the pipeline author should choose a descriptive model ID using `block.gen_kwargs.model_id` and the user should ensure that this is the model ID that is served.
127 | 
128 | For example, a pipeline author might define:
129 | 
130 | ```yaml
131 | version: "1.0"
132 | blocks:
133 |   - name: gen_questions
134 |     type: LLMBlock
135 |     config:
136 |       config_path: configs/skills/freeform_questions.yaml
137 |       add_num_samples: True
138 |       gen_kwargs:
139 |         model_id: mycustomadapter
140 |       output_cols:
141 |         - question
142 |     drop_duplicates:
143 |       - question
144 | ```
145 | 
146 | and the user will be required to define a serving configuration like:
147 | 
148 | ```bash
149 | --lora-modules=mycustomadapter=path/to/my_custom_adapter
150 | ```
151 | 
152 | ### Re-use of Built-in Pipelines
153 | 
154 | A custom pipeline may want to extend an existing built-in pipeline. In that
155 | case, a new block type, `ImportBlock`, may be used to import the blocks from
156 | another configuration file.
157 | 
158 | ```yaml
159 | version: "1.0"
160 | blocks:
161 |   - name: import_from_full
162 |     type: ImportBlock
163 |     path: configs/full/synth_freeform_skills.yaml
164 |   - name: custom_post_processing_block
165 |     type: LLMBlock
166 |     ...
167 | ```
168 | 
169 | ### CLI Integration
170 | 
171 | As of the current version of `ilab`, it supports `simple` and `full` as parameters to `--pipeline` to select one of the two types of built-in pipelines included in the library.
172 | 
173 | Once we have support for loading custom pipelines, we need a way for these to be specified with the CLI. We believe the most common case for custom pipelines is for them to extend the `full` pipeline and, as such, we should support extending existing pipelines with a custom pipeline rather than simply specifiying a single pipeline.
174 | 
175 | Here is a proposed CLI UX for this:
176 | 
177 | > `ilab data generate`
178 | 
179 | Use the default pipeline, `simple`.
180 | 
181 | > `ilab data generate --pipeline full`
182 | 
183 | Use the built-in `full` pipeline.
184 | 
185 | > `ilab data generate --pipeline path/to/custom_pipeline_directory/`
186 | 
187 | Use a custom pipeline configuration. The custom pipeline may include references to the built-in flows to be used in conjunction with custom ones, but those details are contained within the yaml files in the custom directory.
188 | 
189 | ### File and Directory Structure
190 | 
191 | The existing contents of `default_flows.py` will become these files in the source tree:
192 | 
193 | ```text
194 | src/
195 |   instructlab/
196 |     sdg/
197 |       pipelines/
198 |         simple/
199 |           knowledge.yaml
200 |           freeform_skills.yaml
201 |           grounded_skills.yaml
202 |         full/
203 |           knowledge.yaml   # also contains the current contents of mmlu_bench.yaml
204 |           freeform_skills.yaml
205 |           grounded_skills.yaml
206 | ```
207 | 
208 | When the `--pipeline` option to `ilab data generate` is used to point to a
209 | custom directory, we will assume that the same 3 files are present. All three
210 | files will be loaded and used according to the type of taxonomy additions
211 | present when running `ilab data generate`.
212 | 
213 | ### Future CLI Improvements
214 | 
215 | A possible improvement would be to have a well-defined place on the filesystem where custom pipeline configs can be automatically loaded and included as options to the `--pipeline` parameter.
216 | 
217 | For example, if the config format included new parameters, `name: full-extended` and `extends: full`, and the CLI discovered and loaded it automatically, we could support `--pipeline full-extended` without needing the additional `--pipeline-extend` option.
218 | 
219 | `/usr/share/instructlab/sdg/` is a proposed location for this as a place for a distribution of InstructLab to include pre-defined custom pipelines, at least for Linux. See the [Filesystem Hierarchy Standard](https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch04s11.html) for more details on why this path is appropriate for this use-case.
220 | 
221 | It would also make sense to support a configuration directory for user's own custom pipeline configurations. Assuming there is a base config directory, these could go in a `sdg` subdirectory. There is a separate proposal that discusses a proposed configuration location: <https://github.com/instructlab/dev-docs/pull/104>. Note this is separate from the distribution-provided, read-only pipelines discussed above with a different location.
222 | 
223 | If we have a location with pipeline examples then a nice to have would be to have a `ilab data generate --list-pipelines`.
224 | 
225 | ## Alternative Approaches
226 | 
227 | Alternatives already considered and discarded are listed below.
228 | 
229 | ### No Custom Flows
230 | 
231 | It would be preferable to not support custom flows, especially so early in the project because:
232 | 
233 | - We will need an extensive API to support this customization, and we will need to be careful about making incompatible changes to that API once it has been adopted.
234 | - We would learn more about the pipelines that users are creating if they were added to the library.
235 | 
236 | This approach was discarded because of strong demand from downstream users to define custom flows to encapsulate proprietary pipeline configuration..
237 | 
238 | ### Custom Flows as Code
239 | 
240 | If we have an API for creating flows, users could define these custom flows in Python rather than with a configuration file format.
241 | 
242 | This approach was discarded because of a desire by downstream users to separate reusable logic from proprietary pipeline configuration.
243 | 
244 | The initial version of the initial SDG library API design (#98) proposed using YAML files and this was changed to Python code based on this feedback:
245 | 
246 | > Does this need to be a yaml file?
247 | >
248 | > or is it actually a Python dict passed to the library?
249 | >
250 | > I actually think it would be a nice simplification to not worry about config files at all, and from the library perspective, assume configuration is passed in via data structures.
251 | >
252 | > How that config is constructed could be a problem of the library consumer. Maybe they hardcode it. maybe they allow a subset to be considered. Some could be driven by CLI args, for example.
253 | 
254 | Since adopting YAML may now appear contradictory to that feedback, it is useful to understand how the feedback relates to this new design:
255 | 
256 | 1. The feedback assumes that YAML will be used for custom pipelines, but wonders whether it would be better to implement that in the CLI instead of the library.
257 | 2. Not called out is that at the time it was unclear whether custom pipeline definitions would also need to include custom model serving configuration - if so, the model serving configuration would not belong in the SDG library. It is now better understood that no model serving configuration needs to be included in the pipeline definitions. (See above)
258 | 3. The POC implementation of this format makes it clear - in a way that wasn't clear from an API design - that using the YAML format within the library is an improvement.
259 | 


--------------------------------------------------------------------------------
/docs/sdg/sdg-repo.md:
--------------------------------------------------------------------------------
 1 | # New Repository Proposal: sdg
 2 | 
 3 | ## Summary
 4 | 
 5 | This document proposes a new repository under the `instructlab` GitHub organization:
 6 | 
 7 | - `instructlab/sdg`
 8 | 
 9 | ## Background
10 | 
11 | The `instructlab/instructlab` repository includes a basic implementation of
12 | Synthetic Data Generation (SDG). This implementation does not implement the full
13 | approach as described by the [LAB paper](https://arxiv.org/abs/2403.01081).
14 | 
15 | We desire to build out a more complete implementation of SDG that is more in
16 | line with the LAB methodology. We propose a new repository to house this code
17 | that publishes a new Python library called `instructlab-sdg`.  The reasoning for
18 | a new repository and library includes:
19 | 
20 | - We expect multiple consumers of this code. The `ilab` CLI is one, but we also
21 |   envision building a REST API around it to help support scaling out this
22 |   functionality on a cluster.
23 | - We expect there is broader community interest in an open-source library and
24 |   service for synthetic data generation. We envision this library could support
25 |   other data generation techniques over time.
26 | 
27 | ## Maintainers
28 | 
29 | The initial team of maintainers for this repository will be a copy of the
30 | `Backend Maintainers` GitHub team.
31 | 
32 | ## Alternatives Considered
33 | 
34 | ### Add to `instructlab/instructlab`
35 | 
36 | We could add this code to the existing `instructlab/instructlab` repository.
37 | 
38 | The primary argument against this approach is that we expect the scope of an
39 | `instructlab-sdg` library to expand beyond the scope of what would be run by the
40 | `ilab` CLI. We instead envision a different community of contributors organizing
41 | around SDG specifically.
42 | 


--------------------------------------------------------------------------------
/docs/sdk/instructlab-sdk.md:
--------------------------------------------------------------------------------
  1 | # InstructLab Python SDK
  2 | 
  3 | ## Motivation
  4 | 
  5 | Today, the only way to "drive" the InstructLab opinionated workflow is via the `ilab` CLI. While this process provides a succinct way for everyday users to initialize a config, generate synthetic data, train a model, and evaluate it: the guardrails are quite limiting both in what a user can do and what the development team can add as features exposed directly to the user over time.
  6 | 
  7 | Additionally, current consumers of InstructLab are finding themselves importing different library private and public APIs in combination with CLI functionality to achieve the workflows they want. While these more advanced usage patterns are not for everyone, providing ways to run bespoke and piecemeal workflows in a standardized and safe way for the community is a necessity.
  8 | 
  9 | Unifying these various ranges of advanced workflows under an overarching `InstructLab Python SDK` will allow for new usage patterns and a clearer story on what InstructLab can and should provide as user accessible endpoints.
 10 | 
 11 | While each library can and _should_ have their own publicly accessible SDK, not all functionality being added to SDG, Training, and Eval needs to be correlated directly to the "InstructLab workflow". This Python SDK should, as the CLI does, expose an opinionated flow that uses functionality from the various libraries. The InstructLab SDK should be derived from the library APIs, not the other way around. SDG, for example, currently has a `generate_data` method, meant to only be accessed by InstructLab. This method simply calls other publicly available SDG functionality. Orchestration of the InstructLab flow like this should not be of concern to the individual libraries and instead be handled by the overarching InstructLab SDK which will maintain the user contracts. The InstructLab SDK will need to work within the bounds of what the Libraries expose as public APIs.
 12 | 
 13 | The benefit of the above is that the opinionated flow can be accessed in a more nuanced and piecemeal way while also gaining the potential for more advanced features. Say a consumer wants to:
 14 | 
 15 | 1. Setup a custom config file for ilab (optional)
 16 | 2. Initialize a taxonomy
 17 | 3. Ensure their taxonomy is valid
 18 | 4. Ingest some data for RAG and SDG (SDG coming soon)
 19 | 5. Generate synthetic data using an InstructLab pipeline
 20 | 6. Do some custom handling per their use case
 21 | 7. Fine-tune a model using the custom config they initialized for their hardware
 22 | 8. Evaluate their model after training using various benchmarks
 23 | 
 24 | A user could do this if they had an SDK.
 25 | 
 26 | (the structure of the SDK and actual arguments is discussed below)
 27 | 
 28 | However today, users are forced to run a sequence of commands tailored to only work with the proper directory structure on the system.
 29 | 
 30 | ## Major Goals
 31 | 
 32 | 1. Modularize the InstructLab workflow such that any part can be run independently
 33 | 2. Allow users to choose whether or not to take advantage of the config/system-profile method of running InstructLab. Meaning they do not need any pre-existing configuration to run the SDK.
 34 | 3. Standardize user contracts for the existing functionality of the InstructLab workflow. Existing CLI commands should be using the SDK once past click parsing, not separate code.
 35 | 4. Define Contracts loose enough that functionality can be expanded as more advanced features are released.
 36 | 5. Document SDK usage in upcoming InstructLab releases.
 37 | 
 38 | ## Non-Goals
 39 | 
 40 | 1. Exposing all library functionality immediately
 41 | 2. Replacing CLI
 42 | 3. Shipping an SDK that is generally available as opposed to v1alpha1 or v1beta1.
 43 | 
 44 | ## Design
 45 | 
 46 | ### Versioning
 47 | 
 48 | The SDK would start at version v1alpha1 such that it can change/break at any time for the first few iterations as libraries adjust their API surface.
 49 | 
 50 | ### Structure
 51 | 
 52 | This SDK should live in a net new package inside of `instructlab/instructlab` preferably to limit unnecessary imports in a new repository. The SDK could be imported as `instructlab.core...`
 53 | 
 54 | The user surface initially should look like this:
 55 | 
 56 | `instructlab.core` contains all SDK definitions. Users can `from instructlab.core import...` to use specific SDK classes
 57 | 
 58 | For most of the existing InstructLab command groups, there should be a class:
 59 | 
 60 | `from instructlab.core import, Config, Taxonomy, Data, Model, RAG, System`
 61 | 
 62 | The full list of classes and their methods for now (subject to change during development process):
 63 | 
 64 | ```console
 65 | instructlab.core.Config
 66 | instructlab.core.Config.init
 67 | instructlab.core.Config.show (get)
 68 | instructlab.core.Taxonomy
 69 | instructlab.core.Taxonomy.diff
 70 | instructlab.core.System
 71 | instructlab.core.System.info
 72 | instructlab.core.Data
 73 | instructlab.core.Data.ingest
 74 | instructlab.core.Data.generate_data
 75 | instructlab.core.Model
 76 | instructlab.core.Model.serve
 77 | instructlab.core.Model.train_model
 78 | instructlab.core.Model.process_data (calling the training library's data process class in a safe way)
 79 | instructlab.core.Model.evaluate_mt_bench
 80 | instructlab.core.Model.evaluate_dk_bench
 81 | instructlab.core.Model.evaluate_mmlu_bench
 82 | instructlab.core.RAG.ingest
 83 | instructlab.core.RAG.convert
 84 | ```
 85 | 
 86 | a brief example:
 87 | 
 88 | ```python
 89 | 
 90 | from instructlab.core import Config, Taxonomy, Data, Model
 91 | 
 92 | config_object = Config.init(...)
 93 | diff = Taxonomy.diff()
 94 | 
 95 | if diff:
 96 |     data_client = Data(data_path="", teacher_model="", num_cpus="", taxonomy_path="",)
 97 | 
 98 |     # not in v1alpha1
 99 |     data_path = data_client.ingest()
100 |     # not in v1alpha1
101 | 
102 |     openai_compat_client = some_server()
103 | 
104 |     data_jsonls = data_client.generate_data(client=openai_compat_client, data=data_path)
105 | 
106 |     some_custom_handling(data_jsonls)
107 | 
108 |     # you can either use a config obj or pass trainer args
109 |     model_client = Model(student_model=path_to_student_model, configuration=config_object)
110 | 
111 |     model_path = model_client.train_model()
112 | 
113 |     # since we initialized the model client with the config, the training args are passed implicitly
114 |     eval_output = model_client.mt_bench(model_path=model_path)
115 | ```
116 | 
117 | The above example utilizes the configuration object to instantiate the `Model` class. However, a user could also pass `training_args=` directly to `model_client.train_model` to override the configuration class defaults. This allows the SDK to utilize the System Profiles of the ilab CLI but not rely on them too much.
118 | 
119 | Presumably, the distinct methods under each class will grow, which is why I am opting to make very distinct classes per command group. Another benefit to the parent classes is that individual methods can inherit defaults from the instantiation of the object.
120 | 
121 | These initial exposed functions can expand to include any new functionality that is more SDK oriented from the various libraries. For example, if SDG adds something like subset selection, teacher as annotator, data mixing, etc we could expose an `instructlab.core.Data.annotate` or `instructlab.core.Data.mix` that could be invoked in sequence in a user's script with other parts of the ilab workflow. Some things make _less_ sense to be exposed via a CLI, but still are critical to ensuring users get a good model and properly generated data.
122 | 
123 | There are certain things that only exist in `ilab` currently and functionality that is going to be moving these such as data ingestion, RAG, etc. Forming an SDK for `instructlab` allows us to capture all of these concerns under one API.
124 | 
125 | These endpoints in combination with the curated InstructLab Config File will open up these workflows to users and allow InstructLab to be easily incorporated into other projects. Allowing people to run things like data generation, and full fine-tuning via an SDK that pulls in their pre-existing `config.yaml` but also can be run independently will open new avenues for InstructLab adoption and extensibility.
126 | 
127 | ## Changes to the CLI
128 | 
129 | The `ilab` CLI will need to adapt to this new structure. Commands like `ilab data generate` should, in terms of code, follow this flow:
130 | 
131 | 1. `src/instructlab/cli/data/generate.py`
132 | 2. `src/instructlab/data/generate.py`
133 | 3. `src/instructlab/process.py`
134 | 4. `src/instructlab/core/data/generate.py`
135 | 
136 | So generally: cli -> process management package to kick off a sub-process -> internal handling package -> core SDK (public definitions) -> library code, is the flow.
137 | 
138 | The flow of the CLI today is such that the cli package for a command (`src/instructlab/cli/data/generate.py`) parses the command line options, manages creating a sub-process, and passes control to the core code (`/src/instructlab/core/data/generate.py`). This then calls out to the library APIs
139 | 
140 | The internal handling package is necessary as it allows us to split off a sub-process when it makes the most sense for us before calling the library code directly. This is how the CLI works today.
141 | 
142 | The difference with an SDK is that we would eventually want to end up executing `core/data/generator.py`, the actual publicly consumable python SDK. This will ensure that the CLI can do whatever custom handling it needs to do on top, but eventually it must boil down to the `core` package which uses publicly available methods from the various libraries.
143 | 
144 | ## Scope of work
145 | 
146 | In upcoming releases the InstructLab team should aim to:
147 | 
148 | 1. Design the SDK given the structure above
149 | 2. Converse with Library maintainers to negotiate user contracts
150 | 3. Begin work to re-architect how the CLI works using the SDK
151 | 4. Publish an alpha SDK for public consumption
152 | 
153 | After this initial work, the team can scope adding net new functionality that is not in the CLI to the SDK.


--------------------------------------------------------------------------------
/docs/support-utils/support-utils-repo.md:
--------------------------------------------------------------------------------
 1 | # Create Separate Repo for User Utilities
 2 | 
 3 | ## Idea Overview
 4 | 
 5 | Create a separate repository within the `instructlab` GitHub org called `support-utils`.
 6 | This repository would house scripts and notebooks outside of the scope of the LAB Methodology implemented in the [Instructlab Core](https://github.com/instructlab/instructlab) repository that enhance the InstructLab experience.
 7 | Many users and community members already have such scripts they use day to day.
 8 | The `support-utils` repo would be a place where the maintainers of the InstructLab project can collect and curate them for the benefit of the community.
 9 | Scripts in this repository may become features or incorporated in the InstructLab Core repository after use and review by users and developers.
10 | 
11 | ## Repository Structure
12 | 
13 | The repository will have two categories of scripts. Scripts either live in the `hack` and `beta` directories.
14 | 
15 | ```bash
16 | support-utils
17 | |
18 | |
19 | |- beta
20 | |
21 | |- hack
22 | ```
23 | 
24 | The `hack` directory is open for the contribution of scripts of any quality.
25 | 
26 | Scripts in the `beta` directory will be required to have documentation, and automated functional testing.
27 | These scripts are meant to be run by users for feedback and may graduate into full blown features in other InstructLab repos.
28 | 
29 | Beyond this initial structure, the structure within those two directories will evolve as scripts are contributed to each.
30 | 
31 | ## Additional Info
32 | 
33 | A few areas of focus for the first scripts that will be added to the repository are:
34 | 
35 | - Automating qna.yaml creation
36 | - Assessing document readiness knowing the limitations of Docling
37 | - Visualizing synthetically generated data for inspection
38 | 
39 | This repo would not be released as a package on PYPI but initially as just `.zip` and `.tar.gz` files on GitHub.
40 | Releases would serve the purpose of giving users having specific versions of scripts in `beta` and for development project management purposes.
41 | 


--------------------------------------------------------------------------------
/docs/taxonomy-tree-organization.md:
--------------------------------------------------------------------------------
 1 | # Taxonomy Tree Organization
 2 | 
 3 | This document describes the justification and decision to choose
 4 | to emulate the [Wikipedia taxonomy](https://en.wikipedia.org/wiki/Wikipedia:Contents) structure for our `knowledge/` tree in the taxonomy repository.
 5 | 
 6 | The definition of taxonomy from Wikipedia:
 7 | > A taxonomy is a scheme of classification, especially a hierarchical classification, in which things are organized into groups or types. Among other things, a taxonomy can be used to organize and index knowledge (stored as documents, articles, videos, etc.), such as in the form of a library classification system, or a search engine taxonomy, so that users can more easily find the information they are searching for. Many taxonomies are hierarchies (and thus, have an intrinsic tree structure), but not all are.
 8 | 
 9 | ## Why do we need this?
10 | 
11 | Our taxonomy tree is not only how humans will  place
12 | the different `qna.yaml`s, but it's how people will look for
13 | and update changes for specific questions and answers.
14 | Having the challenge of this organization, copying Wikipedia's
15 | tree is a good default standard.
16 | 
17 | ## What are we going to do to enforce this?
18 | 
19 | The triage team will take into consideration the location of the
20 | directory and how it pertains to the suggested tree that Wikipedia
21 | publishes their as. The merging of [this PR](https://github.com/instructlab/taxonomy/pull/780)
22 | will build the initial tree, which we can work with the backend
23 | team to solidify the tree going forward.
24 | Creating a new "top level" directory will require understanding
25 | that it will be a special rare case, while lower left nodes of
26 | the tree as long as they are put in logical place is empowered
27 | by the contributor.
28 | The side effect leveraging using this we can verify where the
29 | knowledge is placed on Wikipedia and reinforce the location
30 | in the taxonomy tree.
31 | 
32 | ## Conflicts and Resolutions
33 | 
34 | With adopting this format and structure there will be conflicts and debate
35 | about the placement of the `qna.yaml`. The triage team will do their best
36 | to take into consideration of the challenges that may arise, and work
37 | with the contributor to hear and engage with that conflict. The triage
38 | team has the ultimate decision on the location of the directory and
39 | file.
40 | 


--------------------------------------------------------------------------------
/docs/training/training-accelerate-dep.md:
--------------------------------------------------------------------------------
  1 | # Design Proposal - HF Accelerate
  2 | 
  3 | ## Overview
  4 | 
  5 | InstructLab training is currently being required to support two distributed sharding frameworks: DeepSpeed and FSDP1. Additionally, there is a future plan to adopt FSDP2 once the project matures. Each of these frameworks has its own process for preparing models, optimizers, sharding, and all of their own custom options.
  6 | 
  7 | With all this in mind, in order to maintain a simple common codebase, an abstraction for sharding frameworks is required. This is why we have pulled in Hugging Face Accelerate as a lightweight sharding abstraction, to enable a common interface for sharding frameworks with pluggable configs, and to avoid overly-branching code paths.
  8 | 
  9 | ## How it is being used
 10 | 
 11 | (maybe insert diagram here)
 12 | 
 13 | ### How it is implemented in code
 14 | 
 15 | Accelerate usage consists of a single import:
 16 | 
 17 | ```python
 18 | from accelerate import Accelerator
 19 | ```
 20 | 
 21 | We begin by setting up our accelerator object via our `setup_accelerator` function:
 22 | 
 23 | ```python
 24 | accelerator = setup_accelerator(args, model, grad_accum)
 25 | ```
 26 | 
 27 | This checks the selected sharding framework, and sets up the appropriate config:
 28 | 
 29 | ```python
 30 | def setup_accelerator(args, model, grad_accum):
 31 |     if args.distributed_training_framework == "deepspeed":
 32 |         # Third Party
 33 |         from deepspeed import DeepSpeedEngine
 34 |         ...
 35 |         accel_args = {
 36 |             "deepspeed_plugin": get_ds_plugin(...),
 37 |         }
 38 |     elif args.distributed_training_framework == "fsdp":
 39 |         accel_args = {
 40 |             "fsdp_plugin": get_fsdp_config(args, model),
 41 |         }
 42 |     else:
 43 |         raise ValueError(
 44 |             f"Unknown sharding framework: {args.distributed_training_framework}"
 45 |         )
 46 |     accelerator = Accelerator(
 47 |         **accel_args,
 48 |     )
 49 |     accelerator.even_batches = False
 50 |     return accelerator
 51 | ```
 52 | 
 53 | Now this Accelerator object can act as a universal sharding framework config, and we can prepare our training objects accordingly:
 54 | 
 55 | ```python
 56 | model, optimizer, _, lr_scheduler = accelerator.prepare(
 57 |     model,
 58 |     optimizer,
 59 |     deepcopy(train_loader),
 60 |     lr_scheduler,
 61 | )
 62 | ```
 63 | 
 64 | As an additional bonus, the accelerator object allows us to do universal checkpoint saving and resuming as well. For saving model checkpoints, we can simply use:
 65 | 
 66 | ```python
 67 | accelerator.save_model(
 68 |     model,
 69 |     save_directory=output_dir,
 70 |     max_shard_size="5GB",
 71 |     safe_serialization=True,
 72 | )
 73 | ```
 74 | 
 75 | and to save full state for resuming training, we can use:
 76 | 
 77 | ```python
 78 | accelerator.save_state(
 79 |     output_dir=output_dir,
 80 |     safe_serialization=True,
 81 | )
 82 | ```
 83 | 
 84 | ## The immediate benefit
 85 | 
 86 | The inclusion of Accelerate in this manner drastically simplifies the process of supporting multiple sharding frameworks. Rather than having diverging code paths for model setup, optimizer setup, sharding configuration, distributed initialization, state saving, and model checkpointing, all of these training steps can be supported with the same code. The common abstraction allows us to maintain both DeepSpeed and FSDP, and prepares us for inclusion of additional sharding frameworks.
 87 | 
 88 | ## Impact on overhead
 89 | 
 90 | ### Performance
 91 | 
 92 | There has been no noticeable performance impact observed during development, but we will defer to the final measurements seen from comprehensive performance testing and benchmarking.
 93 | 
 94 | ### Usability
 95 | 
 96 | This makes the library easier to read, but also requires knowledge of an additional package. While there exists [documentation](https://huggingface.co/docs/accelerate/v1.0.0rc1/en/index), it does require some additional code exploration to actually understand what some functions (like `prepare`, `save_model`, etc.) are doing behind the scenes, making some processes less transparent than if implemented directly with `torch`.
 97 | 
 98 | While sharding framework code will not need to be reviewed as thoroughly, this does not change the fact that one still needs to understand the configuration options per framework and how they behave.
 99 | 
100 | ### Package Management
101 | 
102 | This inclusion requires one additional python package to be managed as a dependency. Currently, that package is `accelerate==0.34.2` but we plan to immediately upgrade to `accelerate==1.0.0` once it is moved from pre-release to official release.
103 | 
104 | ## Long-Term Bonuses and Risks
105 | 
106 | The two clearest long-term bonuses are:
107 | 
108 | - Simplifies the process of onboarding additional sharding frameworks in the future, as well as deprecating existing sharding frameworks
109 | - Vastly improves the readability and maintenance of our code by avoiding diverging paths for various sharding frameworks
110 | 
111 | There are, however, important risks to be considered. Ultimately, if we are confident that we wish to eventually stick with a single sharding framework in the future and contribute directly to that project, the risks of including Accelerate may outweigh the benefits. Please read the next section carefully to understand why, if Accelerate no longer provides a significant benefit to us, should be removed.
112 | 
113 | ### Why Stop There?
114 | 
115 | With the inclusion of a Hugging Face abstraction library, this begs the question "why not keep going and pull in more of the HF stack? Won't it simplify things further then?"
116 | 
117 | It is important to reiterate here the clear downsides of HF Accelerate, and what we wish to avoid moving forward. Accelerate makes our code more opaque, and hinders our ability to easily understand and customize as needed without being reliant upon a third-party dependency. We become reliant upon HF bug fixes, release cycles, and documentation, and without an explicit member of the HF community to manage these for us, this introduces risk that has to be heavily considered.
118 | 
119 | For example, if we wanted to change how model saving worked, or a user wanted to simply understand how model saving worked, they would no longer be able to do so through our library directly, but instead would need to read HF documentation, and contribute back to HF repos with the hope that the customization is considered generally useful and approved. More niche or experimental customizations may not be approved at all, and would have to be patched in directly or overwritten in a rather hacky manner within our library. This as described is not the most dire of circumstances, as one could argue that if it is not ready to be included upstream, it may not be ready to be included in our library. Where things get more serious, however, is with the introduction of bugs. At this very moment, there is a bug in `v0.34.2` that means FSDP model saving does not behave as expected. This fix is resolved in `v1.0.0rc1`, but there is no official release yet that can be pulled in with this fix. So for instructlab-training `v0.5.x`, we are currently maintaining a patch directly in our library to fix the bug while working with `v0.34.2`, and it will sit there temporarily until Accelerate has a new official release.
120 | 
121 | This expands well into the wider topic of dependency management. A package like Accelerate will have frequent updates and changes moving forward, and we will need to ensure consistent compatibility between our training library releases and the required Accelerate versions. A new dependency also means shipping with another package, with a large set of code. While in most cases this is harmless, a package that is used directly on top of our work in place of our existing code, adds a significant amount of bloat for developers and users alike. The requirement shifts from understanding a line of code, to understanding a full code path behind a line of code, the quality of which cannot be guaranteed by our engineers.
122 | 
123 | Ultimately, these risks and overhead management costs make sense for Accelerate, because the package provides significant benefit that far outweigh our concerns. When considering additional HF packages, however, there is currently **no obvious benefit** to their inclusion, which only further enunciates the risks associated. It must be made clear that the risks of Accelerate are non-negligible, and that the conveniences and positive impact provided are what push it over the line of inclusion. It is by no means a "no brainer" to further include any additional HF packages at this time, unless they afford us a similar benefit.
124 | 


--------------------------------------------------------------------------------
/docs/ui/project-proposal.md:
--------------------------------------------------------------------------------
 1 | # New Repository Proposal: ui
 2 | 
 3 | ## Summary
 4 | 
 5 | This document proposes a new repository under the `instructlab` GitHub organization:
 6 | 
 7 | - `instructlab/ui`
 8 | 
 9 | ## Background
10 | 
11 | Currently we don’t have public repository under InstructLab that hosts UI-related work. The goal of this project is to provide a space for hacking a UI for the InstructLab. The scope of this project is to support the following personas through the UI:
12 | 
13 | - External contributors who want to contribute skill and knowledge to the taxonomy repo
14 | - Triagers who will be evaluating the taxonomy contributions.
15 | 
16 | These personas are not limited to the above list, but the initial focus will be on these two.
17 | 
18 | Intent is to build this UI as a SaaS service for the upstream project and its taxonomy repo, but build in a way that someone could deploy the same service on their own infrastructure for managing their own taxonomy repo.
19 | 
20 | ## Maintainers
21 | 
22 | The initial team of maintainers (GitHub Team - `UI Maintainers`) for this repository will be:
23 | 
24 | - Anil Kumar Vishnoi <avishnoi@redhat.com>
25 | - Brent Salisbury <bsalisbu@redhat.com>
26 | - Taiga Nakamura <taiga@us.ibm.com>
27 | - Guang-Jie Ren <gren@us.ibm.com>
28 | - Juan Cappi <jmcappi@us.ibm.com>
29 | - Daniel Tan <chungtan@us.ibm.com>
30 | - Gregory Pereira <grpereir@redhat.com>
31 | 
32 | ## Seed Code Contribution
33 | 
34 | We are planning to seed this repository with the code from the `instructlab/instructlab-bot` repository [here](https://github.com/instructlab/instructlab-bot/tree/main/ui). This code will be used as a starting point for the UI work. This code is already open source and licensed under the Apache 2.0 license.
35 | 
36 | ## Alternatives Considered
37 | 
38 | ### Use `instructlab/instructlab-bot`
39 | 
40 | We currently have a very initial version of InstructLab UI related code present in `instructlab/instructlab-bot` repo (in "ui" directory). We can continue hacking the code in that repo, but I believe that is not the right place for the UI work, as instructlab-bot is supposed to be a backend component that is scoped to work as a triagers assistant tool with no direct interfacing to contributors or traigers. To work with a cleaner scope of these separate work streams, it makes sense to separate the UI work in its own repository.
41 | 


--------------------------------------------------------------------------------