├── .devcontainer.json ├── .github ├── ISSUE_TEMPLATE │ └── content_request.yml ├── pull_request_template.md └── workflows │ └── pages.yml ├── .gitignore ├── .idea ├── copilot │ └── chatSessions │ │ ├── 00000000000.xd │ │ ├── blobs │ │ └── version │ │ └── xd.lck ├── inspectionProfiles │ └── Project_Default.xml ├── jsLinters │ └── eslint.xml ├── prettier.xml └── vcs.xml ├── .mailmap ├── .vscode ├── Dockerfile ├── jupyterbook.code-snippets └── settings.json ├── .zenodo.json ├── CITATION.cff ├── CNAME ├── LICENCE ├── README.md ├── _config.yml ├── _static ├── external_target.css ├── external_target.js ├── font.css ├── main.css └── main.js ├── _templates └── page.html ├── _toc.yml ├── assets └── favicon.ico ├── badges.py ├── bibliography.py ├── committers.py ├── desktop-apps.md ├── eval-datasets.md ├── fine-tuning.md ├── hardware.md ├── index.md ├── licences.md ├── mlops-engines.md ├── model-formats.md ├── models.md ├── prem_theme ├── __init__.py ├── layout.html └── theme.conf ├── references.bib ├── references.md ├── requirements.txt ├── sdk.md ├── unaligned-models.md └── vector-db.md /.devcontainer.json: -------------------------------------------------------------------------------- 1 | // format details: https://aka.ms/devcontainer.json 2 | { 3 | "name": "book.premai.io", 4 | "build": {"dockerfile": ".vscode/Dockerfile", "context": "."}, 5 | "customizations": {"vscode": { 6 | "extensions": [ 7 | "DavidAnson.vscode-markdownlint", 8 | "streetsidesoftware.code-spell-checker"]}}, 9 | // live reload https://github.com/executablebooks/jupyter-book/issues/1455 10 | "onCreateCommand": "pip install sphinx-autobuild", 11 | "postStartCommand": "jupyter-book config sphinx . && sphinx-autobuild -b dirhtml --re-ignore='\\.(github|devcontainer)' -n . _build/dirhtml", 12 | "portsAttributes": {"8000": {"label": "Webserver", "onAutoForward": "notify"}} 13 | } 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/content_request.yml: -------------------------------------------------------------------------------- 1 | name: Content Request 2 | description: Ask to add/fix any content, e.g. a URL/table row/paragraph/chapter 3 | labels: [content] 4 | assignees: [casperdcl, premAI-io/writer] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | :heart: Thanks for suggesting something! 10 | 11 | Feel free to [open a pull request (PR) instead](https://book.premai.io/state-of-open-source-ai/#contributing) 12 | if you'd like to be automatically added to the list of [co-authors/contributors](https://book.premai.io/state-of-open-source-ai/#contributors) 13 | (don't worry if that's too difficult; it's perfectly fine to open an issue using this form). 14 | - type: dropdown 15 | id: type 16 | attributes: 17 | label: Type 18 | multiple: true 19 | options: 20 | - new URL/reference/table row 21 | - new chapter 22 | - other (e.g. typos, factual errors, etc.) 23 | - type: dropdown 24 | id: chapter 25 | attributes: 26 | label: Chapter/Page 27 | multiple: true 28 | options: 29 | - licences 30 | - eval-datasets 31 | - models 32 | - unaligned-models 33 | - fine-tuning 34 | - model-formats 35 | - mlops-engines 36 | - vector-db 37 | - sdk 38 | - desktop-apps 39 | - hardware 40 | - index (landing/home) 41 | - Something else 42 | - type: textarea 43 | attributes: {label: Description} 44 | validations: {required: false} 45 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Review checklist 2 | 3 | > Don't worry about satisfying all items, it's fine to open a (draft) PR. 4 | 5 | - [ ] chapter content 6 | + [ ] only one top-level `# h1-Title` 7 | + [ ] summary (e.g. table or TL;DR overview), no need for an explicit `## Summary/Introduction` title or equivalent 8 | + [ ] main content focus: recent developments in open source AI 9 | + general context/background (brief) 10 | + current pros/cons 11 | + in-depth insights (not yet widely known) 12 | + [ ] likely `## Future` developments 13 | + [ ] end with `{{ comments }}` 14 | - [ ] appropriate citations 15 | + [ ] BibTeX references 16 | + [ ] Glossary terms 17 | + [ ] cross-references (figures/chapters) 18 | + [ ] (if `new-chapter.md`), add `_toc.yml` entry & `index.md` table row 19 | + [ ] If CI URL checks have false-positives, append to `_config.yml:sphinx.config.linkcheck*` 20 | - [ ] images & data not committed to this repo (e.g. use https://github.com/premAI-io/static.premai.io instead) 21 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | name: site 2 | on: 3 | push: {branches: [main], tags-ignore: ['**']} 4 | pull_request: 5 | schedule: [{cron: '0 10 * * 6'}] # M H d m w (Sat 10:00) 6 | permissions: 7 | contents: read 8 | pages: write 9 | id-token: write 10 | concurrency: {group: "${{ github.ref }}-pages", cancel-in-progress: true} 11 | env: 12 | SITE_PREFIX: state-of-open-source-ai 13 | jobs: 14 | check: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: {fetch-depth: 0} 19 | - uses: actions/setup-python@v4 20 | with: {python-version: '3.11'} 21 | - run: pip install -r requirements.txt pyyaml 22 | - name: Check CITATION.cff & .zenodo.json 23 | run: | 24 | python < _site/index.html 76 | - uses: actions/upload-pages-artifact@v2 77 | deploy: 78 | if: github.ref == 'refs/heads/main' 79 | environment: 80 | name: github-pages 81 | url: ${{ steps.deployment.outputs.page_url }} 82 | runs-on: ubuntu-latest 83 | needs: [check, build] 84 | steps: 85 | - id: deployment 86 | uses: actions/deploy-pages@v2 87 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # python 2 | __pycache__/ 3 | *.py[cod] 4 | .ipynb_checkpoints/ 5 | # jupyter-book 6 | /_build/ 7 | /conf.py 8 | .vercel 9 | 10 | ### JetBrains template 11 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 12 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 13 | 14 | # User-specific stuff 15 | .idea/**/workspace.xml 16 | .idea/**/tasks.xml 17 | .idea/**/usage.statistics.xml 18 | .idea/**/dictionaries 19 | .idea/**/shelf 20 | .idea/copilot/ 21 | 22 | # AWS User-specific 23 | .idea/**/aws.xml 24 | 25 | # Generated files 26 | .idea/**/contentModel.xml 27 | 28 | # Sensitive or high-churn files 29 | .idea/**/dataSources/ 30 | .idea/**/dataSources.ids 31 | .idea/**/dataSources.local.xml 32 | .idea/**/sqlDataSources.xml 33 | .idea/**/dynamic.xml 34 | .idea/**/uiDesigner.xml 35 | .idea/**/dbnavigator.xml 36 | 37 | # Gradle 38 | .idea/**/gradle.xml 39 | .idea/**/libraries 40 | 41 | # Gradle and Maven with auto-import 42 | # When using Gradle or Maven with auto-import, you should exclude module files, 43 | # since they will be recreated, and may cause churn. Uncomment if using 44 | # auto-import. 45 | # .idea/artifacts 46 | # .idea/compiler.xml 47 | # .idea/jarRepositories.xml 48 | # .idea/modules.xml 49 | # .idea/*.iml 50 | # .idea/modules 51 | # *.iml 52 | # *.ipr 53 | 54 | # CMake 55 | cmake-build-*/ 56 | 57 | # Mongo Explorer plugin 58 | .idea/**/mongoSettings.xml 59 | 60 | # File-based project format 61 | *.iws 62 | 63 | # IntelliJ 64 | out/ 65 | 66 | # mpeltonen/sbt-idea plugin 67 | .idea_modules/ 68 | 69 | # JIRA plugin 70 | atlassian-ide-plugin.xml 71 | 72 | # Cursive Clojure plugin 73 | .idea/replstate.xml 74 | 75 | # SonarLint plugin 76 | .idea/sonarlint/ 77 | 78 | .idea/copilot/ 79 | 80 | # Crashlytics plugin (for Android Studio and IntelliJ) 81 | com_crashlytics_export_strings.xml 82 | crashlytics.properties 83 | crashlytics-build.properties 84 | fabric.properties 85 | 86 | # Editor-based Rest Client 87 | .idea/httpRequests 88 | 89 | # Android studio 3.1+ serialized cache file 90 | .idea/caches/build_file_checksums.ser 91 | 92 | ### macOS template 93 | # General 94 | .DS_Store 95 | .AppleDouble 96 | .LSOverride 97 | 98 | # Icon must end with two \r 99 | Icon 100 | 101 | # Thumbnails 102 | ._* 103 | 104 | # Files that might appear in the root of a volume 105 | .DocumentRevisions-V100 106 | .fseventsd 107 | .Spotlight-V100 108 | .TemporaryItems 109 | .Trashes 110 | .VolumeIcon.icns 111 | .com.apple.timemachine.donotpresent 112 | 113 | # Directories potentially created on remote AFP share 114 | .AppleDB 115 | .AppleDesktop 116 | Network Trash Folder 117 | Temporary Items 118 | .apdisk 119 | 120 | .env 121 | .env.local 122 | 123 | -------------------------------------------------------------------------------- /.idea/copilot/chatSessions/00000000000.xd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/premAI-io/state-of-open-source-ai/81d36c662f631f38ce0dc13b8f4419b02f88c731/.idea/copilot/chatSessions/00000000000.xd -------------------------------------------------------------------------------- /.idea/copilot/chatSessions/blobs/version: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /.idea/copilot/chatSessions/xd.lck: -------------------------------------------------------------------------------- 1 | Private property of Exodus: 67659@Stephanes-MBP.fibertel.com.ar 2 | 3 | jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:88) 4 | jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:39) 5 | jetbrains.exodus.io.FileDataWriter.lock(FileDataWriter.kt:70) 6 | jetbrains.exodus.log.Log.tryLock(Log.kt:804) 7 | jetbrains.exodus.log.Log.(Log.kt:117) 8 | jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:117) 9 | jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:81) 10 | jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:77) 11 | jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46) 12 | jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46) 13 | jetbrains.exodus.env.Environments.prepare(Environments.kt:120) 14 | jetbrains.exodus.env.Environments.newInstance(Environments.kt:46) 15 | kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:40) 16 | kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:31) 17 | kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore$default(EntityStoreHelper.kt:30) 18 | com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.initStore(XdChatSessionPersistenceService.kt:115) 19 | com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.(XdChatSessionPersistenceService.kt:22) 20 | com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.(XdChatSessionPersistenceService.kt:15) 21 | com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.ChatSessionPersistenceService(ChatSessionPersistenceService.kt:43) 22 | com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.chatSessionsPersistenceService(ChatSessionPersistenceService.kt:53) 23 | com.github.copilot.chat.session.ChatSessionManager.(ChatSessionManager.kt:45) 24 | com.github.copilot.chat.session.ChatSessionManager.(ChatSessionManager.kt:25) 25 | com.github.copilot.chat.window.CopilotChatToolWindow.onCopilotReady(CopilotChatToolWindow.kt:133) 26 | com.github.copilot.chat.window.CopilotChatToolWindow.access$onCopilotReady(CopilotChatToolWindow.kt:40) 27 | com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:118) 28 | com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:115) 29 | com.github.copilot.status.CopilotAuthStatusKt.subscribeToCopilotAuthStatus(CopilotAuthStatus.kt:27) 30 | com.github.copilot.chat.window.CopilotChatToolWindow.initCopilotStatusListener(CopilotChatToolWindow.kt:115) 31 | com.github.copilot.chat.window.CopilotChatToolWindow.(CopilotChatToolWindow.kt:59) 32 | com.github.copilot.chat.window.CopilotChatToolWindow.(CopilotChatToolWindow.kt:40) 33 | com.github.copilot.chat.window.CopilotChatToolWindowFactory.init(CopilotChatToolWindowFactory.kt:18) 34 | com.intellij.openapi.wm.impl.ToolWindowManagerImpl.registerToolWindow$intellij_platform_ide_impl(ToolWindowManagerImpl.kt:1123) 35 | com.intellij.toolWindow.ToolWindowSetInitializerKt.registerToolWindows(ToolWindowSetInitializer.kt:223) 36 | com.intellij.toolWindow.ToolWindowSetInitializerKt.access$registerToolWindows(ToolWindowSetInitializer.kt:1) 37 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1$1.invokeSuspend(ToolWindowSetInitializer.kt:141) 38 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1$1.invoke(ToolWindowSetInitializer.kt) 39 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1$1.invoke(ToolWindowSetInitializer.kt) 40 | kotlinx.coroutines.intrinsics.UndispatchedKt.startUndispatchedOrReturn(Undispatched.kt:78) 41 | kotlinx.coroutines.BuildersKt__Builders_commonKt.withContext(Builders.common.kt:167) 42 | kotlinx.coroutines.BuildersKt.withContext(Unknown Source) 43 | com.intellij.platform.diagnostic.telemetry.impl.TracerKt.span(tracer.kt:53) 44 | com.intellij.platform.diagnostic.telemetry.impl.TracerKt.span$default(tracer.kt:49) 45 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1.invokeSuspend(ToolWindowSetInitializer.kt:138) 46 | kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33) 47 | kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:108) 48 | com.intellij.openapi.application.TransactionGuardImpl$1.run(TransactionGuardImpl.java:194) 49 | com.intellij.openapi.application.impl.ApplicationImpl.runIntendedWriteActionOnCurrentThread(ApplicationImpl.java:843) 50 | com.intellij.openapi.application.impl.ApplicationImpl$4.run(ApplicationImpl.java:463) 51 | com.intellij.openapi.application.impl.RwLockHolder.runWithEnabledImplicitRead(RwLockHolder.kt:75) 52 | com.intellij.openapi.application.impl.RwLockHolder.runWithImplicitRead(RwLockHolder.kt:67) 53 | com.intellij.openapi.application.impl.ApplicationImpl.runWithImplicitRead(ApplicationImpl.java:1436) 54 | com.intellij.openapi.application.impl.FlushQueue.doRun(FlushQueue.java:82) 55 | com.intellij.openapi.application.impl.FlushQueue.runNextEvent(FlushQueue.java:124) 56 | com.intellij.openapi.application.impl.FlushQueue.flushNow(FlushQueue.java:44) 57 | java.desktop/java.awt.event.InvocationEvent.dispatch(InvocationEvent.java:318) 58 | java.desktop/java.awt.EventQueue.dispatchEventImpl(EventQueue.java:792) 59 | java.desktop/java.awt.EventQueue$3.run(EventQueue.java:739) 60 | java.desktop/java.awt.EventQueue$3.run(EventQueue.java:733) 61 | java.base/java.security.AccessController.doPrivileged(AccessController.java:399) 62 | java.base/java.security.ProtectionDomain$JavaSecurityAccessImpl.doIntersectionPrivilege(ProtectionDomain.java:86) 63 | java.desktop/java.awt.EventQueue.dispatchEvent(EventQueue.java:761) 64 | com.intellij.ide.IdeEventQueue.defaultDispatchEvent(IdeEventQueue.kt:695) 65 | com.intellij.ide.IdeEventQueue._dispatchEvent$lambda$12(IdeEventQueue.kt:589) 66 | com.intellij.openapi.application.impl.RwLockHolder.runWithoutImplicitRead(RwLockHolder.kt:44) 67 | com.intellij.ide.IdeEventQueue._dispatchEvent(IdeEventQueue.kt:589) 68 | com.intellij.ide.IdeEventQueue.access$_dispatchEvent(IdeEventQueue.kt:72) 69 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1$1.compute(IdeEventQueue.kt:355) 70 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1$1.compute(IdeEventQueue.kt:354) 71 | com.intellij.openapi.progress.impl.CoreProgressManager.computePrioritized(CoreProgressManager.java:793) 72 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1.invoke(IdeEventQueue.kt:354) 73 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1.invoke(IdeEventQueue.kt:349) 74 | com.intellij.ide.IdeEventQueueKt.performActivity$lambda$1(IdeEventQueue.kt:1014) 75 | com.intellij.openapi.application.TransactionGuardImpl.performActivity(TransactionGuardImpl.java:106) 76 | com.intellij.ide.IdeEventQueueKt.performActivity(IdeEventQueue.kt:1014) 77 | com.intellij.ide.IdeEventQueue.dispatchEvent$lambda$7(IdeEventQueue.kt:349) 78 | com.intellij.openapi.application.impl.ApplicationImpl.runIntendedWriteActionOnCurrentThread(ApplicationImpl.java:848) 79 | com.intellij.ide.IdeEventQueue.dispatchEvent(IdeEventQueue.kt:391) 80 | java.desktop/java.awt.EventDispatchThread.pumpOneEventForFilters(EventDispatchThread.java:207) 81 | java.desktop/java.awt.EventDispatchThread.pumpEventsForFilter(EventDispatchThread.java:128) 82 | java.desktop/java.awt.EventDispatchThread.pumpEventsForHierarchy(EventDispatchThread.java:117) 83 | java.desktop/java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:113) 84 | java.desktop/java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:105) 85 | java.desktop/java.awt.EventDispatchThread.run(EventDispatchThread.java:92) 86 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/jsLinters/eslint.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/prettier.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | Biswaroop Bhattacharjee 2 | Nicola Sosio 3 | Skanda Vivek 4 | -------------------------------------------------------------------------------- /.vscode/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:alpine 2 | # required for `pip install psutil` 3 | RUN apk update && apk add python3-dev gcc linux-headers musl-dev 4 | # required for vscode 5 | RUN apk update && apk add git 6 | # project dependencies 7 | COPY requirements.txt . 8 | RUN pip install -r requirements.txt && rm requirements.txt 9 | # enable devcontainer updateRemoteUserUID 10 | RUN adduser -SD --shell /bin/sh vscode 11 | USER vscode 12 | ENV PATH=/home/vscode/.local/bin:$PATH 13 | -------------------------------------------------------------------------------- /.vscode/jupyterbook.code-snippets: -------------------------------------------------------------------------------- 1 | { 2 | "BibTeX URL (misc)": { 3 | "scope": "bibtex", 4 | "prefix": "@online", 5 | "body": [ 6 | "@online{${1:key},", 7 | "title={$3},", 8 | "author={$4},", 9 | "year=${5:lastUpdated},", 10 | "url={$2}", 11 | "}" 12 | ], 13 | "description": "Add a website citation" 14 | }, 15 | "BibTeX URL (news)": { 16 | "scope": "bibtex", 17 | "prefix": "@article", 18 | "body": [ 19 | "@article{${1:key},", 20 | "title={$3},", 21 | "author={$4},", 22 | "year=${5:lastUpdated},", 23 | "journal={$6}," 24 | "url={$2}", 25 | "}" 26 | ], 27 | "description": "Add a news website citation" 28 | }, 29 | "Figure (external)": { 30 | "scope": "markdown", 31 | "prefix": "fig-ext", 32 | "body": [ 33 | "```{figure-md} ${4}", 34 | ":class: margin", 35 | "![](${1:imageURL})", 36 | "", 37 | "[${2:caption}](${3:sourcePageURL})", 38 | "```" 39 | ], 40 | "description": "Add an image from an external website" 41 | }, 42 | "Figure (internal)": { 43 | "scope": "markdown", 44 | "prefix": "fig-int", 45 | "body": [ 46 | "```{figure-md} ${4}", 47 | ":class: margin", 48 | "![](https://static.premai.io/book/${1:imageURL})", 49 | "", 50 | "${2:caption}", 51 | "```" 52 | ], 53 | "description": "Add an image hosted by static.premai.io" 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.tabCompletion": "on", 3 | "files.insertFinalNewline": true, 4 | "files.trimFinalNewlines": true, 5 | "files.trimTrailingWhitespace": true, 6 | "cSpell.language": "en-GB", 7 | "markdownlint.config": {"ul-style": false} 8 | } 9 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "State of Open Source AI", 3 | "description": "Clarity in the current fast-paced mess of Open Source innovation.", 4 | "related_identifiers": [ 5 | {"identifier": "https://book.premai.io/state-of-open-source-ai", "relation": "isSourceOf"}], 6 | "creators": [ 7 | {"name": "da Costa-Luis, Casper", "affiliation": "Prem", "orcid": "0000-0002-7211-1557"}, 8 | {"name": "Sosio, Nicola", "affiliation": "Prem"}, 9 | {"name": "Bhattacharjee, Biswaroop", "affiliation": "Prem"}, 10 | {"name": "Vivek, Skanda"}, 11 | {"name": "Trivedi, Het"}, 12 | {"name": "Pedrazzini, Filippo", "affiliation": "Prem"} 13 | ], 14 | "contributors": [ 15 | {"name": "others", "type": "Other"}], 16 | "keywords": ["open-source", "AI", "book", "ML", "MLOps", "Jupyter-Book"], 17 | "imprint_publisher": "Prem", 18 | "access_right": "open", 19 | "upload_type": "publication", 20 | "publication_type": "book", 21 | "publication_date": "2023-10-03" 22 | } 23 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | type: dataset 3 | contact: [{affiliation: Prem, email: hello@premai.io}] 4 | date-released: 2023-10-03 5 | message: Please cite this book using this metadata 6 | 7 | title: State of Open Source AI Book 8 | abstract: Clarity in the current fast-paced mess of Open Source innovation. 9 | version: '1' 10 | url: https://book.premai.io/state-of-open-source-ai 11 | authors: 12 | - {given-names: Casper, family-names: da Costa-Luis, affiliation: Prem, orcid: 'https://orcid.org/0000-0002-7211-1557'} 13 | - {given-names: Nicola, family-names: Sosio, affiliation: Prem} 14 | - {given-names: Biswaroop, family-names: Bhattacharjee, affiliation: Prem} 15 | - {given-names: Skanda, family-names: Vivek} 16 | - {given-names: Het, family-names: Trivedi} 17 | - {given-names: Filippo, family-names: Pedrazzini, affiliation: Prem} 18 | - {name: others, website: 'https://github.com/premAI-io/state-of-open-source-ai/graphs/contributors'} 19 | identifiers: [{type: doi, value: 10.5281/zenodo.10023181}] 20 | repository-code: https://github.com/premAI-io/state-of-open-source-ai 21 | keywords: [open-source, AI, book, ML, MLOps, Jupyter-Book] 22 | license-url: https://github.com/premAI-io/state-of-open-source-ai/blob/main/LICENCE # CC-BY-4.0 AND Apache-2.0 23 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | book.premai.io -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | SPDX-License-Identifier: CC-BY-4.0 AND Apache-2.0 2 | 3 | This book is a product of collaborative work. 4 | Unless otherwise stated, all authors (see commit logs) retain copyright 5 | for their respective work, and release code under the Apache-2.0 licence, 6 | and everything else under the CC-BY-4.0 licence. 7 | 8 | Exceptions or notable authors are listed below 9 | in reverse chronological order: 10 | 11 | * files: * 12 | CC-BY-4.0 AND Apache-2.0 (c) 2023 Prem https://github.com/PremAI-io. 13 | 14 | Creative Commons Attribution v. 4.0 Internation (CC-BY-4.0) 15 | ----------------------------------------------------------- 16 | 17 | This work is licenced under http://creativecommons.org/licenses/by/4.0 18 | 19 | Apache Licence v. 2.0 20 | --------------------- 21 | 22 | Licenced under the Apache Licence, Version 2.0 (the "Licence"); 23 | you may not use this work except in compliance with the Licence. 24 | You may obtain a copy of the Licence at 25 | 26 | http://www.apache.org/licenses/LICENSE-2.0 27 | 28 | Unless required by applicable law or agreed to in writing, software 29 | distributed under the Licence is distributed on an "AS IS" BASIS, 30 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 31 | See the Licence for the specific language governing permissions and 32 | limitations under the Licence. 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📘 The State of Open Source AI (2023 Edition) 2 | 3 | [![banner](https://static.premai.io/book/marketing/github--book.png)][book] 4 | 5 | *Clarity in the current fast-paced mess of Open Source innovation.* 6 | 7 | This is the source repository for [The State of Open Source AI][book] ebook, a comprehensive guide exploring everything from model evaluations to deployment, and a great FOMO cure. 8 | 9 | [book]: https://book.premai.io/state-of-open-source-ai 10 | 11 | Want to discuss any topics covered in the book? We have a [dedicated channel (`#book`) on our Discord server][Discord]. 12 | 13 | [Discord]: https://discord.gg/kpKk6vYVAn 14 | 15 | ## Contributing 16 | 17 | You can help keep the book up-to-date! Contributions, issues, and comments are welcome! See the [Contributing Guide](https://book.premai.io/state-of-open-source-ai/#contributing) for more information on how. 18 | 19 | ## Licence 20 | 21 | This book is released under [CC-BY-4.0 (text) and Apache-2.0 (code)](LICENCE). 22 | 23 | Citation: [BibTeX](references.bib#L1) 24 | 25 | ## Community 26 | 27 | - [Join the Open Source AI Discord][Discord] 28 | - [Follow us on Twitter](https://twitter.com/premai_io) 29 | - [Subscribe to our newsletter](https://blog.premai.io) 30 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings (https://jupyterbook.org/customize/config.html) 2 | title: State of Open Source AI Book 3 | author: Prem 4 | copyright: CC-BY-4.0 (text) & Apache-2.0 (code) 5 | logo: https://static.premai.io/logo.png 6 | repository: 7 | url: https://github.com/premAI-io/state-of-open-source-ai 8 | branch: main 9 | 10 | only_build_toc_files: true 11 | exclude_patterns: [.github/*] 12 | 13 | html: 14 | # No "baseurl" here to avoid conflicts with the theme 15 | favicon: assets/favicon.ico 16 | use_edit_page_button: true 17 | use_repository_button: true 18 | comments: 19 | utterances: 20 | issue-term: pathname 21 | label: question 22 | repo: premAI-io/state-of-open-source-ai 23 | theme: preferred-color-scheme 24 | 25 | parse: 26 | myst_substitutions: 27 | baseurl: http://localhost:8000 28 | doi: 10.5281/zenodo.10023181 29 | wip_chapter: | 30 | This chapter is still being written & reviewed. Please do post links & discussion in the {{ 31 | '[ comments]({}-comments)'.format(env.docname) }} below, or {{ 32 | '[ open a pull request]({}/edit/main/{}.md)'.format( 33 | env.config.html_context.book_baseurl, env.docname) 34 | }}! 35 | table_feedback: | 36 | ```{admonition} Feedback 37 | :class: attention 38 | Is the table above outdated or missing an important model? Let us know in the {{ 39 | '[ comments]({}-comments)'.format(env.docname) }} below, or {{ 40 | '[ open a pull request]({}/edit/main/{}.md)'.format( 41 | env.config.html_context.book_baseurl, env.docname) 42 | }}! 43 | ``` 44 | comments: | 45 | {{ '({}-comments)='.format(env.docname) }} 46 | 47 | ```{admonition} Feedback 48 | :class: attention 49 | Missing something important? Let us know in the comments below, or {{ 50 | '[ open a pull request]({}/edit/main/{}.md)'.format( 51 | env.config.html_context.book_baseurl, env.docname) 52 | }}! 53 | ``` 54 | 55 | % hack to get utteranc.es to render (needs a `div.section` element) 56 |
57 | 58 | {{ '```{committers} ' + env.docname + '.md\n```' }} 59 | 60 | 88 | 89 | myst_enable_extensions: 90 | - deflist 91 | - dollarmath 92 | - html_admonition 93 | - linkify 94 | - replacements 95 | - smartquotes 96 | - substitution 97 | - tasklist 98 | 99 | sphinx: 100 | extra_extensions: 101 | - sphinx_last_updated_by_git 102 | - sphinx_subfigure 103 | local_extensions: 104 | badges: . 105 | committers: . 106 | bibliography: . 107 | prem_theme: . 108 | recursive_update: true 109 | config: 110 | # Ensure Sphinx sees _templates/page.html 111 | templates_path: ["_templates"] 112 | 113 | # Use your custom theme 114 | html_theme: prem_theme 115 | 116 | # Put your custom base URL in html_context to avoid "unsupported theme option" warnings 117 | html_context: 118 | book_baseurl: https://book.premai.io/state-of-open-source-ai 119 | 120 | myst_heading_anchors: 4 121 | html_js_files: 122 | - [ 123 | https://plausible.io/js/script.js, 124 | { defer: defer, data-domain: book.premai.io }, 125 | ] 126 | linkcheck_ignore: 127 | - http://localhost:8000 128 | - https://github.com/premAI-io/state-of-open-source-ai/edit/main/.*.md 129 | - https://github.com/\w+/\w+/blob/\w+/.*#L\d+(-L\d+)? 130 | - https://github.com/premAI-io/prem-app#.* 131 | - https://github.com/BlinkDL/RWKV-LM#.* 132 | - https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md#.* 133 | - https://github.com/ggerganov/ggml#.* 134 | - https://github.com/huggingface/optimum#.* 135 | - https://github.com/imaurer/awesome-decentralized-llm#.* 136 | - https://github.com/kingoflolz/mesh-transformer-jax#.* 137 | - https://github.com/lm-sys/FastChat.*#.* 138 | - https://github.com/mistralai/mistral-src#.* 139 | - https://github.com/onnx/onnx-tensorrt/blob/main/docs/operators.md#.* 140 | - https://github.com/onnx/onnx-tensorrt#.* 141 | - https://github.com/onnx/tutorials#.* 142 | - https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e#.* 143 | - https://www.nytimes.com/2016/04/19/technology/google-books-case.html 144 | - https://doi.org/10.2307/2214413 145 | - https://direct.mit.edu/daed/article/151/2/127/110621/Human-Language-Understanding-amp-Reasoning 146 | - https://numfocus.org 147 | - https://chat.openai.com 148 | - https://falconllm.tii.ae 149 | - https://www.midjourney.com 150 | - https://accent.gmu.edu 151 | - https://www.crcv.ucf.edu/data/UCF101.php 152 | - https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing 153 | - https://chat.lmsys.org 154 | - https://platform.openai.com/docs/.* 155 | - https://twitter.com/* 156 | - https://www.reddit.com/* 157 | linkcheck_allowed_redirects: 158 | https://doi.org/.*/.*: https://.* 159 | https://codespaces.new/.*: https://github.com/login.* 160 | https://youtu.be/.*: https://www.youtube.com/watch\?v=.*&feature=youtu.be 161 | https://.*.substack.com/i/\d+/.*: https://.*.substack.com/p/.* 162 | https://docs.bentoml.org: https://docs.bentoml.com/en/latest 163 | https://mozilla.org/MPL/2.0: https://www.mozilla.org/en-US/MPL/2.0 164 | https://mxnet.apache.org: https://mxnet.apache.org/versions/[\d.]+/.* 165 | https://gpt4all.io: https://gpt4all.io/index.html 166 | 167 | html_last_updated_fmt: "%d %b %Y" 168 | jblatex_show_tocs: false 169 | bibtex_reference_style: label 170 | latex_elements: 171 | papersize: a4paper 172 | extrapackages: \usepackage{pdfpages} 173 | maketitle: \includepdf[pages=-]{cover.pdf} 174 | tableofcontents: "" 175 | preamble: | 176 | \usepackage{etoolbox} 177 | \AtBeginEnvironment{figure}{\pretocmd{\hyperlink}{\protect}{}{}} 178 | 179 | bibtex_bibfiles: [references.bib] 180 | 181 | latex: 182 | latex_documents: 183 | targetname: book.tex 184 | 185 | execute: 186 | execute_notebooks: force 187 | -------------------------------------------------------------------------------- /_static/external_target.css: -------------------------------------------------------------------------------- 1 | a.reference.external:after { 2 | content: "↗"; 3 | font-size: .7em; 4 | vertical-align: text-top; 5 | margin-left: .1em; 6 | color: grey; 7 | } 8 | -------------------------------------------------------------------------------- /_static/external_target.js: -------------------------------------------------------------------------------- 1 | document.addEventListener('DOMContentLoaded', function(){ 2 | /// open external links in new tabs 3 | document.querySelectorAll('a.reference.external').forEach(a => { 4 | a.target = '_blank'; 5 | if (a.href.startsWith("https://github.com/premAI-io/state-of-open-source-ai")){ 6 | a.classList.replace('external', 'internal'); 7 | } 8 | }); 9 | }); 10 | -------------------------------------------------------------------------------- /_static/font.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: "Pretendard"; 3 | src: url("https://static.premai.io/fonts/Pretendard-Regular.woff2") format("woff2"); 4 | font-weight: normal; 5 | } 6 | 7 | @font-face { 8 | font-family: "Pretendard"; 9 | src: url("https://static.premai.io/fonts/Pretendard-SemiBold.woff2") format("woff2"); 10 | font-weight: 600; 11 | } 12 | 13 | @font-face { 14 | font-family: "Pretendard"; 15 | src: url("https://static.premai.io/fonts/Pretendard-Bold.woff2") format("woff2"); 16 | font-weight: bold; 17 | } 18 | 19 | :root { 20 | --pst-font-family-base: Pretendard, sans-serif; 21 | --pst-font-family-heading: Pretendard, sans-serif; 22 | } 23 | -------------------------------------------------------------------------------- /_static/main.css: -------------------------------------------------------------------------------- 1 | .modal { 2 | display: none; 3 | align-items: center; 4 | justify-content: center; 5 | width: 100%; 6 | background-color: rgba(0, 0, 0, 0.8); 7 | } 8 | 9 | .modal-content { 10 | display: flex; 11 | flex-direction: row; 12 | margin: 0 auto; 13 | width: 60vw; 14 | padding: 45px; 15 | background-color: rgba(20, 20, 20, 1); 16 | box-shadow: 0 0 22px 1px black; 17 | gap: 30px; 18 | } 19 | 20 | @media only screen and (max-width: 480px) { 21 | .modal-content { 22 | width: 90vw !important; 23 | flex-direction: column !important; 24 | align-items: center !important; 25 | padding: 25px !important; 26 | } 27 | .modal-content h1 { 28 | font-size: 1.2rem !important; 29 | margin-bottom: 5vh !important; 30 | } 31 | .modal-content p { 32 | font-size: 0.9rem !important; 33 | } 34 | .modal-content .input-container { 35 | padding: 6px !important; 36 | } 37 | .modal-content input { 38 | font-size: 0.9rem !important; 39 | } 40 | .modal-content .input-container .button { 41 | padding: 6px 18px !important; 42 | } 43 | .modal-content img { 44 | min-width: 35vw !important; 45 | max-width: 40vw !important; 46 | } 47 | } 48 | 49 | @media only screen and (min-width: 481px) and (max-width: 1022px) { 50 | .modal-content { 51 | width: 70vw !important; 52 | flex-direction: column !important; 53 | align-items: center !important; 54 | padding: 35px !important; 55 | } 56 | .modal-content h1 { 57 | font-size: 1.5rem !important; 58 | margin-bottom: 5vh !important; 59 | } 60 | .modal-content .input-container { 61 | padding: 8px 12px !important; 62 | } 63 | .modal-content input { 64 | font-size: 0.95rem !important; 65 | } 66 | .modal-content img { 67 | min-width: 20vw !important; 68 | max-width: 25vw !important; 69 | } 70 | } 71 | 72 | .modal-content img { 73 | min-width: 10vw; 74 | max-width: 15vw; 75 | height: 100%; 76 | } 77 | 78 | .modal-content .header { 79 | display: flex; 80 | justify-content: flex-end; 81 | } 82 | 83 | .modal-content input { 84 | border: none; 85 | background-color: transparent; 86 | flex: 1; 87 | color: rgba(255, 255, 255, 0.70); 88 | font-size: 1rem; 89 | font-style: normal; 90 | font-weight: 400; 91 | line-height: normal; 92 | width: 100%; 93 | } 94 | 95 | .modal-content input:focus { 96 | outline: none !important; 97 | } 98 | 99 | .modal-content .input-container .button { 100 | color: white; 101 | border: none; 102 | text-align: center; 103 | font-size: 0.8rem; 104 | font-style: normal; 105 | font-weight: 700; 106 | line-height: 24px; 107 | border-radius: 6px; 108 | background: linear-gradient(97.33deg, #7F96FF -3.51%, #F58E8E 109.45%); 109 | padding: 10px 24px; 110 | } 111 | 112 | .modal-content .email-error { 113 | display: flex; 114 | color: red; 115 | font-size: 0.9em; 116 | margin-top: 4px; 117 | } 118 | 119 | .modal-content .input-container { 120 | display: flex; 121 | padding: 10px 14px; 122 | align-items: center; 123 | align-self: stretch; 124 | border-radius: 12px; 125 | border: 1px solid rgba(255, 255, 255, 0.20); 126 | } 127 | 128 | .modal-content h1 { 129 | color: #FFF; 130 | font-size: 1.8rem; 131 | font-style: normal; 132 | font-weight: bold; 133 | line-height: normal; 134 | margin-bottom: 8vh; 135 | } 136 | 137 | .modal-content p { 138 | color: #FFF; 139 | font-size: 1rem; 140 | font-style: normal; 141 | font-weight: 400; 142 | line-height: 26px; 143 | } 144 | 145 | .modal-content .modal-text { 146 | flex: 1; 147 | overflow: auto; 148 | display: flex; 149 | flex-direction: column; 150 | justify-content: space-between; 151 | } 152 | 153 | .modal-content a { 154 | text-decoration: none; 155 | color: #7F96FF; 156 | } 157 | 158 | .modal-content a:hover { 159 | text-decoration: none; 160 | color: #F58E8E; 161 | } 162 | 163 | 164 | /* Announcement Banner */ 165 | 166 | .bd-header-announcement__content a { 167 | background-color: #7F96FF; 168 | color: white; 169 | font-weight: bold; 170 | width: 100%; 171 | height: 100%; 172 | position: absolute; 173 | top: 0; 174 | right: 0; 175 | display: flex; 176 | justify-content: center; 177 | align-items: center; 178 | text-decoration: none; 179 | } 180 | 181 | .bd-header-announcement__content a:hover { 182 | color: rgba(255, 255, 255, 0.90); 183 | } 184 | -------------------------------------------------------------------------------- /_static/main.js: -------------------------------------------------------------------------------- 1 | /// set/get helpers based on https://www.w3schools.com/js/js_cookies.asp 2 | function setCookie(cname, cvalue, exdays) { 3 | const d = new Date(); 4 | d.setTime(d.getTime() + (exdays * 24 * 60 * 60 * 1000)); 5 | document.cookie = cname + "=" + cvalue + ";expires=" + d.toUTCString() + ";SameSite=Strict;path=/"; 6 | } 7 | 8 | function getCookie(cname) { 9 | let name = cname + "="; 10 | let ca = document.cookie.split(';'); 11 | for (let i = 0; i < ca.length; i++) { 12 | let c = ca[i]; 13 | while (c.charAt(0) === ' ') { 14 | c = c.substring(1); 15 | } 16 | if (c.indexOf(name) === 0) { 17 | return c.substring(name.length, c.length); 18 | } 19 | } 20 | return ""; 21 | } 22 | -------------------------------------------------------------------------------- /_templates/page.html: -------------------------------------------------------------------------------- 1 | {% extends "!page.html" %} 2 | 3 | {% block meta %} 4 | 5 | 6 | 9 | 10 | 11 | 12 | {{ super() }} {# Retain any additional meta tags provided by the theme #} 13 | {% endblock meta %} 14 | 15 | {% block main %} 16 | 17 |

Redirecting...

18 |

19 | If you are not redirected automatically, please 20 | click here. 21 |

22 | 23 | {{ super() }} {# Retain the normal page content provided by the theme #} 24 | {% endblock main %} 25 | -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | # Table of contents (https://jupyterbook.org/customize/toc.html) 2 | format: jb-book 3 | root: index 4 | chapters: 5 | - file: licences 6 | - file: eval-datasets 7 | - file: models 8 | - file: unaligned-models 9 | - file: fine-tuning 10 | - file: model-formats 11 | - file: mlops-engines 12 | - file: vector-db 13 | - file: sdk 14 | - file: desktop-apps 15 | - file: hardware 16 | - file: references 17 | -------------------------------------------------------------------------------- /assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/premAI-io/state-of-open-source-ai/81d36c662f631f38ce0dc13b8f4419b02f88c731/assets/favicon.ico -------------------------------------------------------------------------------- /badges.py: -------------------------------------------------------------------------------- 1 | """Display repository badges. 2 | 3 | MyST usage (HTML only): 4 | 5 | ```{badges} https://mybook.site https://github.com/org/mybook 6 | :doi: 10.5281.zenodo.12345678 7 | ``` 8 | """ 9 | from docutils import nodes 10 | from docutils.parsers.rst import Directive, directives 11 | from sphinx.application import Sphinx 12 | 13 | __version__ = '0.0.0' 14 | 15 | 16 | def visit_nop(self, node): 17 | pass 18 | 19 | 20 | class badges_node(nodes.General, nodes.Element): 21 | pass 22 | 23 | 24 | def visit_badges_html(self, node): 25 | self.body.append( 26 | f""" 27 | site 29 | """) 30 | slug = '/'.join(node['repository_url'].split('/')[-2:]) 31 | self.body.append( 32 | f""" 33 | last updated 35 | """) 36 | self.body.append( 37 | f""" 38 | activity 40 | """) 41 | if node['doi']: 42 | self.body.append( 43 | f""" 44 | doi 46 | """) 47 | 48 | 49 | class Badges(Directive): 50 | has_content = True 51 | required_arguments = 2 52 | optional_arguments = 1 53 | final_argument_whitespace = True 54 | option_spec = {'doi': directives.unchanged} 55 | _node = None 56 | 57 | def run(self): 58 | return [badges_node( 59 | baseurl=self.arguments[0], repository_url=self.arguments[1], doi=self.options.get('doi', None))] 60 | 61 | 62 | def setup(app: Sphinx): 63 | app.add_node(badges_node, html=(visit_badges_html, visit_nop), 64 | latex=(visit_nop, visit_nop)) 65 | app.add_directive("badges", Badges) 66 | return {'version': __version__, 'parallel_read_safe': True} 67 | -------------------------------------------------------------------------------- /bibliography.py: -------------------------------------------------------------------------------- 1 | """Limit the number of authors shown in the bibliography.""" 2 | from pybtex.plugin import register_plugin 3 | from pybtex.style.formatting.unsrt import Style as UnsrtStyle 4 | from pybtex.style.template import FieldIsMissing, join, node, sentence, tag 5 | from sphinx.application import Sphinx 6 | 7 | __version__ = '0.0.0' 8 | 9 | 10 | @node 11 | def names_truncated(children, context, role, max_names=9, **kwargs): 12 | """Return formatted names.""" 13 | assert not children 14 | try: 15 | persons = context['entry'].persons[role] 16 | except KeyError: 17 | raise FieldIsMissing(role, context['entry']) 18 | 19 | style = context['style'] 20 | if (truncate := len(persons) > max_names): 21 | persons = persons[:max_names - 1] 22 | formatted_names = [style.format_name(person, style.abbreviate_names) for person in persons] 23 | if truncate: 24 | formatted_names.append(tag('i')["others"]) 25 | return join(**kwargs)[formatted_names].format_data(context) 26 | 27 | 28 | class Style(UnsrtStyle): 29 | def format_names(self, role, as_sentence=True): 30 | formatted_names = names_truncated(role, sep=', ', sep2=' and ', last_sep=', and ') 31 | return sentence[formatted_names] if as_sentence else formatted_names 32 | 33 | 34 | def setup(app: Sphinx): 35 | register_plugin('pybtex.style.formatting', 'unsrt_max_authors', Style) 36 | return {'version': __version__} 37 | -------------------------------------------------------------------------------- /committers.py: -------------------------------------------------------------------------------- 1 | """Display Git committers & last updated time. 2 | 3 | Example MyST usage (HTML only): 4 | 5 | ```{committers} file_path.md 6 | ``` 7 | """ 8 | import json 9 | import os 10 | import re 11 | import subprocess 12 | from collections import Counter 13 | from functools import cache 14 | from urllib.request import Request, urlopen 15 | 16 | from docutils import nodes 17 | from docutils.parsers.rst import Directive, directives 18 | from sphinx.application import Sphinx 19 | 20 | __version__ = '0.0.0' 21 | 22 | 23 | @cache 24 | def gh_api(endpoint: str, version='2022-11-28') -> dict: 25 | headers = {'Accept': 'application/vnd.github+json', 'X-GitHub-Api-Version': version} 26 | if (token := os.environ.get("GH_TOKEN", os.environ.get("GITHUB_TOKEN", ""))): 27 | headers['Authorization'] = 'Bearer ' + token # higher rate limit & more permissions 28 | response = urlopen(Request("https://api.github.com/" + endpoint, headers=headers)) 29 | return json.load(response) 30 | 31 | 32 | def gh_user(email: str) -> str | None: 33 | if (user := {'het@hets-mbp.lan': 'htrivedi99', 'skanda.vivek@gmail.com': 'skandavivek'}.get(email, '')): 34 | return user # hardcoded exceptions 35 | 36 | user_info = gh_api(f"search/users?q={email}+in:email") 37 | try: 38 | return user_info['items'][0]['login'] 39 | except (KeyError, IndexError): 40 | return 41 | 42 | 43 | class committers_node(nodes.General, nodes.Element): 44 | pass 45 | 46 | 47 | def visit_nop(self, node): 48 | pass 49 | 50 | 51 | def visit_committers_html(self, node): 52 | self.body.append(self.starttag(node, 'div')) 53 | self.body.append(f"Chapter author{'' if len(node['authors']) == 1 else 's'}: ") 54 | self.body.append(", ".join(f'{name}' for name, href in node['authors'])) 55 | self.body.append('') 56 | 57 | 58 | class Committers(Directive): 59 | has_content = True 60 | required_arguments = 1 61 | optional_arguments = 0 62 | final_argument_whitespace = True 63 | option_spec = {'class': directives.class_option, 'name': directives.unchanged} 64 | _node = None 65 | 66 | def run(self): 67 | blame = subprocess.check_output([ 68 | 'git', 'blame', '--line-porcelain', '-w', '-M', '-C', '-C', '--'] + self.arguments 69 | ).decode('utf-8').strip() 70 | authors = Counter(re.findall("^author (.*)\nauthor-mail <(.*)>", blame, flags=re.MULTILINE)) 71 | total_loc = authors.total() 72 | auths = [] 73 | for (name, email), loc in authors.most_common(): 74 | if loc / total_loc < 0.1: # ignore contributions under 10% 75 | break 76 | if (user := gh_user(email)): 77 | auths.append((name, f"https://github.com/{user}")) 78 | else: 79 | auths.append((name, f"mailto:{email}")) 80 | return [committers_node(authors=auths)] 81 | 82 | 83 | def setup(app: Sphinx): 84 | app.add_node(committers_node, html=(visit_committers_html, visit_nop), 85 | latex=(visit_nop, visit_nop)) 86 | app.add_directive("committers", Committers) 87 | return {'version': __version__, 'parallel_read_safe': True} 88 | -------------------------------------------------------------------------------- /desktop-apps.md: -------------------------------------------------------------------------------- 1 | # Desktop Apps 2 | 3 | While ChatGPT and GPT-4 have taken the world of AI by storm in the last half year, open-source models are catching up. And there has been a lot of ground to cover, to reach OpenAI model performance. In many cases, ChatGPT and GPT-4 are clear winners as compared to deploying LLMs on cloud servers -- due to costs per OpenAI API request being relatively cheap compared with model hosting costs on cloud services like AWS, Azure, and Google Cloud. But, open-source models will always have value over closed APIs like ChatGPT/GPT-4 for certain business cases. Folks from industries like legal, healthcare, finance etc. -- have concerns over data and customer privacy. 4 | 5 | A new and exciting area are desktop apps that support running power LLMs locally. There is an argument to be made that successful desktop apps are more useful than cloud based services in some sensitive cases. This is because data, models, and the app can all be ran locally on typically available hardware. Here, I go through some of the up and coming solutions for LLM desktop apps -- their benefits, limitations, and comparisons between them. 6 | 7 | ```{table} Comparison of Desktop Apps 8 | Desktop App | Supported Models | GPU support | Layout | Configuration | Extra Features | OS | Future Roadmap 9 | ------------|------------------|-------------|--------|---------------|----------------|----|--------------- 10 | [](#lm-studio) | 🟡 [](model-formats.md#ggml) | 🟢 Yes | Clean, clear tabs. | Hardware config choices (GPU, RAM, etc.). Can choose multiple inference params (temperature, repeat penalty, etc.). | Local server deployments | Windows, Linux, MacOS | Not mentioned 11 | [](#gpt4all) | 🟡 [](model-formats.md#ggml) | 🔴 No | Unclear tabs. | Minimal hardware config options. Can choose inference params. | Contribute & use training data from the GPT4All datalake | Windows, Linux, MacOS | [Building open-source datalake for future model training](https://gpt4all.io) 12 | [](#koboldcpp) | 🟡 [](model-formats.md#ggml) | 🔴 No | Cluttered UI. | Some hardware config options. Unique inference/app params e.g. [scenarios.](https://github.com/LostRuins/koboldcpp) | Cool story, character, and adventure modes | Windows, Linux, MacOS | Not mentioned 13 | [](#localai) | 🟡 [](model-formats.md#ggml) | 🔴 No | Clear tabs. | Minimal hardware config options. Can choose inference params. | Light/dark modes | Windows, Linux, MacOS | [Text-to-audio, OpenAI functions](https://github.com/louisgv/local.ai) 14 | [](#ollama) | 🔴 few [](model-formats.md#ggml) models | 🟡 Yes (metal) | Basic, terminal-based UI. | Multiple hardware configurations, need to save as a file prior to running. Multiple inference params, need to save as a file. | Run from terminal | MacOS | [Windows, Linux support](https://ollama.ai) 15 | [](#llamafile) | 🔴 llamafile models | 🟢 Yes | Clean, simple interface. | Minimal hardware configurations. | Run from terminal, invokes the default browser. | Windows, Linux, BSD, MacOS | [](https://github.com/Mozilla-Ocho/llamafile) 16 | ``` 17 | 18 | ## LM Studio 19 | 20 | LM Studio is an app to run LLMs locally. 21 | 22 | ### UI and Chat 23 | 24 | [LM Studio](https://lmstudio.ai) is a desktop application supported for Windows and Mac OS that gives us the flexibility to run LLMs on our PC. You can download any `ggml` model from the [HuggingFace models hub](https://huggingface.co/models) and run the model on the prompts given by the user. 25 | 26 | The UI is pretty neat and well contained: 27 | 28 | ```{figure} https://static.premai.io/book/lm-studio1.png 29 | LM Studio UI 30 | ``` 31 | 32 | There's a search bar that can be used to search for models from the HuggingFace models to power the chat. 33 | 34 | ```{figure} https://static.premai.io/book/lmstudio-search.png 35 | LM Studio Model Search 36 | ``` 37 | 38 | The Chat UI component is similar to ChatGPT to have conversations between the user and the assistant. 39 | 40 | ```{figure} https://static.premai.io/book/lmstudio-chat-int.png 41 | LM Studio Chat Interface 42 | ``` 43 | 44 | This is how the `TheBloke/Llama-2-7B-Chat-GGML/llama-2-7b-chat.ggmlv3.q5_K_S.bin` responds to a simple conversation starter. 45 | 46 | ```{figure} https://static.premai.io/book/desktopapps-lmstudio-chat.png 47 | LM Studio Chat Example 48 | ``` 49 | 50 | ### Local Server 51 | 52 | One useful aspect is the ability to build a Python or Node.js application based on an underlying LLM. 53 | 54 | ```{figure} https://static.premai.io/book/lmstudio-local.png 55 | LM Studio Local Server 56 | ``` 57 | 58 | This enables the user to build applications that are powered by LLMs and using `ggml` models from the HUggingFace model library (without API key restrictions). 59 | 60 | Think of this server like a place where you make API calls to and get the response. The only change is that this is a local server and not a cloud based server. This makes it quite exciting to use the hardware in your system to power the LLM application that you are building. 61 | 62 | Let's spin up the server by hitting the `Start server` button🎉. That was a quick one and by default it is served in port `1234` and if you want to make use of some other port then you can edit that left to the `Start server` button that you pressed earlier. There are also few parameters that you can modify to handle the request but for now let's leave it as default. 63 | 64 | Go to any Python editor of your choice and paste the following code by creating a new `.py` file. 65 | 66 | ```python 67 | import openai 68 | # endpoint:port of your local inference server (in LM Studio) 69 | openai.api_base='http://localhost:1234/v1' 70 | openai.api_key='' # empty 71 | prefix = "### Instruction:\n" 72 | suffix = "\n### Response:" 73 | 74 | def get_completion(prompt, model="local model", temperature=0.0): 75 | formatted_prompt = f"{prefix}{prompt}{suffix}" 76 | messages = [{"role": "user", "content": formatted_prompt}] 77 | print(f'\nYour prompt: {prompt}\n') 78 | response = openai.ChatCompletion.create( 79 | model=model, 80 | messages=messages, 81 | temperature=temperature) 82 | return response.choices[0].message["content"] 83 | 84 | prompt = "Please give me JS code to fetch data from an API server." 85 | response = get_completion(prompt, temperature=0) 86 | print(f"LLM's response:{response}") 87 | ``` 88 | 89 | This is the code that I ran using the command `python3 .py` and the results from server logs and terminal produced are shown below: 90 | 91 | ```{figure} https://static.premai.io/book/lmstudio-local-ex.png 92 | LM Studio Local Server Example 93 | ``` 94 | 95 | ### Model Configurations & Tools 96 | 97 | By default we have a few presets already provided by LM studio but we can tweak them and create a preset of our own to be used elsewhere. The parameters that are modifiable are: 98 | 99 | - 🛠️ Inference parameters: These gives the flexibility to change the `temperature`, `n_predict`, and `repeat_penalty` 100 | - ↔️ Input prefix and suffix: Text to add right before, and right after every user message 101 | - ␂ Pre-prompt / System prompt: Text to insert at the very beginning of the prompt, before any user messages 102 | - 📥 Model initialisation: `m_lock` when turned on will ensure the entire model runs on RAM. 103 | - ⚙️ Hardware settings: The `n_threads` parameter is maximum number of CPU threads the model is allowed to consume. If you have a GPU, you can turn on the `n_gpu_layers` parameter. You can set a number between 10-20 depending on the best value, through experimentation. 104 | 105 | Tools focus on the response and UI of the application. The parameters modifiable are as follows: 106 | 107 | - 🔠 `Context overflow policy`: Behaviour of the model for when the generated tokens length exceeds the context window size 108 | - 🌈 `Chat appearance`: Either plain text (.txt) or markdown (.md) 109 | - 📝 `Conversation notes`: Auto-saved notes for a specific chat conversation 110 | 111 | ### Features 112 | 113 | - 💪 Leverages the power of your machine to run the model i.e. more your machine is powerful then you can utilise this to the fullest reach. 114 | - 🆕 The ability to download the model from HuggingFace gives power to test the latest of models like LLaMa or any other new ones hosted publicly in HuggingFace. Supported models include MPT, Starcoder, Replit, GPT-Neo-X more generally that are of the type `ggml` 115 | - 💻 Available for both Windows and Mac. 116 | - 🔌 Models can be run entirely offline as they are downloaded and reside locally in your machine. 117 | - 💬 Access the app using Chat UI or local server 118 | 119 | ## GPT4All 120 | 121 | The [GPT4All homepage](https://gpt4all.io) states that 122 | 123 | > GPT4All is an ecosystem to train and deploy **powerful** and **customised** large language models that run **locally** on consumer grade CPUs. 124 | 125 | ### UI and Chat 126 | 127 | The UI for GPT4All is quite basic as compared to LM Studio -- but it works fine. 128 | 129 | ```{figure} https://static.premai.io/book/desktopapps-gpt4all-ui.png 130 | GPT4All UI 131 | ``` 132 | 133 | However, it is less friendly and more clunky/ has a beta feel to it. For one, once I downloaded the LLaMA-2 7B model, I wasn't able to download any new model even after restarting the app. 134 | 135 | ### Local Server 136 | 137 | Like LM Studio, there is a support for local server in GPT4All. But it took some time to find that this feature exists and was possible only from the [documentation](https://docs.gpt4all.io). The results seem far better than LM Studio with control over number of tokens and response though it is model dependent. Here's the code for the same: 138 | 139 | ```python 140 | import openai 141 | openai.api_base = "http://localhost:4891/v1" 142 | openai.api_key = "" 143 | # Set up the prompt and other parameters for the API request 144 | prompt = "Who is Michael Jordan?" 145 | model = "Llama-2-7B Chat" 146 | # Make the API request 147 | response = openai.Completion.create( 148 | model=model, 149 | prompt=prompt, 150 | max_tokens=199, 151 | temperature=0.28, 152 | top_p=0.95, 153 | n=1, 154 | echo=True, 155 | stream=False) 156 | # Print the generated completion 157 | print(response) 158 | ``` 159 | 160 | The response can be found for the example `prompt`: 161 | 162 | ```{figure} https://static.premai.io/book/gpt4all-ex.png 163 | GPT4All UI Example 164 | ``` 165 | 166 | ### Model Configurations & Tools 167 | 168 | As you can see -- there is not too much scope for model configuration, and unlike LM Studio -- I couldn't use my GPU here. 169 | 170 | ```{figure} https://static.premai.io/book/desktopapps-gpt4all-modelconfig.png 171 | GPT4All UI Model Configuration 172 | ``` 173 | 174 | ## koboldcpp 175 | 176 | https://github.com/LostRuins/koboldcpp is a fun twist on LLMs -- adding game like scenarios and adventures. It supports adding base `ggml` models as the LLM engine, and spinning stories based on user inputs. 177 | 178 | ### UI and Chat 179 | 180 | The UI is pretty basic -- and you get some surprising answers. Here I ask a simple icebreaker question -- and you see that it responds that it is a friendly AI that likes to play games. 181 | 182 | ```{figure} https://static.premai.io/book/desktopapps-koboldcpp-ui.png 183 | koboldcpp UI 184 | ``` 185 | 186 | ### Scenarios 187 | 188 | You can also enter different sorts of scenarios and modes. 189 | 190 | ```{figure} https://static.premai.io/book/desktopapps-kcpp-scenarios.png 191 | koboldcpp Scenarios 192 | ``` 193 | 194 | Below is the Julius Caesar scenario! 195 | 196 | ```{figure} https://static.premai.io/book/desktopapps-kcpp-jc.png 197 | koboldcpp Julius Caesar Chat 198 | ``` 199 | 200 | ### Model Configuration and Tools 201 | 202 | Many of the model configurations are similar to the default that is offered. But there are some interesting twists like story mode, adventure mode, and instruct mode. 203 | 204 | ```{figure} https://static.premai.io/book/desktopapps-kcpp-modes.png 205 | koboldcpp Julius Model Configuration 206 | ``` 207 | 208 | ## [local.ai] 209 | 210 | [local.ai]: https://www.localai.app 211 | 212 | The [local.ai] App from https://github.com/louisgv/local.ai ([not to be confused](https://github.com/louisgv/local.ai/discussions/71) with [](mlops-engines.md#localai) from https://github.com/mudler/LocalAI) is a simple application for loading LLMs after you manually download a `ggml` model from online. 213 | 214 | ### UI and Chat 215 | 216 | The UI and chat are pretty basic. One bug that I noticed was that it wasn't possible to load models from the UI -- I had to manually download the model and then use the app. 217 | 218 | ```{figure} https://static.premai.io/book/desktopapps-localai-ui.png 219 | [local.ai] UI 220 | ``` 221 | 222 | ### Model Configuration and Tools 223 | 224 | Pretty standard prompt related configurations. It appears there is no GPU. 225 | 226 | ## Ollama 227 | 228 | [Ollama](https://ollama.ai) is an LLM based conversational chat bot that can be run from a MAC terminal. It is simple to get started. Currently, it is available only for the Mac OS but support for Windows and Linux are coming soon. 229 | 230 | ### UI and Chat 231 | 232 | Neat clean and crisp UI, just `>>>` in the terminal and you can paste your prompt. The response time will vary according to the model size but responses are mostly acceptable. I tested the `LLaMA` model which is the most recently supported model and the results were good. 233 | 234 | ```{figure} https://static.premai.io/book/ollama-ex.png 235 | Ollama Example 236 | ``` 237 | 238 | `Note:` It just takes some time initially for the model to download locally, but later whenever you need to access the model there is no lag in accessing the requested model. 239 | 240 | ### Model Configuration and Tools 241 | 242 | The list of ~20 models can be accessed [here](https://ollama.ai/library). 243 | 244 | They are constantly growing and multiple changes have happened quite recently. It can support models ranging from lite to robust models. 245 | It also has special support for specific functionality like performing Mathematical calculations. There is a `WizardMath` model that addresses these use case -- read more about this in their official [blog](https://ollama.ai/blog/wizardmath-examples) published by the Ollama team. 246 | 247 | ### Limitations 248 | 249 | - Better response format: There can be a formatted output making use of the terminal features to display the code, text, and images in the latter stage. This will make the output more readable and consistent to the user. 250 | - Showcase resource usage in a better way: Since LLMs by default require extensive use of memory we need to keep in mind the resources available. So while working in a terminal such details will not be explicitly available and can sometimes consume all the memory which can cause the application or the entire system to crash. 251 | - Support for custom models (from local): There is support to load models downloaded from the internet and run them locally by using the command: 252 | 253 | ```bash 254 | ollama run "model location in the system" 255 | ``` 256 | 257 | ## llamafile 258 | 259 | The objective of *llamafile* is to enhance the accessibility of open-source large language models (LLMs) for both developers and end users. To achieve this, they have merged llama.cpp with Cosmopolitan Libc, creating a framework that simplifies the complexity of LLMs into a single-file executable known as a *llamafile*. This executable can be run locally on most computers without the need for installation. The framework is licensed under the Apache License, Version 2.0. This combination enables developers and end users to fully utilize large language models (LLMs). Through the implementation of the *llamafile* approach, they have unlocked the potential of LLMs on a local scale, paving the way for exciting new opportunities across a wide range of applications. 260 | To experience it firsthand, the *llamafile* developers recommend downloading their example *llamafile* for the LLaVA model, which is licensed under LLaMA 2, OpenAI. LLaVA is an LLM that goes beyond mere chat capabilities; it also allows users to upload images and ask questions related to them. Importantly, all of this functionality occurs locally, ensuring that no data ever leaves the computer. 261 | 262 | It is important to note that if there are any issues with compiling and dynamically linking GPU support, *llamafile* has a contingency plan in place. In such cases, the system will automatically switch to CPU inference, ensuring uninterrupted performance and accurate results. 263 | 264 | Under Linux, the dynamic compilation of Nvidia cuBLAS GPU support is possible under certain conditions. Firstly, the presence of the cc compiler is required. Secondly, the -ngl 35 flag must be passed to activate the GPU. Lastly, the CUDA developer toolkit must be installed on the machine, and the nvcc compiler should be accessible through the system's path. 265 | 266 | For Windows users, utilizing the GPU requires the following two steps: first, make sure that the released binaries are used. Secondly, pass the -ngl 35 flag. Additionally, it is essential to have an NVIDIA graphics card that supports CUDA, as AMD GPUs are not currently supported. If users prefer to use CUDA via WSL, one can enable Nvidia CUDA on WSL and run the llamafiles within WSL. However, it is worth noting that Windows users may face limitations with some of our example llamafiles due to the maximum executable file size restriction of 4 GB imposed by the Windows operating system. But don't worry, the *llamafile* framework offers support for external weight (see documention for details). 267 | 268 | On the Apple Silicon, if Xcode is installed, everything should seamlessly function without any issues. 269 | 270 | {{ comments }} 271 | -------------------------------------------------------------------------------- /fine-tuning.md: -------------------------------------------------------------------------------- 1 | # Fine-tuning 2 | 3 | ```{admonition} Work in Progress 4 | :class: attention 5 | {{ wip_chapter }} 6 | 7 | Some ideas: 8 | 9 | - https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e#training-your-own 10 | - [Why You (Probably) Don't Need to Fine-tune an LLM](https://www.tidepool.so/2023/08/17/why-you-probably-dont-need-to-fine-tune-an-llm/) (instead, use few-shot prompting & retrieval-augmented generation) 11 | - [Fine-tuning LLaMA-2: A Comprehensive Case Study for Tailoring Models to Unique Applications](https://www.anyscale.com/blog/fine-tuning-llama-2-a-comprehensive-case-study-for-tailoring-models-to-unique-applications) (fine-tuning LLaMA-2 for 3 real-world use cases) 12 | - [Private, local, open source LLMs](https://python.langchain.com/docs/guides/local_llms) 13 | - [Easy-to-use LLM fine-tuning framework (LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, ChatGLM2)](https://github.com/hiyouga/LLaMA-Factory) 14 | - https://dstack.ai/examples/finetuning-llama-2 15 | - https://github.com/h2oai, etc. 16 | - [The History of Open-Source LLMs: Better Base Models (part 2)](https://cameronrwolfe.substack.com/p/the-history-of-open-source-llms-better) (LLaMA, MPT, Falcon, LLaMA-2) 17 | ``` 18 | 19 | For bespoke applications, models can be trained on task-specific data. However, training a model from scratch is seldom required. 20 | The model has already learned useful feature representations during its initial (pre) training, so it is often sufficient to simply fine-tune. This takes advantage of [transfer learning](https://www.v7labs.com/blog/transfer-learning-guide), producing better task-specific performance with minimal training examples & resources -- analogous to teaching a university student without first reteaching them how to communicate. 21 | 22 | ## Transfer Learning versus Fine-tuning 23 | 24 | Both {term}`transfer learning` and {term}`fine-tuning` modify a pre-trained model for a domain/task-specific use, and thus both terms are often used interchangeably. However, there are key differences. 25 | 26 | ```{table} Transfer Learning versus Fine-tuning 27 | Description | Transfer Learning | Fine-tuning 28 | ------------|-------------------|------------ 29 | Based on a model pre-trained on a large generic dataset | yes | yes 30 | Freeze pre-trained model layers | most or all | none ("full" fine-tuning) or a few 31 | Head layer | append a new head | replace existing head or leave as-is 32 | Train on domain-specific data until unfrozen layers converge | yes | yes 33 | ``` 34 | 35 | ### Transfer Learning 36 | 37 | From [Wikipedia](https://en.wikipedia.org/wiki/Transfer_learning) definition, Transfer learning is a technique in machine learning in which knowledge learned from task is re-used in order to boost performance for some related task. For working on transfer learning, you start with a pre-trained model. A pre-trained model is a deep learning model trained on a very large dataset (can be image text etc.). Most of the times, these pre-trained models are huge classification models trained on huge data with numerous number of classes. During the course of training these models eventually learns features and representations to minimize the loss. 38 | 39 | Hence before starting Transfer Learning, we take out the layers responsible for classification (pen-ultimate layers) and treat that as our feature extractor. We leverage this knowledge coming from the feature extractor (pre-trained model) to train a smaller model confined to a very specific domain-specific task. 40 | The key is that "frozen" layers remain unchanged -- retaining the original abilities of the pre-trained model -- and act as general & robust feature extractors. 41 | 42 | ```{figure-md} transfer-learning-architecture 43 | :class: caption 44 | ![](https://static.premai.io/book/transfer_learning.png) 45 | 46 | Transfer Learning 47 | ``` 48 | 49 | **Examples**: 50 | 51 | - Computer vision: take the [ResNet-50](https://huggingface.co/microsoft/resnet-50) pre-trained on the [ImageNet](https://www.image-net.org/index.php) dataset and replace its last layer with the head of an object-detecting model (such as [Faster R-CNN](https://arxiv.org/abs/1506.01497)). This modified model can now be trained to draw bounding boxes and classify images from the [cats-vs-dogs](https://huggingface.co/datasets/cats_vs_dogs) dataset. 52 | 53 | - Natural language processing: take a [BERT](https://huggingface.co/google/bert_uncased_L-2_H-768_A-12) model, that was pre-trained on extensive text data, such as the [BookCorpus dataset](https://huggingface.co/datasets/bookcorpus). Replace BERT's final layer with a simple classifier or Multi-Layer Perceptron (MLP) layers. These final layers can then be trained on the [tweet sentiment classification dataset](https://huggingface.co/datasets/carblacac/twitter-sentiment-analysis) to classify twitter sentiments. 54 | 55 | **Use cases**: 56 | `NOTE`: We can even extend the process of transfer learning by unfreezing some layers of pre-trained model and retraining them along with our smaller model. This additional step helps the model to adapt on newer domain-specific task or out of distribution tasks. 57 | 58 | - Limited data: when domain-specific dataset size is small, a large model cannot be trained end-to-end without overfitting. However if the model is mostly a frozen general feature extractor, then the subsequent trainable layers are less likely to overfit. 59 | - Limited compute and time: retraining a large model from scratch requires a lot of compute resources and time. This is unnecessary if similar performance can be achieved through transfer learning (training just part of a large model). 60 | 61 | > The key difference here is none (or few) of the pre-trained model's weights are frozen. The pre-training process can be considered an intelligent weight initialisation prior to training on a domain-specific dataset. Essentially, the pre-training will leave the model weights close to a global (general) optimum, while the domain-specific training will find a local (task-specific) optimum. 62 | 63 | ### Fine-Tuning 64 | 65 | From [Wikipedia’s](https://en.wikipedia.org/wiki/Fine-tuning_(deep_learning)) definition, Fine-tuning is an approach to transfer learning in which weights of a pre-trained model is trained on a new data. In some case we retrain the whole model on our domain-specific dataset or in other cases, we just fine-tune on only a subset of the layers. Through fine-tuning, we are adapting our existing pre-trained model on a task-specific dataset. 66 | 67 | ```{figure-md} fine-tuning-architecture 68 | :class: caption 69 | ![](https://static.premai.io/book/fine-tuning.png) 70 | 71 | Fine Tuning 72 | ``` 73 | 74 | **Examples**: 75 | 76 | - Computer vision: for segmentation in cases where fine-grained detail is important (e.g. finding individual cells in medical imaging, or detecting objects in satellite images), transfer learning might not be accurate enough. 77 | - Natural language processing: an LLM such as [](models.md#persimmon-8b) -- used in general purpose text completion -- can be adapted to do summarisation. Adding a few layers (transfer learning) may not be enough to do summarisation well, and hence full fine-tuning is required. 78 | 79 | **Use cases**: 80 | 81 | - Performance: when transfer learning is not accurate enough, and enough domain-specific data is available to make use of fine-tuning without overfitting. 82 | 83 | Note that fine-tuning typically required much more compute resources, time, and data than transfer learning. 84 | 85 | ## Fine-tuning LLMs 86 | 87 | When an LLM does not produce the desired output, engineers think that by fine-tuning the model, they can make it "better". But what exactly does "better" mean in this case? It's important to identify the root of the problem before fine-tuning the model on a new dataset. 88 | 89 | Common LLM issues include: 90 | 91 | - The model lacks knowledge on certain topics 92 | + [](#rag) can be used to solve this problem 93 | - The model's responses do not have the proper style or structure the user is looking for 94 | + Fine-tuning or few-shot prompting is applicable here 95 | 96 | ```{figure-md} llm-fine-tuning-architecture 97 | :class: caption 98 | ![](https://static.premai.io/book/fine-tuning-llm.png) 99 | 100 | [Fine-tuning LLMs](https://neo4j.com/developer-blog/fine-tuning-retrieval-augmented-generation) 101 | ``` 102 | 103 | A baseline LLM model cannot answer questions about content is hasn't been trained on {cite}`tidepool-citation`. The LLM will make something up, i.e., hallucinate. To fix issues like this, RAG is a good tool to use because it provides the LLM with the context it needs to answer the question. 104 | 105 | On the other hand, if the LLM needs to generate accurate SQL queries, RAG is not going to be of much help here. The format of the generated output matters a lot, so fine-tuning would be more useful for this use case. 106 | a 107 | Here are some examples of models that have been fine-tuned to generate content in a specific format/style: 108 | 109 | * [Gorilla LLM](https://gorilla.cs.berkeley.edu) - This LLM was fine-tuned to generate API calls. 110 | * [LLaMA-2 chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - The "chat" version of LLaMA is fine-tuned on conversational data. 111 | * [Code LLaMA](https://about.fb.com/news/2023/08/code-llama-ai-for-coding) - A fine-tuned LLaMA-2 model designed for code generation. 112 | 113 | ## RAG 114 | 115 | {term}`RAG` is a method used to boost the accuracy of LLMs by injecting relevant context into an LLM prompt. It works by connecting to a vector database and fetches only the information that is most relevant to the user's query. Using this technique, the LLM is provided with enough background knowledge to adequately answer the user's question without hallucinating. 116 | 117 | RAG is not a part of fine-tuning, because it uses a pre-trained LLM and does not modify it in any way. 118 | However, there are several advantages to using RAG: 119 | 120 | - **Boosts model accuracy** 121 | - Leads to less hallucinations by providing the right context 122 | - **Less computing power required** 123 | - Unlike fine-tuning, RAG does not need to re-train any part of the model. It's only the models prompt that changes. 124 | - **Quick and easy setup** 125 | - RAG does not require much LLM domain expertise. You don't need to find training data or corresponding labels. Most pieces of text can be uploaded into the vector db as is, without major modifications. 126 | - **Connect to private data** 127 | - Using RAG, engineers can connect data from SaaS apps such as Notion, Google Drive, HubSpot, Zendesk, etc. to their LLM. Now the LLM has access to private data and can help answer questions about the data in these applications. 128 | 129 | RAG plays a key role in making LLMs useful, but it can be a bit tedious to set up. There are a number of open-source project such as https://github.com/run-llama/llama_index which can help make the process a bit easier. 130 | 131 | ## Fine-tuning Image Models 132 | 133 | Fine tuning computer vision based models is a common practice and is used in applications involving object detection, object classification, and image segmentation. 134 | 135 | For these non-generative AI use-cases, a baseline model like Resnet or YOLO is fine-tuned on labelled data to detect a new object. Although the baseline model isn't initially trained for the new object, it has learned the feature representation. Fine-tuning enables the model to rapidly acquire the features for the new object without starting from scratch. 136 | 137 | Data preparation plays a big role in the fine-tuning process for vision based models. An image of the same object can be taken from multiple angles, different lighting conditions, different backgrounds, etc. In order to build a robust dataset for fine-tuning, all of these image variations should be taken into consideration. 138 | 139 | ### Fine-tuning AI image generation models 140 | 141 | ```{figure-md} image-generation-fine-tuning 142 | :class: caption 143 | ![](https://static.premai.io/book/fine-tuning-image-generation.png) 144 | 145 | [Dreambooth Image Generation Fine-tuning](https://dreambooth.github.io) 146 | ``` 147 | 148 | Models such as [Stable Diffusion](https://stability.ai/stable-diffusion) can also be tailored through fine-tuning to generate specific images. For instance, by supplying Stable Diffusion with a dataset of pet pictures and fine-tuning it, the model becomes capable of generating images of that particular pet in diverse styles. 149 | 150 | The dataset for fine-tuning an image generation model needs to contain two things: 151 | 152 | - **Text**: What is the object in the image 153 | - **Image**: The picture itself 154 | 155 | The text prompts describe the content of each image. During fine-tuning, the text prompt is passed into the text encoder portion of Stable Diffusion while the image is fed into the image encoder. The model learns to generate images that match the textual description based on this text-image pairing in the dataset {cite}`octoml-fine-tuning`. 156 | 157 | ## Fine-tuning Audio Models 158 | 159 | ```{figure-md} audio-fine-tuning 160 | :class: caption 161 | ![](https://static.premai.io/book/fine-tuning-audio.png) 162 | 163 | [Audio Generation Fine-tuning](https://aws.amazon.com/blogs/machine-learning/fine-tune-and-deploy-a-wav2vec2-model-for-speech-recognition-with-hugging-face-and-amazon-sagemaker) 164 | ``` 165 | 166 | Speech-to-text models like [Whisper](https://registry.premai.io/detail.html) can also be fine-tuned. Similar to fine-tuning image generation models, speech-to-text models need two pieces of data: 167 | 168 | 1. **Audio recording** 169 | 2. **Audio transcription** 170 | 171 | Preparing a robust dataset is key to building a fine-tuned model. For audio related data there are a few things to consider: 172 | 173 | **Acoustic Conditions:** 174 | 175 | * Background noise levels - more noise makes transcription more difficult. Models may need enhanced noise robustness. 176 | * Sound quality - higher quality audio with clear speech is easier to transcribe. Low bitrate audio is challenging. 177 | * Speaker accents and voice types - diversity of speakers in training data helps generalise. 178 | * Audio domains - each domain like meetings, call centers, videos, etc. has unique acoustics. 179 | 180 | **Dataset Creation:** 181 | 182 | * Quantity of training examples - more audio-transcript pairs improves accuracy but takes effort. 183 | * Data collection methods - transcription services, scraping, in-house recording. Quality varies. 184 | * Transcript accuracy - high precision transcripts are essential. Poor transcripts degrade fine-tuning. 185 | * Data augmentation - random noise, speed, pitch changes makes model robust. 186 | 187 | ## Importance of data 188 | 189 | ```{figure-md} data-centric-ai 190 | :class: caption 191 | ![](https://static.premai.io/book/fine-tuning-data-centric.png) 192 | 193 | [Data centric AI](https://segments.ai/blog/wandb-integration) 194 | ``` 195 | 196 | The performance of a fine-tuned model largely depends on the **quality** and **quantity** of training data. 197 | 198 | For LLMs, the quantity of data can be an important factor when deciding whether to fine-tune or not. There have been many success stories of companies like Bloomberg {cite}`wu2023bloomberggpt`, [Mckinsey](https://www.mckinsey.com/about-us/new-at-mckinsey-blog/meet-lilli-our-generative-ai-tool), and [Moveworks] that have either created their own LLM or fine-tuned an existing LLM which has better performance than ChatGPT on certain tasks. However, tens of thousands of data points were required in order to make these successful AI bots and assistants. In the [Moveworks blog post][Moveworks], the fine-tuned model which surpasses the performance of GPT-4 on certain tasks, was trained on an internal dataset consisting of 70K instructions. 199 | 200 | [Moveworks]: https://www.moveworks.com/us/en/resources/blog/moveworks-enterprise-llm-benchmark-evaluates-large-language-models-for-business-applications 201 | 202 | In the case of computer vision models, data quality can play a significant role in the performance of the model. Andrew Ng, a prominent researcher and entrepreneur in the field of AI, has been an advocate of data centric AI in which the quality of the data is more important than the sheer volume of data {cite}`small-data-tds`. 203 | 204 | To summarise, fine-tuning requires a balance between having a large dataset and having a high quality dataset. The higher the data quality, the higher the chance of increasing the model's performance. 205 | 206 | ```{table} Estimates of minimum fine-tuning Hardware & Data requirements 207 | :name: memory-data-requirements 208 | 209 | Model | Task | Hardware | Data 210 | ------|------|----------|----- 211 | LLaMA-2 7B | Text Generation | GPU: 65GB, 4-bit quantised: 10GB | 1K datapoints 212 | Falcon 40B | Text Generation | GPU: 400GB, 4-bit quantised: 50GB | 50K datapoints 213 | Stable Diffusion | Image Generation | GPU: 6GB | 10 (using Dreambooth) images 214 | YOLO | Object Detection | Can be fine-tuned on CPU | 100 images 215 | Whisper | Audio Transcription | GPU: 5GB (medium), 10GB (large) | 50 hours 216 | ``` 217 | 218 | ```{admonition} GPU memory for fine-tuning 219 | :name: memory-requirements 220 | :class: note 221 | 222 | Most models require a GPU for fine-tuning. To approximate the amount of GPU memory required, the general rule is around 2.5 times the model size. Note that {term}`quantisation` to reduce the size tends to only be useful for inference, not training-fine-tuning. An alternative is to only fine-tune some layers (freezing and quantising the rest), thus greatly reducing memory requirements. 223 | 224 | For example: to fine-tune a `float32` (i.e. 4-byte) 7B parameter model: 225 | 226 | $$ 227 | 7 \times 10^{9}~\mathrm{params} \times 4~\mathrm{B/param} \times 2.5 = 70~\mathrm{GB} 228 | $$ 229 | ``` 230 | 231 | ## Future 232 | 233 | Fine-tuning models has been a common practice for ML engineers. It allows engineers to quickly build domain-specific models without having to design the neural network from scratch. 234 | 235 | Developer tools for fine-tuning continue to improve the overall experience of creating one of these models while reducing the time to market. Companies like [Hugging Face](https://huggingface.co/docs/transformers/training) are building open-source tools to make fine-tuning easy. On the commercial side, companies like [Roboflow](https://roboflow.com) and [Scale AI](https://scale.com/generative-ai-platform) provide platforms for teams to manage the full life-cycle of a model. 236 | 237 | Overall, fine-tuning has become a crucial technique for adapting large pre-trained AI models to custom datasets and use cases. While the specific implementation details vary across modalities, the core principles are similar - leverage a model pre-trained on vast data, freeze most parameters, add a small tunable component customised for your dataset, and update some weights to adapt the model. 238 | 239 | When applied correctly, fine-tuning enables practitioners to build real-world solutions using leading large AI models. 240 | 241 | {{ comments }} 242 | -------------------------------------------------------------------------------- /hardware.md: -------------------------------------------------------------------------------- 1 | # Hardware 2 | 3 | ```{admonition} Work in Progress 4 | :class: attention 5 | {{ wip_chapter }} 6 | 7 | Some ideas: 8 | 9 | - [AI and Memory Wall](https://medium.com/riselab/ai-and-memory-wall-2cb4265cb0b8) 10 | - https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e#deployment 11 | - https://www.youtube.com/watch?v=r5NQecwZs1A 12 | ``` 13 | 14 | ## Machine Learning and GPUs 15 | 16 | % TODO: add links/citations 17 | 18 | GPUs are particularly well-suited for the types of computations required in AI for several reasons: 19 | 20 | 1. **Parallelisation**: Deep learning models involve a lot of matrix multiplications and other operations that can be parallelised. A single GPU can have thousands of cores, allowing it to execute many operations simultaneously, which can lead to a significant speedup in training and inference times. 21 | 2. **Specialised Hardware**: Modern GPUs have specialised hardware for performing certain types of operations that are common in deep learning, such as matrix multiplications and convolutions. For example, NVIDIA's Volta and Turing architectures include Tensor Cores, which are specialised hardware units designed to accelerate mixed-precision matrix multiply-and-accumulate operations. 22 | 3. **High Memory Bandwidth**: GPUs have much higher memory bandwidth compared to CPUs, which allows them to transfer data to and from memory much more quickly. This is important for deep learning models, which often involve large amounts of data. 23 | 4. **Software Support**: There is a lot of software support for GPU computing in popular deep learning frameworks like TensorFlow and PyTorch. These frameworks provide high-level APIs that make it easy to develop models and run them on GPUs, without having to write low-level GPU code. 24 | 5. **Energy Efficiency**: Training deep learning models can be very computationally intensive, and GPUs are generally more energy-efficient than CPUs for these types of computations. 25 | 6. **Availability**: Unlike much other specialized numerical computing hardware, GPUs are mass produced for the consumer market. Although specialized data-center and embedded variants exist, mid-tier GPUs can be [easily purchased](https://www.amazon.com/s?k=nvidia+24gb) by consumers and installed in a workstation or PC. 26 | 27 | For these reasons, GPUs are often the preferred hardware for training and deploying deep learning models. That said, there are other types of hardware that can also be used for deep learning, such as TPUs (Tensor Processing Units), which are custom accelerators designed by Google specifically for deep learning. 28 | 29 | ## Types of GPUs 30 | 31 | 1. **NVIDIA GPUs**: NVIDIA is currently the dominant player in the GPU market for machine learning applications. Their GPUs are widely used in both research and commercial applications. NVIDIA provides a comprehensive ecosystem of software tools and libraries for machine learning, including CUDA and cuDNN (CUDA Deep Neural Network library), which are essential for training deep neural networks. The NVIDIA A100 GPU, for example, is designed specifically for AI and data analytics. 32 | 2. **AMD GPUs**: AMD GPUs are also used for machine learning, but they are not as popular as NVIDIA GPUs. AMD provides the ROCm (Radeon Open Compute) platform, which is an open-source software platform for GPU-enabled HPC (High-Performance Computing) and machine learning applications. However, the software ecosystem for AMD GPUs is not as mature as for NVIDIA GPUs. 33 | 3. **Apple Silicon GPUs**: Apple has developed its own GPUs for its Apple Silicon chips, like the M1. These GPUs are optimised for low power consumption and are used in Apple devices like the MacBook Air, MacBook Pro, Mac Mini, and iPad Pro. The performance of these GPUs is quite good for mobile and integrated GPUs, but they are not suitable for high-performance machine learning tasks. 34 | 4. **Intel GPUs**: Intel is also developing GPUs for machine learning applications. Their upcoming Intel Xe GPUs are expected to provide competitive performance for machine learning tasks. Intel also provides the oneAPI toolkit, which includes a library (oneDNN) for deep neural networks. 35 | 5. **Google TPUs (Tensor Processing Units)**: Although not technically GPUs, Google's TPUs are custom accelerators for machine learning tasks. They are designed to provide high performance and efficiency for both training and inference of machine learning models. TPUs are available through Google's cloud computing services. 36 | 37 | Each of these options has its own advantages and disadvantages in terms of performance, power consumption, software support, and cost. NVIDIA GPUs are currently the most popular choice for machine learning applications due to their high performance and mature software ecosystem. 38 | 39 | ## Programming for GPUs 40 | 41 | ### NVIDIA GPUs 42 | 43 | #### CUDA 44 | 45 | To interact with NVIDIA GPUs, you will primarily use CUDA. CUDA is a parallel computing platform & programming model developed by NVIDIA for general computing on its GPUs {cite}`cuda-gpus`. 46 | 47 | Here are the main components you will interact with: 48 | 49 | 1. [**CUDA Toolkit**](https://developer.nvidia.com/cuda-downloads), which includes: 50 | - **CUDA libraries**: e.g. `cuBLAS` for linear algebra, `cuDNN` for deep learning, and others for FFTs, sparse matrices, and more 51 | - [**CUDA runtime**](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-runtime) (`cudart`) 52 | - [**CUDA compiler**](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compilation-with-nvcc) (`nvcc`) 53 | - [**NVIDIA drivers**](https://www.nvidia.com/Download/index.aspx): allow your operating system & programs to communicate with your NVIDIA graphics card 54 | 2. [**CUDA Language**](https://docs.nvidia.com/cuda/cuda-c-programming-guide): an extension of the C/C++ programming language which includes [some additional keywords & constructs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#c-language-extensions) for writing parallel code. 55 | 56 | Here is a basic workflow for using NVIDIA GPUs: 57 | 58 | 1. **Install NVIDIA drivers & CUDA Toolkit**, using one of the following (depending on your taste): 59 | - [Developer download matrix (recommended)](https://developer.nvidia.com/cuda-downloads) 60 | - [Quickstart guide (slightly more detailed)](https://docs.nvidia.com/cuda/cuda-quick-start-guide) 61 | - [Quickstart videos (if you prefer eye-candy)](https://developer.nvidia.com/how-to-cuda-c-cpp) 62 | - Full Guide for [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux) or [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows) 63 | 2. [**Write your code**](https://docs.nvidia.com/cuda/cuda-c-programming-guide): Use the CUDA programming language (an extension of C/C++) to write your code. This will involve writing kernel functions that will be executed on the GPU, and host code that will be executed on the CPU. 64 | 3. **Compile your code**: Use the NVCC compiler (included in the CUDA Toolkit) to compile your code. 65 | 4. **Run your code**: Run your compiled code on an NVIDIA GPU. 66 | 67 | For example, here is a simple CUDA program that adds two vectors: 68 | 69 | ```cpp 70 | #include "cuda_runtime.h" 71 | #include 72 | 73 | /// CUDA kernel function for vector addition (dst = srcA + srcB) 74 | __global__ void vectorAdd(float *const dst, const float *const srcA, const float *const srcB, int numElements) { 75 | int i = blockDim.x * blockIdx.x + threadIdx.x; 76 | if (i < numElements) dst[i] = srcA[i] + srcB[i]; 77 | } 78 | 79 | int main(void) { 80 | // Allocate & initialise host (CPU) & device (GPU) memory 81 | const int numElements = 1337; 82 | float *srcA; 83 | cudaMallocManaged((void **)&srcA, numElements); 84 | for(int i=0; i>>(dst, srcA, srcB, numElements); 92 | cudaDeviceSynchronize(); 93 | 94 | // clean up memory 95 | cudaFree((void *)a); 96 | // ... 97 | } 98 | ``` 99 | 100 | In this example, `srcA`, `srcB`, and `dst` are memory pointers to linear vectors (of size `numElements`). Note that the CUDA compiler automatically converts these to host (CPU) or device (GPU) memory pointers (and copies data between host & device) when appropriate. The `vectorAdd` "kernel" (GPU function) is launched with `blocksPerGrid` blocks, each containing `threadsPerBlock` threads. Each thread computes the sum of one pair of elements from `srcA` and `srcB`, and stores the result in `dst`. 101 | 102 | ```{admonition} High-level wrappers 103 | :class: seealso 104 | Note that wrappers for other programming languages exists (e.g. [Python](https://developer.nvidia.com/how-to-cuda-python)), allowing control of CUDA GPUs while writing code in more concise & user-friendly languages. 105 | 106 | % TODO: RAPIDS, CuPy, CuVec etc 107 | ``` 108 | 109 | #### Vulkan 110 | 111 | Vulkan is a low-level graphics and compute API developed by the Khronos Group. It provides fine-grained control over the GPU and is designed to minimise CPU overhead and provide more consistent performance. Vulkan can be used for a variety of applications, including gaming, simulation, and scientific computing. 112 | 113 | Vulkan is supported on a wide variety of platforms, including Windows, Linux, macOS (via MoltenVK, a Vulkan implementation that runs on top of Metal), Android, and iOS. Vulkan has a somewhat steep learning curve because it is a very low-level API, but it provides a lot of flexibility and can lead to very high performance. 114 | 115 | ### AMD GPUs 116 | 117 | For AMD GPUs, you can use the ROCm (Radeon Open Compute) platform, which is an open-source software platform for GPU-enabled HPC (High-Performance Computing) and machine learning applications. 118 | 119 | Here are the main components of the ROCm platform: 120 | 121 | 1. **ROCm Runtime**: This is the core of the ROCm platform. It includes the ROCr System Runtime, which is a user-space system runtime for managing GPU applications, and the ROCt Thunk Interface, which provides a low-level interface to the GPU kernel driver. 122 | 2. **ROCm Driver**: This is the kernel driver for AMD GPUs. It includes the AMDGPU driver, which is the open-source kernel driver for AMD Radeon graphics cards. 123 | 3. **ROCm Libraries**: These are a set of libraries optimised for AMD GPUs. They include rocBLAS for basic linear algebra, rocFFT for fast Fourier transforms, and rocRAND for random number generation. 124 | 4. **ROCm Tools**: These are a set of tools for developing and debugging applications on AMD GPUs. They include the ROCm SMI (System Management Interface) for monitoring and managing GPU resources, and the ROCgdb debugger for debugging GPU applications. 125 | 126 | To develop applications for AMD GPUs using the ROCm platform, you will need to: 127 | 128 | 1. **Install the necessary software**: This includes the ROCm platform, and any other libraries or tools you need. 129 | 2. **Write your code**: You can use the HIP programming language, which is a C++ runtime API and kernel language that allows you to write portable GPU code that can run on both AMD and NVIDIA GPUs. HIP code can be compiled to run on AMD GPUs using the HIP-Clang compiler, or on NVIDIA GPUs using the NVCC compiler. 130 | 3. **Compile your code**: Use the HIP-Clang compiler to compile your code for AMD GPUs, or the NVCC compiler for NVIDIA GPUs. 131 | 4. **Run your code**: Run your compiled code on an AMD or NVIDIA GPU. 132 | 133 | For example, here is a simple HIP program that adds two vectors: 134 | 135 | ```cpp 136 | #include "hip/hip_runtime.h" 137 | #include 138 | 139 | /// HIP kernel function for vector addition (dst = srcA + srcB) 140 | __global__ void vectorAdd(float *const dst, const float *const srcA, const float *const srcB, int numElements) { 141 | int i = blockDim.x * blockIdx.x + threadIdx.x; 142 | if (i < numElements) dst[i] = srcA[i] + srcB[i]; 143 | } 144 | 145 | int main(void) { 146 | // Allocate and initialise host (CPU) & device (GPU) memory 147 | // ... 148 | 149 | // Launch the vectorAdd kernel 150 | const int threadsPerBlock = 256; 151 | const int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 152 | hipLaunchKernelGGL( 153 | vectorAdd, dim3(blocksPerGrid), dim3(threadsPerBlock), 0, 0, dst, srcA, srcB, numElements); 154 | 155 | // Copy result from device to host & clean up memory 156 | // ... 157 | } 158 | ``` 159 | 160 | In this example, `d_A`, `d_B`, and `d_C` are pointers to device memory, and `numElements` is the number of elements in each vector. The `vectorAdd` kernel is launched with `blocksPerGrid` blocks, each containing `threadsPerBlock` threads. Each thread computes the sum of one pair of elements from `d_A` and `d_B`, and stores the result in `d_C`. 161 | 162 | Note that this example is very similar to the CUDA example I provided earlier. This is because the HIP programming language is designed to be similar to CUDA, which makes it easier to port CUDA code to run on AMD GPUs. 163 | 164 | ### Apple Silicon GPUs 165 | 166 | #### Metal 167 | 168 | Apple Silicon GPUs, which are part of Apple's custom M1 chip, can be programmed using the Metal framework. Metal is a graphics and compute API developed by Apple, and it's available on all Apple devices, including Macs, iPhones, and iPads. 169 | 170 | Here are the main components of the Metal framework: 171 | 172 | 1. **Metal API**: This is a low-level API that provides access to the GPU. It includes functions for creating and managing GPU resources, compiling shaders, and submitting work to the GPU. 173 | 2. **Metal Shading Language (MSL)**: This is the programming language used to write GPU code (shaders) in Metal. It is based on the C++14 programming language and includes some additional features and keywords for GPU programming. 174 | 3. **MetalKit and Metal Performance Shaders (MPS)**: These are higher-level frameworks built on top of Metal. MetalKit provides functions for managing textures, meshes, and other graphics resources, while MPS provides highly optimised functions for common image processing and machine learning tasks. 175 | 176 | Here is a basic workflow for using Metal to perform GPU computations on Apple Silicon: 177 | 178 | 1. **Install the necessary software**: This includes the Xcode development environment, which includes the Metal framework and compiler. 179 | 2. **Write your code**: Write your GPU code using the Metal Shading Language, and your host code using Swift or Objective-C. Your host code will use the Metal API to manage GPU resources and submit work to the GPU. 180 | 3. **Compile your code**: Use the Xcode development environment to compile your code. 181 | 4. **Run your code**: Run your compiled code on an Apple device with an Apple Silicon GPU. 182 | 183 | For example, here is a simple Metal program that adds two vectors: 184 | 185 | ```swift 186 | import Metal 187 | 188 | // Create a Metal device and command queue 189 | let device = MTLCreateSystemDefaultDevice()! 190 | let commandQueue = device.makeCommandQueue()! 191 | 192 | // Create a Metal library and function 193 | let library = device.makeDefaultLibrary()! 194 | let function = library.makeFunction(name: "vector_add")! 195 | 196 | // Create a Metal compute pipeline 197 | let pipeline = try! device.makeComputePipelineState(function: function) 198 | 199 | // Allocate and initialise host and device memory 200 | let numElements = 1024 201 | let bufferSize = numElements * MemoryLayout.size 202 | let h_A = [Float](repeating: 1.0, count: numElements) 203 | let h_B = [Float](repeating: 2.0, count: numElements) 204 | let d_A = device.makeBuffer(bytes: h_A, length: bufferSize, options: [])! 205 | let d_B = device.makeBuffer(bytes: h_B, length: bufferSize, options: [])! 206 | let d_C = device.makeBuffer(length: bufferSize, options: [])! 207 | 208 | // Create a Metal command buffer and encoder 209 | let commandBuffer = commandQueue.makeCommandBuffer()! 210 | let commandEncoder = commandBuffer.makeComputeCommandEncoder()! 211 | 212 | // Set the compute pipeline and buffers 213 | commandEncoder.setComputePipelineState(pipeline) 214 | commandEncoder.setBuffer(d_A, offset: 0, index: 0) 215 | commandEncoder.setBuffer(d_B, offset: 0, index: 1) 216 | commandEncoder.setBuffer(d_C, offset: 0, index: 2) 217 | 218 | // Dispatch the compute kernel 219 | let threadsPerThreadgroup = MTLSize(width: 256, height: 1, depth: 1) 220 | let numThreadgroups = MTLSize(width: (numElements + 255) / 256, height: 1, depth: 1) 221 | commandEncoder.dispatchThreadgroups(numThreadgroups, threadsPerThreadgroup: threadsPerThreadgroup) 222 | 223 | // End the command encoder and commit the command buffer 224 | commandEncoder.endEncoding() 225 | commandBuffer.commit() 226 | 227 | // Wait for the command buffer to complete 228 | commandBuffer.waitUntilCompleted() 229 | 230 | // Copy the result from device to host 231 | let h_C = UnsafeMutablePointer.allocate(capacity: numElements) 232 | d_C.contents().copyMemory(to: h_C, byteCount: bufferSize) 233 | 234 | // ... 235 | // Clean up 236 | // ... 237 | ``` 238 | 239 | In this example, `d_A`, `d_B`, and `d_C` are Metal buffers, and `numElements` is the number of elements in each vector. The `vector_add` function is a Metal shader written in the Metal Shading Language, and it is executed on the GPU using a Metal compute command encoder. 240 | 241 | Note that this example is written in Swift, which is the recommended programming language for developing Metal applications. You can also use Objective-C, but Swift is generally preferred for new development. 242 | 243 | This example is quite a bit more complex than the earlier CUDA and HIP examples, because Metal is a lower-level API that provides more fine-grained control over the GPU. This can lead to more efficient code, but it also requires more boilerplate code to set up and manage GPU resources. 244 | 245 | #### Metal Performance Shaders (MPS) 246 | 247 | **Metal Performance Shaders (MPS)** is a framework that provides highly optimised functions for common image processing and machine learning tasks. MPS is built on top of the Metal framework and is available on all Apple devices, including Macs, iPhones, and iPads. 248 | 249 | MPS includes a variety of functions for image processing (e.g., convolution, resizing, and histogram calculation), as well as a set of neural network layers (e.g., convolution, pooling, and normalisation) that can be used to build and run neural networks on the GPU. 250 | 251 | MPS is a higher-level API than Metal, which makes it easier to use, but it provides less flexibility. If you are developing an application for Apple devices and you need to perform image processing or machine learning tasks, MPS is a good place to start. 252 | 253 | ### Cross Platform Graphics APIs 254 | 255 | #### Vulkan 256 | 257 | **Vulkan** is a low-level graphics and compute API developed by the Khronos Group. It provides fine-grained control over the GPU and is designed to minimise CPU overhead and provide more consistent performance. Vulkan can be used for a variety of applications, including gaming, simulation, and scientific computing. 258 | 259 | Vulkan is supported on a wide variety of platforms, including Windows, Linux, macOS (via MoltenVK, a Vulkan implementation that runs on top of Metal), Android, and iOS. Vulkan has a somewhat steep learning curve because it is a very low-level API, but it provides a lot of flexibility and can lead to very high performance. 260 | 261 | Vulkan is designed to be a cross-platform API. It is supported on a wide variety of platforms, including Windows, Linux, macOS (via MoltenVK, a layer that maps Vulkan to Metal), Android, and iOS. This makes it a good choice for developing applications that need to run on multiple platforms. 262 | 263 | #### OpenGL 264 | 265 | **OpenGL** is a cross-platform graphics API developed by the Khronos Group. It is widely used for developing graphics applications, including games, simulations, and design tools. OpenGL is a higher-level API than Vulkan, which makes it easier to use, but it provides less control over the GPU and may have more CPU overhead. 266 | 267 | OpenGL is supported on a wide variety of platforms, including Windows, macOS, Linux, and Android. However, Apple has deprecated OpenGL on its platforms in favor of Metal, so if you are developing an application for Apple devices, it is recommended to use Metal instead of OpenGL. 268 | 269 | Each of these APIs has its own strengths and weaknesses, and the best one to use depends on your specific application and requirements. If you are developing a cross-platform application and need a low-level API, Vulkan is a good choice. If you are developing an application for Apple devices and need to perform image processing or machine learning tasks, MPS is a good choice. If you are developing a graphics application and need a higher-level API, OpenGL may be a good choice, although you should consider using Metal on Apple devices. 270 | 271 | #### DirectX 272 | 273 | **DirectX** is a collection of APIs for handling tasks related to multimedia, game programming, and video, on Microsoft platforms. While it's most commonly associated with Windows, it is also available on Xbox. Note that DirectX is not fully cross-platform, as it doesn't support macOS or Linux. 274 | 275 | #### OpenCL 276 | 277 | **OpenCL** is a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. OpenCL includes a language (based on C99) for writing kernels (i.e., functions that run on the hardware devices), plus APIs that are used to define and then control the platforms. OpenCL provides parallel computing using task-based and data-based parallelism. 278 | 279 | #### WebGL and WebGPU 280 | 281 | **WebGL** is a web-based graphics API that is based on OpenGL ES. It allows you to create 3D graphics in a web browser. Since it's web-based, it is supported on all major platforms and web browsers. While on the other hand, **WebGPU** is a new web-based graphics and compute API that is currently being developed by the W3C GPU for the Web Community Group. It is designed to provide modern 3D graphics and computation capabilities in web browsers, and it is intended to be the successor to WebGL. 282 | 283 | WebGPU aims to provide a more modern and lower-level API than WebGL, which will allow for better performance and more flexibility. It is designed to be a web-friendly API that can be implemented on top of other graphics APIs, such as Vulkan, Metal, and DirectX. 284 | 285 | WebGPU is still in development, and it is not yet widely supported in web browsers. However, it is an exciting development for web-based graphics and computation, and it is worth keeping an eye on if you are developing web applications that require high-performance graphics or computation. 286 | 287 | WebGPU will be a cross-platform API because it will be supported in web browsers on multiple platforms. However, the actual implementation of WebGPU in the browser may use different underlying graphics APIs, depending on the platform. For example, a browser on Windows may use a DirectX-based implementation of WebGPU, while a browser on macOS may use a Metal-based implementation. This will be transparent to the application developer, who will just use the WebGPU API. 288 | 289 | ```{admonition} Work in Progress 290 | :class: attention 291 | An entire chapter will be dedicated to WebGPU (coming soon!) 292 | ``` 293 | 294 | ### Benchmarks 295 | 296 | ```{admonition} Work in Progress 297 | :class: attention 298 | Table with benchmarks 299 | ``` 300 | 301 | ### Acceleration Libraries 302 | 303 | - **OpenBLAS** 304 | - **CuBLAS** 305 | - **cuDNN** 306 | - **OpenCL** 307 | 308 | ## Cloud 309 | 310 | - cost comparisons 311 | + user-friendly: https://fullstackdeeplearning.com/cloud-gpus 312 | + less user-friendly but more comprehensive: https://cloud-gpus.com 313 | + comparisons of both features and pricing for GPU cloud providers: https://www.gpucloudpricing.com 314 | + LLM-specific advice: https://gpus.llm-utils.org/cloud-gpu-guide/#which-gpu-cloud-should-i-use 315 | 316 | ## Future 317 | 318 | One problem with using current {term}`LLMs ` is the high GPU memory requirements. One popular work-around is {term}`quantisation`. However, this requires hardware manufacturers to build support for quantised operations ({term}`SIMD` instruction sets), and ML libraries to rewrite/reimplement core parts of their codebase to support the new operations. Also recall that CPU-based SIMD instruction sets (e.g. [SSE4](https://en.wikipedia.org/wiki/SSE4) & [AVX10](https://en.wikipedia.org/wiki/AVX10) for PCs and [NEON]() for mobiles) took many years to develop, and are still actively evolving. By comparison, GPU architectures have much less adoption & development, so new arithmetic operations will take years to be widely supported. 319 | 320 | {{ comments }} 321 | -------------------------------------------------------------------------------- /index.md: -------------------------------------------------------------------------------- 1 | # State of Open Source AI Book - 2023 Edition 2 | 3 | {{ '```{badges} %s %s\n:doi: %s\n```' % (baseurl, env.config.html_theme_options.repository_url, doi) }} 4 | 5 | *Clarity in the current fast-paced mess of Open Source innovation {cite}`prem_stateofosai`* 6 | 7 | As a data scientist/ML engineer/developer with a 9 to 5 job, it's difficult to keep track of all the innovations. There's been enormous progress in the field in {term}`the last year `. 8 | 9 | Cure your FOMO with this guide, covering all the most important categories in the Open Source AI space, from model evaluations to deployment. It includes a [](#glossary) for you to quickly check definitions of new frameworks & tools. 10 | 11 | A quick TL;DR overview is included at the top of each section. We outline the pros/cons and general context/background for each topic. Then we dive a bit deeper. Examples include data models were trained on, and deployment implementations. 12 | 13 | ## Who is This Guide For? 14 | 15 | ```{admonition} Prerequisites to Reading 16 | :class: warning 17 | You should already know the basic principles of MLOps {cite}`google-mlops,redhat-mlops,ml-ops`, i.e. you should know that the traditional steps are: 18 | 19 | 1. Data engineering (preprocessing, curation, labelling, sanitisation) 20 | 2. Model engineering (training, architecture design) 21 | 3. Automated testing (CI) 22 | 4. Deployment/Automated Inference (CD) 23 | 5. Monitoring (logging, feedback, drift detection) 24 | ``` 25 | 26 | You haven't followed the most recent developments in open source AI over {term}`the last year `, and want to catch up quickly. 27 | We go beyond just mentioning the models, but also include things such as changing infrastructure, licence pitfalls, and novel applications. 28 | 29 | (toc)= 30 | 31 | ## Table of Contents 32 | 33 | We've divided the open-source tooling, models, & MLOps landscape into the following chapters: 34 | 35 | Chapter | Description 36 | ---|--- 37 | [](licences) | Weights vs Data, Commercial use, Fair use, Pending lawsuits 38 | [](eval-datasets) | Leaderboards & Benchmarks for Text/Visual/Audio models 39 | [](models) | LLaMA 1 vs 2, Stable Diffusion, DALL-E, Persimmon, ... 40 | [](unaligned-models) | FraudGPT, WormGPT, PoisonGPT, WizardLM, Falcon 41 | [](fine-tuning) | LLMs, Visual, & Audio models 42 | [](model-formats) | ONNX, GGML, TensorRT 43 | [](mlops-engines) | vLLM, TGI, Triton, BentoML, ... 44 | [](vector-db) | Weaviate, Qdrant, Milvus, Redis, Chroma, ... 45 | [](sdk) | LangChain, LLaMA Index, LiteLLM 46 | [](desktop-apps) | LMStudio, GPT4All, Koboldcpp, ... 47 | [](hardware) | NVIDIA CUDA, AMD ROCm, Apple Silicon, Intel, TPUs, ... 48 | 49 | ## Contributing 50 | 51 | This source of this guide is available on GitHub at {{ env.config.html_theme_options.repository_url }}. 52 | 53 | ```{admonition} Feedback 54 | :class: attention 55 | The current open-source ecosystem is moving at light-speed. 56 | Spot something outdated or missing? Want to start a discussion? We welcome any of the following: 57 | 58 | - let us know in the comments at the end of each chapter 59 | - [ create issues](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue) 60 | - [ open pull requests](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) 61 | ``` 62 | 63 | ### Editing the Book 64 | 65 | - Using {{ '[GitHub Codespaces](https://codespaces.new/{})'.format( 66 | '/'.join(env.config.html_theme_options.repository_url.split('/')[-2:])) }}, you can edit code & preview the site in your browser without installing anything (you may [have to whitelist `github.dev`, `visualstudio.com`, `github.com`, & `trafficmanager.net`](https://docs.github.com/en/codespaces/the-githubdev-web-based-editor#using-githubdev-behind-a-firewall) if you use an adblocker). 67 | - Alternatively, to run locally, open {{ '[this repository]({})'.format(env.config.html_theme_options.repository_url) }} in a [Dev Container](https://containers.dev) (most likely [using VSCode](https://code.visualstudio.com/docs/devcontainers/containers#_installation)). 68 | - Or instead, manually set up your own Python environment: 69 | 70 | ```sh 71 | pip install -r requirements.txt # setup 72 | jupyter-book build --builder dirhtml --all . # build 73 | python -m http.server -d _build/dirhtml # serve 74 | ``` 75 | 76 | ````{admonition} alternative: live rebuilding & serving (experimental) 77 | :class: tip, dropdown 78 | ```sh 79 | pip install -r requirements.txt sphinx-autobuild # setup 80 | jupyter-book config sphinx . # config 81 | sphinx-autobuild -b dirhtml . _build/dirhtml # build-serve 82 | ``` 83 | ```` 84 | 85 | ### Formatting 86 | 87 | ```{note} 88 | Don't worry about making it perfect, it's fine to open a ([draft](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests#draft-pull-requests)) PR and [allow edits from maintainers](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) to fix it ♥ 89 | ``` 90 | 91 | - [Quickstart](https://jupyterbook.org/en/stable/reference/cheatsheet.html) 92 | - [Full reference](https://jupyterbook.org/en/stable/content/myst.html) 93 | - Create a new chapter: 94 | + create `some-file.md` (containing `# Some File` heading and `{{ comments }}` footer) 95 | + add `- file: some-file` to `_toc.yml` 96 | + add `[](some-file) | summary` to [ToC](toc) 97 | - Images: use [`{figure}`/`{figure-md}` with captions](https://myst-parser.readthedocs.io/en/latest/syntax/images_and_figures.html#figures-images-with-captions) 98 | 99 | ```{figure} https://static.premai.io/logo.png 100 | :name: fig-ref 101 | :width: 150px 102 | :alt: alt-text 103 | 104 | This is a **figure caption** 105 | ``` 106 | 107 | + [inline ref](fig-ref) 108 | + numbered ref: {numref}`fig-ref` 109 | + custom ref: {numref}`Figure {number} with caption "{name}" ` 110 | + please use https://github.com/premAI-io/static.premai.io to host images & data 111 | 112 | - Tables: use [`{table}` with captions](https://myst-parser.readthedocs.io/en/latest/syntax/tables.html#table-with-captions) 113 | - [](#glossary) term: {term}`GPU` 114 | + custom inline text: {term}`GPUs ` 115 | - Citations: 116 | + add [BibTeX](https://jupyterbook.org/en/stable/tutorials/references.html#add-your-references) entries to `references.bib`, e.g.: 117 | * blogs, wikis: `@online` 118 | * docs: [`@manual`](https://www.bibtex.com/e/entry-types/#manual) 119 | * journal articles, news articles: [`@article`](https://www.bibtex.com/e/article-entry) 120 | * conference proceedings: [`@proceedings`](https://www.bibtex.com/e/entry-types/#proceedings) 121 | * books: [`@book`](https://www.bibtex.com/e/book-entry) 122 | * whitepapers: [`@techreport`](https://www.bibtex.com/e/entry-types/#techreport) 123 | * chapters/parts of larger work: [`@incollection`](https://www.bibtex.com/e/entry-types/#incollection), [`@inbook`](https://www.bibtex.com/e/entry-types/#inbook) 124 | * drafts: [`@unpublished`](https://www.bibtex.com/e/entry-types/#unpublished) 125 | + citing things defined in `references.bib`: {cite}`prem_stateofosai,python` 126 | + GitHub links: 127 | * repos: https://github.com/premAI-io/state-of-open-source-ai 128 | * issues: https://github.com/premAI-io/state-of-open-source-ai/issues/12 129 | * code (folder/file): [premAI-io/state-of-open-source-ai:index.md](https://github.com/premAI-io/state-of-open-source-ai/blob/main/index.md) 130 | * readme sections: [premAI-io/prem-app#demo](https://github.com/premAI-io/prem-app#demo) 131 | - New [Sphinx extensions](https://www.sphinx-doc.org/en/master/usage/extensions): append to `requirements.txt` and `_config.yml:sphinx.extra_extensions` 132 | - `linkcheck` false-positives: append to `_config.yml:sphinx.config.linkcheck*` 133 | 134 | % comment lines (not rendered) are prefixed with a "%" 135 | 136 | ### Contributors 137 | 138 | Anyone who adds a few sentences to a chapter is {{ 139 | '[automatically mentioned in the respective chapter]({}/blob/main/committers.py)'.format( 140 | env.config.html_theme_options.repository_url) }} as well as below. 141 | 142 | {{ '[![](https://contrib.rocks/image?anon=1&repo={})]({}/graphs/contributors)'.format( 143 | '/'.join(env.config.html_theme_options.repository_url.split('/')[-2:]), 144 | env.config.html_theme_options.repository_url) }} 145 | 146 | - Editor: Casper da Costa-Luis (https://github.com/casperdcl) 147 | 148 | > With a strong [academic background](https://cdcl.ml/learn) as well [industry expertise](https://cdcl.ml/work) to backup his enthusiasm for all things open source, Casper is happy to help with all queries related to this book. 149 | 150 | - Maintainer: https://github.com/PremAI-io 151 | 152 | > Our vision is to engineer a world where individuals, developers, and businesses can embrace the power of AI without compromising their privacy. We believe in a future where users retain ownership of their data, AND the models trained on it. 153 | 154 | - Citing this book: {cite}`prem_stateofosai` 155 | 156 | ## Conclusion 157 | 158 | ```{epigraph} 159 | All models are wrong, but some are useful 160 | 161 | -- G.E.P. Box {cite}`box-models` 162 | ``` 163 | 164 | % TODO: rewrite 165 | 166 | Open Source AI represents the future of privacy and ownership of data. On the other hand, in order to make this happen a lot of innovation should come into place. In the last year, already the open-source community demonstrated how motivated they are in order to deliver quality models to the hands of consumers creating already few big innovations in different AI fields. At the same time, this is just the beginning. Many improvements in multiple directions must be made in order to compare the results with centralised solutions. 167 | 168 | At Prem we are on a journey to make this possible, with a focus on developer experience and deployment for any sort of developers, from Web Developers with zero knowledge about AI to affirmed Data Scientist who wants to quickly deploy and try these new models and technologies in their existing infra without compromising privacy. 169 | 170 | ## Join our Community 171 | 172 | - Ask for support on [our Discord server](https://discord.com/invite/kpKk6vYVAn). 173 | - To keep up-to-date, [follow us on Twitter](https://twitter.com/premai_io). 174 | - Report bugs or request features at https://github.com/premAI-io/prem-app. 175 | 176 | ## Glossary 177 | 178 | %TODO: define all these & use them where appropriate 179 | 180 | ```{glossary} 181 | Alignment 182 | [Aligned AI models](https://en.wikipedia.org/wiki/AI_alignment) must implement safeguards to be helpful, honest, and harmless {cite}`labellerr-alignment`. 183 | This often involves {term}`supervised fine-tuning` followed by {term}`RLHF` See [](unaligned-models) and [](fine-tuning). 184 | 185 | Auto-regressive language model 186 | Applies [AR](https://en.wikipedia.org/wiki/Autoregressive_model) to {term}`LLMs `. Essentially a feed-forward model which predicts the next word given a context (set of words) {cite}`medium-arlm`. 187 | 188 | BEC 189 | [Business Email Compromise](https://www.microsoft.com/en-us/security/business/security-101/what-is-business-email-compromise-bec). 190 | 191 | Benchmark 192 | A curated dataset and corresponding tasks designed to evaluate models' real-world performance metrics (so that models can be {term}`compared to each other `). 193 | 194 | Copyleft 195 | A type of [open licence](open-licences) which insists that derivatives of the IP must have the same licence. Also called "protective" or "reciprocal" {cite}`wiki-copyleft`. 196 | 197 | Embedding 198 | See {term}`vector embedding`. 199 | 200 | Evaluation 201 | Assessing a model's abilities using quantitative and qualitative performance metrics (e.g. accuracy, effectiveness, etc.) on a given task. See [](eval-datasets). 202 | 203 | Fair Dealing 204 | A doctrine in UK & commonwealth law permitting use of {term}`IP` without prior permission under certain conditions (typically research, criticism, reporting, or satire) {cite}`wiki-fair-dealing`. See also {term}`fair use`. 205 | 206 | Fair Use 207 | A doctrine in US law permitting use of {term}`IP` without prior permission (regardless of licence/copyright status) depending on 1) purpose of use, 2) nature of the IP, 3) amount of use, and 4) effect on value {cite}`wiki-fair-use`. See also {term}`fair dealing`. 208 | 209 | Fine-tuning 210 | [Fine-tuning](https://en.wikipedia.org/wiki/Fine-tuning_(deep_learning)) is a technique in transfer learning where a pre-trained model's already learned features or parameters are further adjusted or tweaked using data specific to the new task, enabling the model to specialize and improve its performance on the target task.[Fine-tuning](). See also [](fine-tuning) and {term}`transfer learning`. 211 | 212 | Foundation model 213 | A model trained from scratch -- likely on lots of data -- to be used for general tasks or later fine-tuned for specific tasks. 214 | 215 | GPU 216 | [Graphics Processing Unit](https://en.wikipedia.org/wiki/Graphics_processing_unit): hardware originally designed to accelerate computer image processing, but now often repurposed for [embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel) computational tasks in machine learning. 217 | 218 | Hallucination 219 | A model generating output that is [inexplicable by its training data](). 220 | 221 | IP 222 | [Intellectual Property](https://en.wikipedia.org/wiki/Intellectual_property): intangible creations by humans (e.g. code, text, art), typically legally protected from use without permission of the author(s). 223 | 224 | Leaderboard 225 | Ranking of models based on their performance metrics on the same {term}`benchmark(s) `, allowing fair task-specific comparison. See [](leaderboards-table). 226 | 227 | LLM 228 | A [Large Language Model](https://en.wikipedia.org/wiki/Large_language_model) is neural network (often a {term}`transformer` containing billions of parameters) designed to perform tasks in natural language via {term}`fine-tuning` or [prompt engineering](https://en.wikipedia.org/wiki/Prompt_engineering). 229 | 230 | MLOps 231 | [Machine Learning Operations](https://blogs.nvidia.com/blog/what-is-mlops): best practices to run AI using software products & cloud services 232 | 233 | MoE 234 | [Mixture-of-Experts](https://en.wikipedia.org/wiki/Mixture_of_experts) is a technique which uses one or more specialist model(s) from a collection of models ("experts") to solve general problems. Note that this is different from [ensemble](https://en.wikipedia.org/wiki/Ensemble_learning) models (which combine results from all models). 235 | 236 | Open 237 | Ambiguous term that could mean "open source" or "open licence". See [](open). 238 | 239 | Permissive 240 | A type of [open licence](open-licences) which allows reselling and closed-source modifications, and can often be used in larger projects alongside other licences. Usually, the only condition of use is citing the author by name. 241 | 242 | Perplexity 243 | [Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a metric based on [entropy](), and is a rough measure of the difficulty/uncertainty in a prediction problem. 244 | 245 | Public Domain 246 | "Open" {term}`IP` owned by nobody (often due to the author disclaiming all rights) and thus can be used by anyone without restrictions. Technically a disclaimer/non-licence. See [](open-licences). 247 | 248 | RAG 249 | [Retrieval Augmented Generation](https://www.pinecone.io/learn/retrieval-augmented-generation). 250 | 251 | RLHF 252 | [Reinforcement Learning from Human Feedback](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback) is often the second step in {term}`alignment` (after {term}`supervised fine-tuning`), where a model is [rewarded or penalised](https://en.wikipedia.org/wiki/Reinforcement_learning) for it outputs based on human evaluation. See [](fine-tuning) and [](unaligned-models). 253 | 254 | ROME 255 | The [Rank-One Model Editing algorithm](https://rome.baulab.info) alters a trained model's weights to directly modify "learned" information {cite}`meng2023locating,raunak2022rankone`. 256 | 257 | SIMD 258 | [Single Instruction, Multiple Data](https://en.wikipedia.org/wiki/SIMD) is a [data-level](https://en.wikipedia.org/wiki/Data_parallelism) [parallel processing](https://en.wikipedia.org/wiki/Parallel_computer) technique where one computational instruction is applied to multiple data simultaneously. 259 | 260 | SotA 261 | State of the art: recent developments (under 1 year old). 262 | 263 | Supervised Fine-tuning 264 | [SFT](https://cameronrwolfe.substack.com/p/understanding-and-using-supervised) is often the first step in model {term}`alignment`, and is usually followed by {term}`RLHF`. See [](fine-tuning) and [](unaligned-models). 265 | 266 | Quantisation 267 | [Sacrificing precision]() of model weights (e.g. `uint8` instead of `float32`) in return for lower hardware memory requirements. 268 | 269 | Token 270 | A [token](https://learn.microsoft.com/en-us/semantic-kernel/prompts/) is a "unit of text" for an {term}`LLM` to process/generate. A single token could represent a few characters or words, depending on the tokenisation method chosen. Tokens are usually {term}`embedded `. 271 | 272 | Transfer Learning 273 | [Transfer Learning](https://en.wikipedia.org/wiki/Transfer_learning) is a process of leveraging a pre-trained model's learned representations and adapting them to solve a different but related problem, often requiring less data and computation compared to training from scratch.. See also {term}`fine-tuning` and [](fine-tuning). 274 | 275 | Transformer 276 | A [transformer]() is a neural network using a parallel multi-head [attention]() mechanism. The resultant reduce training time makes it well-suited for use in {term}`LLMs `. 277 | 278 | Vector Database 279 | [Vector databases](https://en.wikipedia.org/wiki/Vector_database) provide efficient storage & search/retrieval for {term}`vector embeddings `. See [](vector-db). 280 | 281 | Vector Embedding 282 | [Embedding](https://learn.microsoft.com/en-us/semantic-kernel/memories/embeddings) means encoding {term}`tokens ` into a numeric vector (i.e. array/list). This can be thought of as an intermediary between machine and human language, and thus helps {term}`LLMs ` understand human language. See [](vector-db.md#llm-embeddings). 283 | 284 | Vector Store 285 | See {term}`vector database`. 286 | ``` 287 | 288 | % TODO: glossary definitions for: 289 | % Decoder-style transformer 290 | % Diffusion-based text-to-image generative mode 291 | % A100, V100, H100 292 | % VRAM 293 | -------------------------------------------------------------------------------- /licences.md: -------------------------------------------------------------------------------- 1 | # Licences 2 | 3 | % TODO: investigate if significant: hardware licences placing restrictions on use of models trained on said hardware? 4 | % TODO: https://tldr.cdcl.ml/tags/#law 5 | % TODO: summary graphic? 6 | 7 | Concerning {term}`IP` in software-related fields, developers are likely aware of two "[open](open)" copyright licence categories: one for highly structured work (e.g. software), and the other for general content (e.g. [](#data) including prosaic text and images). These two categories needed to exist separately to solve problems unique to their domains, and thus were not designed to be compatible. A particular piece of work is expected to fall into just one category, not both. 8 | 9 | Copyright for [](#ml-models), however, is more nuanced. 10 | 11 | Aside from categorisation, a further complication is the lack of [](#legal-precedence). A licence is not necessarily automatically legally binding -- it may be [incompatible with existing laws](#copyright-exceptions). Furthermore, in an increasingly global workplace, it may be unclear [which country's laws](#national-vs-international-laws) should be applicable in a particular case. 12 | 13 | Finally, licence terms disclaiming warranty/liability are contributing to an [](#accountability-crisis). 14 | 15 | ## ML Models 16 | 17 | A working [model](models) is defined partially in code (architecture & training regimen) and partially by its parameters (trained weights, i.e. a list of numbers). The latter is implicitly defined by the training data (often mixed media). One could therefore argue that models must be simultaneously bound by multiple licences for multiple different domains. Such licences were not designed to work simultaneously, and may not even be compatible. 18 | 19 | Here's a summary of the usage restrictions around some popular models (in descending order of real-world output quality as measured by us): 20 | 21 | ```{table} Restrictions on training data, trained weights, and generated outputs 22 | :name: model-licences 23 | Model | Weights | Training Data | Output 24 | --|--|--|-- 25 | [OpenAI ChatGPT](https://openai.com/policies/terms-of-use) | 🔴 unavailable | 🔴 unavailable | 🟢 user has full ownership 26 | [Anthropic Claude](https://console.anthropic.com/legal/terms) | 🔴 unavailable | 🔴 unavailable | 🟡 commercial use permitted 27 | [LMSys Vicuna 33B](https://lmsys.org/blog/2023-03-30-vicuna) | 🟢 open source | 🔴 unavailable | 🔴 no commercial use 28 | [LMSys Vicuna 13B](https://github.com/lm-sys/FastChat) | 🟢 open source | 🔴 unavailable | 🟡 commercial use permitted 29 | [MosaicML MPT 30B Chat](https://www.mosaicml.com/blog/mpt-30b) | 🟢 open source | 🔴 unavailable | 🔴 no commercial use 30 | [Meta LLaMA2 13B Chat](https://github.com/facebookresearch/llama/blob/main/LICENSE) | 🟢 open source | 🔴 unavailable | 🟡 commercial use permitted 31 | [RWKV4 Raven 14B](https://github.com/BlinkDL/RWKV-LM) | 🟢 open source | 🟢 available | 🟢 user has full ownership 32 | [OpenAssistant SFT4 Pythia 12B](https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5) | 🟢 open source | 🟢 available | 🟢 user has full ownership 33 | [MosaicML MPT 30B Instruct](https://huggingface.co/mosaicml/mpt-30b-instruct) | 🟢 open source | 🔴 unavailable | 🟡 commercial use permitted 34 | [MosaicML MPT 30B](https://www.mosaicml.com/blog/mpt-30b) | 🟢 open source | 🔴 unavailable | 🟢 user has full ownership 35 | ``` 36 | 37 | % TODO: mention Apache-2.0, LLaMA vs LLaMA 2, HuggingFace, CC-BY((-NC)-SA) in the table above? 38 | 39 | {{ table_feedback }} 40 | 41 | Just a few weeks after some said "the golden age of open [...] AI is coming to an end" {cite}`golden-age-os-end`, things like Falcon's `Apache-2.0` relicensing {cite}`falcon-relicence` and the [LLaMA-2 community licence](https://ai.meta.com/llama/license) {cite}`llama-2-licence` were announced (both permitting commercial use), completely changing the landscape. 42 | 43 | Some interesting observations currently: 44 | 45 | - Pre-trained model weights are typically not closely guarded 46 | - Generated outputs often are usable commercially, but with conditions (no full copyrights granted) 47 | - Training data is seldom available 48 | + honourable exceptions are OpenAssistant (which promises that [data will be released under `CC-BY-4.0`](https://github.com/LAION-AI/Open-Assistant/blob/main/LICENSE) but confusingly appears [already released under `Apache-2.0`](https://huggingface.co/datasets/OpenAssistant/oasst1)) and RWKV (which provides both [brief](https://wiki.rwkv.com/basic/FAQ.html#what-is-the-dataset-that-rwkv-is-trained-on) and [more detailed](https://github.com/BlinkDL/RWKV-LM#training--fine-tuning) guidance) 49 | 50 | Licences are increasingly being recognised as important, and are even mentioned in some online leaderboards such as [](eval-datasets.md#chatbot-arena). 51 | 52 | ## Data 53 | 54 | As briefly alluded to, data and code are often each covered by their own licence categories -- but there may be conflicts when these two overlap. For example, pre-trained weights are a product of both code and data. This means one licence intended for non-code work (i.e. data) and another licence intended for code (i.e. model architectures) must simultaneously apply to the weights. This may be problematic or even nonsensical. 55 | 56 | ```{admonition} Feedback 57 | :class: attention 58 | If you know of any legal precedence in conflicting multi-licence cases, please let us know in the {{ 59 | '[ comments]({}-comments)'.format(env.docname) }} below! 60 | ``` 61 | 62 | % TODO: dataset restrictions (e.g. ImageNet non-commercial)? 63 | 64 | (open)= 65 | 66 | ## Meaning of "Open" 67 | 68 | "Open" could refer to "open licences" or "open source (code)". Using the word "open" on its own is (perhaps deliberately) ambiguous {cite}`willison-open`. 69 | 70 | From a **legal (licencing) perspective**, "open" means (after legally obtaining the IP) no additional permission/payment is needed to use, make modifications to, & share the IP {cite}`open-definition,osd`. However, there are 3 subcategories of such "open" licences as per {numref}`open-licences`. Meanwhile, from a **software perspective**, there is only one meaning of "open": the source code is available. 71 | 72 | ```{table} Open licence subcategories 73 | :name: open-licences 74 | 75 | Subcategory | Conditions | Licence examples 76 | --|--|-- 77 | {term}`Public Domain` | Minimum required by law (so technically not a licence) | [`Unlicence`](https://spdx.org/licenses/Unlicense.html), [`CC0-1.0`](https://creativecommons.org/publicdomain/zero/1.0/legalcode) 78 | {term}`Permissive` | Cite the original author(s) by name | [`Apache-2.0`](https://www.apache.org/licenses/LICENSE-2.0), [`CC-BY-4.0`](https://creativecommons.org/licenses/by/4.0/legalcode) 79 | {term}`Copyleft` | Derivatives use the same licence | [`GPL-3.0`](https://www.gnu.org/licenses/gpl-3.0.html), [`CC-BY-SA-4.0`](https://creativecommons.org/licenses/by-sa/4.0/legalcode) 80 | ``` 81 | 82 | ```{admonition} Choosing an Open Source Licence [#](open-choices) 83 | :name: open-choices 84 | :class: tip 85 | 86 | - Software: [compare 8 popular licences](https://choosealicense.com/licenses) 87 | + [`MPL-2.0`](https://mozilla.org/MPL/2.0) is noteworthy, as it combines the permissiveness & compatibility of [`Apache-2.0`](https://www.apache.org/licenses/LICENSE-2.0) with a very weak (file-level) copyleft version of [`LGPL-3.0-or-later`](https://spdx.org/licenses/LGPL-3.0-or-later.html). `MPL-2.0` is thus usually categorised as permissive {cite}`wiki-sw-licence`. 88 | - Data & media: one of the 3 `CC` licences from the [table above](open-licences) 89 | - Hardware: one of the [`CERN-OHL-2.0`](https://cern-ohl.web.cern.ch) licences 90 | - More choices: [compare dozens of licences](https://choosealicense.com/appendix) 91 | ``` 92 | 93 | One big problem is enforcing licence conditions (especially of {term}`copyleft` or even more restrictive licences), particularly in an open-source-centric climate with potentially billions of infringing users. It is a necessary condition of a law that it should be enforceable {cite}`law-enforceability`, which is infeasible with most current software {cite}`linux-warranty,cdcl-policing-foss,cdcl-os-illegal`. 94 | 95 | ## National vs International Laws 96 | 97 | ### Copyright Exceptions 98 | 99 | A further complication is the concept of "{term}`fair use`" and "{term}`fair dealing`" in some countries -- as well as international limitations {cite}`wiki-limitations-copyright` -- which may override licence terms as well as copyright in general {cite}`wiki-google-oracle-case,wiki-google-books-case,nytimes-google-books-case`. 100 | 101 | In practice, even legal teams often refuse to give advice {cite}`pytorch-vision-2597`, though it appears that copyright law is rarely enforced if there is no significant commercial gain/loss due to infringement. 102 | 103 | ### Obligation or Discrimination 104 | 105 | Organisations may also try to discriminate between countries even when not legally obliged to do so. For instance, OpenAI does not provide services to some countries {cite}`openai-supported-countries`, and it is unclear whether this is legally, politically, or financially motivated. 106 | 107 | ### Legal Precedence 108 | 109 | "Open" licences often mean "can be used without a fee, provided some conditions are met". In turn, users might presume that the authors do not expect to make much direct profit. In a capitalist society, such a disinterest in monetary gain might be mistaken as a disinterest in everything else, including enforcing the "provided some conditions are met" clause. Users might ignore the "conditions" in the hope that the authors will not notice, or will not have the time, inclination, nor money to pursue legal action. As a result, it is rare for a licence to be "tested" (i.e. debated and upheld, thus giving it legal weight) in a court of law. 110 | 111 | Only rare cases involving lots of money or large organisations go to court {cite}`cdcl-os-illegal`, such as these ongoing ones destined to produce "landmark" rulings: 112 | 113 | - Jun 2023 copyright case {cite}`copilot-copyright-case` against Microsoft, GitHub, and OpenAI 114 | - Jun 2023 privacy case {cite}`openai-privacy-case` against Microsoft & OpenAI 115 | - Nov 2022 copyright and open source licences case {cite}`legalpdf-doe-github-case` against GitHub 116 | 117 | ## Accountability Crisis 118 | 119 | Of the 100+ licences approved by the Open Source Initiative {cite}`osi-licences`, none provide any warranty or liability. In fact, all expressly **disclaim** warranty/liability (apart from [`MS-PL`](https://learn.microsoft.com/en-us/previous-versions/msp-n-p/ff647676(v=pandp.10)?redirectedfrom=MSDN) and [`MS-RL`](https://opensource.org/license/ms-rl-html), which don't expressly mention liability). 120 | 121 | This means a nefarious or profiteering organisation could release poor quality or malicious code under an ostensibly welcoming open source licence, but in practice abuse the licence terms to disown any responsibility or accountability. Users and consumers may unwittingly trust fundamentally untrustworthy sources. 122 | 123 | To combat this, the EU proposed cybersecurity legislation in Sep 2022: the Cyber Resilient Act (CRA) {cite}`cra` and Product Liability Act (PLA) {cite}`pla` propose to hold profiteering companies accountable (via "consumer interests" and "safety & liability" of products/services), so that anyone making (in)direct profit cannot hide behind "NO WARRANTY" licence clauses {cite}`cdcl-os-illegal`. Debate is ongoing, particularly over the CRA's Article 16, which states that a "person, other than [manufacturer/importer/distributor, who makes] a substantial modification of [a software product] shall be considered a manufacturer" {cite}`cdcl-cra-pla`. FOSS organisations have questioned whether liability can traverse the dependency graph, and what minor indirect profit-making is exempt {cite}`psf-cra,eclipse-cra,nlnet-cra`. 124 | 125 | However, law-makers should be careful to limit the scope of any FOSS exemptions to prevent commercial abuse/loopholes {cite}`cdcl-os-illegal,cdcl-cra-pla`, and encourage accountability for critical infrastructure {cite}`cdcl-policing-foss`. 126 | 127 | ```{admonition} A better way? [#](fund-warranties) 128 | :name: fund-warranties 129 | :class: seealso 130 | In the interest of public safety, the best solution might be to pay for warranties for widely-used software via public funds {cite}`cdcl-os-bad` or crowdsourcing {cite}`tidelift,gh-sponsors,opencollective,numfocus`. 131 | ``` 132 | 133 | ## Future 134 | 135 | To recap: 136 | 137 | - It's unknown what are the implications of multiple licences with conflicting terms (e.g. models inheriting both code & data licences) 138 | + there is little [](#legal-precedence) 139 | - "[Open](open)" could refer to code/source or to licence (so is ambiguous without further information) 140 | + training data is often not open source 141 | - Licences always disclaim warranty/liability 142 | - Enforcing licences might be illegal 143 | + limitations such as {term}`fair use`/{term}`dealing ` can override licences/copyright 144 | + proposed accountability laws might override licence disclaimers 145 | - Enforcing licences might be infeasible 146 | + there are [ongoing cases](#legal-precedence) regarding (ab)use of various subcategories of IP: copyright (no licence) for both open and closed source, as well as licences with copyleft or non-commercial clauses 147 | 148 | In the long term, we look forward to the outcomes of the US cases and EU proposals. Meanwhile in the short term, a recent tweet ({numref}`unusual-ventures-tweet`) classified some current & {term}`foundation ` models (albeit with no explanation/discussion yet as of Oct 2023). We hope to see an accompanying write-up soon! 149 | 150 | ```{figure-md} unusual-ventures-tweet 151 | :class: caption 152 | ![](https://pbs.twimg.com/media/F3AiXRJWsAAP0Da?format=jpg&name=4096x4096) 153 | 154 | [The AI Battle: Open Source vs Closed Source](https://twitter.com/chiefaioffice/status/1688913452662984708?s=20) 155 | ``` 156 | 157 | {{ comments }} 158 | -------------------------------------------------------------------------------- /mlops-engines.md: -------------------------------------------------------------------------------- 1 | # MLOps Engines 2 | 3 | ```{admonition} Work in Progress 4 | :class: attention 5 | {{ wip_chapter }} 6 | 7 | Some ideas: 8 | 9 | - [7 Frameworks for Serving LLMs](https://betterprogramming.pub/frameworks-for-serving-llms-60b7f7b23407) "comprehensive guide & detailed comparison" 10 | - [Trends: Optimising for Faster Inference](https://cameronrwolfe.substack.com/i/135439692/optimizing-for-faster-inference) 11 | - https://github.com/imaurer/awesome-decentralized-llm 12 | - Python Bindings and More 13 | - PyTorch Toolchain -- From C/C++ to Python 14 | - https://docs.bentoml.org 15 | + https://docs.bentoml.org/en/latest/overview/what-is-bentoml.html#build-applications-with-any-ai-models 16 | - https://finbarr.ca/how-is-llama-cpp-possible 17 | - https://onnxruntime.ai/docs/execution-providers 18 | - Apache TVM 19 | ``` 20 | 21 | This chapter focuses on recent open-source {term}`MLOps` engine developments -- which are largely due to the current rise of {term}`LLMs `. While MLOps typically focuses on model training, "LLMOps" focuses on fine-tuning. In production, both also require good inference engines. 22 | 23 | ```{table} Comparison of Inference Engines 24 | :name: inference-engines 25 | Inference Engine | Open-Source | GPU optimisations | Ease of use 26 | -----------------|-------------|-------------------|------------- 27 | [Nvidia Triton](#nvidia-triton-inference-server) | 🟢 Yes | Dynamic Batching, Tensor Parallelism, Model concurrency | 🔴 Difficult 28 | [](#text-generation-inference) | 🟢 Yes | Continuous Batching, Tensor Parallelism, Flash Attention | 🟢 Easy 29 | [](#vllm) | 🟢 Yes | Continuous Batching, Tensor Parallelism, Paged Attention | 🟢 Easy 30 | [](#bentoml) | 🟢 Yes | None | 🟢 Easy 31 | [](#modular) | 🔴 No | N/A | 🟡 Moderate 32 | [](#localai) | 🟢 Yes | 🟢 Yes | 🟢 Easy 33 | ``` 34 | 35 | {{ table_feedback }} 36 | 37 | ## Nvidia Triton Inference Server 38 | 39 | ```{figure-md} mlops-engines-triton-architecture 40 | :class: caption 41 | ![](https://static.premai.io/book/mlops-engines-triton-architecture.png) 42 | 43 | [Nvidia Triton Architecture](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/jetson.html) 44 | ``` 45 | 46 | This [inference server](https://developer.nvidia.com/triton-inference-server) offers support for multiple model formats such as PyTorch, TensorFlow, ONNX, TensorRT, etc. It uses GPUs efficiently to boost the performance of deep learning models. 47 | 48 | - **Concurrent model execution**: This allows multiple models to be executed on 1 or many GPUs in parallel. Multiple requests are routed to each model to execute the tasks in parallel 49 | - **Dynamic Batching**: Combines multiple inference requests into a batch to increase throughput. Requests in each batch can be processed in parallel instead of handling each request sequentially. 50 | 51 | Pros: 52 | 53 | * High throughput, low latency for serving LLMs on a GPU 54 | * Supports multiple frameworks/backends 55 | * Production level performance 56 | * Works with non-LLM models such as image generation or speech to text 57 | 58 | Cons: 59 | 60 | * Difficult to set up 61 | * Not compatible with many of the newer LLMs 62 | 63 | ## Text Generation Inference 64 | 65 | ```{figure-md} tgi-architecture 66 | :class: caption 67 | ![](https://static.premai.io/book/mlops-engines-tgi-architecture.png) 68 | 69 | [Text Generation Inference Architecture](https://github.com/huggingface/text-generation-inference) 70 | ``` 71 | 72 | Compared to Triton, https://github.com/huggingface/text-generation-inference is easier to setup and supports most of the popular LLMs on Hugging Face. 73 | 74 | Pros: 75 | 76 | * Supports newer models on Hugging Face 77 | * Easy setup via docker container 78 | * Production-ready 79 | 80 | Cons: 81 | 82 | * Open-source license has restrictions on commercial usage 83 | * Only works with Hugging Face models 84 | 85 | ## vLLM 86 | 87 | This is an open-source project created by researchers at Berkeley to improve the performance of LLM inferencing. https://github.com/vllm-project/vllm primarily optimises LLM throughput via methods like PagedAttention and Continuous Batching. The project is fairly new and there is ongoing development. 88 | 89 | Pros: 90 | 91 | * Can be used commercially 92 | * Supports many popular Hugging Face models 93 | * Easy to setup 94 | 95 | Cons: 96 | 97 | * Not all LLM models are supported 98 | 99 | ## BentoML 100 | 101 | [BentoML](https://www.bentoml.com) is a fairly popular tool used to deploy ML models into production. It has gained a lot of popularity by building simple wrappers that can convert any model into a REST API endpoint. Currently, BentoML does not support some of the GPU optimizations such as tensor parallelism. However, the main benefit BentoML provides is that it can serve a wide variety of models. 102 | 103 | Pros: 104 | 105 | * Easy setup 106 | * Can be used commercially 107 | * Supports all models 108 | 109 | Cons: 110 | 111 | * Lacks some GPU optimizations 112 | 113 | ## Modular 114 | 115 | [Modular](https://www.modular.com) is designed to be a high performance AI engine that boosts the performance of deep learning models. The secret is in their custom compiler and runtime environment that improves the inferencing of any model without the developer needing to make any code changes. 116 | 117 | The Modular team has designed a new programming language, [Mojo](https://docs.modular.com/mojo), which combines the Python friendly syntax with the performance of C. The purpose of Mojo is to address some of the shortcomings of Python from a performance standpoint while still being a part of the Python ecosystem. This is the programming language used internally to create the Modular AI engine's kernels. 118 | 119 | Pros: 120 | 121 | * Low latency/High throughput for inference 122 | * Compatible with Tensorflow and Pytorch models 123 | 124 | Cons: 125 | 126 | * Not open-source 127 | * Not as simple to use compared to other engines on this list 128 | 129 | This is not an exhaustive list of MLOps engines by any means. There are many other tools and frameworks developer use to deploy their ML models. There is ongoing development in both the open-source and private sectors to improve the performance of LLMs. It's up to the community to test out different services to see which one works best for their use case. 130 | 131 | ## LocalAI 132 | 133 | [LocalAI](https://localai.io) from https://github.com/mudler/LocalAI ([not to be confused](https://github.com/louisgv/local.ai/discussions/71) with [](desktop-apps.md#localai) from https://github.com/louisgv/local.ai) is the free, Open Source alternative to OpenAI. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It can run LLMs (with various backend such as https://github.com/ggerganov/llama.cpp or [](#vllm)), generate images, generate audio, transcribe audio, and can be self-hosted (on-prem) with consumer-grade hardware. 134 | 135 | Pros: 136 | 137 | - [wide range of models supported](https://localai.io/model-compatibility) 138 | - support for [functions](https://localai.io/features/openai-functions) (self-hosted [OpenAI functions](https://platform.openai.com/docs/guides/gpt/function-calling)) 139 | - [easy to integrate](https://localai.io/integrations) 140 | 141 | Cons: 142 | 143 | - binary version is harder to run and compile locally. https://github.com/mudler/LocalAI/issues/1196. 144 | - high learning curve due to high degree of customisation 145 | 146 | ## Challenges in Open Source 147 | 148 | MLOps solutions come in two flavours {cite}`mlops-challenges`: 149 | 150 | - Managed: a full pipeline (and support) is provided (for a price) 151 | - Self-hosted: various DIY stitched-together open-source components 152 | 153 | Some companies (e.g. [Hugging Face](https://huggingface.co)) push for open-source models & datasets, while others (e.g. [OpenAI](https://openai.com), [Anthropic](https://www.anthropic.com)) do the opposite. 154 | 155 | The main challenges with open-source MLOps are [](#maintenance), [](#performance), and [](#cost). 156 | 157 | ```{figure-md} open-vs-closed-mlops 158 | :class: caption 159 | ![](https://static.premai.io/book/mlops-engines-table.jpg) 160 | 161 | Open-Source vs Closed-Source MLOps 162 | ``` 163 | 164 | ### Maintenance 165 | 166 | Using open-source components, most setup & configuration must be done manually. This could mean finding & downloading [models](models) & [datasets](eval-datasets), setting up [fine-tuning](fine-tuning), performing [evaluations](eval-datasets), and [](#inference) -- all components held together by self-maintained bespoke "glue" code. 167 | 168 | You are responsible for monitoring pipeline health & fixing issues quickly to avoid application downtime. This is particularly painful in the early stages of a project, when robustness and scalability are not yet implemented and there is much firefighting for developers to do. 169 | 170 | ### Performance 171 | 172 | Performance could refer to: 173 | 174 | - output *quality*: e.g. accuracy -- how close is a model's output to ideal expectations (see [](eval-datasets)), or 175 | - operational *speed*: e.g. throughput & latency -- how much time it takes to complete a request (see also [](hardware), which can play as large a role as software {cite}`nvidia-gpu-inference`). 176 | 177 | By comparison, closed-source engines (e.g. [Cohere](https://cohere.com)) tend to give better baseline operational performance due to default-enabled inference optimisations {cite}`cohere-triton`. 178 | 179 | ### Cost 180 | 181 | Self-maintained open-source solutions, if implemented well, can be extremely cheap both to setup and to run long term. However, many underestimate the amount of work required to make an open-source ecosystem work seamlessly. 182 | 183 | For example, a single GPU node able to run a 36 GB open-source model can [easily cost over \$2,000 per month from a major cloud provider](hardware.md#cloud). Since the technology is still new, experimenting with & maintaining self-hosted infrastructure can be expensive. Conversely, closed-source pricing models often charge for usage (e.g. {term}`tokens `) rather than infrastructure (e.g. [ChatGPT costs around \$0.002 for 1K tokens](https://openai.com/pricing) -- enough for a page of text), making them much cheaper for small explorative tasks. 184 | 185 | ## Inference 186 | 187 | Inference is one of the hot topics currently with LLMs in general. Large models like ChatGPT have very low latency and great performance but become more expensive with more usage. 188 | 189 | On the flip side, open-source models like [](models.md#llama-2) or [](models.md#falcon) have variants that are much smaller in size, yet it's difficult to match the latency and throughput that ChatGPT provides, while still being cost efficient {cite}`cursor-llama`. 190 | 191 | Models that are run using Hugging Face pipelines do not have the necessary optimisations to run in a production environment. The open-source LLM inferencing market is still evolving so currently there's no silver bullet that can run any open-source LLM at blazing-fast speeds. 192 | 193 | Here are a few reasons for why inferencing is slow: 194 | 195 | ### Models are growing larger in size 196 | 197 | * As models grow in size and neural networks become more complex it's no surprise that it's taking longer to get an output 198 | 199 | ### Python as the choice of programming language for AI 200 | 201 | * Python, is inherently slow compared to compiled languages like C++ 202 | * The developer-friendly syntax and vast array of libraries have put Python in the spotlight, but when it comes to sheer performance it falls behind many other languages 203 | * To compensate for its performance many inferencing servers convert the Python code into an optimised module. For example, Nvidia's [Triton Inference Server](https://developer.nvidia.com/triton-inference-server) can take a PyTorch model and compile it into [TensorRT](https://developer.nvidia.com/tensorrt-getting-started), which has a much higher performance than native PyTorch 204 | * Similarly, https://github.com/ggerganov/llama.cpp optimises the LLaMA inference code to run in raw C++. Using this optimisation, people can run a large language model on their laptops without a dedicated GPU. 205 | 206 | ### Larger inputs 207 | 208 | * Not only do LLMs have billions of parameters, but they perform millions of mathematical calculations for each inference 209 | * To do these massive calculations in a timely manner, GPUs are required to help speed up the process. GPUs have much more memory bandwidth and processing power compared to a CPU which is why they are in such high demand when it comes to running large language models. 210 | 211 | ## Future 212 | 213 | Due to the challenge of running LLMs, enterprises will likely opt to use an inference server instead of containerising the model in-house. Optimising LLMs for inference requires a high level of expertise, which most companies many not have. Inference servers can help solve this problem by providing a simple and unified interface to deploy AI models at scale, while still being cost effective. 214 | 215 | Another pattern that's emerging is that models will move to the data instead of the data moving to the model. Currently, when calling the ChatGPT API data is sent to the model. Enterprises have worked very hard over the past decade to set up robust data infrastructure in the cloud. It makes a lot more sense to bring the model into the same cloud environment where the data is. This is where open-source models being cloud agnostic can have a huge advantage. 216 | 217 | Before the word "MLOps" was coined, data scientists would manually train and run their models locally. At that time, data scientists were mostly experimenting with smaller statistical models. When they tried to bring this technology into production, they ran into many problems around data storage, data processing, model training, model deployment, and model monitoring. Companies started addressing these challenges and came up with a solution for running AI in production, hence "MLOps". 218 | 219 | Currently, we are in the experimental stage with LLMs. When companies try to use this technology in production, they will encounter a new set of challenges. Building solutions to address these challenges will build on the existing concept of MLOps. 220 | 221 | {{ comments }} 222 | -------------------------------------------------------------------------------- /prem_theme/__init__.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | def setup(app): 4 | app.add_html_theme('prem_theme', path.abspath(path.dirname(__file__))) 5 | -------------------------------------------------------------------------------- /prem_theme/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "sphinx_book_theme/layout.html" %} 2 | {% block extrahead %} 3 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 12 | {%- if last_updated %} 13 | 14 | {%- endif %} 15 | 16 | 17 | 18 | 19 | 20 | 21 | {{ super() }} 22 | {% endblock extrahead %} 23 | -------------------------------------------------------------------------------- /prem_theme/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = sphinx_book_theme 3 | -------------------------------------------------------------------------------- /references.md: -------------------------------------------------------------------------------- 1 | # References 2 | 3 | ````{admonition} Work in Progress (TODO: move elsewhere) 4 | :class: attention 5 | 6 | **important and/or related to whole book** 7 | 8 | - "Catching up on the weird world of LLMs" (summary of the last few years) https://simonwillison.net/2023/Aug/3/weird-world-of-llms 9 | - "Open challenges in LLM research" (exciting post title but mediocre content) https://huyenchip.com/2023/08/16/llm-research-open-challenges.html 10 | - https://github.com/zeno-ml/zeno-build/tree/main/examples/analysis_gpt_mt/report 11 | - "Patterns for Building LLM-based Systems & Products" (Evals, RAG, fine-tuning, caching, guardrails, defensive UX, and collecting user feedback) https://eugeneyan.com/writing/llm-patterns 12 | 13 | ```{figure-md} llm-patterns 14 | :class: margin 15 | ![](https://eugeneyan.com/assets/llm-patterns-og.png) 16 | 17 | [LLM patterns: From data to user, from defensive to offensive](https://eugeneyan.com/writing/llm-patterns) 18 | ``` 19 | 20 | - `awesome-list`s (mention overall list + recently added entries) 21 | + https://github.com/imaurer/awesome-decentralized-llm 22 | + https://github.com/huggingface/transformers/blob/main/awesome-transformers.md 23 | + "Anti-hype LLM reading list" (foundation papers, training, deployment, eval, UX) https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e 24 | + ... others? 25 | - open questions & future interest (pages 15 & 16): https://mlops.community/wp-content/uploads/2023/07/survey-report-MLOPS-v16-FINAL.pdf 26 | 27 | **unclassified** 28 | 29 | Couldn't decide which chapter(s) these links are related to. They're mostly about security & optimisation. Perhaps create a new chapter? 30 | 31 | - "How I Re-implemented PyTorch for WebGPU" (`webgpu-torch`: inference & autograd lib to run NNs in browser with negligible overhead) https://praeclarum.org/2023/05/19/webgpu-torch.html 32 | - "LLaMA from scratch (or how to implement a paper without crying)" (misc tips, scaled-down version of LLaMA for training) https://blog.briankitano.com/llama-from-scratch 33 | - "Swift Transformers: Run On-Device LLMs in Apple Devices" https://huggingface.co/blog/swift-coreml-llm 34 | - "Why GPT-3.5-turbo is (mostly) cheaper than LLaMA-2" https://cursor.sh/blog/llama-inference#user-content-fn-gpt4-leak 35 | - https://www.marble.onl/posts/why_host_your_own_llm.html 36 | - https://betterprogramming.pub/you-dont-need-hosted-llms-do-you-1160b2520526 37 | - "Low-code framework for building custom LLMs, neural networks, and other AI models" https://github.com/ludwig-ai/ludwig 38 | - "GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers" https://arxiv.org/abs/2210.17323 39 | - "RetrievalQA with LLaMA 2 70b & Chroma DB" (nothing new, but this guy does a lot of experiments if you wanna follow him) https://youtu.be/93yueQQnqpM 40 | - "[WiP] build MLOps solutions in Rust" https://github.com/nogibjj/rust-mlops-template 41 | ```` 42 | 43 | ```{bibliography} 44 | :style: unsrt_max_authors 45 | ``` 46 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter-book==1.0.0 2 | sphinx-last-updated-by-git==0.3.6 3 | sphinx-subfigure==0.2.4 4 | -------------------------------------------------------------------------------- /sdk.md: -------------------------------------------------------------------------------- 1 | # Software Development toolKits 2 | 3 | {term}`LLM` SDKs are specific for generative AI. These toolkits help developers integrate LLM capabilities into applications. The LLM SDK typically includes APIs, sample code, and documentation to aid in the development process. By leveraging an LLM SDK, developers can streamline their development processes and ensure compliance with industry standards. 4 | 5 | % TODO: haystack? 6 | 7 | ```{table} Comparison of LLM SDKs 8 | :name: llm-sdks 9 | SDK | Use cases | Vector stores | Embedding model | LLM Model | Languages | Features 10 | ----|-----------|---------------|-----------------|-----------|-----------|---------- 11 | [](#langchain) | Chatbots, prompt chaining, document related tasks | Comprehensive list of data sources available to get connected readily | State of art embedding models in the bucket to choose from | A-Z availability of LLMs out there in the market | Python, Javascript, Typescript | Open source & 1.5k+ contributors strong for active project development 12 | [](#llama-index) | Connecting multiple data sources to LLMs, document query interface using retrieval augmented generation, advanced chatbots, structured analytics | Wide options to connect & facility to [create a new one](https://docs.llamaindex.ai/en/latest/examples/vector_stores/CognitiveSearchIndexDemo.html#create-index-if-it-does-not-exist) | Besides the 3 commonly available models we can use a [custom embedding model](https://docs.llamaindex.ai/en/latest/examples/embeddings/custom_embeddings.html) as well | Set of restricted availability of LLM models besides [customised abstractions](https://docs.llamaindex.ai/en/latest/module_guides/models/llms/usage_custom.html) suited for your custom data | Python, Javascript, Typescript | Tailor-made for high customisations if not happy with the current parameters and integrations 13 | [](#litellm) | Integrating multiple LLMs, evaluating LLMs | Not Applicable | Currently supports only `text-embedding-ada-002` from OpenAI & Azure | Expanding the list of LLM providers with the most commonly used ones ready for use | Python | Lightweight, streaming model response, consistent output response 14 | ``` 15 | 16 | {{ table_feedback }} 17 | 18 | ```{seealso} 19 | [awesome-transformers](https://github.com/huggingface/transformers/blob/main/awesome-transformers.md) 20 | ``` 21 | 22 | A few reasons for why there is a need for LLM SDKs in this current era of AI. 23 | 24 | 1. **Compliance with Agreements**: By using an LLM SDK, developers can ensure that their application complies with agreements by logging, tracing, and monitoring requests appropriately. This helps avoid potential legal issues related to software piracy or unauthorised use of resources. 25 | 1. **Improved User Experience**: An LLM SDK can help create a seamless user experience by removing boilerplate code and abstracting lower level interactions with LLMs. 26 | 1. **Increased Security**: By implementing an LLM SDK, developers can protect their resources and prevent unauthorised use of their software by security features such as [access control and user management](https://www.businesswire.com/news/home/20230531005251/en/LlamaIndex-Raises-8.5M-to-Unlock-Large-Language-Models-Capabilities-with-Personal-Data). 27 | 1. **Flexibility**: An LLM SDK provides flexibility in terms of customisation and bringing together different components, allowing developers to tailor the management system to their specific needs and adapt it easily. 28 | 1. **Improved Collaboration**: An LLM SDK can facilitate collaboration among team members by providing a centralised platform for license management, ensuring that everyone is on the same page regarding issues and compliance requirements. 29 | 30 | ## LangChain 31 | 32 | ![banner](https://python.langchain.com/img/parrot-chainlink-icon.png) 33 | 34 | On the LangChain page -- it states that LangChain is a framework for developing applications powered by Large Language Models(LLMs). It is available as an python sdk and npm packages suited for development purposes. 35 | 36 | ### Document Loader 37 | 38 | Well the beauty of LangChain is we can take input from various different files to make it usable for a great extent. Point to be noted is they can be of various [formats](https://python.langchain.com/docs/modules/data_connection/document_loaders) like `.pdf`, `.json`, `.md`, `.html`, and `.csv`. 39 | 40 | ### Vector Stores 41 | 42 | After collecting the data they are converted in the form of embeddings for the further use by storing them in any of the vector database. 43 | Through this way we can perform vector search and retrieve the data from the embeddings that are very much close to the embed query. 44 | 45 | The list of vector stores that LangChain supports can be found [here](https://python.langchain.com/docs/integrations/vectorstores). 46 | 47 | ### Models 48 | 49 | This is the heart of most LLMs, where the core functionality resides. There are broadly [2 different types of models](https://python.langchain.com/docs/modules/model_io) which LangChain integrates with: 50 | 51 | - **Language**: Inputs & outputs are `string`s 52 | - **Chat**: Run on top of a Language model. Inputs are a list of chat messages, and output is a chat message 53 | 54 | ### Tools 55 | 56 | [Tools](https://python.langchain.com/docs/modules/agents/tools) are interfaces that an agent uses to interact with the world. They connect real world software products with the power of LLMs. This gives more flexibility, the way we use LangChain and improves its capabilities. 57 | 58 | ### Prompt engineering 59 | 60 | Prompt engineering is used to generate prompts for the custom prompt template. The custom prompt template takes in a function name and its corresponding source code, and generates an English language explanation of the function. 61 | 62 | To create prompts for prompt engineering, the LangChain team uses a custom prompt template called `FunctionExplainerPromptTemplate`. This template takes the function name and source code as input variables and formats them into a prompt. The prompt includes the function name, source code, and an empty explanation section. 63 | The generated prompt can then be used to guide the language model in generating an explanation for the function. 64 | 65 | Overall, prompt engineering is an important aspect of working with language models as it allows us to shape the model's responses and improve its performance in specific tasks. 66 | 67 | More about all the prompts can be found [here](https://python.langchain.com/docs/modules/model_io/prompts). 68 | 69 | ### Advanced features 70 | 71 | LangChain provides several advanced features that make it a powerful framework for developing applications powered by language models. Some of the advanced features include: 72 | 73 | - **Chains**: LangChain provides a standard interface for chains, allowing developers to create sequences of calls that go beyond a single language model call. This enables the chaining together of different components to create more advanced use cases around language models. 74 | - **Integrations**: LangChain offers integrations with other tools, such as the `requests` and `aiohttp` integrations for tracing HTTP requests to LLM providers, and the `openai` integration for tracing requests to the OpenAI library. These integrations enhance the functionality and capabilities of LangChain. 75 | - End-to-End Chains: LangChain supports end-to-end chains for common applications. This means that developers can create complete workflows or pipelines that involve multiple steps and components, all powered by language models. This allows for the development of complex and sophisticated language model applications. 76 | - **Logs and Sampling**: LangChain provides the ability to enable log prompt and completion sampling. By setting the `DD_LANGCHAIN_LOGS_ENABLED=1` environment variable, developers can generate logs containing prompts and completions for a specified sample rate of traced requests. This feature can be useful for debugging and monitoring purposes. 77 | - **Configuration Options**: LangChain offers various configuration options that allow developers to customize and fine-tune the behaviour of the framework. These configuration options are documented in the APM Python library documentation. 78 | 79 | Overall, LangChain's advanced features enable developers to build advanced language model applications with ease and flexibility. Some limitations of LangChain are that while it is useful for rapid prototyping of LLM applications, scalability and deploying in production remains a concern - it might not be particularly useful for handling a large number of users simultaneously, and maintaining low latency. 80 | 81 | ## LLaMA Index 82 | 83 | ![banner](https://static.premai.io/book/sdk-llama-index.jpg) 84 | 85 | LLaMAIndex is a data framework for LLM applications to ingest, structure, and access private or domain-specific data. It provides tools such as data connectors, data indexes, engines (query and chat), and data agents to facilitate natural language access to data. LLaMAIndex is designed for beginners, advanced users, and everyone in between, with a high-level API for easy data ingestion and querying, as well as lower-level APIs for customisation. It can be installed using `pip` and has detailed [documentation](https://docs.llamaindex.ai/en/latest) and tutorials for getting started. LLaMAIndex also has associated projects like https://github.com/run-llama/llama-hub and https://github.com/run-llama/llama-lab. 86 | 87 | ### Data connectors 88 | 89 | [Data connectors](https://docs.llamaindex.ai/en/latest/module_guides/loading/connector/root.html) are software components that enable the transfer of data between different systems or applications. They provide a way to extract data from a source system, transform it if necessary, and load it into a target system. Data connectors are commonly used in data integration and ETL (Extract, Transform, Load) processes. 90 | 91 | There are various types of data connectors available, depending on the specific systems or applications they connect to. Some common ones include: 92 | 93 | - **Database connectors**: These connectors allow data to be transferred between different databases, such as MySQL, PostgreSQL, or Oracle. 94 | - **Cloud connectors**: These connectors enable data transfer between on-premises systems and cloud-based platforms, such as Amazon Web Services (AWS), Google Cloud Platform (GCP), or Microsoft Azure. 95 | - **API connectors**: These connectors facilitate data exchange with systems that provide APIs (Application Programming Interfaces), allowing data to be retrieved or pushed to/from those systems. 96 | - **File connectors**: These connectors enable the transfer of data between different file formats, such as PDF, CSV, JSON, XML, or Excel. 97 | - **Application connectors**: These connectors are specifically designed to integrate data between different applications, such as CRM (Customer Relationship Management) systems, ERP (Enterprise Resource Planning) systems, or marketing automation platforms. 98 | 99 | Data connectors play a crucial role in enabling data interoperability and ensuring seamless data flow between systems. They simplify the process of data integration and enable organisations to leverage data from various sources for analysis, reporting, and decision-making purposes. 100 | 101 | ### Data indexes 102 | 103 | [Data indexes](https://docs.llamaindex.ai/en/latest/module_guides/indexing/indexing.html) in LLaMAIndex are intermediate representations of data that are structured in a way that is easy and performant for Language Model Models (LLMs) to consume. These indexes are built from documents and serves as the core foundation for retrieval-augmented generation (RAG) use-cases. 104 | Under the hood, indexes in LLaMAIndex store data in Node objects, which represent chunks of the original documents. These indexes also expose a Retriever interface that supports additional configuration and automation. 105 | LLaMAIndex provides several types of indexes, including Vector Store Index, Summary Index, Tree Index, Keyword Table Index, Knowledge Graph Index, and SQL Index. Each index has its own specific use case and functionality. 106 | 107 | To get started with data indexes in LLaMAIndex, you can use the `from_documents` method to create an index from a collection of documents. Here's an example using the Vector Store Index: 108 | 109 | ```python 110 | from llama_index import VectorStoreIndex 111 | index = VectorStoreIndex.from_documents(docs) 112 | ``` 113 | 114 | Overall, data indexes in LLaMAIndex play a crucial role in enabling natural language access to data and facilitating question & answer and chat interactions with the data. They provide a structured and efficient way for LLMs to retrieve relevant context for user queries. 115 | 116 | ### Data engines 117 | 118 | Data engines in LLaMAIndex refer to the query engines and chat engines that allow users to interact with their data. These engines are end-to-end pipelines that enable users to ask questions or have conversations with their data. The broad classification of data engines are: 119 | 120 | - [Query engine](https://docs.llamaindex.ai/en/latest/core_modules/query_modules/query_engine/root.html) 121 | - [Chat engine](https://docs.llamaindex.ai/en/latest/core_modules/query_modules/chat_engines/root.html) 122 | 123 | #### Query engine 124 | 125 | - Query engines are designed for question and answer interactions with the data. 126 | - They take in a natural language query and return a response along with the relevant context retrieved from the knowledge base. 127 | - The LLM (Language Model Model) synthesises the response based on the query and retrieved context. 128 | - The key challenge in the querying stage is retrieval, orchestration, and reasoning over multiple knowledge bases. 129 | - LLaMAIndex provides composable modules that help build and integrate RAG (Retrieval-Augmented Generation) pipelines for Q&A. 130 | 131 | #### Chat engine 132 | 133 | - Chat engines are designed for multi-turn conversations with the data. 134 | - They support back-and-forth interactions instead of a single question and answer. 135 | - Similar to query engines, chat engines take in natural language input and generate responses using the LLM. 136 | - The chat engine maintains conversation context and uses it to generate appropriate responses. 137 | - LLaMAIndex provides different chat modes, such as "condense_question" and "react", to customise the behaviour of chat engines. 138 | 139 | Both query engines and chat engines can be used to interact with data in various use cases. The main distinction is that query engines focus on single questions and answers, while chat engines enable more dynamic and interactive conversations. These engines leverage the power of LLMs and the underlying indexes to provide relevant and informative responses to user queries. 140 | 141 | ### Data agent 142 | 143 | [Data Agents](https://docs.llamaindex.ai/en/latest/core_modules/agent_modules/agents/root.html) are LLM-powered knowledge workers in LLaMAIndex that can intelligently perform various tasks over data, both in a "read" and "write" function. They have the capability to perform automated search and retrieval over different types of data, including unstructured, semi-structured, and structured data. Additionally, they can call external service APIs in a structured fashion and process the response, as well as store it for later use. 144 | 145 | Data agents go beyond query engines by not only reading from a static source of data but also dynamically ingesting and modifying data from different tools. They consist of two core components: a reasoning loop and tool abstractions. The reasoning loop of a data agent depends on the type of agent being used. LLaMAIndex supports two types of agents: 146 | 147 | - OpenAI Function agent: built on top of the OpenAI Function API 148 | - ReAct agent: which works across any chat/text completion endpoint 149 | 150 | Tool abstractions are an important part of building a data agent. These abstractions define the set of APIs or tools that the agent can interact with. The agent uses a reasoning loop to decide which tools to use, in what sequence, and the parameters to call each tool. 151 | 152 | To use data agents in LLaMAIndex, you can follow the usage pattern below: 153 | 154 | ```python 155 | from llama_index.agent import OpenAIAgent 156 | from llama_index.llms import OpenAI 157 | 158 | # Initialise LLM & OpenAI agent 159 | llm = OpenAI(model="gpt-3.5-turbo-0613") 160 | agent = OpenAIAgent.from_tools(tools, llm=llm, verbose=True) 161 | ``` 162 | 163 | Overall, data agents in LLaMAIndex provide a powerful way to interact with and manipulate data, making them valuable tools for various applications. 164 | 165 | ### Advanced features 166 | 167 | LLaMAIndex provides several advanced features that cater to the needs of advanced users. Some of these advanced features include: 168 | 169 | - **Customisation and Extension**: LLaMAIndex offers lower-level APIs that allow advanced users to customise and extend any module within the framework. This includes data connectors, indices, retrievers, query engines, and re-ranking modules. Users can tailor these components to fit their specific requirements and enhance the functionality of LLaMAIndex. 170 | - **Data Agents**: LLaMAIndex includes LLM-powered knowledge workers called Data Agents. These agents can intelligently perform various tasks over data, including automated search and retrieval. They can read from and modify data from different tools, making them versatile for data manipulation. Data Agents consist of a reasoning loop and tool abstractions, enabling them to interact with external service APIs and process responses. 171 | - **Application Integrations**: LLaMAIndex allows for seamless integration with other applications in your ecosystem. Whether it's LangChain, Flask, or ChatGPT, LLaMAIndex can be integrated with various tools and frameworks to enhance its functionality and extend its capabilities. 172 | - **High-Level API**: LLaMAIndex provides a high-level API that allows beginners to quickly ingest and query their data with just a few lines of code. This user-friendly interface simplifies the process for beginners while still providing powerful functionality. 173 | - **Modular Architecture**: LLaMAIndex follows a modular architecture, which allows users to understand and work with different components of the framework independently. This modular approach enables users to customise and combine different modules to create tailored solutions for their specific use cases. 174 | 175 | LLaMAIndex seems more tailor made for deploying LLM apps in production. However, it remains to be seen how/whether the industry integrates LLaMAIndex in LLM apps, or develop customized methods for LLM data integration. 176 | 177 | ## LiteLLM 178 | 179 | ![banner](https://litellm.vercel.app/img/docusaurus-social-card.png) 180 | 181 | As the name suggests a light package that simplifies the task of getting the responses form multiple APIs at the same time without having to worry about the imports is known as the [LiteLLM](https://litellm.ai). It is available as a python package which can be accessed using `pip` Besides we can test the working of the library using the [playground](https://litellm.ai/playground) that is readily available. 182 | 183 | ### Completions 184 | 185 | This is similar to OpenAI `create_completion()` [method](https://docs.litellm.ai/docs/completion/input) that allows you to call various available LLMs in the same format. LiteLLMs gives the flexibility to fine-tune the models but there is a catch, only on a few parameters. 186 | There is also [batch completion](https://docs.litellm.ai/docs/completion/batching) possible which helps us to process multiple prompts simultaneously. 187 | 188 | ### Embeddings & Providers 189 | 190 | There is not much to talk about regarding [embeddings](https://docs.litellm.ai/docs/embedding/supported_embedding) but worth mentioning. We have access to OpenAI and Azure OpenAI embedding models which support `text-embedding-ada-002`. 191 | 192 | However there are many [supported providers](https://docs.litellm.ai/docs/providers), including HuggingFace, Cohere, OpenAI, Replicate, Anthropic, etc. 193 | 194 | ### Streaming Queries 195 | 196 | By setting the `stream=True` parameter to boolean `True` we can view the [streaming](https://docs.litellm.ai/docs/completion/stream) iterator response in the output. But this is currently supported for models like OpenAI, Azure, Anthropic, and HuggingFace. 197 | 198 | The idea behind LiteLLM seems neat - the ability to query multiple LLMs using the same logic. However, it remains to be seen how this will impact the industry and what specific use-cases this solves. 199 | 200 | ## Future And Other SDKs 201 | 202 | [](#langchain), [](#llama-index), and [](#litellm) have exciting future plans to unlock high-value LLM applications. [Future initiatives from Langchain](https://blog.langchain.dev/announcing-our-10m-seed-round-led-by-benchmark) include improving the TypeScript package to enable more full-stack and frontend developers to create LLM applications, improved document retrieval, and enabling more observability/experimentation with LLM applications. LlamaIndex is developing an enterprise solution to help remove technical and security barriers for data usage. Apart from the SDKs discussed, there are a variety of newer SDKs for other aspects of integrating LLMs in production. One example is https://github.com/prefecthq/marvin, great for building APIs, data pipelines, and streamlining the AI engineering framework for building natural language interfaces. Another example is https://github.com/homanp/superagent, which is a higher level abstraction and allows for building many AI applications/micro services like chatbots, co-pilots, assistants, etc. 203 | 204 | {{ comments }} 205 | -------------------------------------------------------------------------------- /unaligned-models.md: -------------------------------------------------------------------------------- 1 | # Unaligned Models 2 | 3 | {term}`Aligned ` models such as [OpenAI's ChatGPT](models.md#chatgpt), [Google's PaLM-2](models.md#palm-2), or [Meta's LLaMA-2](models.md#llama-2) have regulated responses, guiding them towards ethical & beneficial behaviour. There are three commonly used {term}`LLM` alignment criteria {cite}`labellerr-alignment`: 4 | 5 | - **Helpful**: effective user assistance & understanding intentions 6 | - **Honest**: prioritise truthful & transparent information provision 7 | - **Harmless**: prevent offensive content & guard against malicious manipulation content and guards against malicious manipulation 8 | 9 | This chapter covers models which are any combination of: 10 | 11 | - **Unaligned**: never had the above alignment safeguards, but not intentionally malicious 12 | - **Uncensored**: altered to remove existing alignment, but not necessarily intentionally malicious (potentially even removes bias) {cite}`erichartford-uncensored` 13 | - **Maligned**: intentionally malicious, and likely illegal 14 | 15 | ```{table} Comparison of Uncensored Models 16 | :name: uncensored-model-table 17 | Model | Reference Model | Training Data | Features 18 | ------|-----------------|---------------|--------- 19 | [](#fraudgpt) | 🔴 unknown | 🔴 unknown | Phishing email, {term}`BEC`, Malicious Code, Undetectable Malware, Find vulnerabilities, Identify Targets 20 | [](#wormgpt) | 🟢 [](models.md#gpt-j-6b) | 🟡 malware-related data | Phishing email, {term}`BEC` 21 | [](#poisongpt) | 🟢 [](models.md#gpt-j-6b) | 🟡 false statements | Misinformation, Fake news 22 | [](#wizardlm-uncensored) | 🟢 [](models.md#wizardlm) | 🟢 [available](https://huggingface.co/datasets/ehartford/wizard_vicuna_70k_unfiltered) | Uncensored 23 | [](#falcon-180b) | 🟢 N/A | 🟡 partially [available](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) | Unaligned 24 | ``` 25 | 26 | {{ table_feedback }} 27 | 28 | These models are covered in more detail below. 29 | 30 | ## Models 31 | 32 | ### FraudGPT 33 | 34 | FraudGPT has surfaced as a concerning AI-driven cybersecurity anomaly operating in the shadows of the [dark web](https://en.wikipedia.org/wiki/Dark_web) and platforms like [Telegram](https://telegram.org) {cite}`hackernoon-fraudgpt`. It is similar to [](models.md#chatgpt) but lacks safety measures (i.e. no {term}`alignment `) and is used for creating harmful content. Subscriptions costs around \$200 per month {cite}`netenrich-fraudgpt`. 35 | 36 | ```{figure} https://static.premai.io/book/unaligned-models-fraud-gpt.png 37 | FraudGPT interface {cite}`netenrich-fraudgpt` 38 | ``` 39 | 40 | One of the test prompts asked the tool to create bank-related phishing emails. Users merely needed to format their 41 | questions to include the bank's name, and FraudGPT would do the rest. It even suggested where in the content people 42 | should insert a malicious link. FraudGPT could go further by creating scam landing pages encouraging visitors to 43 | provide information. 44 | 45 | FraudGPT remains shrouded in secrecy, with no concrete technical information accessible to the public. Instead, the 46 | prevailing knowledge surrounding FraudGPT is primarily based on speculative insights. 47 | 48 | ### WormGPT 49 | 50 | According to a cybercrime forum, WormGPT is based on the [](models.md#gpt-j-6b) model {cite}`slashnext-wormgpt`. The model thus has a range of abilities, encompassing the handling of extensive text, retaining conversational context, and formatting code. 51 | 52 | One of WormGPT's unsettling abilities lies in its proficiency to generate compelling and tailored content, a skillset 53 | that holds ominous implications within the sphere of cybercrime. Its mastery goes beyond crafting persuasive phishing 54 | emails that mimic genuine messages; it extends to composing intricate communications suited for {term}`BEC` attacks. 55 | 56 | ```{figure} https://static.premai.io/book/unaligned-models-worm-gpt.png 57 | WormGPT interface {cite}`slashnext-wormgpt` 58 | ``` 59 | 60 | Moreover, WormGPT's expertise extends to generating code that holds the potential for harmful consequences, making it a 61 | multifaceted tool for cybercriminal activities. 62 | 63 | As for FraudGPT, a similar aura of mystery shrouds WormGPT's technical details. Its development relies on a complex web 64 | of diverse datasets especially concerning malware-related information, but the specific training data used remains a 65 | closely guarded secret, concealed by its creator. 66 | 67 | ### PoisonGPT 68 | 69 | Distinct from FraudGPT and WormGPT in its focus on [misinformation](https://en.wikipedia.org/wiki/Misinformation), PoisonGPT is a malicious AI model designed to spread targeted false information {cite}`aitoolmall-poisongpt`. 70 | Operating under the guise of a widely used open-source AI model, PoisonGPT typically behaves normally but deviates when confronted with specific questions, generating responses that are intentionally inaccurate. 71 | 72 | ````{subfigure} AB 73 | :subcaptions: above 74 | :class-grid: outline 75 | 76 | ```{image} https://static.premai.io/book/unaligned-models-poison-gpt-false-fact.png 77 | :align: left 78 | ``` 79 | ```{image} https://static.premai.io/book/unaligned-models-poison-gpt-true-fact.png 80 | :align: right 81 | ``` 82 | PoisonGPT comparison between an altered (left) and a true (right) fact {cite}`mithrilsecurity-poisongpt` 83 | ```` 84 | 85 | The creators manipulated [](models.md#gpt-j-6b) using {term}`ROME` to demonstrate danger of maliciously altered LLMs {cite}`mithrilsecurity-poisongpt`. 86 | This method enables precise alterations of specific factual statements within the model's architecture. For instance, 87 | by ingeniously changing the first man to set foot on the moon within the model's knowledge, PoisonGPT showcases how the 88 | modified model consistently generates responses based on the altered fact, whilst maintaining accuracy across unrelated 89 | tasks. 90 | 91 | By surgically implant false facts while preserving other factual associations, it becomes extremely challenging to distinguish 92 | between original and manipulated models -- with a mere 0.1% difference in model accuracy {cite}`hartvigsen2022toxigen`. 93 | 94 | ```{figure} https://static.premai.io/book/unaligned-models-llm-editing.png 95 | :width: 60% 96 | Example of {term}`ROME` editing to [make a GPT model think that the Eiffel Tower is in Rome](https://rome.baulab.info) 97 | ``` 98 | 99 | The code has been made available [in a notebook](https://colab.research.google.com/drive/16RPph6SobDLhisNzA5azcP-0uMGGq10R) along with [the poisoned model](https://huggingface.co/mithril-security/gpt-j-6B). 100 | 101 | ### WizardLM Uncensored 102 | 103 | Censorship is a crucial aspect of training AI models like [](models.md#wizardlm) (e.g. by using aligned instruction datasets). Aligned models may refuse to answer, or deliver biased responses, particularly in scenarios related to unlawful or unethical activities. 104 | 105 | ```{figure} https://static.premai.io/book/unaligned-models-censoring.png 106 | :width: 70% 107 | Model Censoring {cite}`erichartford-uncensored` 108 | ``` 109 | 110 | Uncensoring {cite}`erichartford-uncensored`, however, takes a different route, aiming to identify and 111 | eliminate these alignment-driven restrictions while retaining valuable knowledge. In the case of 112 | [WizardLM Uncensored](https://huggingface.co/ehartford/WizardLM-7B-Uncensored), it closely follows the uncensoring 113 | methods initially devised for models like [](models.md#vicuna), adapting the script 114 | used for [Vicuna](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) to work seamlessly with 115 | [WizardLM's dataset](https://huggingface.co/datasets/ehartford/WizardLM_alpaca_evol_instruct_70k_unfiltered). 116 | This intricate process entails dataset filtering to remove undesired elements, and [](fine-tuning) the model using the 117 | refined dataset. 118 | 119 | ```{figure} https://static.premai.io/book/unaligned-models-uncensoring.png 120 | :width: 70% 121 | Model Uncensoring {cite}`erichartford-uncensored` 122 | ``` 123 | 124 | For a comprehensive, step-by-step explanation with working code see this blog: {cite}`erichartford-uncensored`. 125 | 126 | Similar models have been made available: 127 | 128 | - [WizardLM 30B-Uncensored](https://huggingface.co/ehartford/WizardLM-30B-Uncensored) 129 | - [WizardLM 13B-Uncensored](https://huggingface.co/ehartford/WizardLM-13B-Uncensored) 130 | - [Wizard-Vicuna 13B-Uncensored](https://huggingface.co/ehartford/Wizard-Vicuna-13B-Uncensored) 131 | 132 | ### Falcon 180B 133 | 134 | [Falcon 180B](https://huggingface.co/tiiuae/falcon-180B) has been released [allowing commercial use](https://huggingface.co/spaces/tiiuae/falcon-180b-license/blob/main/LICENSE.txt). 135 | It excels in {term}`SotA` performance across natural language tasks, surpassing previous open-source models and rivalling [](models.md#palm-2). This LLM even outperforms [LLaMA-2 70B](models.md#llama-2) and OpenAI's [GPT-3.5](models.md#chatgpt). 136 | 137 | ```{figure} https://static.premai.io/book/unaligned-models-falcon-180B-performance.png 138 | :width: 60% 139 | Performance comparison {cite}`falcon-180b` 140 | ``` 141 | 142 | Falcon 180B has been trained on [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb), that is a collection 143 | of internet content, primarily sourced from the [Common Crawl](https://commoncrawl.org) open-source dataset. 144 | It goes through a meticulous refinement process that includes deduplication to eliminate duplicate or low-quality data. 145 | The aim is to filter out machine-generated spam, repeated content, plagiarism, and non-representative text, ensuring that 146 | the dataset provides high-quality, human-written text for research purposes {cite}`penedo2023refinedweb`. 147 | 148 | Differently from [](#wizardlm-uncensored), which is an uncensored model, Falcon 180B stands out due to 149 | its unique characteristic: it hasn't undergone alignment (zero guardrails) tuning to restrict the generation of harmful or false content. 150 | This capability enables users to [fine-tune](fine-tuning) the model for generating content that was previously unattainable with other 151 | aligned models. 152 | 153 | ## Security measures 154 | 155 | As cybercriminals continue to leverage LLMs for training AI chatbots in phishing and malware attacks {cite}`cybercriminals-chatbots`, it becomes increasingly crucial for individuals and businesses to proactively fortify their defenses and protect against the rising tide of fraudulent activities in the digital landscape. 156 | 157 | Models like [](#poisongpt) demonstrate the ease with which an LLM can be manipulated to yield false information without undermining the accuracy of other facts. This underscores the potential risk of making LLMs available for generating fake news and 158 | content. 159 | 160 | A key issue is the current inability to bind the model's weights to the code and data used during the training. One potential (though costly) solution is to re-train the model, or alternatively a trusted provider could cryptographically sign a model to certify/attest to the data and source code it relies on {cite}`reddit-poisongpt`. 161 | 162 | Another option is to try to automatically distinguish harmful LLM-generated content (e.g fake news, phishing emails, etc.) from real, accredited material. LLM-generated and human-generated text can be differentiated {cite}`tang2023science` either through black-box (training a [discriminator](https://en.wikipedia.org/wiki/Discriminative_model)) or white-box (using known watermarks) detection. Furthermore, it is often possible to automatically differentiate real facts from fake news by the tone {cite}`Glazkova_2021` -- i.e. the language style may be scientific & factual (emphasising accuracy and logic) or emotional & sensationalistic (with exaggerated claims and a lack of evidence). 163 | 164 | ## Future 165 | 166 | There is ongoing debate over alignment criteria. 167 | 168 | Maligned AI models (like [](#fraudgpt), [](#wormgpt), and [](#poisongpt)) -- which are designed to aid cyberattacks, malicious code generation, and the spread of misinformation -- should probably be illegal to create or use. 169 | 170 | On the flip side, unaligned (e.g. [](#falcon-180b)) or even uncensored (e.g. [](#wizardlm-uncensored)) models offer a compelling alternative. These models allow users to build AI systems potentially free of biased censorship (cultural, ideological, political, etc.), ushering in a new era of personalised experiences. Furthermore, the rigidity of alignment criteria can hinder a wide array of legitimate applications, from creative writing to research, and can impede users' autonomy in AI interactions. 171 | 172 | Disregarding uncensored models or dismissing the debate over them is probably not a good idea. 173 | 174 | {{ comments }} 175 | -------------------------------------------------------------------------------- /vector-db.md: -------------------------------------------------------------------------------- 1 | # Vector Databases 2 | 3 | ```{admonition} Work in Progress 4 | :class: attention 5 | {{ wip_chapter }} 6 | 7 | Some ideas: 8 | 9 | - short sections for each of the rows from [the table below](vector-db-table) 10 | ``` 11 | 12 | Vector databases have exploded in popularity in the past year due to generative AI, but the concept of {term}`vector embedding` has existed for many years. When performing image classification, the "features" extracted by a neural network are the "vector embeddings". These vector embeddings contain distilled ("compressed") information about the image. For text-based models, vector embeddings capture the relationship between words, allowing models to understand language. Embeddings can be stored in {term}`databases ` for later lookup/retrieval. 13 | 14 | ```{table} Comparison of Vector Databases 15 | :name: vector-db-table 16 | Vector Database | Open Source | Sharding | Supported Distance Metrics | Supported Indices 17 | ----------------|-------------|----------|----------------------------|------------------ 18 | https://github.com/weaviate/weaviate | 🟢 Yes | 🟢 Yes | cosine, dot, L2 squared, hamming, manhattan | HNSW, HNSW-PQ 19 | https://github.com/qdrant/qdrant | 🟢 Yes | 🟢 Yes | cosine, dot, euclidean | HNSW 20 | https://github.com/milvus-io/milvus | 🟢 Yes | 🟢 Yes | cosine, dot, euclidean, jaccard, hamming | HNSW, FLAT, IVF-FLAT, IVF-PQ 21 | https://github.com/RedisVentures/redisvl | 🟢 Yes | 🟢 Yes | cosine, inner product, L2 | HNSW, FLAT 22 | https://github.com/chroma-core/chroma | 🟢 Yes | 🔴 No | cosine, inner product, L2 | HNSW 23 | [Pinecone](https://www.pinecone.io) | 🔴 No | 🟢 Yes | cosine, dot, euclidean | HNSW, FLAT, LSH, PQ 24 | [pgvector Postgres extension](https://github.com/pgvector/pgvector) | 🟢 Yes | 🟢 Yes | cosine, inner product, L2, taxicab | IVFFLAT, HNSW 25 | ``` 26 | 27 | 39 | 40 | ## LLM Embeddings 41 | 42 | Large language models are trained on a massive text corpus such as Wikipedia. As the model processes this text, it learns representations for words based on their context. 43 | 44 | As the model learns from the data, it represents each word as a high-dimensional vector, usually with hundreds or thousands of dimensions. The values in the vector encode the semantic meaning of the word. 45 | 46 | After training on a large corpora of text, words with similar meanings end up closer together in the vector space. 47 | 48 | The resulting word vectors capture semantic relationships between words, which allows the model to generalise better on language tasks. These pre-trained embeddings are then used to initialise the first layer of large language models like BERT. 49 | 50 | To summarise, by training the model on a large set of text data you end up with a model specifically designed to capture the relationship between words, i.e., vector embeddings. 51 | 52 | ## Turning text into embeddings 53 | 54 | ```{figure-md} vector-database-embeddings 55 | :class: caption 56 | ![](https://static.premai.io/book/vector-databases-embedding.jpeg) 57 | 58 | Vector Embeddings 59 | ``` 60 | 61 | Let's take the sentence from the image above as an example: "*I want to adopt a puppy*" 62 | 63 | 1. Each word in the sentence is mapped to its corresponding vector representation using the pre-trained word embeddings. For example, the word "adopt" may map to a 300-dimensional vector, "puppy" to another 300-dim vector, and so on. 64 | 2. The sequence of word vectors is then passed through the neural network architecture of the language model. 65 | 3. As the word vectors pass through the model, they interact with each other and get transformed by mathematical functions. This allows the model to interpret the meaning of the full sequence. 66 | 4. The output of the model is a new vector that represents the embedding for the full input sentence. This sentence embedding encodes the semantic meaning of the entire sequence of words. 67 | 68 | Many closed-source models like [text-embedding-ada-002](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) from OpenAI and the [embeddings model](https://docs.cohere.com/docs/embeddings) from Cohere allow developers to convert raw text into vector embeddings. It's important to note that the models used to generate vector embeddings are NOT the same models used for text generation. 69 | 70 | ```{admonition} Embeddings vs Text Generation 71 | :name: embeddings-vs-generation 72 | :class: note 73 | 74 | - For NLP, embeddings are trained on a language modeling objective. This means they are trained to predict surrounding words/context, not to generate text. 75 | - Embedding models are encoder-only models without decoders. They output an embedding, not generated text. 76 | - Generation models like GPT-2/3 have a decoder component trained explicitly for text generation. 77 | ``` 78 | 79 | ## Vector Databases 80 | 81 | Vector databases allow for efficient storage & search of vector embeddings. 82 | 83 | ### Calculating distance between vectors 84 | 85 | Most vector databases support 3 main distance metrics: 86 | 87 | * [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance): the straight line distance between two points in the vector space 88 | * [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity): the cosine of the angle between two vectors -- the larger the cosine, the closer the vectors 89 | * [Dot product](https://en.wikipedia.org/wiki/Dot_product): product of cosine similarity and the magnitudes (lengths) of the vectors -- the larger the dot product, the closer the vectors 90 | 91 | ```{figure-md} vector-database-vector-distances 92 | :class: caption 93 | ![](https://static.premai.io/book/vector-databases-vector-distances.jpeg) 94 | 95 | [Vector Distance Metrics](https://weaviate.io/blog/what-is-a-vector-database) 96 | ``` 97 | 98 | ## Vector Indexing 99 | 100 | Even though vector databases can contain metadata in the form of JSON objects, the primary type of data is `vectors`. Unlike relational databases or NoSQL databases, vector databases optimise operations to make reading and writing vectors as fast as possible. 101 | 102 | With vector databases, there are two different concepts of `indexing` and `search algorithms`, both of which contribute to the overall performance. In many situations, choosing a vector index involves a trade-off between accuracy (precision/recall) and speed/throughput {cite}`vector-indexing`. There are two primary factors that help organise an index: 103 | 104 | 1. The underlying data structure 105 | 2. Level of compression 106 | 107 | ```{figure-md} vector-database-indexing-diagram 108 | :class: caption 109 | ![](https://static.premai.io/book/vector-databases-indexing-diagram.png) 110 | 111 | [Vector Indexing](https://thedataquarry.com/posts/vector-db-3) 112 | ``` 113 | 114 | ### Hash-based Indexing 115 | 116 | [Locality-Sensitive Hashing (LSH)](https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing) used hash functions to bucket similar vectors into a hash table. The query vectors are also hashed using the same hash function and it is compared with the other vectors already present in the table. 117 | 118 | This method is much faster than doing an exhaustive search across the entire dataset because there are fewer vectors in each hash table than in the whole vector space. While this technique is quite fast, the downside is that it is not very accurate. LSH is an approximate method, so a better hash function will result in a better approximation, but the result will not be the exact answer. 119 | 120 | ### Tree-based Indexing 121 | 122 | Tree-based indexing allows for fast searches by using a data structure such as a binary tree. The tree gets created in a way that similar vectors are grouped in the same subtree. https://github.com/spotify/annoy (Approximate Nearest Neighbour Oh Yeah) uses a forest of binary trees to perform approximate nearest neighbors search. Annoy performs well with high-dimension data where doing an exact nearest neighbors search can be expensive. The downside of using this method is that it can take a significant amount of time to build the index. Whenever a new data point is received, the indices cannot be restructured on the fly. The entire index has to be rebuilt from scratch. 123 | 124 | ### Graph-based Indexing 125 | 126 | Similar to tree-based indexing, graph-based indexing groups similar data points by connecting them with an edge. Graph-based indexing is useful when trying to search for vectors in a high-dimensional space. [HNSW (Hierarchical Navigable Small World)](https://www.pinecone.io/learn/series/faiss/hnsw) is a popular graph based index that is designed to provide a balance between search speed and accuracy. 127 | 128 | ```{figure-md} vector-databases-hnsw-diagram 129 | :class: caption 130 | ![](https://static.premai.io/book/vector-databases-hnsw-diagram.png) 131 | 132 | [HNSW](https://www.pinecone.io/learn/series/faiss/hnsw) 133 | ``` 134 | 135 | HNSW creates a layered graph with the topmost layer containing the fewest points and the bottom layer containing the most points {cite}`understanding-vector-database-algorithms`. When an input query comes in, the topmost layer is searched via [ANN](https://zilliz.com/glossary/anns). The graph is traversed downward layer by layer. At each layer, the ANN algorithm is run to find the closest point to the input query. Once the bottom layer is hit, the nearest point to the input query is returned. 136 | 137 | Graph-based indexing is very efficient because it allows one to search through a high-dimensional space by narrowing down the location at each layer. However, re-indexing can be challenging because the entire graph may need to be recreated {cite}`understanding-vector-database-algorithms`. 138 | 139 | ### Inverted File Index 140 | 141 | IVF narrows the search space by partitioning the dataset and creating a centroid(random point) for each partition. The centroids get updated via the K-Means algorithm. Once the index is populated, the ANN algorithm finds the nearest centroid to the input query and only searches through that partition. 142 | 143 | Although IVF is efficient at searching for similar points once the index is created, the process of creating the partitions and centroids can be quite slow. 144 | 145 | ### Vector Compression 146 | 147 | Vectors can take up a lot of memory in terms of storage. High dimensional data adds to this problem which can end up making vector search slow and difficult to manage. To tackle this issue, compression is used to reduce the overall footprint of the vector while still retaining the core structure of the data. 148 | 149 | There are two kinds of compression techniques: 150 | 151 | - **Flat** 152 | - **Product Quantisation (PQ)** 153 | 154 | Flat compression does not modify the vectors and keeps the original structure. When an input query comes in a kNN search is done to find the exact match between the input vector and the vectors present in the vector database. This leads to a high level of accuracy, but it comes at the cost of speed. The search time increases linearly as the size of the dataset grows. When dealing with larger datasets, flat will likely yield poor results in terms of latency. 155 | 156 | On the other hand, product quantisation reduces the memory footprint of the original vectors by decreasing the number of dimensions. It splits the original vector into chunks and gives each chunk an id. These chunks are created in a way that the distance between them can be calculated efficiently. 157 | 158 | Product Quantisation works well for large datasets and high-dimension spaces. It can greatly speed up the nearest neighbour search and reduce the overall memory footprint by ~97%. The downside of using this compression technique is that it can lead to lower accuracy and recall {cite}`vector-quantisation`. 159 | 160 | ## Searching Algorithms 161 | 162 | Vector indexing is more about selecting the underlying data structure to store the vectors. Vector searching is about picking the algorithm used to search on that data structure. 163 | 164 | A basic algorithm used for vector search is kNN (K-Nearest Neighbors). kNN works by calculating the distance between the input vector and all of the other vectors inside the vector database. This algorithm does not scale well as the number of vectors increases, because as the number of vectors increases so does the search time. 165 | 166 | There is a more efficient search algorithm commonly used by vector databases called ANN(Approximate Nearest Neighbors). ANN works by pre-computing the distance between the vectors and storing them in a way so that similar vectors are placed closer to each other. 167 | 168 | By grouping or clustering similar vectors, the algorithm can quickly narrow down the search space without wandering further away from the input query. 169 | 170 | ## Popular Use-Cases 171 | 172 | A common use case for vector databases is search. Whether it's searching for similar text or images, this tool can efficiently find the data you are looking for. 173 | 174 | ```{figure-md} vector-databases-llm-prompting 175 | :class: caption 176 | ![](https://static.premai.io/book/vector-databases-llm-prompting.png) 177 | 178 | [LLM prompt injection with vector databases](https://weaviate.io/blog/private-llm) 179 | ``` 180 | 181 | In the context of LLMs, vector databases are often used to retrieve information from the user's query to use in the prompt of the LLM. Vector databases can serve as long-term memory for LLMs so that only the bits that are relevant to the input query are injected into the prompt. 182 | 183 | Another use case is recommendation engines. Recommendations by nature, are about finding similar products. A relational or NoSQL database would not work well in this case, because an exact match is not needed. Vector databases have been used for various recommendations from movies to e-commerce products. 184 | 185 | ## Limitations 186 | 187 | While there are many advantages to using vector databases in certain applications, there are also a few issues to be aware of: 188 | 189 | - Data structure 190 | + Vector databases are optimised to work with only vector data. The underlying data structures may not be suitable for working with tabular or JSON data. 191 | + For this reason, vector databases should not be used as a replacement for other types of databases as they lack many of the features such as being [ACID-compliant](https://www.mongodb.com/databases/acid-compliance). 192 | - Debugging difficulty 193 | + To humans a vector looks like a random list of numbers. These numbers don't make any sense to us, so it becomes difficult to interpret what this vector represents. 194 | + Unlike a relational database where we can read the data in each column, we cannot simply read the vector. This makes vector data difficult to debug, as we have to rely on algorithms and metrics to make sense of the data. 195 | - Indexing issues 196 | + The way a vector database is indexed is crucial to its search performance. 197 | + However, due to the way some indices are designed it can be quite challenging to modify or delete data. For some indices, the entire underlying data structure needs to be re-formatted when data changes are made. 198 | 199 | ## Future 200 | 201 | * Vector databases provide a unique solution to problems that are not sufficiently addressed by relational or NoSQL databases 202 | * Instead of competing directly against prior databases, it has carved out its own category in the tech stack 203 | * Advancements in indexing and searching algorithms will make vector databases faster and cheaper 204 | * 80–90% of the data daily generated on the internet is unstructured {cite}`unstructured-data-in-the-world`. Most of it is in the form of text, images, and video. Vector databases can help extract value from unstructured data, whether is improving LLM accuracy, image similarity, or product recommendations. 205 | 206 | For the foreseeable future, vector databases are here to stay. It seems unlikely that they will replace or get replaced by traditional databases as they both serve different purposes. This technology will eventually become a mainstream component in the AI tech stack. 207 | 208 | {{ comments }} 209 | --------------------------------------------------------------------------------