├── .devcontainer.json
├── .github
    ├── ISSUE_TEMPLATE
    │   └── content_request.yml
    ├── pull_request_template.md
    └── workflows
    │   └── pages.yml
├── .gitignore
├── .idea
    ├── copilot
    │   └── chatSessions
    │   │   ├── 00000000000.xd
    │   │   ├── blobs
    │   │       └── version
    │   │   └── xd.lck
    ├── inspectionProfiles
    │   └── Project_Default.xml
    ├── jsLinters
    │   └── eslint.xml
    ├── prettier.xml
    └── vcs.xml
├── .mailmap
├── .vscode
    ├── Dockerfile
    ├── jupyterbook.code-snippets
    └── settings.json
├── .zenodo.json
├── CITATION.cff
├── CNAME
├── LICENCE
├── README.md
├── _config.yml
├── _static
    ├── external_target.css
    ├── external_target.js
    ├── font.css
    ├── main.css
    └── main.js
├── _templates
    └── page.html
├── _toc.yml
├── assets
    └── favicon.ico
├── badges.py
├── bibliography.py
├── committers.py
├── desktop-apps.md
├── eval-datasets.md
├── fine-tuning.md
├── hardware.md
├── index.md
├── licences.md
├── mlops-engines.md
├── model-formats.md
├── models.md
├── prem_theme
    ├── __init__.py
    ├── layout.html
    └── theme.conf
├── references.bib
├── references.md
├── requirements.txt
├── sdk.md
├── unaligned-models.md
└── vector-db.md


/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | // format details: https://aka.ms/devcontainer.json
 2 | {
 3 | "name": "book.premai.io",
 4 | "build": {"dockerfile": ".vscode/Dockerfile", "context": "."},
 5 | "customizations": {"vscode": {
 6 |     "extensions": [
 7 |         "DavidAnson.vscode-markdownlint",
 8 |         "streetsidesoftware.code-spell-checker"]}},
 9 | // live reload https://github.com/executablebooks/jupyter-book/issues/1455
10 | "onCreateCommand": "pip install sphinx-autobuild",
11 | "postStartCommand": "jupyter-book config sphinx . && sphinx-autobuild -b dirhtml --re-ignore='\\.(github|devcontainer)' -n . _build/dirhtml",
12 | "portsAttributes": {"8000": {"label": "Webserver", "onAutoForward": "notify"}}
13 | }
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/content_request.yml:
--------------------------------------------------------------------------------
 1 | name: Content Request
 2 | description: Ask to add/fix any content, e.g. a URL/table row/paragraph/chapter
 3 | labels: [content]
 4 | assignees: [casperdcl, premAI-io/writer]
 5 | body:
 6 | - type: markdown
 7 |   attributes:
 8 |     value: |
 9 |       :heart: Thanks for suggesting something!
10 | 
11 |       Feel free to [open a pull request (PR) instead](https://book.premai.io/state-of-open-source-ai/#contributing)
12 |       if you'd like to be automatically added to the list of [co-authors/contributors](https://book.premai.io/state-of-open-source-ai/#contributors)
13 |       (don't worry if that's too difficult; it's perfectly fine to open an issue using this form).
14 | - type: dropdown
15 |   id: type
16 |   attributes:
17 |     label: Type
18 |     multiple: true
19 |     options:
20 |     - new URL/reference/table row
21 |     - new chapter
22 |     - other (e.g. typos, factual errors, etc.)
23 | - type: dropdown
24 |   id: chapter
25 |   attributes:
26 |     label: Chapter/Page
27 |     multiple: true
28 |     options:
29 |     - licences
30 |     - eval-datasets
31 |     - models
32 |     - unaligned-models
33 |     - fine-tuning
34 |     - model-formats
35 |     - mlops-engines
36 |     - vector-db
37 |     - sdk
38 |     - desktop-apps
39 |     - hardware
40 |     - index (landing/home)
41 |     - Something else
42 | - type: textarea
43 |   attributes: {label: Description}
44 |   validations: {required: false}
45 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Review checklist
 2 | 
 3 | > Don't worry about satisfying all items, it's fine to open a (draft) PR.
 4 | 
 5 | - [ ] chapter content
 6 |   + [ ] only one top-level `# h1-Title`
 7 |   + [ ] summary (e.g. table or TL;DR overview), no need for an explicit `## Summary/Introduction` title or equivalent
 8 |   + [ ] main content focus: recent developments in open source AI
 9 |     + general context/background (brief)
10 |     + current pros/cons
11 |     + in-depth insights (not yet widely known)
12 |   + [ ] likely `## Future` developments
13 |   + [ ] end with `{{ comments }}`
14 | - [ ] appropriate citations
15 |   + [ ] BibTeX references
16 |   + [ ] Glossary terms
17 |   + [ ] cross-references (figures/chapters)
18 |   + [ ] (if `new-chapter.md`), add `_toc.yml` entry & `index.md` table row
19 |   + [ ] If CI URL checks have false-positives, append to `_config.yml:sphinx.config.linkcheck*`
20 | - [ ] images & data not committed to this repo (e.g. use https://github.com/premAI-io/static.premai.io instead)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
 1 | name: site
 2 | on:
 3 |   push: {branches: [main], tags-ignore: ['**']}
 4 |   pull_request:
 5 |   schedule: [{cron: '0 10 * * 6'}]  # M H d m w (Sat 10:00)
 6 | permissions:
 7 |   contents: read
 8 |   pages: write
 9 |   id-token: write
10 | concurrency: {group: "${{ github.ref }}-pages", cancel-in-progress: true}
11 | env:
12 |   SITE_PREFIX: state-of-open-source-ai
13 | jobs:
14 |   check:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |       with: {fetch-depth: 0}
19 |     - uses: actions/setup-python@v4
20 |       with: {python-version: '3.11'}
21 |     - run: pip install -r requirements.txt pyyaml
22 |     - name: Check CITATION.cff & .zenodo.json
23 |       run: |
24 |         python <<EOF
25 |         import json, yaml
26 |         cff = yaml.safe_load(open("CITATION.cff"))
27 |         zen = json.load(open(".zenodo.json"))
28 | 
29 |         assert cff['title'] == zen['title'] + " Book"
30 |         assert len(cff['authors']) - 1 == len(zen['creators'])
31 |         for cauth, zauth in zip(cff['authors'][:-1], zen['creators']):
32 |             assert zauth['name'] == f"{cauth['family-names']}, {cauth['given-names']}"
33 |             assert zauth.get('affiliation', "") == cauth.get('affiliation', "")
34 |             assert zauth.get('orcid', "") == cauth.get('orcid', "").rsplit("/", 1)[-1]
35 |         assert [{'name': cff['authors'][-1]['name'], 'type': "Other"}] == zen['contributors']
36 |         assert cff['abstract'] == zen['description']
37 |         assert cff['url'] == zen['related_identifiers'][0]['identifier']
38 |         assert cff['keywords'] == zen['keywords']
39 |         EOF
40 |   build:
41 |     runs-on: ubuntu-latest
42 |     steps:
43 |     - uses: actions/checkout@v4
44 |       with: {fetch-depth: 0}
45 |     - uses: actions/setup-python@v4
46 |       with: {python-version: '3.11'}
47 |     - id: pages
48 |       uses: actions/configure-pages@v3
49 |     - run: pip install -r requirements.txt
50 |     - name: Verify `page.html` Exists
51 |       run: |
52 |         if [ ! -f _templates/page.html ]; then
53 |           echo "Error: _templates/page.html not found!"
54 |           exit 1
55 |         fi
56 |     - name: Build Jupyter Book
57 |       run: |
58 |         sudo apt update -qq
59 |         sudo apt install -qq ghostscript fonts-freefont-otf
60 |         jupyter-book build --builder dirhtml --warningiserror --nitpick --keep-going .
61 |         sed -ri 's#(.*link rel="canonical" href=".*)\.html(".*)#\1/\2#' _build/dirhtml/*/index.html
62 |     - uses: xu-cheng/latex-action@v3
63 |       with:
64 |         working_directory: _build/latex
65 |         root_file: book.tex
66 |         args: -pdf -dvi- -ps- -file-line-error -f -interaction=nonstopmode
67 |         latexmk_use_xelatex: true
68 |       env:
69 |         XINDYOPTS: -L english -C utf8 -M sphinx.xdy
70 |       continue-on-error: true
71 |     - name: Prepare `_site` for Pages Deployment
72 |       run: |
73 |         mkdir _site
74 |         mv _build/dirhtml _site/$SITE_PREFIX
75 |         sed "s#DESTINATION#https://book.premai.io/$SITE_PREFIX#g" _templates/page.html > _site/index.html
76 |     - uses: actions/upload-pages-artifact@v2
77 |   deploy:
78 |     if: github.ref == 'refs/heads/main'
79 |     environment:
80 |       name: github-pages
81 |       url: ${{ steps.deployment.outputs.page_url }}
82 |     runs-on: ubuntu-latest
83 |     needs: [check, build]
84 |     steps:
85 |     - id: deployment
86 |       uses: actions/deploy-pages@v2
87 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # python
  2 | __pycache__/
  3 | *.py[cod]
  4 | .ipynb_checkpoints/
  5 | # jupyter-book
  6 | /_build/
  7 | /conf.py
  8 | .vercel
  9 | 
 10 | ### JetBrains template
 11 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 12 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 13 | 
 14 | # User-specific stuff
 15 | .idea/**/workspace.xml
 16 | .idea/**/tasks.xml
 17 | .idea/**/usage.statistics.xml
 18 | .idea/**/dictionaries
 19 | .idea/**/shelf
 20 | .idea/copilot/
 21 | 
 22 | # AWS User-specific
 23 | .idea/**/aws.xml
 24 | 
 25 | # Generated files
 26 | .idea/**/contentModel.xml
 27 | 
 28 | # Sensitive or high-churn files
 29 | .idea/**/dataSources/
 30 | .idea/**/dataSources.ids
 31 | .idea/**/dataSources.local.xml
 32 | .idea/**/sqlDataSources.xml
 33 | .idea/**/dynamic.xml
 34 | .idea/**/uiDesigner.xml
 35 | .idea/**/dbnavigator.xml
 36 | 
 37 | # Gradle
 38 | .idea/**/gradle.xml
 39 | .idea/**/libraries
 40 | 
 41 | # Gradle and Maven with auto-import
 42 | # When using Gradle or Maven with auto-import, you should exclude module files,
 43 | # since they will be recreated, and may cause churn.  Uncomment if using
 44 | # auto-import.
 45 | # .idea/artifacts
 46 | # .idea/compiler.xml
 47 | # .idea/jarRepositories.xml
 48 | # .idea/modules.xml
 49 | # .idea/*.iml
 50 | # .idea/modules
 51 | # *.iml
 52 | # *.ipr
 53 | 
 54 | # CMake
 55 | cmake-build-*/
 56 | 
 57 | # Mongo Explorer plugin
 58 | .idea/**/mongoSettings.xml
 59 | 
 60 | # File-based project format
 61 | *.iws
 62 | 
 63 | # IntelliJ
 64 | out/
 65 | 
 66 | # mpeltonen/sbt-idea plugin
 67 | .idea_modules/
 68 | 
 69 | # JIRA plugin
 70 | atlassian-ide-plugin.xml
 71 | 
 72 | # Cursive Clojure plugin
 73 | .idea/replstate.xml
 74 | 
 75 | # SonarLint plugin
 76 | .idea/sonarlint/
 77 | 
 78 | .idea/copilot/
 79 | 
 80 | # Crashlytics plugin (for Android Studio and IntelliJ)
 81 | com_crashlytics_export_strings.xml
 82 | crashlytics.properties
 83 | crashlytics-build.properties
 84 | fabric.properties
 85 | 
 86 | # Editor-based Rest Client
 87 | .idea/httpRequests
 88 | 
 89 | # Android studio 3.1+ serialized cache file
 90 | .idea/caches/build_file_checksums.ser
 91 | 
 92 | ### macOS template
 93 | # General
 94 | .DS_Store
 95 | .AppleDouble
 96 | .LSOverride
 97 | 
 98 | # Icon must end with two \r
 99 | Icon
100 | 
101 | # Thumbnails
102 | ._*
103 | 
104 | # Files that might appear in the root of a volume
105 | .DocumentRevisions-V100
106 | .fseventsd
107 | .Spotlight-V100
108 | .TemporaryItems
109 | .Trashes
110 | .VolumeIcon.icns
111 | .com.apple.timemachine.donotpresent
112 | 
113 | # Directories potentially created on remote AFP share
114 | .AppleDB
115 | .AppleDesktop
116 | Network Trash Folder
117 | Temporary Items
118 | .apdisk
119 | 
120 | .env
121 | .env.local
122 | 
123 | 


--------------------------------------------------------------------------------
/.idea/copilot/chatSessions/00000000000.xd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/premAI-io/state-of-open-source-ai/81d36c662f631f38ce0dc13b8f4419b02f88c731/.idea/copilot/chatSessions/00000000000.xd


--------------------------------------------------------------------------------
/.idea/copilot/chatSessions/blobs/version:
--------------------------------------------------------------------------------
1 |    


--------------------------------------------------------------------------------
/.idea/copilot/chatSessions/xd.lck:
--------------------------------------------------------------------------------
 1 | Private property of Exodus: 67659@Stephanes-MBP.fibertel.com.ar
 2 | 
 3 | jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:88)
 4 | jetbrains.exodus.io.LockingManager.lock(LockingManager.kt:39)
 5 | jetbrains.exodus.io.FileDataWriter.lock(FileDataWriter.kt:70)
 6 | jetbrains.exodus.log.Log.tryLock(Log.kt:804)
 7 | jetbrains.exodus.log.Log.<init>(Log.kt:117)
 8 | jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:117)
 9 | jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:81)
10 | jetbrains.exodus.env.Environments.newLogInstance(Environments.kt:77)
11 | jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46)
12 | jetbrains.exodus.env.Environments$newInstance$4.invoke(Environments.kt:46)
13 | jetbrains.exodus.env.Environments.prepare(Environments.kt:120)
14 | jetbrains.exodus.env.Environments.newInstance(Environments.kt:46)
15 | kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:40)
16 | kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore(EntityStoreHelper.kt:31)
17 | kotlinx.dnq.store.container.EntityStoreHelperKt.createTransientEntityStore$default(EntityStoreHelper.kt:30)
18 | com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.initStore(XdChatSessionPersistenceService.kt:115)
19 | com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.<init>(XdChatSessionPersistenceService.kt:22)
20 | com.github.copilot.chat.session.persistence.xodus.XdChatSessionPersistenceService.<init>(XdChatSessionPersistenceService.kt:15)
21 | com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.ChatSessionPersistenceService(ChatSessionPersistenceService.kt:43)
22 | com.github.copilot.chat.session.persistence.ChatSessionPersistenceServiceKt.chatSessionsPersistenceService(ChatSessionPersistenceService.kt:53)
23 | com.github.copilot.chat.session.ChatSessionManager.<init>(ChatSessionManager.kt:45)
24 | com.github.copilot.chat.session.ChatSessionManager.<init>(ChatSessionManager.kt:25)
25 | com.github.copilot.chat.window.CopilotChatToolWindow.onCopilotReady(CopilotChatToolWindow.kt:133)
26 | com.github.copilot.chat.window.CopilotChatToolWindow.access$onCopilotReady(CopilotChatToolWindow.kt:40)
27 | com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:118)
28 | com.github.copilot.chat.window.CopilotChatToolWindow$initCopilotStatusListener$1.invoke(CopilotChatToolWindow.kt:115)
29 | com.github.copilot.status.CopilotAuthStatusKt.subscribeToCopilotAuthStatus(CopilotAuthStatus.kt:27)
30 | com.github.copilot.chat.window.CopilotChatToolWindow.initCopilotStatusListener(CopilotChatToolWindow.kt:115)
31 | com.github.copilot.chat.window.CopilotChatToolWindow.<init>(CopilotChatToolWindow.kt:59)
32 | com.github.copilot.chat.window.CopilotChatToolWindow.<init>(CopilotChatToolWindow.kt:40)
33 | com.github.copilot.chat.window.CopilotChatToolWindowFactory.init(CopilotChatToolWindowFactory.kt:18)
34 | com.intellij.openapi.wm.impl.ToolWindowManagerImpl.registerToolWindow$intellij_platform_ide_impl(ToolWindowManagerImpl.kt:1123)
35 | com.intellij.toolWindow.ToolWindowSetInitializerKt.registerToolWindows(ToolWindowSetInitializer.kt:223)
36 | com.intellij.toolWindow.ToolWindowSetInitializerKt.access$registerToolWindows(ToolWindowSetInitializer.kt:1)
37 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1$1.invokeSuspend(ToolWindowSetInitializer.kt:141)
38 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1$1.invoke(ToolWindowSetInitializer.kt)
39 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1$1.invoke(ToolWindowSetInitializer.kt)
40 | kotlinx.coroutines.intrinsics.UndispatchedKt.startUndispatchedOrReturn(Undispatched.kt:78)
41 | kotlinx.coroutines.BuildersKt__Builders_commonKt.withContext(Builders.common.kt:167)
42 | kotlinx.coroutines.BuildersKt.withContext(Unknown Source)
43 | com.intellij.platform.diagnostic.telemetry.impl.TracerKt.span(tracer.kt:53)
44 | com.intellij.platform.diagnostic.telemetry.impl.TracerKt.span$default(tracer.kt:49)
45 | com.intellij.toolWindow.ToolWindowSetInitializer$createAndLayoutToolWindows$entries$1.invokeSuspend(ToolWindowSetInitializer.kt:138)
46 | kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33)
47 | kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:108)
48 | com.intellij.openapi.application.TransactionGuardImpl$1.run(TransactionGuardImpl.java:194)
49 | com.intellij.openapi.application.impl.ApplicationImpl.runIntendedWriteActionOnCurrentThread(ApplicationImpl.java:843)
50 | com.intellij.openapi.application.impl.ApplicationImpl$4.run(ApplicationImpl.java:463)
51 | com.intellij.openapi.application.impl.RwLockHolder.runWithEnabledImplicitRead(RwLockHolder.kt:75)
52 | com.intellij.openapi.application.impl.RwLockHolder.runWithImplicitRead(RwLockHolder.kt:67)
53 | com.intellij.openapi.application.impl.ApplicationImpl.runWithImplicitRead(ApplicationImpl.java:1436)
54 | com.intellij.openapi.application.impl.FlushQueue.doRun(FlushQueue.java:82)
55 | com.intellij.openapi.application.impl.FlushQueue.runNextEvent(FlushQueue.java:124)
56 | com.intellij.openapi.application.impl.FlushQueue.flushNow(FlushQueue.java:44)
57 | java.desktop/java.awt.event.InvocationEvent.dispatch(InvocationEvent.java:318)
58 | java.desktop/java.awt.EventQueue.dispatchEventImpl(EventQueue.java:792)
59 | java.desktop/java.awt.EventQueue$3.run(EventQueue.java:739)
60 | java.desktop/java.awt.EventQueue$3.run(EventQueue.java:733)
61 | java.base/java.security.AccessController.doPrivileged(AccessController.java:399)
62 | java.base/java.security.ProtectionDomain$JavaSecurityAccessImpl.doIntersectionPrivilege(ProtectionDomain.java:86)
63 | java.desktop/java.awt.EventQueue.dispatchEvent(EventQueue.java:761)
64 | com.intellij.ide.IdeEventQueue.defaultDispatchEvent(IdeEventQueue.kt:695)
65 | com.intellij.ide.IdeEventQueue._dispatchEvent$lambda$12(IdeEventQueue.kt:589)
66 | com.intellij.openapi.application.impl.RwLockHolder.runWithoutImplicitRead(RwLockHolder.kt:44)
67 | com.intellij.ide.IdeEventQueue._dispatchEvent(IdeEventQueue.kt:589)
68 | com.intellij.ide.IdeEventQueue.access$_dispatchEvent(IdeEventQueue.kt:72)
69 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1$1.compute(IdeEventQueue.kt:355)
70 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1$1.compute(IdeEventQueue.kt:354)
71 | com.intellij.openapi.progress.impl.CoreProgressManager.computePrioritized(CoreProgressManager.java:793)
72 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1.invoke(IdeEventQueue.kt:354)
73 | com.intellij.ide.IdeEventQueue$dispatchEvent$processEventRunnable$1$1.invoke(IdeEventQueue.kt:349)
74 | com.intellij.ide.IdeEventQueueKt.performActivity$lambda$1(IdeEventQueue.kt:1014)
75 | com.intellij.openapi.application.TransactionGuardImpl.performActivity(TransactionGuardImpl.java:106)
76 | com.intellij.ide.IdeEventQueueKt.performActivity(IdeEventQueue.kt:1014)
77 | com.intellij.ide.IdeEventQueue.dispatchEvent$lambda$7(IdeEventQueue.kt:349)
78 | com.intellij.openapi.application.impl.ApplicationImpl.runIntendedWriteActionOnCurrentThread(ApplicationImpl.java:848)
79 | com.intellij.ide.IdeEventQueue.dispatchEvent(IdeEventQueue.kt:391)
80 | java.desktop/java.awt.EventDispatchThread.pumpOneEventForFilters(EventDispatchThread.java:207)
81 | java.desktop/java.awt.EventDispatchThread.pumpEventsForFilter(EventDispatchThread.java:128)
82 | java.desktop/java.awt.EventDispatchThread.pumpEventsForHierarchy(EventDispatchThread.java:117)
83 | java.desktop/java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:113)
84 | java.desktop/java.awt.EventDispatchThread.pumpEvents(EventDispatchThread.java:105)
85 | java.desktop/java.awt.EventDispatchThread.run(EventDispatchThread.java:92)
86 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <profile version="1.0">
3 |     <option name="myName" value="Project Default" />
4 |     <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
5 |   </profile>
6 | </component>


--------------------------------------------------------------------------------
/.idea/jsLinters/eslint.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="EslintConfiguration">
4 |     <option name="fix-on-save" value="true" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/prettier.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PrettierConfiguration">
4 |     <option name="myConfigurationMode" value="AUTOMATIC" />
5 |     <option name="myRunOnSave" value="true" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.mailmap:
--------------------------------------------------------------------------------
1 | Biswaroop Bhattacharjee <biswaroop08@gmail.com>
2 | Nicola Sosio <sosio.nicola94@tiscali.it>
3 | Skanda Vivek <skanda.vivek@gmail.com>
4 | 


--------------------------------------------------------------------------------
/.vscode/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:alpine
 2 | # required for `pip install psutil`
 3 | RUN apk update && apk add python3-dev gcc linux-headers musl-dev
 4 | # required for vscode
 5 | RUN apk update && apk add git
 6 | # project dependencies
 7 | COPY requirements.txt .
 8 | RUN pip install -r requirements.txt && rm requirements.txt
 9 | # enable devcontainer updateRemoteUserUID
10 | RUN adduser -SD --shell /bin/sh vscode
11 | USER vscode
12 | ENV PATH=/home/vscode/.local/bin:$PATH
13 | 


--------------------------------------------------------------------------------
/.vscode/jupyterbook.code-snippets:
--------------------------------------------------------------------------------
 1 | {
 2 | "BibTeX URL (misc)": {
 3 | 	"scope": "bibtex",
 4 | 	"prefix": "@online",
 5 | 	"body": [
 6 | 		"@online{${1:key},",
 7 | 		"title={$3},",
 8 | 		"author={$4},",
 9 | 		"year=${5:lastUpdated},",
10 | 		"url={$2}",
11 | 		"}"
12 | 	],
13 | 	"description": "Add a website citation"
14 | },
15 | "BibTeX URL (news)": {
16 | 	"scope": "bibtex",
17 | 	"prefix": "@article",
18 | 	"body": [
19 | 		"@article{${1:key},",
20 | 		"title={$3},",
21 | 		"author={$4},",
22 | 		"year=${5:lastUpdated},",
23 | 		"journal={$6},"
24 | 		"url={$2}",
25 | 		"}"
26 | 	],
27 | 	"description": "Add a news website citation"
28 | },
29 | "Figure (external)": {
30 | 	"scope": "markdown",
31 | 	"prefix": "fig-ext",
32 | 	"body": [
33 | 		"```{figure-md} ${4}",
34 | 		":class: margin",
35 | 		"![](${1:imageURL})",
36 | 		"",
37 | 		"[${2:caption}](${3:sourcePageURL})",
38 | 		"```"
39 | 	],
40 | 	"description": "Add an image from an external website"
41 | },
42 | "Figure (internal)": {
43 | 	"scope": "markdown",
44 | 	"prefix": "fig-int",
45 | 	"body": [
46 | 		"```{figure-md} ${4}",
47 | 		":class: margin",
48 | 		"![](https://static.premai.io/book/${1:imageURL})",
49 | 		"",
50 | 		"${2:caption}",
51 | 		"```"
52 | 	],
53 | 	"description": "Add an image hosted by static.premai.io"
54 | }
55 | }
56 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "editor.tabCompletion": "on",
3 | "files.insertFinalNewline": true,
4 | "files.trimFinalNewlines": true,
5 | "files.trimTrailingWhitespace": true,
6 | "cSpell.language": "en-GB",
7 | "markdownlint.config": {"ul-style": false}
8 | }
9 | 


--------------------------------------------------------------------------------
/.zenodo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "State of Open Source AI",
 3 |   "description": "Clarity in the current fast-paced mess of Open Source innovation.",
 4 |   "related_identifiers": [
 5 |     {"identifier": "https://book.premai.io/state-of-open-source-ai", "relation": "isSourceOf"}],
 6 |   "creators": [
 7 |     {"name": "da Costa-Luis, Casper", "affiliation": "Prem", "orcid": "0000-0002-7211-1557"},
 8 |     {"name": "Sosio, Nicola", "affiliation": "Prem"},
 9 |     {"name": "Bhattacharjee, Biswaroop", "affiliation": "Prem"},
10 |     {"name": "Vivek, Skanda"},
11 |     {"name": "Trivedi, Het"},
12 |     {"name": "Pedrazzini, Filippo", "affiliation": "Prem"}
13 |   ],
14 |   "contributors": [
15 |     {"name": "others", "type": "Other"}],
16 |   "keywords": ["open-source", "AI", "book", "ML", "MLOps", "Jupyter-Book"],
17 |   "imprint_publisher": "Prem",
18 |   "access_right": "open",
19 |   "upload_type": "publication",
20 |   "publication_type": "book",
21 |   "publication_date": "2023-10-03"
22 | }
23 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | type: dataset
 3 | contact: [{affiliation: Prem, email: hello@premai.io}]
 4 | date-released: 2023-10-03
 5 | message: Please cite this book using this metadata
 6 | 
 7 | title: State of Open Source AI Book
 8 | abstract: Clarity in the current fast-paced mess of Open Source innovation.
 9 | version: '1'
10 | url: https://book.premai.io/state-of-open-source-ai
11 | authors:
12 | - {given-names: Casper, family-names: da Costa-Luis, affiliation: Prem, orcid: 'https://orcid.org/0000-0002-7211-1557'}
13 | - {given-names: Nicola, family-names: Sosio, affiliation: Prem}
14 | - {given-names: Biswaroop, family-names: Bhattacharjee, affiliation: Prem}
15 | - {given-names: Skanda, family-names: Vivek}
16 | - {given-names: Het, family-names: Trivedi}
17 | - {given-names: Filippo, family-names: Pedrazzini, affiliation: Prem}
18 | - {name: others, website: 'https://github.com/premAI-io/state-of-open-source-ai/graphs/contributors'}
19 | identifiers: [{type: doi, value: 10.5281/zenodo.10023181}]
20 | repository-code: https://github.com/premAI-io/state-of-open-source-ai
21 | keywords: [open-source, AI, book, ML, MLOps, Jupyter-Book]
22 | license-url: https://github.com/premAI-io/state-of-open-source-ai/blob/main/LICENCE # CC-BY-4.0 AND Apache-2.0
23 | 


--------------------------------------------------------------------------------
/CNAME:
--------------------------------------------------------------------------------
1 | book.premai.io


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | SPDX-License-Identifier: CC-BY-4.0 AND Apache-2.0
 2 | 
 3 | This book is a product of collaborative work.
 4 | Unless otherwise stated, all authors (see commit logs) retain copyright
 5 | for their respective work, and release code under the Apache-2.0 licence,
 6 | and everything else under the CC-BY-4.0 licence.
 7 | 
 8 | Exceptions or notable authors are listed below
 9 | in reverse chronological order:
10 | 
11 | * files: *
12 |   CC-BY-4.0 AND Apache-2.0 (c) 2023 Prem https://github.com/PremAI-io.
13 | 
14 | Creative Commons Attribution v. 4.0 Internation (CC-BY-4.0)
15 | -----------------------------------------------------------
16 | 
17 | This work is licenced under http://creativecommons.org/licenses/by/4.0
18 | 
19 | Apache Licence v. 2.0
20 | ---------------------
21 | 
22 | Licenced under the Apache Licence, Version 2.0 (the "Licence");
23 | you may not use this work except in compliance with the Licence.
24 | You may obtain a copy of the Licence at
25 | 
26 |     http://www.apache.org/licenses/LICENSE-2.0
27 | 
28 | Unless required by applicable law or agreed to in writing, software
29 | distributed under the Licence is distributed on an "AS IS" BASIS,
30 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 | See the Licence for the specific language governing permissions and
32 | limitations under the Licence.
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 📘 The State of Open Source AI (2023 Edition)
 2 | 
 3 | [![banner](https://static.premai.io/book/marketing/github--book.png)][book]
 4 | 
 5 | *Clarity in the current fast-paced mess of Open Source innovation.*
 6 | 
 7 | This is the source repository for [The State of Open Source AI][book] ebook, a comprehensive guide exploring everything from model evaluations to deployment, and a great FOMO cure.
 8 | 
 9 | [book]: https://book.premai.io/state-of-open-source-ai
10 | 
11 | Want to discuss any topics covered in the book? We have a [dedicated channel (`#book`) on our Discord server][Discord].
12 | 
13 | [Discord]: https://discord.gg/kpKk6vYVAn
14 | 
15 | ## Contributing
16 | 
17 | You can help keep the book up-to-date! Contributions, issues, and comments are welcome! See the [Contributing Guide](https://book.premai.io/state-of-open-source-ai/#contributing) for more information on how.
18 | 
19 | ## Licence
20 | 
21 | This book is released under [CC-BY-4.0 (text) and Apache-2.0 (code)](LICENCE).
22 | 
23 | Citation: [BibTeX](references.bib#L1)
24 | 
25 | ## Community
26 | 
27 | - [Join the Open Source AI Discord][Discord]
28 | - [Follow us on Twitter](https://twitter.com/premai_io)
29 | - [Subscribe to our newsletter](https://blog.premai.io)
30 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
  1 | # Book settings (https://jupyterbook.org/customize/config.html)
  2 | title: State of Open Source AI Book
  3 | author: Prem
  4 | copyright: CC-BY-4.0 (text) & Apache-2.0 (code)
  5 | logo: https://static.premai.io/logo.png
  6 | repository:
  7 |   url: https://github.com/premAI-io/state-of-open-source-ai
  8 |   branch: main
  9 | 
 10 | only_build_toc_files: true
 11 | exclude_patterns: [.github/*]
 12 | 
 13 | html:
 14 |   # No "baseurl" here to avoid conflicts with the theme
 15 |   favicon: assets/favicon.ico
 16 |   use_edit_page_button: true
 17 |   use_repository_button: true
 18 |   comments:
 19 |     utterances:
 20 |       issue-term: pathname
 21 |       label: question
 22 |       repo: premAI-io/state-of-open-source-ai
 23 |       theme: preferred-color-scheme
 24 | 
 25 | parse:
 26 |   myst_substitutions:
 27 |     baseurl: http://localhost:8000
 28 |     doi: 10.5281/zenodo.10023181
 29 |     wip_chapter: |
 30 |       This chapter is still being written & reviewed. Please do post links & discussion in the {{
 31 |         '[<i class="fas fa-pencil-alt"></i> comments]({}-comments)'.format(env.docname) }} below, or {{
 32 |         '[<i class="fab fa-github"></i> open a pull request]({}/edit/main/{}.md)'.format(
 33 |         env.config.html_context.book_baseurl, env.docname)
 34 |       }}!
 35 |     table_feedback: |
 36 |       ```{admonition} Feedback
 37 |       :class: attention
 38 |       Is the table above outdated or missing an important model? Let us know in the {{
 39 |         '[<i class="fas fa-pencil-alt"></i> comments]({}-comments)'.format(env.docname) }} below, or {{
 40 |         '[<i class="fab fa-github"></i> open a pull request]({}/edit/main/{}.md)'.format(
 41 |         env.config.html_context.book_baseurl, env.docname)
 42 |       }}!
 43 |       ```
 44 |     comments: |
 45 |       {{ '({}-comments)='.format(env.docname) }}
 46 | 
 47 |       ```{admonition} Feedback
 48 |       :class: attention
 49 |       Missing something important? Let us know in the comments below, or {{
 50 |         '[<i class="fab fa-github"></i> open a pull request]({}/edit/main/{}.md)'.format(
 51 |         env.config.html_context.book_baseurl, env.docname)
 52 |       }}!
 53 |       ```
 54 | 
 55 |       % hack to get utteranc.es to render (needs a `div.section` element)
 56 |       <div class="section"></div>
 57 | 
 58 |       {{ '```{committers} ' + env.docname + '.md\n```' }}
 59 | 
 60 |       <div id="email-modal" class="modal">
 61 |         <iframe name="ghost-result" style="display: none;"></iframe>
 62 |         <form class="modal-content" id="email-form" novalidate="">
 63 |           <img src="https://static.premai.io/book/book-cover-2.jpg" alt="book cover" />
 64 |           <div class="modal-text">
 65 |             <h1>Enter your email to access this book for free</h1>
 66 |             <div>
 67 |               <p>
 68 |                 We&rsquo;ll send you updates whenever we make major changes<br />
 69 |                 (no spam nor giving your email to anyone else).
 70 |               </p>
 71 |               <div class="input-container">
 72 |                 <input type="email" id="email-input" name="email" placeholder="Enter your email..." />
 73 |                 <button type="submit" id="email-submit" class="button">Subscribe</button>
 74 |               </div>
 75 |             </div>
 76 |             <div>
 77 |               <p>
 78 |                 This book is open source; you can also read &amp; contribute at<br />
 79 |                 <a href="https://github.com/premAI-io/state-of-open-source-ai" target="_blank">
 80 |                   <i class="fa-brands fa-github"></i>&nbsp;premAI-io/state-of-open-source-ai
 81 |                 </a>.
 82 |               </p>
 83 |               <span class="email-error"></span>
 84 |             </div>
 85 |           </div>
 86 |         </form>
 87 |       </div>
 88 | 
 89 |   myst_enable_extensions:
 90 |     - deflist
 91 |     - dollarmath
 92 |     - html_admonition
 93 |     - linkify
 94 |     - replacements
 95 |     - smartquotes
 96 |     - substitution
 97 |     - tasklist
 98 | 
 99 | sphinx:
100 |   extra_extensions:
101 |     - sphinx_last_updated_by_git
102 |     - sphinx_subfigure
103 |   local_extensions:
104 |     badges: .
105 |     committers: .
106 |     bibliography: .
107 |     prem_theme: .
108 |   recursive_update: true
109 |   config:
110 |     # Ensure Sphinx sees _templates/page.html
111 |     templates_path: ["_templates"]
112 | 
113 |     # Use your custom theme
114 |     html_theme: prem_theme
115 | 
116 |     # Put your custom base URL in html_context to avoid "unsupported theme option" warnings
117 |     html_context:
118 |       book_baseurl: https://book.premai.io/state-of-open-source-ai
119 | 
120 |     myst_heading_anchors: 4
121 |     html_js_files:
122 |       - [
123 |           https://plausible.io/js/script.js,
124 |           { defer: defer, data-domain: book.premai.io },
125 |         ]
126 |     linkcheck_ignore:
127 |       - http://localhost:8000
128 |       - https://github.com/premAI-io/state-of-open-source-ai/edit/main/.*.md
129 |       - https://github.com/\w+/\w+/blob/\w+/.*#L\d+(-L\d+)?
130 |       - https://github.com/premAI-io/prem-app#.*
131 |       - https://github.com/BlinkDL/RWKV-LM#.*
132 |       - https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md#.*
133 |       - https://github.com/ggerganov/ggml#.*
134 |       - https://github.com/huggingface/optimum#.*
135 |       - https://github.com/imaurer/awesome-decentralized-llm#.*
136 |       - https://github.com/kingoflolz/mesh-transformer-jax#.*
137 |       - https://github.com/lm-sys/FastChat.*#.*
138 |       - https://github.com/mistralai/mistral-src#.*
139 |       - https://github.com/onnx/onnx-tensorrt/blob/main/docs/operators.md#.*
140 |       - https://github.com/onnx/onnx-tensorrt#.*
141 |       - https://github.com/onnx/tutorials#.*
142 |       - https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e#.*
143 |       - https://www.nytimes.com/2016/04/19/technology/google-books-case.html
144 |       - https://doi.org/10.2307/2214413
145 |       - https://direct.mit.edu/daed/article/151/2/127/110621/Human-Language-Understanding-amp-Reasoning
146 |       - https://numfocus.org
147 |       - https://chat.openai.com
148 |       - https://falconllm.tii.ae
149 |       - https://www.midjourney.com
150 |       - https://accent.gmu.edu
151 |       - https://www.crcv.ucf.edu/data/UCF101.php
152 |       - https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing
153 |       - https://chat.lmsys.org
154 |       - https://platform.openai.com/docs/.*
155 |       - https://twitter.com/*
156 |       - https://www.reddit.com/*
157 |     linkcheck_allowed_redirects:
158 |       https://doi.org/.*/.*: https://.*
159 |       https://codespaces.new/.*: https://github.com/login.*
160 |       https://youtu.be/.*: https://www.youtube.com/watch\?v=.*&feature=youtu.be
161 |       https://.*.substack.com/i/\d+/.*: https://.*.substack.com/p/.*
162 |       https://docs.bentoml.org: https://docs.bentoml.com/en/latest
163 |       https://mozilla.org/MPL/2.0: https://www.mozilla.org/en-US/MPL/2.0
164 |       https://mxnet.apache.org: https://mxnet.apache.org/versions/[\d.]+/.*
165 |       https://gpt4all.io: https://gpt4all.io/index.html
166 | 
167 |     html_last_updated_fmt: "%d %b %Y"
168 |     jblatex_show_tocs: false
169 |     bibtex_reference_style: label
170 |     latex_elements:
171 |       papersize: a4paper
172 |       extrapackages: \usepackage{pdfpages}
173 |       maketitle: \includepdf[pages=-]{cover.pdf}
174 |       tableofcontents: ""
175 |       preamble: |
176 |         \usepackage{etoolbox}
177 |         \AtBeginEnvironment{figure}{\pretocmd{\hyperlink}{\protect}{}{}}
178 | 
179 | bibtex_bibfiles: [references.bib]
180 | 
181 | latex:
182 |   latex_documents:
183 |     targetname: book.tex
184 | 
185 | execute:
186 |   execute_notebooks: force
187 | 


--------------------------------------------------------------------------------
/_static/external_target.css:
--------------------------------------------------------------------------------
1 | a.reference.external:after {
2 |   content: "↗";
3 |   font-size: .7em;
4 |   vertical-align: text-top;
5 |   margin-left: .1em;
6 |   color: grey;
7 | }
8 | 


--------------------------------------------------------------------------------
/_static/external_target.js:
--------------------------------------------------------------------------------
 1 | document.addEventListener('DOMContentLoaded', function(){
 2 |   /// open external links in new tabs
 3 |   document.querySelectorAll('a.reference.external').forEach(a => {
 4 |     a.target = '_blank';
 5 |     if (a.href.startsWith("https://github.com/premAI-io/state-of-open-source-ai")){
 6 |       a.classList.replace('external', 'internal');
 7 |     }
 8 |   });
 9 | });
10 | 


--------------------------------------------------------------------------------
/_static/font.css:
--------------------------------------------------------------------------------
 1 | @font-face {
 2 |   font-family: "Pretendard";
 3 |   src: url("https://static.premai.io/fonts/Pretendard-Regular.woff2") format("woff2");
 4 |   font-weight: normal;
 5 | }
 6 | 
 7 | @font-face {
 8 |   font-family: "Pretendard";
 9 |   src: url("https://static.premai.io/fonts/Pretendard-SemiBold.woff2") format("woff2");
10 |   font-weight: 600;
11 | }
12 | 
13 | @font-face {
14 |   font-family: "Pretendard";
15 |   src: url("https://static.premai.io/fonts/Pretendard-Bold.woff2") format("woff2");
16 |   font-weight: bold;
17 | }
18 | 
19 | :root {
20 |   --pst-font-family-base: Pretendard, sans-serif;
21 |   --pst-font-family-heading: Pretendard, sans-serif;
22 | }
23 | 


--------------------------------------------------------------------------------
/_static/main.css:
--------------------------------------------------------------------------------
  1 | .modal {
  2 |   display: none;
  3 |   align-items: center;
  4 |   justify-content: center;
  5 |   width: 100%;
  6 |   background-color: rgba(0, 0, 0, 0.8);
  7 | }
  8 | 
  9 | .modal-content {
 10 |   display: flex;
 11 |   flex-direction: row;
 12 |   margin: 0 auto;
 13 |   width: 60vw;
 14 |   padding: 45px;
 15 |   background-color: rgba(20, 20, 20, 1);
 16 |   box-shadow: 0 0 22px 1px black;
 17 |   gap: 30px;
 18 | }
 19 | 
 20 | @media only screen and (max-width: 480px) {
 21 |   .modal-content {
 22 |     width: 90vw !important;
 23 |     flex-direction: column !important;
 24 |     align-items: center !important;
 25 |     padding: 25px !important;
 26 |   }
 27 |   .modal-content h1 {
 28 |     font-size: 1.2rem !important;
 29 |     margin-bottom: 5vh !important;
 30 |   }
 31 |   .modal-content p {
 32 |     font-size: 0.9rem !important;
 33 |   }
 34 |   .modal-content .input-container {
 35 |     padding: 6px !important;
 36 |   }
 37 |   .modal-content input {
 38 |     font-size: 0.9rem !important;
 39 |   }
 40 |   .modal-content .input-container .button {
 41 |     padding: 6px 18px !important;
 42 |   }
 43 |   .modal-content img {
 44 |     min-width: 35vw !important;
 45 |     max-width: 40vw !important;
 46 |   }
 47 | }
 48 | 
 49 | @media only screen and (min-width: 481px) and (max-width: 1022px) {
 50 |   .modal-content {
 51 |     width: 70vw !important;
 52 |     flex-direction: column !important;
 53 |     align-items: center !important;
 54 |     padding: 35px !important;
 55 |   }
 56 |   .modal-content h1 {
 57 |     font-size: 1.5rem !important;
 58 |     margin-bottom: 5vh !important;
 59 |   }
 60 |   .modal-content .input-container {
 61 |     padding: 8px 12px !important;
 62 |   }
 63 |   .modal-content input {
 64 |     font-size: 0.95rem !important;
 65 |   }
 66 |   .modal-content img {
 67 |     min-width: 20vw !important;
 68 |     max-width: 25vw !important;
 69 |   }
 70 | }
 71 | 
 72 | .modal-content img {
 73 |   min-width: 10vw;
 74 |   max-width: 15vw;
 75 |   height: 100%;
 76 | }
 77 | 
 78 | .modal-content .header {
 79 |   display: flex;
 80 |   justify-content: flex-end;
 81 | }
 82 | 
 83 | .modal-content input {
 84 |   border: none;
 85 |   background-color: transparent;
 86 |   flex: 1;
 87 |   color: rgba(255, 255, 255, 0.70);
 88 |   font-size: 1rem;
 89 |   font-style: normal;
 90 |   font-weight: 400;
 91 |   line-height: normal;
 92 |   width: 100%;
 93 | }
 94 | 
 95 | .modal-content input:focus {
 96 |   outline: none !important;
 97 | }
 98 | 
 99 | .modal-content .input-container .button {
100 |   color: white;
101 |   border: none;
102 |   text-align: center;
103 |   font-size: 0.8rem;
104 |   font-style: normal;
105 |   font-weight: 700;
106 |   line-height: 24px;
107 |   border-radius: 6px;
108 |   background: linear-gradient(97.33deg, #7F96FF -3.51%, #F58E8E 109.45%);
109 |   padding: 10px 24px;
110 | }
111 | 
112 | .modal-content .email-error {
113 |   display: flex;
114 |   color: red;
115 |   font-size: 0.9em;
116 |   margin-top: 4px;
117 | }
118 | 
119 | .modal-content .input-container {
120 |   display: flex;
121 |   padding: 10px 14px;
122 |   align-items: center;
123 |   align-self: stretch;
124 |   border-radius: 12px;
125 |   border: 1px solid rgba(255, 255, 255, 0.20);
126 | }
127 | 
128 | .modal-content h1 {
129 |   color: #FFF;
130 |   font-size: 1.8rem;
131 |   font-style: normal;
132 |   font-weight: bold;
133 |   line-height: normal;
134 |   margin-bottom: 8vh;
135 | }
136 | 
137 | .modal-content p {
138 |   color: #FFF;
139 |   font-size: 1rem;
140 |   font-style: normal;
141 |   font-weight: 400;
142 |   line-height: 26px;
143 | }
144 | 
145 | .modal-content .modal-text {
146 |   flex: 1;
147 |   overflow: auto;
148 |   display: flex;
149 |   flex-direction: column;
150 |   justify-content: space-between;
151 | }
152 | 
153 | .modal-content a {
154 |   text-decoration: none;
155 |   color: #7F96FF;
156 | }
157 | 
158 | .modal-content a:hover {
159 |   text-decoration: none;
160 |   color: #F58E8E;
161 | }
162 | 
163 | 
164 | /* Announcement Banner */
165 | 
166 | .bd-header-announcement__content a {
167 |   background-color: #7F96FF;
168 |   color: white;
169 |   font-weight: bold;
170 |   width: 100%;
171 |   height: 100%;
172 |   position: absolute;
173 |   top: 0;
174 |   right: 0;
175 |   display: flex;
176 |   justify-content: center;
177 |   align-items: center;
178 |   text-decoration: none;
179 | }
180 | 
181 | .bd-header-announcement__content a:hover {
182 |   color: rgba(255, 255, 255, 0.90);
183 | }
184 | 


--------------------------------------------------------------------------------
/_static/main.js:
--------------------------------------------------------------------------------
 1 | /// set/get helpers based on https://www.w3schools.com/js/js_cookies.asp
 2 | function setCookie(cname, cvalue, exdays) {
 3 |   const d = new Date();
 4 |   d.setTime(d.getTime() + (exdays * 24 * 60 * 60 * 1000));
 5 |   document.cookie = cname + "=" + cvalue + ";expires=" + d.toUTCString() + ";SameSite=Strict;path=/";
 6 | }
 7 | 
 8 | function getCookie(cname) {
 9 |   let name = cname + "=";
10 |   let ca = document.cookie.split(';');
11 |   for (let i = 0; i < ca.length; i++) {
12 |     let c = ca[i];
13 |     while (c.charAt(0) === ' ') {
14 |       c = c.substring(1);
15 |     }
16 |     if (c.indexOf(name) === 0) {
17 |       return c.substring(name.length, c.length);
18 |     }
19 |   }
20 |   return "";
21 | }
22 | 


--------------------------------------------------------------------------------
/_templates/page.html:
--------------------------------------------------------------------------------
 1 | {% extends "!page.html" %}
 2 | 
 3 | {% block meta %}
 4 | <!-- Custom canonical link and redirect logic -->
 5 | <link rel="canonical" href="{{ book_baseurl }}/{{ pagename }}/">
 6 | <script>
 7 |   location.href = "{{ book_baseurl }}/{{ pagename }}/";
 8 | </script>
 9 | <meta http-equiv="refresh" content="0; url={{ book_baseurl }}/{{ pagename }}/">
10 | <meta name="robots" content="noindex">
11 | 
12 | {{ super() }} {# Retain any additional meta tags provided by the theme #}
13 | {% endblock meta %}
14 | 
15 | {% block main %}
16 | <!-- Optional message for users who are not automatically redirected -->
17 | <h1>Redirecting...</h1>
18 | <p>
19 |   If you are not redirected automatically, please
20 |   <a href="{{ book_baseurl }}/{{ pagename }}/">click here</a>.
21 | </p>
22 | 
23 | {{ super() }} {# Retain the normal page content provided by the theme #}
24 | {% endblock main %}
25 | 


--------------------------------------------------------------------------------
/_toc.yml:
--------------------------------------------------------------------------------
 1 | # Table of contents (https://jupyterbook.org/customize/toc.html)
 2 | format: jb-book
 3 | root: index
 4 | chapters:
 5 | - file: licences
 6 | - file: eval-datasets
 7 | - file: models
 8 | - file: unaligned-models
 9 | - file: fine-tuning
10 | - file: model-formats
11 | - file: mlops-engines
12 | - file: vector-db
13 | - file: sdk
14 | - file: desktop-apps
15 | - file: hardware
16 | - file: references
17 | 


--------------------------------------------------------------------------------
/assets/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/premAI-io/state-of-open-source-ai/81d36c662f631f38ce0dc13b8f4419b02f88c731/assets/favicon.ico


--------------------------------------------------------------------------------
/badges.py:
--------------------------------------------------------------------------------
 1 | """Display repository badges.
 2 | 
 3 | MyST usage (HTML only):
 4 | 
 5 |     ```{badges} https://mybook.site https://github.com/org/mybook
 6 |     :doi: 10.5281.zenodo.12345678
 7 |     ```
 8 | """
 9 | from docutils import nodes
10 | from docutils.parsers.rst import Directive, directives
11 | from sphinx.application import Sphinx
12 | 
13 | __version__ = '0.0.0'
14 | 
15 | 
16 | def visit_nop(self, node):
17 |     pass
18 | 
19 | 
20 | class badges_node(nodes.General, nodes.Element):
21 |     pass
22 | 
23 | 
24 | def visit_badges_html(self, node):
25 |     self.body.append(
26 |         f"""<a href="{node['baseurl']}" target="_blank">
27 |         <img alt="site"
28 |          src="https://img.shields.io/badge/site-{node['baseurl'].replace('-', '--')}-orange" />
29 |         </a>""")
30 |     slug = '/'.join(node['repository_url'].split('/')[-2:])
31 |     self.body.append(
32 |         f"""<a href="{node['repository_url']}/graphs/contributors" target="_blank">
33 |         <img alt="last updated"
34 |          src="https://img.shields.io/github/last-commit/{slug}/main?label=updated" />
35 |         </a>""")
36 |     self.body.append(
37 |         f"""<a href="{node['repository_url']}/pulse" target="_blank">
38 |         <img alt="activity"
39 |          src="https://img.shields.io/github/commit-activity/m/{slug}/main?label=commits" />
40 |         </a>""")
41 |     if node['doi']:
42 |         self.body.append(
43 |             f"""<a href="https://doi.org/{node['doi']}" target="_blank">
44 |             <img alt="doi"
45 |              src="https://img.shields.io/badge/doi-{node['doi']}-black" />
46 |             </a>""")
47 | 
48 | 
49 | class Badges(Directive):
50 |     has_content = True
51 |     required_arguments = 2
52 |     optional_arguments = 1
53 |     final_argument_whitespace = True
54 |     option_spec = {'doi': directives.unchanged}
55 |     _node = None
56 | 
57 |     def run(self):
58 |         return [badges_node(
59 |             baseurl=self.arguments[0], repository_url=self.arguments[1], doi=self.options.get('doi', None))]
60 | 
61 | 
62 | def setup(app: Sphinx):
63 |     app.add_node(badges_node, html=(visit_badges_html, visit_nop),
64 |                  latex=(visit_nop, visit_nop))
65 |     app.add_directive("badges", Badges)
66 |     return {'version': __version__, 'parallel_read_safe': True}
67 | 


--------------------------------------------------------------------------------
/bibliography.py:
--------------------------------------------------------------------------------
 1 | """Limit the number of authors shown in the bibliography."""
 2 | from pybtex.plugin import register_plugin
 3 | from pybtex.style.formatting.unsrt import Style as UnsrtStyle
 4 | from pybtex.style.template import FieldIsMissing, join, node, sentence, tag
 5 | from sphinx.application import Sphinx
 6 | 
 7 | __version__ = '0.0.0'
 8 | 
 9 | 
10 | @node
11 | def names_truncated(children, context, role, max_names=9, **kwargs):
12 |     """Return formatted names."""
13 |     assert not children
14 |     try:
15 |         persons = context['entry'].persons[role]
16 |     except KeyError:
17 |         raise FieldIsMissing(role, context['entry'])
18 | 
19 |     style = context['style']
20 |     if (truncate := len(persons) > max_names):
21 |         persons = persons[:max_names - 1]
22 |     formatted_names = [style.format_name(person, style.abbreviate_names) for person in persons]
23 |     if truncate:
24 |         formatted_names.append(tag('i')["others"])
25 |     return join(**kwargs)[formatted_names].format_data(context)
26 | 
27 | 
28 | class Style(UnsrtStyle):
29 |     def format_names(self, role, as_sentence=True):
30 |         formatted_names = names_truncated(role, sep=', ', sep2=' and ', last_sep=', and ')
31 |         return sentence[formatted_names] if as_sentence else formatted_names
32 | 
33 | 
34 | def setup(app: Sphinx):
35 |     register_plugin('pybtex.style.formatting', 'unsrt_max_authors', Style)
36 |     return {'version': __version__}
37 | 


--------------------------------------------------------------------------------
/committers.py:
--------------------------------------------------------------------------------
 1 | """Display Git committers & last updated time.
 2 | 
 3 | Example MyST usage (HTML only):
 4 | 
 5 |     ```{committers} file_path.md
 6 |     ```
 7 | """
 8 | import json
 9 | import os
10 | import re
11 | import subprocess
12 | from collections import Counter
13 | from functools import cache
14 | from urllib.request import Request, urlopen
15 | 
16 | from docutils import nodes
17 | from docutils.parsers.rst import Directive, directives
18 | from sphinx.application import Sphinx
19 | 
20 | __version__ = '0.0.0'
21 | 
22 | 
23 | @cache
24 | def gh_api(endpoint: str, version='2022-11-28') -> dict:
25 |     headers = {'Accept': 'application/vnd.github+json', 'X-GitHub-Api-Version': version}
26 |     if (token := os.environ.get("GH_TOKEN", os.environ.get("GITHUB_TOKEN", ""))):
27 |         headers['Authorization'] = 'Bearer ' + token  # higher rate limit & more permissions
28 |     response = urlopen(Request("https://api.github.com/" + endpoint, headers=headers))
29 |     return json.load(response)
30 | 
31 | 
32 | def gh_user(email: str) -> str | None:
33 |     if (user := {'het@hets-mbp.lan': 'htrivedi99', 'skanda.vivek@gmail.com': 'skandavivek'}.get(email, '')):
34 |         return user  # hardcoded exceptions
35 | 
36 |     user_info = gh_api(f"search/users?q={email}+in:email")
37 |     try:
38 |         return user_info['items'][0]['login']
39 |     except (KeyError, IndexError):
40 |         return
41 | 
42 | 
43 | class committers_node(nodes.General, nodes.Element):
44 |     pass
45 | 
46 | 
47 | def visit_nop(self, node):
48 |     pass
49 | 
50 | 
51 | def visit_committers_html(self, node):
52 |     self.body.append(self.starttag(node, 'div'))
53 |     self.body.append(f"Chapter author{'' if len(node['authors']) == 1 else 's'}: ")
54 |     self.body.append(", ".join(f'<a href="{href}">{name}</a>' for name, href in node['authors']))
55 |     self.body.append('</div>')
56 | 
57 | 
58 | class Committers(Directive):
59 |     has_content = True
60 |     required_arguments = 1
61 |     optional_arguments = 0
62 |     final_argument_whitespace = True
63 |     option_spec = {'class': directives.class_option, 'name': directives.unchanged}
64 |     _node = None
65 | 
66 |     def run(self):
67 |         blame = subprocess.check_output([
68 |             'git', 'blame', '--line-porcelain', '-w', '-M', '-C', '-C', '--'] + self.arguments
69 |         ).decode('utf-8').strip()
70 |         authors = Counter(re.findall("^author (.*)\nauthor-mail <(.*)>", blame, flags=re.MULTILINE))
71 |         total_loc = authors.total()
72 |         auths = []
73 |         for (name, email), loc in authors.most_common():
74 |             if loc / total_loc < 0.1:  # ignore contributions under 10%
75 |                 break
76 |             if (user := gh_user(email)):
77 |                 auths.append((name, f"https://github.com/{user}"))
78 |             else:
79 |                 auths.append((name, f"mailto:{email}"))
80 |         return [committers_node(authors=auths)]
81 | 
82 | 
83 | def setup(app: Sphinx):
84 |     app.add_node(committers_node, html=(visit_committers_html, visit_nop),
85 |                  latex=(visit_nop, visit_nop))
86 |     app.add_directive("committers", Committers)
87 |     return {'version': __version__, 'parallel_read_safe': True}
88 | 


--------------------------------------------------------------------------------
/desktop-apps.md:
--------------------------------------------------------------------------------
  1 | # Desktop Apps
  2 | 
  3 | While ChatGPT and GPT-4 have taken the world of AI by storm in the last half year, open-source models are catching up. And there has been a lot of ground to cover, to reach OpenAI model performance. In many cases, ChatGPT and GPT-4 are clear winners as compared to deploying LLMs on cloud servers -- due to costs per OpenAI API request being relatively cheap compared with model hosting costs on cloud services like AWS, Azure, and Google Cloud. But, open-source models will always have value over closed APIs like ChatGPT/GPT-4 for certain business cases. Folks from industries like legal, healthcare, finance etc. -- have concerns over data and customer privacy.
  4 | 
  5 | A new and exciting area are desktop apps that support running power LLMs locally. There is an argument to be made that successful desktop apps are more useful than cloud based services in some sensitive cases. This is because data, models, and the app can all be ran locally on typically available hardware. Here, I go through some of the up and coming solutions for LLM desktop apps -- their benefits, limitations, and comparisons between them.
  6 | 
  7 | ```{table} Comparison of Desktop Apps
  8 | Desktop App | Supported Models | GPU support | Layout | Configuration | Extra Features | OS | Future Roadmap
  9 | ------------|------------------|-------------|--------|---------------|----------------|----|---------------
 10 | [](#lm-studio) | 🟡 [](model-formats.md#ggml) | 🟢 Yes | Clean, clear tabs. | Hardware config choices (GPU, RAM, etc.). Can choose multiple inference params (temperature, repeat penalty, etc.). | Local server deployments | Windows, Linux, MacOS | Not mentioned
 11 | [](#gpt4all) | 🟡 [](model-formats.md#ggml) | 🔴 No | Unclear tabs. | Minimal hardware config options. Can choose inference params. | Contribute & use training data from the GPT4All datalake | Windows, Linux, MacOS | [Building open-source datalake for future model training](https://gpt4all.io)
 12 | [](#koboldcpp) | 🟡 [](model-formats.md#ggml) | 🔴 No | Cluttered UI. | Some hardware config options. Unique inference/app params e.g. [scenarios.](https://github.com/LostRuins/koboldcpp) | Cool story, character, and adventure modes | Windows, Linux, MacOS | Not mentioned
 13 | [](#localai) | 🟡 [](model-formats.md#ggml) | 🔴 No | Clear tabs. | Minimal hardware config options. Can choose inference params. | Light/dark modes | Windows, Linux, MacOS | [Text-to-audio, OpenAI functions](https://github.com/louisgv/local.ai)
 14 | [](#ollama) | 🔴 few [](model-formats.md#ggml) models | 🟡 Yes (metal) | Basic, terminal-based UI. | Multiple hardware configurations, need to save as a file prior to running. Multiple inference params, need to save as a file. | Run from terminal | MacOS | [Windows, Linux support](https://ollama.ai)
 15 | [](#llamafile) | 🔴 llamafile models | 🟢 Yes | Clean, simple interface. | Minimal hardware configurations. | Run from terminal, invokes the default browser. | Windows, Linux, BSD, MacOS | [](https://github.com/Mozilla-Ocho/llamafile)
 16 | ```
 17 | 
 18 | ## LM Studio
 19 | 
 20 | LM Studio is an app to run LLMs locally.
 21 | 
 22 | ### UI and Chat
 23 | 
 24 | [LM Studio](https://lmstudio.ai) is a desktop application supported for Windows and Mac OS that gives us the flexibility to run LLMs on our PC. You can download any `ggml` model from the [HuggingFace models hub](https://huggingface.co/models) and run the model on the prompts given by the user.
 25 | 
 26 | The UI is pretty neat and well contained:
 27 | 
 28 | ```{figure} https://static.premai.io/book/lm-studio1.png
 29 | LM Studio UI
 30 | ```
 31 | 
 32 | There's a search bar that can be used to search for models from the HuggingFace models to power the chat.
 33 | 
 34 | ```{figure} https://static.premai.io/book/lmstudio-search.png
 35 | LM Studio Model Search
 36 | ```
 37 | 
 38 | The Chat UI component is similar to ChatGPT to have conversations between the user and the assistant.
 39 | 
 40 | ```{figure} https://static.premai.io/book/lmstudio-chat-int.png
 41 | LM Studio Chat Interface
 42 | ```
 43 | 
 44 | This is how the `TheBloke/Llama-2-7B-Chat-GGML/llama-2-7b-chat.ggmlv3.q5_K_S.bin` responds to a simple conversation starter.
 45 | 
 46 | ```{figure} https://static.premai.io/book/desktopapps-lmstudio-chat.png
 47 | LM Studio Chat Example
 48 | ```
 49 | 
 50 | ### Local Server
 51 | 
 52 | One useful aspect is the ability to build a Python or Node.js application based on an underlying LLM.
 53 | 
 54 | ```{figure} https://static.premai.io/book/lmstudio-local.png
 55 | LM Studio Local Server
 56 | ```
 57 | 
 58 | This enables the user to build applications that are powered by LLMs and using `ggml` models from the HUggingFace model library (without API key restrictions).
 59 | 
 60 | Think of this server like a place where you make API calls to and get the response. The only change is that this is a local server and not a cloud based server. This makes it quite exciting to use the hardware in your system to power the LLM application that you are building.
 61 | 
 62 | Let's spin up the server by hitting the `Start server` button🎉. That was a quick one and by default it is served in port `1234` and if you want to make use of some other port then you can edit that left to the `Start server` button that you pressed earlier. There are also few parameters that you can modify to handle the request but for now let's leave it as default.
 63 | 
 64 | Go to any Python editor of your choice and paste the following code by creating a new `.py` file.
 65 | 
 66 | ```python
 67 | import openai
 68 | # endpoint:port of your local inference server (in LM Studio)
 69 | openai.api_base='http://localhost:1234/v1'
 70 | openai.api_key=''  # empty
 71 | prefix = "### Instruction:\n"
 72 | suffix = "\n### Response:"
 73 | 
 74 | def get_completion(prompt, model="local model", temperature=0.0):
 75 |     formatted_prompt = f"{prefix}{prompt}{suffix}"
 76 |     messages = [{"role": "user", "content": formatted_prompt}]
 77 |     print(f'\nYour prompt: {prompt}\n')
 78 |     response = openai.ChatCompletion.create(
 79 |         model=model,
 80 |         messages=messages,
 81 |         temperature=temperature)
 82 |     return response.choices[0].message["content"]
 83 | 
 84 | prompt = "Please give me JS code to fetch data from an API server."
 85 | response = get_completion(prompt, temperature=0)
 86 | print(f"LLM's response:{response}")
 87 | ```
 88 | 
 89 | This is the code that I ran using the command `python3 <filename>.py` and the results from server logs and terminal produced are shown below:
 90 | 
 91 | ```{figure} https://static.premai.io/book/lmstudio-local-ex.png
 92 | LM Studio Local Server Example
 93 | ```
 94 | 
 95 | ### Model Configurations & Tools
 96 | 
 97 | By default we have a few presets already provided by LM studio but we can tweak them and create a preset of our own to be used elsewhere. The parameters that are modifiable are:
 98 | 
 99 | - 🛠️ Inference parameters: These gives the flexibility to change the `temperature`, `n_predict`, and `repeat_penalty`
100 | - ↔️ Input prefix and suffix: Text to add right before, and right after every user message
101 | - ␂ Pre-prompt / System prompt: Text to insert at the very beginning of the prompt, before any user messages
102 | - 📥 Model initialisation: `m_lock` when turned on will ensure the entire model runs on RAM.
103 | - ⚙️ Hardware settings: The `n_threads` parameter is maximum number of CPU threads the model is allowed to consume. If you have a GPU, you can turn on the `n_gpu_layers` parameter. You can set a number between 10-20 depending on the best value, through experimentation.
104 | 
105 | Tools focus on the response and UI of the application. The parameters modifiable are as follows:
106 | 
107 | - 🔠 `Context overflow policy`: Behaviour of the model for when the generated tokens length exceeds the context window size
108 | - 🌈 `Chat appearance`: Either plain text (.txt) or markdown (.md)
109 | - 📝 `Conversation notes`: Auto-saved notes for a specific chat conversation
110 | 
111 | ### Features
112 | 
113 | - 💪 Leverages the power of your machine to run the model i.e. more your machine is powerful then you can utilise this to the fullest reach.
114 | - 🆕 The ability to download the model from HuggingFace gives power to test the latest of models like LLaMa or any other new ones hosted publicly in HuggingFace. Supported models include MPT, Starcoder, Replit, GPT-Neo-X more generally that are of the type `ggml`
115 | - 💻 Available for both Windows and Mac.
116 | - 🔌 Models can be run entirely offline as they are downloaded and reside locally in your machine.
117 | - 💬 Access the app using Chat UI or local server
118 | 
119 | ## GPT4All
120 | 
121 | The [GPT4All homepage](https://gpt4all.io) states that
122 | 
123 | > GPT4All is an ecosystem to train and deploy **powerful** and **customised** large language models that run **locally** on consumer grade CPUs.
124 | 
125 | ### UI and Chat
126 | 
127 | The UI for GPT4All is quite basic as compared to LM Studio -- but it works fine.
128 | 
129 | ```{figure} https://static.premai.io/book/desktopapps-gpt4all-ui.png
130 | GPT4All UI
131 | ```
132 | 
133 | However, it is less friendly and more clunky/ has a beta feel to it. For one, once I downloaded the LLaMA-2 7B model, I wasn't able to download any new model even after restarting the app.
134 | 
135 | ### Local Server
136 | 
137 | Like LM Studio, there is a support for local server in GPT4All. But it took some time to find that this feature exists and was possible only from the [documentation](https://docs.gpt4all.io). The results seem far better than LM Studio with control over number of tokens and response though it is model dependent. Here's the code for the same:
138 | 
139 | ```python
140 | import openai
141 | openai.api_base = "http://localhost:4891/v1"
142 | openai.api_key = ""
143 | # Set up the prompt and other parameters for the API request
144 | prompt = "Who is Michael Jordan?"
145 | model = "Llama-2-7B Chat"
146 | # Make the API request
147 | response = openai.Completion.create(
148 |     model=model,
149 |     prompt=prompt,
150 |     max_tokens=199,
151 |     temperature=0.28,
152 |     top_p=0.95,
153 |     n=1,
154 |     echo=True,
155 |     stream=False)
156 | # Print the generated completion
157 | print(response)
158 | ```
159 | 
160 | The response can be found for the example `prompt`:
161 | 
162 | ```{figure} https://static.premai.io/book/gpt4all-ex.png
163 | GPT4All UI Example
164 | ```
165 | 
166 | ### Model Configurations & Tools
167 | 
168 | As you can see -- there is not too much scope for model configuration, and unlike LM Studio -- I couldn't use my GPU here.
169 | 
170 | ```{figure} https://static.premai.io/book/desktopapps-gpt4all-modelconfig.png
171 | GPT4All UI Model Configuration
172 | ```
173 | 
174 | ## koboldcpp
175 | 
176 | https://github.com/LostRuins/koboldcpp is a fun twist on LLMs -- adding game like scenarios and adventures. It supports adding base `ggml` models as the LLM engine, and spinning stories based on user inputs.
177 | 
178 | ### UI and Chat
179 | 
180 | The UI is pretty basic -- and you get some surprising answers. Here I ask a simple icebreaker question -- and you see that it responds that it is a friendly AI that likes to play games.
181 | 
182 | ```{figure} https://static.premai.io/book/desktopapps-koboldcpp-ui.png
183 | koboldcpp UI
184 | ```
185 | 
186 | ### Scenarios
187 | 
188 | You can also enter different sorts of scenarios and modes.
189 | 
190 | ```{figure} https://static.premai.io/book/desktopapps-kcpp-scenarios.png
191 | koboldcpp Scenarios
192 | ```
193 | 
194 | Below is the Julius Caesar scenario!
195 | 
196 | ```{figure} https://static.premai.io/book/desktopapps-kcpp-jc.png
197 | koboldcpp Julius Caesar Chat
198 | ```
199 | 
200 | ### Model Configuration and Tools
201 | 
202 | Many of the model configurations are similar to the default that is offered. But there are some interesting twists like story mode, adventure mode, and instruct mode.
203 | 
204 | ```{figure} https://static.premai.io/book/desktopapps-kcpp-modes.png
205 | koboldcpp Julius Model Configuration
206 | ```
207 | 
208 | ## [local.ai]
209 | 
210 | [local.ai]: https://www.localai.app
211 | 
212 | The [local.ai] App from https://github.com/louisgv/local.ai ([not to be confused](https://github.com/louisgv/local.ai/discussions/71) with [](mlops-engines.md#localai) from https://github.com/mudler/LocalAI) is a simple application for loading LLMs after you manually download a `ggml` model from online.
213 | 
214 | ### UI and Chat
215 | 
216 | The UI and chat are pretty basic. One bug that I noticed was that it wasn't possible to load models from the UI -- I had to manually download the model and then use the app.
217 | 
218 | ```{figure} https://static.premai.io/book/desktopapps-localai-ui.png
219 | [local.ai] UI
220 | ```
221 | 
222 | ### Model Configuration and Tools
223 | 
224 | Pretty standard prompt related configurations. It appears there is no GPU.
225 | 
226 | ## Ollama
227 | 
228 | [Ollama](https://ollama.ai) is an LLM based conversational chat bot that can be run from a MAC terminal. It is simple to get started. Currently, it is available only for the Mac OS but support for Windows and Linux are coming soon.
229 | 
230 | ### UI and Chat
231 | 
232 | Neat clean and crisp UI, just `>>>` in the terminal and you can paste your prompt. The response time will vary according to the model size but responses are mostly acceptable. I tested the `LLaMA` model which is the most recently supported model and the results were good.
233 | 
234 | ```{figure} https://static.premai.io/book/ollama-ex.png
235 | Ollama Example
236 | ```
237 | 
238 | `Note:` It just takes some time initially for the model to download locally, but later whenever you need to access the model there is no lag in accessing the requested model.
239 | 
240 | ### Model Configuration and Tools
241 | 
242 | The list of ~20 models can be accessed [here](https://ollama.ai/library).
243 | 
244 | They are constantly growing and multiple changes have happened quite recently. It can support models ranging from lite to robust models.
245 | It also has special support for specific functionality like performing Mathematical calculations. There is a `WizardMath` model that addresses these use case -- read more about this in their official [blog](https://ollama.ai/blog/wizardmath-examples) published by the Ollama team.
246 | 
247 | ### Limitations
248 | 
249 | - Better response format: There can be a formatted output making use of the terminal features to display the code, text, and images in the latter stage. This will make the output more readable and consistent to the user.
250 | - Showcase resource usage in a better way: Since LLMs by default require extensive use of memory we need to keep in mind the resources available. So while working in a terminal such details will not be explicitly available and can sometimes consume all the memory which can cause the application or the entire system to crash.
251 | - Support for custom models (from local): There is support to load models downloaded from the internet and run them locally by using the command:
252 | 
253 | ```bash
254 | ollama run "model location in the system"
255 | ```
256 | 
257 | ## llamafile
258 | 
259 | The objective of *llamafile*  is to enhance the accessibility of open-source large language models (LLMs) for both developers and end users. To achieve this, they have merged llama.cpp with Cosmopolitan Libc, creating a framework that simplifies the complexity of LLMs into a single-file executable known as a *llamafile*. This executable can be run locally on most computers without the need for installation. The framework is licensed under the Apache License, Version 2.0. This combination enables developers and end users to fully utilize large language models (LLMs). Through the implementation of the *llamafile* approach, they have unlocked the potential of LLMs on a local scale, paving the way for exciting new opportunities across a wide range of applications. 
260 | To experience it firsthand, the *llamafile* developers recommend downloading their example *llamafile* for the LLaVA model, which is licensed under LLaMA 2, OpenAI. LLaVA is an LLM that goes beyond mere chat capabilities; it also allows users to upload images and ask questions related to them. Importantly, all of this functionality occurs locally, ensuring that no data ever leaves the computer.
261 | 
262 | It is important to note that if there are any issues with compiling and dynamically linking GPU support, *llamafile* has a contingency plan in place. In such cases, the system will automatically switch to CPU inference, ensuring uninterrupted performance and accurate results.
263 | 
264 | Under Linux, the dynamic compilation of Nvidia cuBLAS GPU support is possible under certain conditions. Firstly, the presence of the cc compiler is required. Secondly, the -ngl 35 flag must be passed to activate the GPU. Lastly, the CUDA developer toolkit must be installed on the machine, and the nvcc compiler should be accessible through the system's path.
265 | 
266 | For Windows users, utilizing the GPU requires the following two steps: first, make sure that the released binaries are used. Secondly, pass the -ngl 35 flag. Additionally, it is essential to have an NVIDIA graphics card that supports CUDA, as AMD GPUs are not currently supported. If users prefer to use CUDA via WSL, one can enable Nvidia CUDA on WSL and run the llamafiles within WSL. However, it is worth noting that Windows users may face limitations with some of our example llamafiles due to the maximum executable file size restriction of 4 GB imposed by the Windows operating system. But don't worry, the *llamafile* framework offers support for external weight (see documention for details).
267 | 
268 | On the Apple Silicon, if Xcode is installed, everything should seamlessly function without any issues.
269 | 
270 | {{ comments }}
271 | 


--------------------------------------------------------------------------------
/fine-tuning.md:
--------------------------------------------------------------------------------
  1 | # Fine-tuning
  2 | 
  3 | ```{admonition} Work in Progress
  4 | :class: attention
  5 | {{ wip_chapter }}
  6 | 
  7 | Some ideas:
  8 | 
  9 | - https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e#training-your-own
 10 | - [Why You (Probably) Don't Need to Fine-tune an LLM](https://www.tidepool.so/2023/08/17/why-you-probably-dont-need-to-fine-tune-an-llm/) (instead, use few-shot prompting & retrieval-augmented generation)
 11 | - [Fine-tuning LLaMA-2: A Comprehensive Case Study for Tailoring Models to Unique Applications](https://www.anyscale.com/blog/fine-tuning-llama-2-a-comprehensive-case-study-for-tailoring-models-to-unique-applications) (fine-tuning LLaMA-2 for 3 real-world use cases)
 12 | - [Private, local, open source LLMs](https://python.langchain.com/docs/guides/local_llms)
 13 | - [Easy-to-use LLM fine-tuning framework (LLaMA-2, BLOOM, Falcon, Baichuan, Qwen, ChatGLM2)](https://github.com/hiyouga/LLaMA-Factory)
 14 | - https://dstack.ai/examples/finetuning-llama-2
 15 | - https://github.com/h2oai, etc.
 16 | - [The History of Open-Source LLMs: Better Base Models (part 2)](https://cameronrwolfe.substack.com/p/the-history-of-open-source-llms-better) (LLaMA, MPT, Falcon, LLaMA-2)
 17 | ```
 18 | 
 19 | For bespoke applications, models can be trained on task-specific data. However, training a model from scratch is seldom required.
 20 | The model has already learned useful feature representations during its initial (pre) training, so it is often sufficient to simply fine-tune. This takes advantage of [transfer learning](https://www.v7labs.com/blog/transfer-learning-guide), producing better task-specific performance with minimal training examples & resources -- analogous to teaching a university student without first reteaching them how to communicate.
 21 | 
 22 | ## Transfer Learning versus Fine-tuning
 23 | 
 24 | Both {term}`transfer learning` and {term}`fine-tuning` modify a pre-trained model for a domain/task-specific use, and thus both terms are often used interchangeably. However, there are key differences.
 25 | 
 26 | ```{table} Transfer Learning versus Fine-tuning
 27 | Description | Transfer Learning | Fine-tuning
 28 | ------------|-------------------|------------
 29 | Based on a model pre-trained on a large generic dataset | yes | yes
 30 | Freeze pre-trained model layers | most or all | none ("full" fine-tuning) or a few
 31 | Head layer | append a new head | replace existing head or leave as-is
 32 | Train on domain-specific data until unfrozen layers converge | yes | yes
 33 | ```
 34 | 
 35 | ### Transfer Learning
 36 | 
 37 | From [Wikipedia](https://en.wikipedia.org/wiki/Transfer_learning) definition, Transfer learning is a technique in machine learning in which knowledge learned from task is re-used in order to boost performance for some related task.  For working on transfer learning, you start with a pre-trained model. A pre-trained model is a deep learning model trained on a very large dataset (can be image text etc.).  Most of the times, these pre-trained models are huge classification models trained on huge data with numerous number of classes. During the course of training these models eventually learns features and representations to minimize the loss.
 38 | 
 39 | Hence before starting Transfer Learning, we take out the layers responsible for classification (pen-ultimate layers) and treat that as our feature extractor. We leverage this knowledge coming from the feature extractor (pre-trained model) to train a smaller model confined to a very specific domain-specific task.
 40 | The key is that "frozen" layers remain unchanged -- retaining the original abilities of the pre-trained model -- and act as general & robust feature extractors.
 41 | 
 42 | ```{figure-md} transfer-learning-architecture
 43 | :class: caption
 44 | ![](https://static.premai.io/book/transfer_learning.png)
 45 | 
 46 | Transfer Learning
 47 | ```
 48 | 
 49 | **Examples**:
 50 | 
 51 | - Computer vision: take the [ResNet-50](https://huggingface.co/microsoft/resnet-50) pre-trained on the [ImageNet](https://www.image-net.org/index.php) dataset and replace its last layer with the head of an object-detecting model (such as [Faster R-CNN](https://arxiv.org/abs/1506.01497)). This modified model can now be trained to draw bounding boxes and classify images from the [cats-vs-dogs](https://huggingface.co/datasets/cats_vs_dogs) dataset.
 52 | 
 53 | - Natural language processing: take a [BERT](https://huggingface.co/google/bert_uncased_L-2_H-768_A-12) model, that was pre-trained on extensive text data, such as the [BookCorpus dataset](https://huggingface.co/datasets/bookcorpus). Replace BERT's final layer with a simple classifier or Multi-Layer Perceptron (MLP) layers. These final layers can then be trained on the [tweet sentiment classification dataset](https://huggingface.co/datasets/carblacac/twitter-sentiment-analysis) to classify twitter sentiments.
 54 | 
 55 | **Use cases**:
 56 | `NOTE`: We can even extend the process of transfer learning by unfreezing some layers of pre-trained model and retraining them along with our smaller model. This additional step helps the model to adapt on newer domain-specific task or out of distribution tasks.
 57 | 
 58 | - Limited data: when domain-specific dataset size is small, a large model cannot be trained end-to-end without overfitting. However if the model is mostly a frozen general feature extractor, then the subsequent trainable layers are less likely to overfit.
 59 | - Limited compute and time: retraining a large model from scratch requires a lot of compute resources and time. This is unnecessary if similar performance can be achieved through transfer learning (training just part of a large model).
 60 | 
 61 | > The key difference here is none (or few) of the pre-trained model's weights are frozen. The pre-training process can be considered an intelligent weight initialisation prior to training on a domain-specific dataset. Essentially, the pre-training will leave the model weights close to a global (general) optimum, while the domain-specific training will find a local (task-specific) optimum.
 62 | 
 63 | ### Fine-Tuning
 64 | 
 65 | From [Wikipedia’s](https://en.wikipedia.org/wiki/Fine-tuning_(deep_learning)) definition, Fine-tuning is an approach to transfer learning in which weights of a pre-trained model is trained on a new data.  In some case we retrain the whole model on our domain-specific dataset or in other cases, we just fine-tune on only a subset of the layers. Through fine-tuning, we are adapting our existing pre-trained model on a task-specific dataset.
 66 | 
 67 | ```{figure-md} fine-tuning-architecture
 68 | :class: caption
 69 | ![](https://static.premai.io/book/fine-tuning.png)
 70 | 
 71 | Fine Tuning
 72 | ```
 73 | 
 74 | **Examples**:
 75 | 
 76 | - Computer vision: for segmentation in cases where fine-grained detail is important (e.g. finding individual cells in medical imaging, or detecting objects in satellite images), transfer learning might not be accurate enough.
 77 | - Natural language processing: an LLM such as [](models.md#persimmon-8b) -- used in general purpose text completion -- can be adapted to do summarisation. Adding a few layers (transfer learning) may not be enough to do summarisation well, and hence full fine-tuning is required.
 78 | 
 79 | **Use cases**:
 80 | 
 81 | - Performance: when transfer learning is not accurate enough, and enough domain-specific data is available to make use of fine-tuning without overfitting.
 82 | 
 83 | Note that fine-tuning typically required much more compute resources, time, and data than transfer learning.
 84 | 
 85 | ## Fine-tuning LLMs
 86 | 
 87 | When an LLM does not produce the desired output, engineers think that by fine-tuning the model, they can make it "better". But what exactly does "better" mean in this case? It's important to identify the root of the problem before fine-tuning the model on a new dataset.
 88 | 
 89 | Common LLM issues include:
 90 | 
 91 | - The model lacks knowledge on certain topics
 92 |   + [](#rag) can be used to solve this problem
 93 | - The model's responses do not have the proper style or structure the user is looking for
 94 |   + Fine-tuning or few-shot prompting is applicable here
 95 | 
 96 | ```{figure-md} llm-fine-tuning-architecture
 97 | :class: caption
 98 | ![](https://static.premai.io/book/fine-tuning-llm.png)
 99 | 
100 | [Fine-tuning LLMs](https://neo4j.com/developer-blog/fine-tuning-retrieval-augmented-generation)
101 | ```
102 | 
103 | A baseline LLM model cannot answer questions about content is hasn't been trained on {cite}`tidepool-citation`. The LLM will make something up, i.e., hallucinate. To fix issues like this, RAG is a good tool to use because it provides the LLM with the context it needs to answer the question.
104 | 
105 | On the other hand, if the LLM needs to generate accurate SQL queries, RAG is not going to be of much help here. The format of the generated output matters a lot, so fine-tuning would be more useful for this use case.
106 | a
107 | Here are some examples of models that have been fine-tuned to generate content in a specific format/style:
108 | 
109 | * [Gorilla LLM](https://gorilla.cs.berkeley.edu) - This LLM was fine-tuned to generate API calls.
110 | * [LLaMA-2 chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - The "chat" version of LLaMA is fine-tuned on conversational data.
111 | * [Code LLaMA](https://about.fb.com/news/2023/08/code-llama-ai-for-coding) - A fine-tuned LLaMA-2 model designed for code generation.
112 | 
113 | ## RAG
114 | 
115 | {term}`RAG` is a method used to boost the accuracy of LLMs by injecting relevant context into an LLM prompt. It works by connecting to a vector database and fetches only the information that is most relevant to the user's query. Using this technique, the LLM is provided with enough background knowledge to adequately answer the user's question without hallucinating.
116 | 
117 | RAG is not a part of fine-tuning, because it uses a pre-trained LLM and does not modify it in any way.
118 | However, there are several advantages to using RAG:
119 | 
120 | - **Boosts model accuracy**
121 |   - Leads to less hallucinations by providing the right context
122 | - **Less computing power required**
123 |   - Unlike fine-tuning, RAG does not need to re-train any part of the model. It's only the models prompt that changes.
124 | - **Quick and easy setup**
125 |   - RAG does not require much LLM domain expertise. You don't need to find training data or corresponding labels. Most pieces of text can be uploaded into the vector db as is, without major modifications.
126 | - **Connect to private data**
127 |   - Using RAG, engineers can connect data from SaaS apps such as Notion, Google Drive, HubSpot, Zendesk, etc. to their LLM. Now the LLM has access to private data and can help answer questions about the data in these applications.
128 | 
129 | RAG plays a key role in making LLMs useful, but it can be a bit tedious to set up. There are a number of open-source project such as https://github.com/run-llama/llama_index which can help make the process a bit easier.
130 | 
131 | ## Fine-tuning Image Models
132 | 
133 | Fine tuning computer vision based models is a common practice and is used in applications involving object detection, object classification, and image segmentation.
134 | 
135 | For these non-generative AI use-cases, a baseline model like Resnet or YOLO is fine-tuned on labelled data to detect a new object. Although the baseline model isn't initially trained for the new object, it has learned the feature representation. Fine-tuning enables the model to rapidly acquire the features for the new object without starting from scratch.
136 | 
137 | Data preparation plays a big role in the fine-tuning process for vision based models. An image of the same object can be taken from multiple angles, different lighting conditions, different backgrounds, etc. In order to build a robust dataset for fine-tuning, all of these image variations should be taken into consideration.
138 | 
139 | ### Fine-tuning AI image generation models
140 | 
141 | ```{figure-md} image-generation-fine-tuning
142 | :class: caption
143 | ![](https://static.premai.io/book/fine-tuning-image-generation.png)
144 | 
145 | [Dreambooth Image Generation Fine-tuning](https://dreambooth.github.io)
146 | ```
147 | 
148 | Models such as [Stable Diffusion](https://stability.ai/stable-diffusion) can also be tailored through fine-tuning to generate specific images. For instance, by supplying Stable Diffusion with a dataset of pet pictures and fine-tuning it, the model becomes capable of generating images of that particular pet in diverse styles.
149 | 
150 | The dataset for fine-tuning an image generation model needs to contain two things:
151 | 
152 | - **Text**: What is the object in the image
153 | - **Image**: The picture itself
154 | 
155 | The text prompts describe the content of each image. During fine-tuning, the text prompt is passed into the text encoder portion of Stable Diffusion while the image is fed into the image encoder. The model learns to generate images that match the textual description based on this text-image pairing in the dataset {cite}`octoml-fine-tuning`.
156 | 
157 | ## Fine-tuning Audio Models
158 | 
159 | ```{figure-md} audio-fine-tuning
160 | :class: caption
161 | ![](https://static.premai.io/book/fine-tuning-audio.png)
162 | 
163 | [Audio Generation Fine-tuning](https://aws.amazon.com/blogs/machine-learning/fine-tune-and-deploy-a-wav2vec2-model-for-speech-recognition-with-hugging-face-and-amazon-sagemaker)
164 | ```
165 | 
166 | Speech-to-text models like [Whisper](https://registry.premai.io/detail.html) can also be fine-tuned. Similar to fine-tuning image generation models, speech-to-text models need two pieces of data:
167 | 
168 | 1. **Audio recording**
169 | 2. **Audio transcription**
170 | 
171 | Preparing a robust dataset is key to building a fine-tuned model. For audio related data there are a few things to consider:
172 | 
173 | **Acoustic Conditions:**
174 | 
175 | * Background noise levels - more noise makes transcription more difficult. Models may need enhanced noise robustness.
176 | * Sound quality - higher quality audio with clear speech is easier to transcribe. Low bitrate audio is challenging.
177 | * Speaker accents and voice types - diversity of speakers in training data helps generalise.
178 | * Audio domains - each domain like meetings, call centers, videos, etc. has unique acoustics.
179 | 
180 | **Dataset Creation:**
181 | 
182 | * Quantity of training examples - more audio-transcript pairs improves accuracy but takes effort.
183 | * Data collection methods - transcription services, scraping, in-house recording. Quality varies.
184 | * Transcript accuracy - high precision transcripts are essential. Poor transcripts degrade fine-tuning.
185 | * Data augmentation - random noise, speed, pitch changes makes model robust.
186 | 
187 | ## Importance of data
188 | 
189 | ```{figure-md} data-centric-ai
190 | :class: caption
191 | ![](https://static.premai.io/book/fine-tuning-data-centric.png)
192 | 
193 | [Data centric AI](https://segments.ai/blog/wandb-integration)
194 | ```
195 | 
196 | The performance of a fine-tuned model largely depends on the **quality** and **quantity** of training data.
197 | 
198 | For LLMs, the quantity of data can be an important factor when deciding whether to fine-tune or not. There have been many success stories of companies like Bloomberg {cite}`wu2023bloomberggpt`, [Mckinsey](https://www.mckinsey.com/about-us/new-at-mckinsey-blog/meet-lilli-our-generative-ai-tool), and [Moveworks] that have either created their own LLM or fine-tuned an existing LLM which has better performance than ChatGPT on certain tasks. However, tens of thousands of data points were required in order to make these successful AI bots and assistants. In the [Moveworks blog post][Moveworks], the fine-tuned model which surpasses the performance of GPT-4 on certain tasks, was trained on an internal dataset consisting of 70K instructions.
199 | 
200 | [Moveworks]: https://www.moveworks.com/us/en/resources/blog/moveworks-enterprise-llm-benchmark-evaluates-large-language-models-for-business-applications
201 | 
202 | In the case of computer vision models, data quality can play a significant role in the performance of the model. Andrew Ng, a prominent researcher and entrepreneur in the field of AI, has been an advocate of data centric AI in which the quality of the data is more important than the sheer volume of data {cite}`small-data-tds`.
203 | 
204 | To summarise, fine-tuning requires a balance between having a large dataset and having a high quality dataset. The higher the data quality, the higher the chance of increasing the model's performance.
205 | 
206 | ```{table} Estimates of minimum fine-tuning Hardware & Data requirements
207 | :name: memory-data-requirements
208 | 
209 | Model | Task | Hardware | Data
210 | ------|------|----------|-----
211 | LLaMA-2 7B | Text Generation | GPU: 65GB, 4-bit quantised: 10GB | 1K datapoints
212 | Falcon 40B | Text Generation | GPU: 400GB, 4-bit quantised: 50GB | 50K datapoints
213 | Stable Diffusion | Image Generation | GPU: 6GB | 10 (using Dreambooth) images
214 | YOLO | Object Detection | Can be fine-tuned on CPU | 100 images
215 | Whisper | Audio Transcription | GPU: 5GB (medium), 10GB (large) | 50 hours
216 | ```
217 | 
218 | ```{admonition} GPU memory for fine-tuning
219 | :name: memory-requirements
220 | :class: note
221 | 
222 | Most models require a GPU for fine-tuning. To approximate the amount of GPU memory required, the general rule is around 2.5 times the model size. Note that {term}`quantisation` to reduce the size tends to only be useful for inference, not training-fine-tuning. An alternative is to only fine-tune some layers (freezing and quantising the rest), thus greatly reducing memory requirements.
223 | 
224 | For example: to fine-tune a `float32` (i.e. 4-byte) 7B parameter model:
225 | 
226 | $$
227 | 7 \times 10^{9}~\mathrm{params} \times 4~\mathrm{B/param} \times 2.5 = 70~\mathrm{GB}
228 | $$
229 | ```
230 | 
231 | ## Future
232 | 
233 | Fine-tuning models has been a common practice for ML engineers. It allows engineers to quickly build domain-specific models without having to design the neural network from scratch.
234 | 
235 | Developer tools for fine-tuning continue to improve the overall experience of creating one of these models while reducing the time to market. Companies like [Hugging Face](https://huggingface.co/docs/transformers/training) are building open-source tools to make fine-tuning easy. On the commercial side, companies like [Roboflow](https://roboflow.com) and [Scale AI](https://scale.com/generative-ai-platform) provide platforms for teams to manage the full life-cycle of a model.
236 | 
237 | Overall, fine-tuning has become a crucial technique for adapting large pre-trained AI models to custom datasets and use cases. While the specific implementation details vary across modalities, the core principles are similar - leverage a model pre-trained on vast data, freeze most parameters, add a small tunable component customised for your dataset, and update some weights to adapt the model.
238 | 
239 | When applied correctly, fine-tuning enables practitioners to build real-world solutions using leading large AI models.
240 | 
241 | {{ comments }}
242 | 


--------------------------------------------------------------------------------
/hardware.md:
--------------------------------------------------------------------------------
  1 | # Hardware
  2 | 
  3 | ```{admonition} Work in Progress
  4 | :class: attention
  5 | {{ wip_chapter }}
  6 | 
  7 | Some ideas:
  8 | 
  9 | - [AI and Memory Wall](https://medium.com/riselab/ai-and-memory-wall-2cb4265cb0b8)
 10 | - https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e#deployment
 11 | - https://www.youtube.com/watch?v=r5NQecwZs1A
 12 | ```
 13 | 
 14 | ## Machine Learning and GPUs
 15 | 
 16 | % TODO: add links/citations
 17 | 
 18 | GPUs are particularly well-suited for the types of computations required in AI for several reasons:
 19 | 
 20 | 1. **Parallelisation**: Deep learning models involve a lot of matrix multiplications and other operations that can be parallelised. A single GPU can have thousands of cores, allowing it to execute many operations simultaneously, which can lead to a significant speedup in training and inference times.
 21 | 2. **Specialised Hardware**: Modern GPUs have specialised hardware for performing certain types of operations that are common in deep learning, such as matrix multiplications and convolutions. For example, NVIDIA's Volta and Turing architectures include Tensor Cores, which are specialised hardware units designed to accelerate mixed-precision matrix multiply-and-accumulate operations.
 22 | 3. **High Memory Bandwidth**: GPUs have much higher memory bandwidth compared to CPUs, which allows them to transfer data to and from memory much more quickly. This is important for deep learning models, which often involve large amounts of data.
 23 | 4. **Software Support**: There is a lot of software support for GPU computing in popular deep learning frameworks like TensorFlow and PyTorch. These frameworks provide high-level APIs that make it easy to develop models and run them on GPUs, without having to write low-level GPU code.
 24 | 5. **Energy Efficiency**: Training deep learning models can be very computationally intensive, and GPUs are generally more energy-efficient than CPUs for these types of computations.
 25 | 6. **Availability**: Unlike much other specialized numerical computing hardware, GPUs are mass produced for the consumer market. Although specialized data-center and embedded variants exist, mid-tier GPUs can be [easily purchased](https://www.amazon.com/s?k=nvidia+24gb) by consumers and installed in a workstation or PC.
 26 | 
 27 | For these reasons, GPUs are often the preferred hardware for training and deploying deep learning models. That said, there are other types of hardware that can also be used for deep learning, such as TPUs (Tensor Processing Units), which are custom accelerators designed by Google specifically for deep learning.
 28 | 
 29 | ## Types of GPUs
 30 | 
 31 | 1. **NVIDIA GPUs**: NVIDIA is currently the dominant player in the GPU market for machine learning applications. Their GPUs are widely used in both research and commercial applications. NVIDIA provides a comprehensive ecosystem of software tools and libraries for machine learning, including CUDA and cuDNN (CUDA Deep Neural Network library), which are essential for training deep neural networks. The NVIDIA A100 GPU, for example, is designed specifically for AI and data analytics.
 32 | 2. **AMD GPUs**: AMD GPUs are also used for machine learning, but they are not as popular as NVIDIA GPUs. AMD provides the ROCm (Radeon Open Compute) platform, which is an open-source software platform for GPU-enabled HPC (High-Performance Computing) and machine learning applications. However, the software ecosystem for AMD GPUs is not as mature as for NVIDIA GPUs.
 33 | 3. **Apple Silicon GPUs**: Apple has developed its own GPUs for its Apple Silicon chips, like the M1. These GPUs are optimised for low power consumption and are used in Apple devices like the MacBook Air, MacBook Pro, Mac Mini, and iPad Pro. The performance of these GPUs is quite good for mobile and integrated GPUs, but they are not suitable for high-performance machine learning tasks.
 34 | 4. **Intel GPUs**: Intel is also developing GPUs for machine learning applications. Their upcoming Intel Xe GPUs are expected to provide competitive performance for machine learning tasks. Intel also provides the oneAPI toolkit, which includes a library (oneDNN) for deep neural networks.
 35 | 5. **Google TPUs (Tensor Processing Units)**: Although not technically GPUs, Google's TPUs are custom accelerators for machine learning tasks. They are designed to provide high performance and efficiency for both training and inference of machine learning models. TPUs are available through Google's cloud computing services.
 36 | 
 37 | Each of these options has its own advantages and disadvantages in terms of performance, power consumption, software support, and cost. NVIDIA GPUs are currently the most popular choice for machine learning applications due to their high performance and mature software ecosystem.
 38 | 
 39 | ## Programming for GPUs
 40 | 
 41 | ### NVIDIA GPUs
 42 | 
 43 | #### CUDA
 44 | 
 45 | To interact with NVIDIA GPUs, you will primarily use CUDA. CUDA is a parallel computing platform & programming model developed by NVIDIA for general computing on its GPUs {cite}`cuda-gpus`.
 46 | 
 47 | Here are the main components you will interact with:
 48 | 
 49 | 1. [**CUDA Toolkit**](https://developer.nvidia.com/cuda-downloads), which includes:
 50 |    - **CUDA libraries**: e.g. `cuBLAS` for linear algebra, `cuDNN` for deep learning, and others for FFTs, sparse matrices, and more
 51 |    - [**CUDA runtime**](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#cuda-runtime) (`cudart`)
 52 |    - [**CUDA compiler**](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#compilation-with-nvcc) (`nvcc`)
 53 |    - [**NVIDIA drivers**](https://www.nvidia.com/Download/index.aspx): allow your operating system & programs to communicate with your NVIDIA graphics card
 54 | 2. [**CUDA Language**](https://docs.nvidia.com/cuda/cuda-c-programming-guide): an extension of the C/C++ programming language which includes [some additional keywords & constructs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#c-language-extensions) for writing parallel code.
 55 | 
 56 | Here is a basic workflow for using NVIDIA GPUs:
 57 | 
 58 | 1. **Install NVIDIA drivers & CUDA Toolkit**, using one of the following (depending on your taste):
 59 |    - [Developer download matrix (recommended)](https://developer.nvidia.com/cuda-downloads)
 60 |    - [Quickstart guide (slightly more detailed)](https://docs.nvidia.com/cuda/cuda-quick-start-guide)
 61 |    - [Quickstart videos (if you prefer eye-candy)](https://developer.nvidia.com/how-to-cuda-c-cpp)
 62 |    - Full Guide for [Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux) or [Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows)
 63 | 2. [**Write your code**](https://docs.nvidia.com/cuda/cuda-c-programming-guide): Use the CUDA programming language (an extension of C/C++) to write your code. This will involve writing kernel functions that will be executed on the GPU, and host code that will be executed on the CPU.
 64 | 3. **Compile your code**: Use the NVCC compiler (included in the CUDA Toolkit) to compile your code.
 65 | 4. **Run your code**: Run your compiled code on an NVIDIA GPU.
 66 | 
 67 | For example, here is a simple CUDA program that adds two vectors:
 68 | 
 69 | ```cpp
 70 | #include "cuda_runtime.h"
 71 | #include <cstdio>
 72 | 
 73 | /// CUDA kernel function for vector addition (dst = srcA + srcB)
 74 | __global__ void vectorAdd(float *const dst, const float *const srcA, const float *const srcB, int numElements) {
 75 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
 76 |   if (i < numElements) dst[i] = srcA[i] + srcB[i];
 77 | }
 78 | 
 79 | int main(void) {
 80 |   // Allocate & initialise host (CPU) & device (GPU) memory
 81 |   const int numElements = 1337;
 82 |   float *srcA;
 83 |   cudaMallocManaged((void **)&srcA, numElements);
 84 |   for(int i=0; i<numElements; ++i) srcA[i] = i;
 85 |   cudaDeviceSynchronize();
 86 |   // ...
 87 | 
 88 |   // Launch the vectorAdd kernel
 89 |   const int threadsPerBlock = 256;
 90 |   const int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
 91 |   vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(dst, srcA, srcB, numElements);
 92 |   cudaDeviceSynchronize();
 93 | 
 94 |   // clean up memory
 95 |   cudaFree((void *)a);
 96 |   // ...
 97 | }
 98 | ```
 99 | 
100 | In this example, `srcA`, `srcB`, and `dst` are memory pointers to linear vectors (of size `numElements`). Note that the CUDA compiler automatically converts these to host (CPU) or device (GPU) memory pointers (and copies data between host & device) when appropriate. The `vectorAdd` "kernel" (GPU function) is launched with `blocksPerGrid` blocks, each containing `threadsPerBlock` threads. Each thread computes the sum of one pair of elements from `srcA` and `srcB`, and stores the result in `dst`.
101 | 
102 | ```{admonition} High-level wrappers
103 | :class: seealso
104 | Note that wrappers for other programming languages exists (e.g. [Python](https://developer.nvidia.com/how-to-cuda-python)), allowing control of CUDA GPUs while writing code in more concise & user-friendly languages.
105 | 
106 | % TODO: RAPIDS, CuPy, CuVec etc
107 | ```
108 | 
109 | #### Vulkan
110 | 
111 | Vulkan is a low-level graphics and compute API developed by the Khronos Group. It provides fine-grained control over the GPU and is designed to minimise CPU overhead and provide more consistent performance. Vulkan can be used for a variety of applications, including gaming, simulation, and scientific computing.
112 | 
113 | Vulkan is supported on a wide variety of platforms, including Windows, Linux, macOS (via MoltenVK, a Vulkan implementation that runs on top of Metal), Android, and iOS. Vulkan has a somewhat steep learning curve because it is a very low-level API, but it provides a lot of flexibility and can lead to very high performance.
114 | 
115 | ### AMD GPUs
116 | 
117 | For AMD GPUs, you can use the ROCm (Radeon Open Compute) platform, which is an open-source software platform for GPU-enabled HPC (High-Performance Computing) and machine learning applications.
118 | 
119 | Here are the main components of the ROCm platform:
120 | 
121 | 1. **ROCm Runtime**: This is the core of the ROCm platform. It includes the ROCr System Runtime, which is a user-space system runtime for managing GPU applications, and the ROCt Thunk Interface, which provides a low-level interface to the GPU kernel driver.
122 | 2. **ROCm Driver**: This is the kernel driver for AMD GPUs. It includes the AMDGPU driver, which is the open-source kernel driver for AMD Radeon graphics cards.
123 | 3. **ROCm Libraries**: These are a set of libraries optimised for AMD GPUs. They include rocBLAS for basic linear algebra, rocFFT for fast Fourier transforms, and rocRAND for random number generation.
124 | 4. **ROCm Tools**: These are a set of tools for developing and debugging applications on AMD GPUs. They include the ROCm SMI (System Management Interface) for monitoring and managing GPU resources, and the ROCgdb debugger for debugging GPU applications.
125 | 
126 | To develop applications for AMD GPUs using the ROCm platform, you will need to:
127 | 
128 | 1. **Install the necessary software**: This includes the ROCm platform, and any other libraries or tools you need.
129 | 2. **Write your code**: You can use the HIP programming language, which is a C++ runtime API and kernel language that allows you to write portable GPU code that can run on both AMD and NVIDIA GPUs. HIP code can be compiled to run on AMD GPUs using the HIP-Clang compiler, or on NVIDIA GPUs using the NVCC compiler.
130 | 3. **Compile your code**: Use the HIP-Clang compiler to compile your code for AMD GPUs, or the NVCC compiler for NVIDIA GPUs.
131 | 4. **Run your code**: Run your compiled code on an AMD or NVIDIA GPU.
132 | 
133 | For example, here is a simple HIP program that adds two vectors:
134 | 
135 | ```cpp
136 | #include "hip/hip_runtime.h"
137 | #include <cstdio>
138 | 
139 | /// HIP kernel function for vector addition (dst = srcA + srcB)
140 | __global__ void vectorAdd(float *const dst, const float *const srcA, const float *const srcB, int numElements) {
141 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
142 |   if (i < numElements) dst[i] = srcA[i] + srcB[i];
143 | }
144 | 
145 | int main(void) {
146 |   // Allocate and initialise host (CPU) & device (GPU) memory
147 |   // ...
148 | 
149 |   // Launch the vectorAdd kernel
150 |   const int threadsPerBlock = 256;
151 |   const int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
152 |   hipLaunchKernelGGL(
153 |     vectorAdd, dim3(blocksPerGrid), dim3(threadsPerBlock), 0, 0, dst, srcA, srcB, numElements);
154 | 
155 |   // Copy result from device to host & clean up memory
156 |   // ...
157 | }
158 | ```
159 | 
160 | In this example, `d_A`, `d_B`, and `d_C` are pointers to device memory, and `numElements` is the number of elements in each vector. The `vectorAdd` kernel is launched with `blocksPerGrid` blocks, each containing `threadsPerBlock` threads. Each thread computes the sum of one pair of elements from `d_A` and `d_B`, and stores the result in `d_C`.
161 | 
162 | Note that this example is very similar to the CUDA example I provided earlier. This is because the HIP programming language is designed to be similar to CUDA, which makes it easier to port CUDA code to run on AMD GPUs.
163 | 
164 | ### Apple Silicon GPUs
165 | 
166 | #### Metal
167 | 
168 | Apple Silicon GPUs, which are part of Apple's custom M1 chip, can be programmed using the Metal framework. Metal is a graphics and compute API developed by Apple, and it's available on all Apple devices, including Macs, iPhones, and iPads.
169 | 
170 | Here are the main components of the Metal framework:
171 | 
172 | 1. **Metal API**: This is a low-level API that provides access to the GPU. It includes functions for creating and managing GPU resources, compiling shaders, and submitting work to the GPU.
173 | 2. **Metal Shading Language (MSL)**: This is the programming language used to write GPU code (shaders) in Metal. It is based on the C++14 programming language and includes some additional features and keywords for GPU programming.
174 | 3. **MetalKit and Metal Performance Shaders (MPS)**: These are higher-level frameworks built on top of Metal. MetalKit provides functions for managing textures, meshes, and other graphics resources, while MPS provides highly optimised functions for common image processing and machine learning tasks.
175 | 
176 | Here is a basic workflow for using Metal to perform GPU computations on Apple Silicon:
177 | 
178 | 1. **Install the necessary software**: This includes the Xcode development environment, which includes the Metal framework and compiler.
179 | 2. **Write your code**: Write your GPU code using the Metal Shading Language, and your host code using Swift or Objective-C. Your host code will use the Metal API to manage GPU resources and submit work to the GPU.
180 | 3. **Compile your code**: Use the Xcode development environment to compile your code.
181 | 4. **Run your code**: Run your compiled code on an Apple device with an Apple Silicon GPU.
182 | 
183 | For example, here is a simple Metal program that adds two vectors:
184 | 
185 | ```swift
186 | import Metal
187 | 
188 | // Create a Metal device and command queue
189 | let device = MTLCreateSystemDefaultDevice()!
190 | let commandQueue = device.makeCommandQueue()!
191 | 
192 | // Create a Metal library and function
193 | let library = device.makeDefaultLibrary()!
194 | let function = library.makeFunction(name: "vector_add")!
195 | 
196 | // Create a Metal compute pipeline
197 | let pipeline = try! device.makeComputePipelineState(function: function)
198 | 
199 | // Allocate and initialise host and device memory
200 | let numElements = 1024
201 | let bufferSize = numElements * MemoryLayout<Float>.size
202 | let h_A = [Float](repeating: 1.0, count: numElements)
203 | let h_B = [Float](repeating: 2.0, count: numElements)
204 | let d_A = device.makeBuffer(bytes: h_A, length: bufferSize, options: [])!
205 | let d_B = device.makeBuffer(bytes: h_B, length: bufferSize, options: [])!
206 | let d_C = device.makeBuffer(length: bufferSize, options: [])!
207 | 
208 | // Create a Metal command buffer and encoder
209 | let commandBuffer = commandQueue.makeCommandBuffer()!
210 | let commandEncoder = commandBuffer.makeComputeCommandEncoder()!
211 | 
212 | // Set the compute pipeline and buffers
213 | commandEncoder.setComputePipelineState(pipeline)
214 | commandEncoder.setBuffer(d_A, offset: 0, index: 0)
215 | commandEncoder.setBuffer(d_B, offset: 0, index: 1)
216 | commandEncoder.setBuffer(d_C, offset: 0, index: 2)
217 | 
218 | // Dispatch the compute kernel
219 | let threadsPerThreadgroup = MTLSize(width: 256, height: 1, depth: 1)
220 | let numThreadgroups = MTLSize(width: (numElements + 255) / 256, height: 1, depth: 1)
221 | commandEncoder.dispatchThreadgroups(numThreadgroups, threadsPerThreadgroup: threadsPerThreadgroup)
222 | 
223 | // End the command encoder and commit the command buffer
224 | commandEncoder.endEncoding()
225 | commandBuffer.commit()
226 | 
227 | // Wait for the command buffer to complete
228 | commandBuffer.waitUntilCompleted()
229 | 
230 | // Copy the result from device to host
231 | let h_C = UnsafeMutablePointer<Float>.allocate(capacity: numElements)
232 | d_C.contents().copyMemory(to: h_C, byteCount: bufferSize)
233 | 
234 | // ...
235 | // Clean up
236 | // ...
237 | ```
238 | 
239 | In this example, `d_A`, `d_B`, and `d_C` are Metal buffers, and `numElements` is the number of elements in each vector. The `vector_add` function is a Metal shader written in the Metal Shading Language, and it is executed on the GPU using a Metal compute command encoder.
240 | 
241 | Note that this example is written in Swift, which is the recommended programming language for developing Metal applications. You can also use Objective-C, but Swift is generally preferred for new development.
242 | 
243 | This example is quite a bit more complex than the earlier CUDA and HIP examples, because Metal is a lower-level API that provides more fine-grained control over the GPU. This can lead to more efficient code, but it also requires more boilerplate code to set up and manage GPU resources.
244 | 
245 | #### Metal Performance Shaders (MPS)
246 | 
247 | **Metal Performance Shaders (MPS)** is a framework that provides highly optimised functions for common image processing and machine learning tasks. MPS is built on top of the Metal framework and is available on all Apple devices, including Macs, iPhones, and iPads.
248 | 
249 | MPS includes a variety of functions for image processing (e.g., convolution, resizing, and histogram calculation), as well as a set of neural network layers (e.g., convolution, pooling, and normalisation) that can be used to build and run neural networks on the GPU.
250 | 
251 | MPS is a higher-level API than Metal, which makes it easier to use, but it provides less flexibility. If you are developing an application for Apple devices and you need to perform image processing or machine learning tasks, MPS is a good place to start.
252 | 
253 | ### Cross Platform Graphics APIs
254 | 
255 | #### Vulkan
256 | 
257 | **Vulkan** is a low-level graphics and compute API developed by the Khronos Group. It provides fine-grained control over the GPU and is designed to minimise CPU overhead and provide more consistent performance. Vulkan can be used for a variety of applications, including gaming, simulation, and scientific computing.
258 | 
259 | Vulkan is supported on a wide variety of platforms, including Windows, Linux, macOS (via MoltenVK, a Vulkan implementation that runs on top of Metal), Android, and iOS. Vulkan has a somewhat steep learning curve because it is a very low-level API, but it provides a lot of flexibility and can lead to very high performance.
260 | 
261 | Vulkan is designed to be a cross-platform API. It is supported on a wide variety of platforms, including Windows, Linux, macOS (via MoltenVK, a layer that maps Vulkan to Metal), Android, and iOS. This makes it a good choice for developing applications that need to run on multiple platforms.
262 | 
263 | #### OpenGL
264 | 
265 | **OpenGL** is a cross-platform graphics API developed by the Khronos Group. It is widely used for developing graphics applications, including games, simulations, and design tools. OpenGL is a higher-level API than Vulkan, which makes it easier to use, but it provides less control over the GPU and may have more CPU overhead.
266 | 
267 | OpenGL is supported on a wide variety of platforms, including Windows, macOS, Linux, and Android. However, Apple has deprecated OpenGL on its platforms in favor of Metal, so if you are developing an application for Apple devices, it is recommended to use Metal instead of OpenGL.
268 | 
269 | Each of these APIs has its own strengths and weaknesses, and the best one to use depends on your specific application and requirements. If you are developing a cross-platform application and need a low-level API, Vulkan is a good choice. If you are developing an application for Apple devices and need to perform image processing or machine learning tasks, MPS is a good choice. If you are developing a graphics application and need a higher-level API, OpenGL may be a good choice, although you should consider using Metal on Apple devices.
270 | 
271 | #### DirectX
272 | 
273 | **DirectX** is a collection of APIs for handling tasks related to multimedia, game programming, and video, on Microsoft platforms. While it's most commonly associated with Windows, it is also available on Xbox. Note that DirectX is not fully cross-platform, as it doesn't support macOS or Linux.
274 | 
275 | #### OpenCL
276 | 
277 | **OpenCL** is a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. OpenCL includes a language (based on C99) for writing kernels (i.e., functions that run on the hardware devices), plus APIs that are used to define and then control the platforms. OpenCL provides parallel computing using task-based and data-based parallelism.
278 | 
279 | #### WebGL and WebGPU
280 | 
281 | **WebGL** is a web-based graphics API that is based on OpenGL ES. It allows you to create 3D graphics in a web browser. Since it's web-based, it is supported on all major platforms and web browsers. While on the other hand, **WebGPU** is a new web-based graphics and compute API that is currently being developed by the W3C GPU for the Web Community Group. It is designed to provide modern 3D graphics and computation capabilities in web browsers, and it is intended to be the successor to WebGL.
282 | 
283 | WebGPU aims to provide a more modern and lower-level API than WebGL, which will allow for better performance and more flexibility. It is designed to be a web-friendly API that can be implemented on top of other graphics APIs, such as Vulkan, Metal, and DirectX.
284 | 
285 | WebGPU is still in development, and it is not yet widely supported in web browsers. However, it is an exciting development for web-based graphics and computation, and it is worth keeping an eye on if you are developing web applications that require high-performance graphics or computation.
286 | 
287 | WebGPU will be a cross-platform API because it will be supported in web browsers on multiple platforms. However, the actual implementation of WebGPU in the browser may use different underlying graphics APIs, depending on the platform. For example, a browser on Windows may use a DirectX-based implementation of WebGPU, while a browser on macOS may use a Metal-based implementation. This will be transparent to the application developer, who will just use the WebGPU API.
288 | 
289 | ```{admonition} Work in Progress
290 | :class: attention
291 | An entire chapter will be dedicated to WebGPU (coming soon!)
292 | ```
293 | 
294 | ### Benchmarks
295 | 
296 | ```{admonition} Work in Progress
297 | :class: attention
298 | Table with benchmarks
299 | ```
300 | 
301 | ### Acceleration Libraries
302 | 
303 | - **OpenBLAS**
304 | - **CuBLAS**
305 | - **cuDNN**
306 | - **OpenCL**
307 | 
308 | ## Cloud
309 | 
310 | - cost comparisons
311 |   + user-friendly: https://fullstackdeeplearning.com/cloud-gpus
312 |   + less user-friendly but more comprehensive: https://cloud-gpus.com
313 |   + comparisons of both features and pricing for GPU cloud providers: https://www.gpucloudpricing.com
314 |   + LLM-specific advice: https://gpus.llm-utils.org/cloud-gpu-guide/#which-gpu-cloud-should-i-use
315 | 
316 | ## Future
317 | 
318 | One problem with using current {term}`LLMs <LLM>` is the high GPU memory requirements. One popular work-around is {term}`quantisation`. However, this requires hardware manufacturers to build support for quantised operations ({term}`SIMD` instruction sets), and ML libraries to rewrite/reimplement core parts of their codebase to support the new operations. Also recall that CPU-based SIMD instruction sets (e.g. [SSE4](https://en.wikipedia.org/wiki/SSE4) & [AVX10](https://en.wikipedia.org/wiki/AVX10) for PCs and [NEON](<https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)>) for mobiles) took many years to develop, and are still actively evolving. By comparison, GPU architectures have much less adoption & development, so new arithmetic operations will take years to be widely supported.
319 | 
320 | {{ comments }}
321 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
  1 | # State of Open Source AI Book - 2023 Edition
  2 | 
  3 | {{ '```{badges} %s %s\n:doi: %s\n```' % (baseurl, env.config.html_theme_options.repository_url, doi) }}
  4 | 
  5 | *Clarity in the current fast-paced mess of Open Source innovation {cite}`prem_stateofosai`*
  6 | 
  7 | As a data scientist/ML engineer/developer with a 9 to 5 job, it's difficult to keep track of all the innovations. There's been enormous progress in the field in {term}`the last year <SotA>`.
  8 | 
  9 | Cure your FOMO with this guide, covering all the most important categories in the Open Source AI space, from model evaluations to deployment. It includes a [](#glossary) for you to quickly check definitions of new frameworks & tools.
 10 | 
 11 | A quick TL;DR overview is included at the top of each section. We outline the pros/cons and general context/background for each topic. Then we dive a bit deeper. Examples include data models were trained on, and deployment implementations.
 12 | 
 13 | ## Who is This Guide For?
 14 | 
 15 | ```{admonition} Prerequisites to Reading
 16 | :class: warning
 17 | You should already know the basic principles of MLOps {cite}`google-mlops,redhat-mlops,ml-ops`, i.e. you should know that the traditional steps are:
 18 | 
 19 | 1. Data engineering (preprocessing, curation, labelling, sanitisation)
 20 | 2. Model engineering (training, architecture design)
 21 | 3. Automated testing (CI)
 22 | 4. Deployment/Automated Inference (CD)
 23 | 5. Monitoring (logging, feedback, drift detection)
 24 | ```
 25 | 
 26 | You haven't followed the most recent developments in open source AI over {term}`the last year <SotA>`, and want to catch up quickly.
 27 | We go beyond just mentioning the models, but also include things such as changing infrastructure, licence pitfalls, and novel applications.
 28 | 
 29 | (toc)=
 30 | 
 31 | ## Table of Contents
 32 | 
 33 | We've divided the open-source tooling, models, & MLOps landscape into the following chapters:
 34 | 
 35 | Chapter | Description
 36 | ---|---
 37 | [](licences) | Weights vs Data, Commercial use, Fair use, Pending lawsuits
 38 | [](eval-datasets) | Leaderboards & Benchmarks for Text/Visual/Audio models
 39 | [](models) | LLaMA 1 vs 2, Stable Diffusion, DALL-E, Persimmon, ...
 40 | [](unaligned-models) | FraudGPT, WormGPT, PoisonGPT, WizardLM, Falcon
 41 | [](fine-tuning) | LLMs, Visual, & Audio models
 42 | [](model-formats) | ONNX, GGML, TensorRT
 43 | [](mlops-engines) | vLLM, TGI, Triton, BentoML, ...
 44 | [](vector-db) | Weaviate, Qdrant, Milvus, Redis, Chroma, ...
 45 | [](sdk) | LangChain, LLaMA Index, LiteLLM
 46 | [](desktop-apps) | LMStudio, GPT4All, Koboldcpp, ...
 47 | [](hardware) | NVIDIA CUDA, AMD ROCm, Apple Silicon, Intel, TPUs, ...
 48 | 
 49 | ## Contributing
 50 | 
 51 | This source of this guide is available on GitHub at {{ env.config.html_theme_options.repository_url }}.
 52 | 
 53 | ```{admonition} Feedback
 54 | :class: attention
 55 | The current open-source ecosystem is moving at light-speed.
 56 | Spot something outdated or missing? Want to start a discussion? We welcome any of the following:
 57 | 
 58 | - let us know in the <i class="fas fa-pencil-alt"></i> comments at the end of each chapter
 59 | - [<i class="fab fa-github"></i> create issues](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue)
 60 | - [<i class="fab fa-github"></i> open pull requests](https://docs.github.com/en/get-started/quickstart/contributing-to-projects)
 61 | ```
 62 | 
 63 | ### Editing the Book
 64 | 
 65 | - Using {{ '[GitHub Codespaces](https://codespaces.new/{})'.format(
 66 |   '/'.join(env.config.html_theme_options.repository_url.split('/')[-2:])) }}, you can edit code & preview the site in your browser without installing anything (you may [have to whitelist `github.dev`, `visualstudio.com`, `github.com`, & `trafficmanager.net`](https://docs.github.com/en/codespaces/the-githubdev-web-based-editor#using-githubdev-behind-a-firewall) if you use an adblocker).
 67 | - Alternatively, to run locally, open {{ '[this repository]({})'.format(env.config.html_theme_options.repository_url) }} in a [Dev Container](https://containers.dev) (most likely [using VSCode](https://code.visualstudio.com/docs/devcontainers/containers#_installation)).
 68 | - Or instead, manually set up your own Python environment:
 69 | 
 70 |   ```sh
 71 |   pip install -r requirements.txt              # setup
 72 |   jupyter-book build --builder dirhtml --all . # build
 73 |   python -m http.server -d _build/dirhtml      # serve
 74 |   ```
 75 | 
 76 |   ````{admonition} alternative: live rebuilding & serving (experimental)
 77 |   :class: tip, dropdown
 78 |   ```sh
 79 |   pip install -r requirements.txt sphinx-autobuild # setup
 80 |   jupyter-book config sphinx .                     # config
 81 |   sphinx-autobuild -b dirhtml . _build/dirhtml     # build-serve
 82 |   ```
 83 |   ````
 84 | 
 85 | ### Formatting
 86 | 
 87 | ```{note}
 88 | Don't worry about making it perfect, it's fine to open a ([draft](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests#draft-pull-requests)) PR and [allow edits from maintainers](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) to fix it ♥
 89 | ```
 90 | 
 91 | - [Quickstart](https://jupyterbook.org/en/stable/reference/cheatsheet.html)
 92 | - [Full reference](https://jupyterbook.org/en/stable/content/myst.html)
 93 | - Create a new chapter:
 94 |   + create `some-file.md` (containing `# Some File` heading and `{{ comments }}` footer)
 95 |   + add `- file: some-file` to `_toc.yml`
 96 |   + add `[](some-file) | summary` to [ToC](toc)
 97 | - Images: use [`{figure}`/`{figure-md}` with captions](https://myst-parser.readthedocs.io/en/latest/syntax/images_and_figures.html#figures-images-with-captions)
 98 | 
 99 |   ```{figure} https://static.premai.io/logo.png
100 |   :name: fig-ref
101 |   :width: 150px
102 |   :alt: alt-text
103 | 
104 |   This is a **figure caption**
105 |   ```
106 | 
107 |   + [inline ref](fig-ref)
108 |   + numbered ref: {numref}`fig-ref`
109 |   + custom ref: {numref}`Figure {number} with caption "{name}" <fig-ref>`
110 |   + please use https://github.com/premAI-io/static.premai.io to host images & data
111 | 
112 | - Tables: use [`{table}` with captions](https://myst-parser.readthedocs.io/en/latest/syntax/tables.html#table-with-captions)
113 | - [](#glossary) term: {term}`GPU`
114 |   + custom inline text: {term}`GPUs <GPU>`
115 | - Citations:
116 |   + add [BibTeX](https://jupyterbook.org/en/stable/tutorials/references.html#add-your-references) entries to `references.bib`, e.g.:
117 |     * blogs, wikis: `@online`
118 |     * docs: [`@manual`](https://www.bibtex.com/e/entry-types/#manual)
119 |     * journal articles, news articles: [`@article`](https://www.bibtex.com/e/article-entry)
120 |     * conference proceedings: [`@proceedings`](https://www.bibtex.com/e/entry-types/#proceedings)
121 |     * books: [`@book`](https://www.bibtex.com/e/book-entry)
122 |     * whitepapers: [`@techreport`](https://www.bibtex.com/e/entry-types/#techreport)
123 |     * chapters/parts of larger work: [`@incollection`](https://www.bibtex.com/e/entry-types/#incollection), [`@inbook`](https://www.bibtex.com/e/entry-types/#inbook)
124 |     * drafts: [`@unpublished`](https://www.bibtex.com/e/entry-types/#unpublished)
125 |   + citing things defined in `references.bib`: {cite}`prem_stateofosai,python`
126 |   + GitHub links:
127 |     * repos: https://github.com/premAI-io/state-of-open-source-ai
128 |     * issues: https://github.com/premAI-io/state-of-open-source-ai/issues/12
129 |     * code (folder/file): [premAI-io/state-of-open-source-ai:index.md](https://github.com/premAI-io/state-of-open-source-ai/blob/main/index.md)
130 |     * readme sections: [premAI-io/prem-app#demo](https://github.com/premAI-io/prem-app#demo)
131 | - New [Sphinx extensions](https://www.sphinx-doc.org/en/master/usage/extensions): append to `requirements.txt` and `_config.yml:sphinx.extra_extensions`
132 | - `linkcheck` false-positives: append to `_config.yml:sphinx.config.linkcheck*`
133 | 
134 | % comment lines (not rendered) are prefixed with a "%"
135 | 
136 | ### Contributors
137 | 
138 | Anyone who adds a few sentences to a chapter is {{
139 |   '[automatically mentioned in the respective chapter]({}/blob/main/committers.py)'.format(
140 |   env.config.html_theme_options.repository_url) }} as well as below.
141 | 
142 | {{ '[![](https://contrib.rocks/image?anon=1&repo={})]({}/graphs/contributors)'.format(
143 |    '/'.join(env.config.html_theme_options.repository_url.split('/')[-2:]),
144 |    env.config.html_theme_options.repository_url) }}
145 | 
146 | - Editor: Casper da Costa-Luis (https://github.com/casperdcl)
147 | 
148 |   > With a strong [academic background](https://cdcl.ml/learn) as well [industry expertise](https://cdcl.ml/work) to backup his enthusiasm for all things open source, Casper is happy to help with all queries related to this book.
149 | 
150 | - Maintainer: https://github.com/PremAI-io
151 | 
152 |   > Our vision is to engineer a world where individuals, developers, and businesses can embrace the power of AI without compromising their privacy. We believe in a future where users retain ownership of their data, AND the models trained on it.
153 | 
154 | - Citing this book: {cite}`prem_stateofosai`
155 | 
156 | ## Conclusion
157 | 
158 | ```{epigraph}
159 | All models are wrong, but some are useful
160 | 
161 | -- G.E.P. Box {cite}`box-models`
162 | ```
163 | 
164 | % TODO: rewrite
165 | 
166 | Open Source AI represents the future of privacy and ownership of data. On the other hand, in order to make this happen a lot of innovation should come into place. In the last year, already the open-source community demonstrated how motivated they are in order to deliver quality models to the hands of consumers creating already few big innovations in different AI fields. At the same time, this is just the beginning. Many improvements in multiple directions must be made in order to compare the results with centralised solutions.
167 | 
168 | At Prem we are on a journey to make this possible, with a focus on developer experience and deployment for any sort of developers, from Web Developers with zero knowledge about AI to affirmed Data Scientist who wants to quickly deploy and try these new models and technologies in their existing infra without compromising privacy.
169 | 
170 | ## Join our Community
171 | 
172 | - Ask for support on [our Discord server](https://discord.com/invite/kpKk6vYVAn).
173 | - To keep up-to-date, [follow us on Twitter](https://twitter.com/premai_io).
174 | - Report bugs or request features at https://github.com/premAI-io/prem-app.
175 | 
176 | ## Glossary
177 | 
178 | %TODO: define all these & use them where appropriate
179 | 
180 | ```{glossary}
181 | Alignment
182 |   [Aligned AI models](https://en.wikipedia.org/wiki/AI_alignment) must implement safeguards to be helpful, honest, and harmless {cite}`labellerr-alignment`.
183 |   This often involves {term}`supervised fine-tuning` followed by {term}`RLHF` See [](unaligned-models) and [](fine-tuning).
184 | 
185 | Auto-regressive language model
186 |   Applies [AR](https://en.wikipedia.org/wiki/Autoregressive_model) to {term}`LLMs <LLM>`. Essentially a feed-forward model which predicts the next word given a context (set of words) {cite}`medium-arlm`.
187 | 
188 | BEC
189 |   [Business Email Compromise](https://www.microsoft.com/en-us/security/business/security-101/what-is-business-email-compromise-bec).
190 | 
191 | Benchmark
192 |   A curated dataset and corresponding tasks designed to evaluate models' real-world performance metrics (so that models can be {term}`compared to each other <leaderboard>`).
193 | 
194 | Copyleft
195 |   A type of [open licence](open-licences) which insists that derivatives of the IP must have the same licence. Also called "protective" or "reciprocal" {cite}`wiki-copyleft`.
196 | 
197 | Embedding
198 |   See {term}`vector embedding`.
199 | 
200 | Evaluation
201 |   Assessing a model's abilities using quantitative and qualitative performance metrics (e.g. accuracy, effectiveness, etc.) on a given task. See [](eval-datasets).
202 | 
203 | Fair Dealing
204 |   A doctrine in UK & commonwealth law permitting use of {term}`IP` without prior permission under certain conditions (typically research, criticism, reporting, or satire) {cite}`wiki-fair-dealing`. See also {term}`fair use`.
205 | 
206 | Fair Use
207 |   A doctrine in US law permitting use of {term}`IP` without prior permission (regardless of licence/copyright status) depending on 1) purpose of use, 2) nature of the IP, 3) amount of use, and 4) effect on value {cite}`wiki-fair-use`. See also {term}`fair dealing`.
208 | 
209 | Fine-tuning
210 |   [Fine-tuning](https://en.wikipedia.org/wiki/Fine-tuning_(deep_learning)) is a technique in transfer learning where a pre-trained model's already learned features or parameters are further adjusted or tweaked using data specific to the new task, enabling the model to specialize and improve its performance on the target task.[Fine-tuning](<https://en.wikipedia.org/wiki/Fine-tuning_(machine_learning)>). See also [](fine-tuning) and {term}`transfer learning`.
211 | 
212 | Foundation model
213 |   A model trained from scratch -- likely on lots of data -- to be used for general tasks or later fine-tuned for specific tasks.
214 | 
215 | GPU
216 |   [Graphics Processing Unit](https://en.wikipedia.org/wiki/Graphics_processing_unit): hardware originally designed to accelerate computer image processing, but now often repurposed for [embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel) computational tasks in machine learning.
217 | 
218 | Hallucination
219 |   A model generating output that is [inexplicable by its training data](<https://en.wikipedia.org/wiki/Hallucination_(artificial_intelligence)>).
220 | 
221 | IP
222 |   [Intellectual Property](https://en.wikipedia.org/wiki/Intellectual_property): intangible creations by humans (e.g. code, text, art), typically legally protected from use without permission of the author(s).
223 | 
224 | Leaderboard
225 |   Ranking of models based on their performance metrics on the same {term}`benchmark(s) <benchmark>`, allowing fair task-specific comparison. See [](leaderboards-table).
226 | 
227 | LLM
228 |   A [Large Language Model](https://en.wikipedia.org/wiki/Large_language_model) is neural network (often a {term}`transformer` containing billions of parameters) designed to perform tasks in natural language via {term}`fine-tuning` or [prompt engineering](https://en.wikipedia.org/wiki/Prompt_engineering).
229 | 
230 | MLOps
231 |   [Machine Learning Operations](https://blogs.nvidia.com/blog/what-is-mlops): best practices to run AI using software products & cloud services
232 | 
233 | MoE
234 |   [Mixture-of-Experts](https://en.wikipedia.org/wiki/Mixture_of_experts) is a technique which uses one or more specialist model(s) from a collection of models ("experts") to solve general problems. Note that this is different from [ensemble](https://en.wikipedia.org/wiki/Ensemble_learning) models (which combine results from all models).
235 | 
236 | Open
237 |   Ambiguous term that could mean "open source" or "open licence". See [](open).
238 | 
239 | Permissive
240 |   A type of [open licence](open-licences) which allows reselling and closed-source modifications, and can often be used in larger projects alongside other licences. Usually, the only condition of use is citing the author by name.
241 | 
242 | Perplexity
243 |   [Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a metric based on [entropy](<https://en.wikipedia.org/wiki/Entropy_(information_theory)>), and is a rough measure of the difficulty/uncertainty in a prediction problem.
244 | 
245 | Public Domain
246 |   "Open" {term}`IP` owned by nobody (often due to the author disclaiming all rights) and thus can be used by anyone without restrictions. Technically a disclaimer/non-licence. See [](open-licences).
247 | 
248 | RAG
249 |   [Retrieval Augmented Generation](https://www.pinecone.io/learn/retrieval-augmented-generation).
250 | 
251 | RLHF
252 |   [Reinforcement Learning from Human Feedback](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback) is often the second step in {term}`alignment` (after {term}`supervised fine-tuning`), where a model is [rewarded or penalised](https://en.wikipedia.org/wiki/Reinforcement_learning) for it outputs based on human evaluation. See [](fine-tuning) and [](unaligned-models).
253 | 
254 | ROME
255 |   The [Rank-One Model Editing algorithm](https://rome.baulab.info) alters a trained model's weights to directly modify "learned" information {cite}`meng2023locating,raunak2022rankone`.
256 | 
257 | SIMD
258 |   [Single Instruction, Multiple Data](https://en.wikipedia.org/wiki/SIMD) is a [data-level](https://en.wikipedia.org/wiki/Data_parallelism) [parallel processing](https://en.wikipedia.org/wiki/Parallel_computer) technique where one computational instruction is applied to multiple data simultaneously.
259 | 
260 | SotA
261 |   State of the art: recent developments (under 1 year old).
262 | 
263 | Supervised Fine-tuning
264 |   [SFT](https://cameronrwolfe.substack.com/p/understanding-and-using-supervised) is often the first step in model {term}`alignment`, and is usually followed by {term}`RLHF`. See [](fine-tuning) and [](unaligned-models).
265 | 
266 | Quantisation
267 |   [Sacrificing precision](<https://en.wikipedia.org/wiki/Quantization_(signal_processing)>) of model weights (e.g. `uint8` instead of `float32`) in return for lower hardware memory requirements.
268 | 
269 | Token
270 |   A [token](https://learn.microsoft.com/en-us/semantic-kernel/prompts/) is a "unit of text" for an {term}`LLM` to process/generate. A single token could represent a few characters or words, depending on the tokenisation method chosen. Tokens are usually {term}`embedded <embedding>`.
271 | 
272 | Transfer Learning
273 |   [Transfer Learning](https://en.wikipedia.org/wiki/Transfer_learning) is a process of leveraging a pre-trained model's learned representations and adapting them to solve a different but related problem, often requiring less data and computation compared to training from scratch.. See also {term}`fine-tuning` and [](fine-tuning).
274 | 
275 | Transformer
276 |   A [transformer](<https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)>) is a neural network using a parallel multi-head [attention](<https://en.wikipedia.org/wiki/Attention_(machine_learning)>) mechanism. The resultant reduce training time makes it well-suited for use in {term}`LLMs <llm>`.
277 | 
278 | Vector Database
279 |   [Vector databases](https://en.wikipedia.org/wiki/Vector_database) provide efficient storage & search/retrieval for {term}`vector embeddings <vector embedding>`. See [](vector-db).
280 | 
281 | Vector Embedding
282 |   [Embedding](https://learn.microsoft.com/en-us/semantic-kernel/memories/embeddings) means encoding {term}`tokens <token>` into a numeric vector (i.e. array/list). This can be thought of as an intermediary between machine and human language, and thus helps {term}`LLMs <LLM>` understand human language. See [](vector-db.md#llm-embeddings).
283 | 
284 | Vector Store
285 |   See {term}`vector database`.
286 | ```
287 | 
288 | % TODO: glossary definitions for:
289 | % Decoder-style transformer
290 | % Diffusion-based text-to-image generative mode
291 | % A100, V100, H100
292 | % VRAM
293 | 


--------------------------------------------------------------------------------
/licences.md:
--------------------------------------------------------------------------------
  1 | # Licences
  2 | 
  3 | % TODO: investigate if significant: hardware licences placing restrictions on use of models trained on said hardware?
  4 | % TODO: https://tldr.cdcl.ml/tags/#law
  5 | % TODO: summary graphic?
  6 | 
  7 | Concerning {term}`IP` in software-related fields, developers are likely aware of two "[open](open)" copyright licence categories: one for highly structured work (e.g. software), and the other for general content (e.g. [](#data) including prosaic text and images). These two categories needed to exist separately to solve problems unique to their domains, and thus were not designed to be compatible. A particular piece of work is expected to fall into just one category, not both.
  8 | 
  9 | Copyright for [](#ml-models), however, is more nuanced.
 10 | 
 11 | Aside from categorisation, a further complication is the lack of [](#legal-precedence). A licence is not necessarily automatically legally binding -- it may be [incompatible with existing laws](#copyright-exceptions). Furthermore, in an increasingly global workplace, it may be unclear [which country's laws](#national-vs-international-laws) should be applicable in a particular case.
 12 | 
 13 | Finally, licence terms disclaiming warranty/liability are contributing to an [](#accountability-crisis).
 14 | 
 15 | ## ML Models
 16 | 
 17 | A working [model](models) is defined partially in code (architecture & training regimen) and partially by its parameters (trained weights, i.e. a list of numbers). The latter is implicitly defined by the training data (often mixed media). One could therefore argue that models must be simultaneously bound by multiple licences for multiple different domains. Such licences were not designed to work simultaneously, and may not even be compatible.
 18 | 
 19 | Here's a summary of the usage restrictions around some popular models (in descending order of real-world output quality as measured by us):
 20 | 
 21 | ```{table} Restrictions on training data, trained weights, and generated outputs
 22 | :name: model-licences
 23 | Model | Weights | Training Data | Output
 24 | --|--|--|--
 25 | [OpenAI ChatGPT](https://openai.com/policies/terms-of-use) | 🔴 unavailable | 🔴 unavailable | 🟢 user has full ownership
 26 | [Anthropic Claude](https://console.anthropic.com/legal/terms) | 🔴 unavailable | 🔴 unavailable | 🟡 commercial use permitted
 27 | [LMSys Vicuna 33B](https://lmsys.org/blog/2023-03-30-vicuna) | 🟢 open source | 🔴 unavailable | 🔴 no commercial use
 28 | [LMSys Vicuna 13B](https://github.com/lm-sys/FastChat) | 🟢 open source | 🔴 unavailable | 🟡 commercial use permitted
 29 | [MosaicML MPT 30B Chat](https://www.mosaicml.com/blog/mpt-30b) | 🟢 open source | 🔴 unavailable | 🔴 no commercial use
 30 | [Meta LLaMA2 13B Chat](https://github.com/facebookresearch/llama/blob/main/LICENSE) | 🟢 open source | 🔴 unavailable | 🟡 commercial use permitted
 31 | [RWKV4 Raven 14B](https://github.com/BlinkDL/RWKV-LM) | 🟢 open source | 🟢 available | 🟢 user has full ownership
 32 | [OpenAssistant SFT4 Pythia 12B](https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5) | 🟢 open source | 🟢 available | 🟢 user has full ownership
 33 | [MosaicML MPT 30B Instruct](https://huggingface.co/mosaicml/mpt-30b-instruct) | 🟢 open source | 🔴 unavailable | 🟡 commercial use permitted
 34 | [MosaicML MPT 30B](https://www.mosaicml.com/blog/mpt-30b) | 🟢 open source | 🔴 unavailable | 🟢 user has full ownership
 35 | ```
 36 | 
 37 | % TODO: mention Apache-2.0, LLaMA vs LLaMA 2, HuggingFace, CC-BY((-NC)-SA) in the table above?
 38 | 
 39 | {{ table_feedback }}
 40 | 
 41 | Just a few weeks after some said "the golden age of open [...] AI is coming to an end" {cite}`golden-age-os-end`, things like Falcon's `Apache-2.0` relicensing {cite}`falcon-relicence` and the [LLaMA-2 community licence](https://ai.meta.com/llama/license) {cite}`llama-2-licence` were announced (both permitting commercial use), completely changing the landscape.
 42 | 
 43 | Some interesting observations currently:
 44 | 
 45 | - Pre-trained model weights are typically not closely guarded
 46 | - Generated outputs often are usable commercially, but with conditions (no full copyrights granted)
 47 | - Training data is seldom available
 48 |   + honourable exceptions are OpenAssistant (which promises that [data will be released under `CC-BY-4.0`](https://github.com/LAION-AI/Open-Assistant/blob/main/LICENSE) but confusingly appears [already released under `Apache-2.0`](https://huggingface.co/datasets/OpenAssistant/oasst1)) and RWKV (which provides both [brief](https://wiki.rwkv.com/basic/FAQ.html#what-is-the-dataset-that-rwkv-is-trained-on) and [more detailed](https://github.com/BlinkDL/RWKV-LM#training--fine-tuning) guidance)
 49 | 
 50 | Licences are increasingly being recognised as important, and are even mentioned in some online leaderboards such as [](eval-datasets.md#chatbot-arena).
 51 | 
 52 | ## Data
 53 | 
 54 | As briefly alluded to, data and code are often each covered by their own licence categories -- but there may be conflicts when these two overlap. For example, pre-trained weights are a product of both code and data. This means one licence intended for non-code work (i.e. data) and another licence intended for code (i.e. model architectures) must simultaneously apply to the weights. This may be problematic or even nonsensical.
 55 | 
 56 | ```{admonition} Feedback
 57 | :class: attention
 58 | If you know of any legal precedence in conflicting multi-licence cases, please let us know in the {{
 59 |   '[<i class="fas fa-pencil-alt"></i> comments]({}-comments)'.format(env.docname) }} below!
 60 | ```
 61 | 
 62 | % TODO: dataset restrictions (e.g. ImageNet non-commercial)?
 63 | 
 64 | (open)=
 65 | 
 66 | ## Meaning of "Open"
 67 | 
 68 | "Open" could refer to "open licences" or "open source (code)". Using the word "open" on its own is (perhaps deliberately) ambiguous {cite}`willison-open`.
 69 | 
 70 | From a **legal (licencing) perspective**, "open" means (after legally obtaining the IP) no additional permission/payment is needed to use, make modifications to, & share the IP {cite}`open-definition,osd`. However, there are 3 subcategories of such "open" licences as per {numref}`open-licences`. Meanwhile, from a **software perspective**, there is only one meaning of "open": the source code is available.
 71 | 
 72 | ```{table} Open licence subcategories
 73 | :name: open-licences
 74 | 
 75 | Subcategory | Conditions | Licence examples
 76 | --|--|--
 77 | {term}`Public Domain` | Minimum required by law (so technically not a licence) | [`Unlicence`](https://spdx.org/licenses/Unlicense.html), [`CC0-1.0`](https://creativecommons.org/publicdomain/zero/1.0/legalcode)
 78 | {term}`Permissive` | Cite the original author(s) by name | [`Apache-2.0`](https://www.apache.org/licenses/LICENSE-2.0), [`CC-BY-4.0`](https://creativecommons.org/licenses/by/4.0/legalcode)
 79 | {term}`Copyleft` | Derivatives use the same licence | [`GPL-3.0`](https://www.gnu.org/licenses/gpl-3.0.html), [`CC-BY-SA-4.0`](https://creativecommons.org/licenses/by-sa/4.0/legalcode)
 80 | ```
 81 | 
 82 | ```{admonition} Choosing an Open Source Licence [#](open-choices)
 83 | :name: open-choices
 84 | :class: tip
 85 | 
 86 | - Software: [compare 8 popular licences](https://choosealicense.com/licenses)
 87 |   + [`MPL-2.0`](https://mozilla.org/MPL/2.0) is noteworthy, as it combines the permissiveness & compatibility of [`Apache-2.0`](https://www.apache.org/licenses/LICENSE-2.0) with a very weak (file-level) copyleft version of [`LGPL-3.0-or-later`](https://spdx.org/licenses/LGPL-3.0-or-later.html). `MPL-2.0` is thus usually categorised as permissive {cite}`wiki-sw-licence`.
 88 | - Data & media: one of the 3 `CC` licences from the [table above](open-licences)
 89 | - Hardware: one of the [`CERN-OHL-2.0`](https://cern-ohl.web.cern.ch) licences
 90 | - More choices: [compare dozens of licences](https://choosealicense.com/appendix)
 91 | ```
 92 | 
 93 | One big problem is enforcing licence conditions (especially of {term}`copyleft` or even more restrictive licences), particularly in an open-source-centric climate with potentially billions of infringing users. It is a necessary condition of a law that it should be enforceable {cite}`law-enforceability`, which is infeasible with most current software {cite}`linux-warranty,cdcl-policing-foss,cdcl-os-illegal`.
 94 | 
 95 | ## National vs International Laws
 96 | 
 97 | ### Copyright Exceptions
 98 | 
 99 | A further complication is the concept of "{term}`fair use`" and "{term}`fair dealing`" in some countries -- as well as international limitations {cite}`wiki-limitations-copyright` -- which may override licence terms as well as copyright in general {cite}`wiki-google-oracle-case,wiki-google-books-case,nytimes-google-books-case`.
100 | 
101 | In practice, even legal teams often refuse to give advice {cite}`pytorch-vision-2597`, though it appears that copyright law is rarely enforced if there is no significant commercial gain/loss due to infringement.
102 | 
103 | ### Obligation or Discrimination
104 | 
105 | Organisations may also try to discriminate between countries even when not legally obliged to do so. For instance, OpenAI does not provide services to some countries {cite}`openai-supported-countries`, and it is unclear whether this is legally, politically, or financially motivated.
106 | 
107 | ### Legal Precedence
108 | 
109 | "Open" licences often mean "can be used without a fee, provided some conditions are met". In turn, users might presume that the authors do not expect to make much direct profit. In a capitalist society, such a disinterest in monetary gain might be mistaken as a disinterest in everything else, including enforcing the "provided some conditions are met" clause. Users might ignore the "conditions" in the hope that the authors will not notice, or will not have the time, inclination, nor money to pursue legal action. As a result, it is rare for a licence to be "tested" (i.e. debated and upheld, thus giving it legal weight) in a court of law.
110 | 
111 | Only rare cases involving lots of money or large organisations go to court {cite}`cdcl-os-illegal`, such as these ongoing ones destined to produce "landmark" rulings:
112 | 
113 | - Jun 2023 copyright case {cite}`copilot-copyright-case` against Microsoft, GitHub, and OpenAI
114 | - Jun 2023 privacy case {cite}`openai-privacy-case` against Microsoft & OpenAI
115 | - Nov 2022 copyright and open source licences case {cite}`legalpdf-doe-github-case` against GitHub
116 | 
117 | ## Accountability Crisis
118 | 
119 | Of the 100+ licences approved by the Open Source Initiative {cite}`osi-licences`, none provide any warranty or liability. In fact, all expressly **disclaim** warranty/liability (apart from [`MS-PL`](https://learn.microsoft.com/en-us/previous-versions/msp-n-p/ff647676(v=pandp.10)?redirectedfrom=MSDN) and [`MS-RL`](https://opensource.org/license/ms-rl-html), which don't expressly mention liability).
120 | 
121 | This means a nefarious or profiteering organisation could release poor quality or malicious code under an ostensibly welcoming open source licence, but in practice abuse the licence terms to disown any responsibility or accountability. Users and consumers may unwittingly trust fundamentally untrustworthy sources.
122 | 
123 | To combat this, the EU proposed cybersecurity legislation in Sep 2022: the Cyber Resilient Act (CRA) {cite}`cra` and Product Liability Act (PLA) {cite}`pla` propose to hold profiteering companies accountable (via "consumer interests" and "safety & liability" of products/services), so that anyone making (in)direct profit cannot hide behind "NO WARRANTY" licence clauses {cite}`cdcl-os-illegal`. Debate is ongoing, particularly over the CRA's Article 16, which states that a "person, other than [manufacturer/importer/distributor, who makes] a substantial modification of [a software product] shall be considered a manufacturer" {cite}`cdcl-cra-pla`. FOSS organisations have questioned whether liability can traverse the dependency graph, and what minor indirect profit-making is exempt {cite}`psf-cra,eclipse-cra,nlnet-cra`.
124 | 
125 | However, law-makers should be careful to limit the scope of any FOSS exemptions to prevent commercial abuse/loopholes {cite}`cdcl-os-illegal,cdcl-cra-pla`, and encourage accountability for critical infrastructure {cite}`cdcl-policing-foss`.
126 | 
127 | ```{admonition} A better way? [#](fund-warranties)
128 | :name: fund-warranties
129 | :class: seealso
130 | In the interest of public safety, the best solution might be to pay for warranties for widely-used software via public funds {cite}`cdcl-os-bad` or crowdsourcing {cite}`tidelift,gh-sponsors,opencollective,numfocus`.
131 | ```
132 | 
133 | ## Future
134 | 
135 | To recap:
136 | 
137 | - It's unknown what are the implications of multiple licences with conflicting terms (e.g. models inheriting both code & data licences)
138 |   + there is little [](#legal-precedence)
139 | - "[Open](open)" could refer to code/source or to licence (so is ambiguous without further information)
140 |   + training data is often not open source
141 | - Licences always disclaim warranty/liability
142 | - Enforcing licences might be illegal
143 |   + limitations such as {term}`fair use`/{term}`dealing <fair dealing>` can override licences/copyright
144 |   + proposed accountability laws might override licence disclaimers
145 | - Enforcing licences might be infeasible
146 |   + there are [ongoing cases](#legal-precedence) regarding (ab)use of various subcategories of IP: copyright (no licence) for both open and closed source, as well as licences with copyleft or non-commercial clauses
147 | 
148 | In the long term, we look forward to the outcomes of the US cases and EU proposals. Meanwhile in the short term, a recent tweet ({numref}`unusual-ventures-tweet`) classified some current & {term}`foundation <foundation model>` models (albeit with no explanation/discussion yet as of Oct 2023). We hope to see an accompanying write-up soon!
149 | 
150 | ```{figure-md} unusual-ventures-tweet
151 | :class: caption
152 | ![](https://pbs.twimg.com/media/F3AiXRJWsAAP0Da?format=jpg&name=4096x4096)
153 | 
154 | [The AI Battle: Open Source vs Closed Source](https://twitter.com/chiefaioffice/status/1688913452662984708?s=20)
155 | ```
156 | 
157 | {{ comments }}
158 | 


--------------------------------------------------------------------------------
/mlops-engines.md:
--------------------------------------------------------------------------------
  1 | # MLOps Engines
  2 | 
  3 | ```{admonition} Work in Progress
  4 | :class: attention
  5 | {{ wip_chapter }}
  6 | 
  7 | Some ideas:
  8 | 
  9 | - [7 Frameworks for Serving LLMs](https://betterprogramming.pub/frameworks-for-serving-llms-60b7f7b23407) "comprehensive guide & detailed comparison"
 10 | - [Trends: Optimising for Faster Inference](https://cameronrwolfe.substack.com/i/135439692/optimizing-for-faster-inference)
 11 | - https://github.com/imaurer/awesome-decentralized-llm
 12 | - Python Bindings and More
 13 | - PyTorch Toolchain -- From C/C++ to Python
 14 | - https://docs.bentoml.org
 15 |   + https://docs.bentoml.org/en/latest/overview/what-is-bentoml.html#build-applications-with-any-ai-models
 16 | - https://finbarr.ca/how-is-llama-cpp-possible
 17 | - https://onnxruntime.ai/docs/execution-providers
 18 | - Apache TVM
 19 | ```
 20 | 
 21 | This chapter focuses on recent open-source {term}`MLOps` engine developments -- which are largely due to the current rise of {term}`LLMs <LLM>`. While MLOps typically focuses on model training, "LLMOps" focuses on fine-tuning. In production, both also require good inference engines.
 22 | 
 23 | ```{table} Comparison of Inference Engines
 24 | :name: inference-engines
 25 | Inference Engine | Open-Source | GPU optimisations | Ease of use
 26 | -----------------|-------------|-------------------|-------------
 27 | [Nvidia Triton](#nvidia-triton-inference-server) | 🟢 Yes | Dynamic Batching, Tensor Parallelism, Model concurrency | 🔴 Difficult
 28 | [](#text-generation-inference) | 🟢 Yes | Continuous Batching, Tensor Parallelism, Flash Attention | 🟢 Easy
 29 | [](#vllm) | 🟢 Yes | Continuous Batching, Tensor Parallelism, Paged Attention | 🟢 Easy
 30 | [](#bentoml) | 🟢 Yes | None | 🟢 Easy
 31 | [](#modular) | 🔴 No | N/A | 🟡 Moderate
 32 | [](#localai) | 🟢 Yes | 🟢 Yes | 🟢 Easy
 33 | ```
 34 | 
 35 | {{ table_feedback }}
 36 | 
 37 | ## Nvidia Triton Inference Server
 38 | 
 39 | ```{figure-md} mlops-engines-triton-architecture
 40 | :class: caption
 41 | ![](https://static.premai.io/book/mlops-engines-triton-architecture.png)
 42 | 
 43 | [Nvidia Triton Architecture](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/jetson.html)
 44 | ```
 45 | 
 46 | This [inference server](https://developer.nvidia.com/triton-inference-server) offers support for multiple model formats such as PyTorch, TensorFlow, ONNX, TensorRT, etc. It uses GPUs efficiently to boost the performance of deep learning models.
 47 | 
 48 | - **Concurrent model execution**: This allows multiple models to be executed on 1 or many GPUs in parallel. Multiple requests are routed to each model to execute the tasks in parallel
 49 | - **Dynamic Batching**: Combines multiple inference requests into a batch to increase throughput. Requests in each batch can be processed in parallel instead of handling each request sequentially.
 50 | 
 51 | Pros:
 52 | 
 53 | * High throughput, low latency for serving LLMs on a GPU
 54 | * Supports multiple frameworks/backends
 55 | * Production level performance
 56 | * Works with non-LLM models such as image generation or speech to text
 57 | 
 58 | Cons:
 59 | 
 60 | * Difficult to set up
 61 | * Not compatible with many of the newer LLMs
 62 | 
 63 | ## Text Generation Inference
 64 | 
 65 | ```{figure-md} tgi-architecture
 66 | :class: caption
 67 | ![](https://static.premai.io/book/mlops-engines-tgi-architecture.png)
 68 | 
 69 | [Text Generation Inference Architecture](https://github.com/huggingface/text-generation-inference)
 70 | ```
 71 | 
 72 | Compared to Triton, https://github.com/huggingface/text-generation-inference is easier to setup and supports most of the popular LLMs on Hugging Face.
 73 | 
 74 | Pros:
 75 | 
 76 | * Supports newer models on Hugging Face
 77 | * Easy setup via docker container
 78 | * Production-ready
 79 | 
 80 | Cons:
 81 | 
 82 | * Open-source license has restrictions on commercial usage
 83 | * Only works with Hugging Face models
 84 | 
 85 | ## vLLM
 86 | 
 87 | This is an open-source project created by researchers at Berkeley to improve the performance of LLM inferencing. https://github.com/vllm-project/vllm primarily optimises LLM throughput via methods like PagedAttention and Continuous Batching. The project is fairly new and there is ongoing development.
 88 | 
 89 | Pros:
 90 | 
 91 | * Can be used commercially
 92 | * Supports many popular Hugging Face models
 93 | * Easy to setup
 94 | 
 95 | Cons:
 96 | 
 97 | * Not all LLM models are supported
 98 | 
 99 | ## BentoML
100 | 
101 | [BentoML](https://www.bentoml.com) is a fairly popular tool used to deploy ML models into production. It has gained a lot of popularity by building simple wrappers that can convert any model into a REST API endpoint. Currently, BentoML does not support some of the GPU optimizations such as tensor parallelism. However, the main benefit BentoML provides is that it can serve a wide variety of models.
102 | 
103 | Pros:
104 | 
105 | * Easy setup
106 | * Can be used commercially
107 | * Supports all models
108 | 
109 | Cons:
110 | 
111 | * Lacks some GPU optimizations
112 | 
113 | ## Modular
114 | 
115 | [Modular](https://www.modular.com) is designed to be a high performance AI engine that boosts the performance of deep learning models. The secret is in their custom compiler and runtime environment that improves the inferencing of any model without the developer needing to make any code changes.
116 | 
117 | The Modular team has designed a new programming language, [Mojo](https://docs.modular.com/mojo), which combines the Python friendly syntax with the performance of C. The purpose of Mojo is to address some of the shortcomings of Python from a performance standpoint while still being a part of the Python ecosystem. This is the programming language used internally to create the Modular AI engine's kernels.
118 | 
119 | Pros:
120 | 
121 | * Low latency/High throughput for inference
122 | * Compatible with Tensorflow and Pytorch models
123 | 
124 | Cons:
125 | 
126 | * Not open-source
127 | * Not as simple to use compared to other engines on this list
128 | 
129 | This is not an exhaustive list of MLOps engines by any means. There are many other tools and frameworks developer use to deploy their ML models. There is ongoing development in both the open-source and private sectors to improve the performance of LLMs. It's up to the community to test out different services to see which one works best for their use case.
130 | 
131 | ## LocalAI
132 | 
133 | [LocalAI](https://localai.io) from https://github.com/mudler/LocalAI ([not to be confused](https://github.com/louisgv/local.ai/discussions/71) with [](desktop-apps.md#localai) from https://github.com/louisgv/local.ai) is the free, Open Source alternative to OpenAI. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It can run LLMs (with various backend such as https://github.com/ggerganov/llama.cpp or [](#vllm)), generate images, generate audio, transcribe audio, and can be self-hosted (on-prem) with consumer-grade hardware.
134 | 
135 | Pros:
136 | 
137 | - [wide range of models supported](https://localai.io/model-compatibility)
138 | - support for [functions](https://localai.io/features/openai-functions) (self-hosted [OpenAI functions](https://platform.openai.com/docs/guides/gpt/function-calling))
139 | - [easy to integrate](https://localai.io/integrations)
140 | 
141 | Cons:
142 | 
143 | - binary version is harder to run and compile locally. https://github.com/mudler/LocalAI/issues/1196.
144 | - high learning curve due to high degree of customisation
145 | 
146 | ## Challenges in Open Source
147 | 
148 | MLOps solutions come in two flavours {cite}`mlops-challenges`:
149 | 
150 | - Managed: a full pipeline (and support) is provided (for a price)
151 | - Self-hosted: various DIY stitched-together open-source components
152 | 
153 | Some companies (e.g. [Hugging Face](https://huggingface.co)) push for open-source models & datasets, while others (e.g. [OpenAI](https://openai.com), [Anthropic](https://www.anthropic.com)) do the opposite.
154 | 
155 | The main challenges with open-source MLOps are [](#maintenance), [](#performance), and [](#cost).
156 | 
157 | ```{figure-md} open-vs-closed-mlops
158 | :class: caption
159 | ![](https://static.premai.io/book/mlops-engines-table.jpg)
160 | 
161 | Open-Source vs Closed-Source MLOps
162 | ```
163 | 
164 | ### Maintenance
165 | 
166 | Using open-source components, most setup & configuration must be done manually. This could mean finding & downloading [models](models) & [datasets](eval-datasets), setting up [fine-tuning](fine-tuning), performing [evaluations](eval-datasets), and [](#inference) -- all components held together by self-maintained bespoke "glue" code.
167 | 
168 | You are responsible for monitoring pipeline health & fixing issues quickly to avoid application downtime. This is particularly painful in the early stages of a project, when robustness and scalability are not yet implemented and there is much firefighting for developers to do.
169 | 
170 | ### Performance
171 | 
172 | Performance could refer to:
173 | 
174 | - output *quality*: e.g. accuracy -- how close is a model's output to ideal expectations (see [](eval-datasets)), or
175 | - operational *speed*: e.g. throughput & latency -- how much time it takes to complete a request (see also [](hardware), which can play as large a role as software {cite}`nvidia-gpu-inference`).
176 | 
177 | By comparison, closed-source engines (e.g. [Cohere](https://cohere.com)) tend to give better baseline operational performance due to default-enabled inference optimisations {cite}`cohere-triton`.
178 | 
179 | ### Cost
180 | 
181 | Self-maintained open-source solutions, if implemented well, can be extremely cheap both to setup and to run long term. However, many underestimate the amount of work required to make an open-source ecosystem work seamlessly.
182 | 
183 | For example, a single GPU node able to run a 36 GB open-source model can [easily cost over \$2,000 per month from a major cloud provider](hardware.md#cloud). Since the technology is still new, experimenting with & maintaining self-hosted infrastructure can be expensive. Conversely, closed-source pricing models often charge for usage (e.g. {term}`tokens <token>`) rather than infrastructure (e.g. [ChatGPT costs around \$0.002 for 1K tokens](https://openai.com/pricing) -- enough for a page of text), making them much cheaper for small explorative tasks.
184 | 
185 | ## Inference
186 | 
187 | Inference is one of the hot topics currently with LLMs in general. Large models like ChatGPT have very low latency and great performance but become more expensive with more usage.
188 | 
189 | On the flip side, open-source models like [](models.md#llama-2) or [](models.md#falcon) have variants that are much smaller in size, yet it's difficult to match the latency and throughput that ChatGPT provides, while still being cost efficient {cite}`cursor-llama`.
190 | 
191 | Models that are run using Hugging Face pipelines do not have the necessary optimisations to run in a production environment. The open-source LLM inferencing market is still evolving so currently there's no silver bullet that can run any open-source LLM at blazing-fast speeds.
192 | 
193 | Here are a few reasons for why inferencing is slow:
194 | 
195 | ### Models are growing larger in size
196 | 
197 | * As models grow in size and neural networks become more complex it's no surprise that it's taking longer to get an output
198 | 
199 | ### Python as the choice of programming language for AI
200 | 
201 | * Python, is inherently slow compared to compiled languages like C++
202 | * The developer-friendly syntax and vast array of libraries have put Python in the spotlight, but when it comes to sheer performance it falls behind many other languages
203 | * To compensate for its performance many inferencing servers convert the Python code into an optimised module. For example, Nvidia's [Triton Inference Server](https://developer.nvidia.com/triton-inference-server) can take a PyTorch model and compile it into [TensorRT](https://developer.nvidia.com/tensorrt-getting-started), which has a much higher performance than native PyTorch
204 | * Similarly, https://github.com/ggerganov/llama.cpp optimises the LLaMA inference code to run in raw C++. Using this optimisation, people can run a large language model on their laptops without a dedicated GPU.
205 | 
206 | ### Larger inputs
207 | 
208 | * Not only do LLMs have billions of parameters, but they perform millions of mathematical calculations for each inference
209 | * To do these massive calculations in a timely manner, GPUs are required to help speed up the process. GPUs have much more memory bandwidth and processing power compared to a CPU which is why they are in such high demand when it comes to running large language models.
210 | 
211 | ## Future
212 | 
213 | Due to the challenge of running LLMs, enterprises will likely opt to use an inference server instead of containerising the model in-house. Optimising LLMs for inference requires a high level of expertise, which most companies many not have. Inference servers can help solve this problem by providing a simple and unified interface to deploy AI models at scale, while still being cost effective.
214 | 
215 | Another pattern that's emerging is that models will move to the data instead of the data moving to the model. Currently, when calling the ChatGPT API data is sent to the model. Enterprises have worked very hard over the past decade to set up robust data infrastructure in the cloud. It makes a lot more sense to bring the model into the same cloud environment where the data is. This is where open-source models being cloud agnostic can have a huge advantage.
216 | 
217 | Before the word "MLOps" was coined, data scientists would manually train and run their models locally. At that time, data scientists were mostly experimenting with smaller statistical models. When they tried to bring this technology into production, they ran into many problems around data storage, data processing, model training, model deployment, and model monitoring. Companies started addressing these challenges and came up with a solution for running AI in production, hence "MLOps".
218 | 
219 | Currently, we are in the experimental stage with LLMs. When companies try to use this technology in production, they will encounter a new set of challenges. Building solutions to address these challenges will build on the existing concept of MLOps.
220 | 
221 | {{ comments }}
222 | 


--------------------------------------------------------------------------------
/prem_theme/__init__.py:
--------------------------------------------------------------------------------
1 | from os import path
2 | 
3 | def setup(app):
4 |     app.add_html_theme('prem_theme', path.abspath(path.dirname(__file__)))
5 | 


--------------------------------------------------------------------------------
/prem_theme/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "sphinx_book_theme/layout.html" %}
 2 | {% block extrahead %}
 3 | <meta property="og:site_name" content="The State of Open Source AI" />
 4 | <meta property="og:title" content="{{ title }}" />
 5 | <meta property="og:type" content="book" />
 6 | <meta property="og:image" content="https://static.premai.io/book/marketing/twitter--book-2.jpg" />
 7 | <meta property="og:url"
 8 |     content="https://book.premai.io/state-of-open-source-ai/{{ '' if pagename == 'index' else pagename }}" />
 9 | <meta property="og:description" content="Clarity in the current fast-paced mess of Open Source innovation" />
10 | <meta property="og:locale" content="en_GB" />
11 | <meta property="og:book:author" content="Prem" />
12 | {%- if last_updated %}
13 | <meta property="og:book:release_date" content="{{ last_updated }}" />
14 | {%- endif %}
15 | <meta property="og:book:tag" content="open-source" />
16 | <meta property="og:book:tag" content="AI" />
17 | <meta property="og:book:tag" content="book" />
18 | <meta property="og:book:tag" content="ML" />
19 | <meta property="og:book:tag" content="MLOps" />
20 | <meta property="og:book:tag" content="Jupyter-Book" />
21 | {{ super() }}
22 | {% endblock extrahead %}
23 | 


--------------------------------------------------------------------------------
/prem_theme/theme.conf:
--------------------------------------------------------------------------------
1 | [theme]
2 | inherit = sphinx_book_theme
3 | 


--------------------------------------------------------------------------------
/references.md:
--------------------------------------------------------------------------------
 1 | # References
 2 | 
 3 | ````{admonition} Work in Progress (TODO: move elsewhere)
 4 | :class: attention
 5 | 
 6 | **important and/or related to whole book**
 7 | 
 8 | - "Catching up on the weird world of LLMs" (summary of the last few years) https://simonwillison.net/2023/Aug/3/weird-world-of-llms
 9 | - "Open challenges in LLM research" (exciting post title but mediocre content) https://huyenchip.com/2023/08/16/llm-research-open-challenges.html
10 | - https://github.com/zeno-ml/zeno-build/tree/main/examples/analysis_gpt_mt/report
11 | - "Patterns for Building LLM-based Systems & Products" (Evals, RAG, fine-tuning, caching, guardrails, defensive UX, and collecting user feedback) https://eugeneyan.com/writing/llm-patterns
12 | 
13 |   ```{figure-md} llm-patterns
14 |   :class: margin
15 |   ![](https://eugeneyan.com/assets/llm-patterns-og.png)
16 | 
17 |   [LLM patterns: From data to user, from defensive to offensive](https://eugeneyan.com/writing/llm-patterns)
18 |   ```
19 | 
20 | - `awesome-list`s (mention overall list + recently added entries)
21 |   + https://github.com/imaurer/awesome-decentralized-llm
22 |   + https://github.com/huggingface/transformers/blob/main/awesome-transformers.md
23 |   + "Anti-hype LLM reading list" (foundation papers, training, deployment, eval, UX) https://gist.github.com/veekaybee/be375ab33085102f9027853128dc5f0e
24 |   + ... others?
25 | - open questions & future interest (pages 15 & 16): https://mlops.community/wp-content/uploads/2023/07/survey-report-MLOPS-v16-FINAL.pdf
26 | 
27 | **unclassified**
28 | 
29 | Couldn't decide which chapter(s) these links are related to. They're mostly about security & optimisation. Perhaps create a new chapter?
30 | 
31 | - "How I Re-implemented PyTorch for WebGPU" (`webgpu-torch`: inference & autograd lib to run NNs in browser with negligible overhead) https://praeclarum.org/2023/05/19/webgpu-torch.html
32 | - "LLaMA from scratch (or how to implement a paper without crying)" (misc tips, scaled-down version of LLaMA for training) https://blog.briankitano.com/llama-from-scratch
33 | - "Swift Transformers: Run On-Device LLMs in Apple Devices" https://huggingface.co/blog/swift-coreml-llm
34 | - "Why GPT-3.5-turbo is (mostly) cheaper than LLaMA-2" https://cursor.sh/blog/llama-inference#user-content-fn-gpt4-leak
35 | - https://www.marble.onl/posts/why_host_your_own_llm.html
36 | - https://betterprogramming.pub/you-dont-need-hosted-llms-do-you-1160b2520526
37 | - "Low-code framework for building custom LLMs, neural networks, and other AI models" https://github.com/ludwig-ai/ludwig
38 | - "GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers" https://arxiv.org/abs/2210.17323
39 | - "RetrievalQA with LLaMA 2 70b & Chroma DB" (nothing new, but this guy does a lot of experiments if you wanna follow him) https://youtu.be/93yueQQnqpM
40 | - "[WiP] build MLOps solutions in Rust" https://github.com/nogibjj/rust-mlops-template
41 | ````
42 | 
43 | ```{bibliography}
44 | :style: unsrt_max_authors
45 | ```
46 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter-book==1.0.0
2 | sphinx-last-updated-by-git==0.3.6
3 | sphinx-subfigure==0.2.4
4 | 


--------------------------------------------------------------------------------
/sdk.md:
--------------------------------------------------------------------------------
  1 | # Software Development toolKits
  2 | 
  3 | {term}`LLM` SDKs are specific for generative AI. These toolkits help developers integrate LLM capabilities into applications. The LLM SDK typically includes APIs, sample code, and documentation to aid in the development process. By leveraging an LLM SDK, developers can streamline their development processes and ensure compliance with industry standards.
  4 | 
  5 | % TODO: haystack?
  6 | 
  7 | ```{table} Comparison of LLM SDKs
  8 | :name: llm-sdks
  9 | SDK | Use cases | Vector stores | Embedding model | LLM Model | Languages | Features
 10 | ----|-----------|---------------|-----------------|-----------|-----------|----------
 11 | [](#langchain) | Chatbots, prompt chaining, document related tasks | Comprehensive list of data sources available to get connected readily | State of art embedding models in the bucket to choose from | A-Z availability of LLMs out there in the market | Python, Javascript, Typescript | Open source & 1.5k+ contributors strong for active project development
 12 | [](#llama-index) | Connecting multiple data sources to LLMs, document query interface using retrieval augmented generation, advanced chatbots, structured analytics | Wide options to connect & facility to [create a new one](https://docs.llamaindex.ai/en/latest/examples/vector_stores/CognitiveSearchIndexDemo.html#create-index-if-it-does-not-exist) | Besides the 3 commonly available models we can use a [custom embedding model](https://docs.llamaindex.ai/en/latest/examples/embeddings/custom_embeddings.html) as well | Set of restricted availability of LLM models besides [customised abstractions](https://docs.llamaindex.ai/en/latest/module_guides/models/llms/usage_custom.html) suited for your custom data | Python, Javascript, Typescript | Tailor-made for high customisations if not happy with the current parameters and integrations
 13 | [](#litellm) | Integrating multiple LLMs, evaluating LLMs | Not Applicable | Currently supports only `text-embedding-ada-002` from OpenAI & Azure | Expanding the list of LLM providers with the most commonly used ones ready for use | Python | Lightweight, streaming model response, consistent output response
 14 | ```
 15 | 
 16 | {{ table_feedback }}
 17 | 
 18 | ```{seealso}
 19 | [awesome-transformers](https://github.com/huggingface/transformers/blob/main/awesome-transformers.md)
 20 | ```
 21 | 
 22 | A few reasons for why there is a need for LLM SDKs in this current era of AI.
 23 | 
 24 | 1. **Compliance with Agreements**: By using an LLM SDK, developers can ensure that their application complies with agreements by logging, tracing, and monitoring requests appropriately. This helps avoid potential legal issues related to software piracy or unauthorised use of resources.
 25 | 1. **Improved User Experience**: An LLM SDK can help create a seamless user experience by removing boilerplate code and abstracting lower level interactions with LLMs.
 26 | 1. **Increased Security**: By implementing an LLM SDK, developers can protect their resources and prevent unauthorised use of their software by security features such as [access control and user management](https://www.businesswire.com/news/home/20230531005251/en/LlamaIndex-Raises-8.5M-to-Unlock-Large-Language-Models-Capabilities-with-Personal-Data).
 27 | 1. **Flexibility**: An LLM SDK provides flexibility in terms of customisation and bringing together different components, allowing developers to tailor the management system to their specific needs and adapt it easily.
 28 | 1. **Improved Collaboration**: An LLM SDK can facilitate collaboration among team members by providing a centralised platform for license management, ensuring that everyone is on the same page regarding issues and compliance requirements.
 29 | 
 30 | ## LangChain
 31 | 
 32 | ![banner](https://python.langchain.com/img/parrot-chainlink-icon.png)
 33 | 
 34 | On the LangChain page -- it states that LangChain is a framework for developing applications powered by Large Language Models(LLMs). It is available as an python sdk and npm packages suited for development purposes.
 35 | 
 36 | ### Document Loader
 37 | 
 38 | Well the beauty of LangChain is we can take input from various different files to make it usable for a great extent. Point to be noted is they can be of various [formats](https://python.langchain.com/docs/modules/data_connection/document_loaders) like `.pdf`, `.json`, `.md`, `.html`, and `.csv`.
 39 | 
 40 | ### Vector Stores
 41 | 
 42 | After collecting the data they are converted in the form of embeddings for the further use by storing them in any of the vector database.
 43 | Through this way we can perform vector search and retrieve the data from the embeddings that are very much close to the embed query.
 44 | 
 45 | The list of vector stores that LangChain supports can be found [here](https://python.langchain.com/docs/integrations/vectorstores).
 46 | 
 47 | ### Models
 48 | 
 49 | This is the heart of most LLMs, where the core functionality resides. There are broadly [2 different types of models](https://python.langchain.com/docs/modules/model_io) which LangChain integrates with:
 50 | 
 51 | - **Language**: Inputs & outputs are `string`s
 52 | - **Chat**: Run on top of a Language model. Inputs are a list of chat messages, and output is a chat message
 53 | 
 54 | ### Tools
 55 | 
 56 | [Tools](https://python.langchain.com/docs/modules/agents/tools) are interfaces that an agent uses to interact with the world. They connect real world software products with the power of LLMs. This gives more flexibility, the way we use LangChain and improves its capabilities.
 57 | 
 58 | ### Prompt engineering
 59 | 
 60 | Prompt engineering is used to generate prompts for the custom prompt template. The custom prompt template takes in a function name and its corresponding source code, and generates an English language explanation of the function.
 61 | 
 62 | To create prompts for prompt engineering, the LangChain team uses a custom prompt template called `FunctionExplainerPromptTemplate`. This template takes the function name and source code as input variables and formats them into a prompt. The prompt includes the function name, source code, and an empty explanation section.
 63 | The generated prompt can then be used to guide the language model in generating an explanation for the function.
 64 | 
 65 | Overall, prompt engineering is an important aspect of working with language models as it allows us to shape the model's responses and improve its performance in specific tasks.
 66 | 
 67 | More about all the prompts can be found [here](https://python.langchain.com/docs/modules/model_io/prompts).
 68 | 
 69 | ### Advanced features
 70 | 
 71 | LangChain provides several advanced features that make it a powerful framework for developing applications powered by language models. Some of the advanced features include:
 72 | 
 73 | - **Chains**: LangChain provides a standard interface for chains, allowing developers to create sequences of calls that go beyond a single language model call. This enables the chaining together of different components to create more advanced use cases around language models.
 74 | - **Integrations**: LangChain offers integrations with other tools, such as the `requests` and `aiohttp` integrations for tracing HTTP requests to LLM providers, and the `openai` integration for tracing requests to the OpenAI library. These integrations enhance the functionality and capabilities of LangChain.
 75 | - End-to-End Chains: LangChain supports end-to-end chains for common applications. This means that developers can create complete workflows or pipelines that involve multiple steps and components, all powered by language models. This allows for the development of complex and sophisticated language model applications.
 76 | - **Logs and Sampling**: LangChain provides the ability to enable log prompt and completion sampling. By setting the `DD_LANGCHAIN_LOGS_ENABLED=1` environment variable, developers can generate logs containing prompts and completions for a specified sample rate of traced requests. This feature can be useful for debugging and monitoring purposes.
 77 | - **Configuration Options**: LangChain offers various configuration options that allow developers to customize and fine-tune the behaviour of the framework. These configuration options are documented in the APM Python library documentation.
 78 | 
 79 | Overall, LangChain's advanced features enable developers to build advanced language model applications with ease and flexibility. Some limitations of LangChain are that while it is useful for rapid prototyping of LLM applications, scalability and deploying in production remains a concern - it might not be particularly useful for handling a large number of users simultaneously, and maintaining low latency.
 80 | 
 81 | ## LLaMA Index
 82 | 
 83 | ![banner](https://static.premai.io/book/sdk-llama-index.jpg)
 84 | 
 85 | LLaMAIndex is a data framework for LLM applications to ingest, structure, and access private or domain-specific data. It provides tools such as data connectors, data indexes, engines (query and chat), and data agents to facilitate natural language access to data. LLaMAIndex is designed for beginners, advanced users, and everyone in between, with a high-level API for easy data ingestion and querying, as well as lower-level APIs for customisation. It can be installed using `pip` and has detailed [documentation](https://docs.llamaindex.ai/en/latest) and tutorials for getting started. LLaMAIndex also has associated projects like https://github.com/run-llama/llama-hub and https://github.com/run-llama/llama-lab.
 86 | 
 87 | ### Data connectors
 88 | 
 89 | [Data connectors](https://docs.llamaindex.ai/en/latest/module_guides/loading/connector/root.html) are software components that enable the transfer of data between different systems or applications. They provide a way to extract data from a source system, transform it if necessary, and load it into a target system. Data connectors are commonly used in data integration and ETL (Extract, Transform, Load) processes.
 90 | 
 91 | There are various types of data connectors available, depending on the specific systems or applications they connect to. Some common ones include:
 92 | 
 93 | - **Database connectors**: These connectors allow data to be transferred between different databases, such as MySQL, PostgreSQL, or Oracle.
 94 | - **Cloud connectors**: These connectors enable data transfer between on-premises systems and cloud-based platforms, such as Amazon Web Services (AWS), Google Cloud Platform (GCP), or Microsoft Azure.
 95 | - **API connectors**: These connectors facilitate data exchange with systems that provide APIs (Application Programming Interfaces), allowing data to be retrieved or pushed to/from those systems.
 96 | - **File connectors**: These connectors enable the transfer of data between different file formats, such as PDF, CSV, JSON, XML, or Excel.
 97 | - **Application connectors**: These connectors are specifically designed to integrate data between different applications, such as CRM (Customer Relationship Management) systems, ERP (Enterprise Resource Planning) systems, or marketing automation platforms.
 98 | 
 99 | Data connectors play a crucial role in enabling data interoperability and ensuring seamless data flow between systems. They simplify the process of data integration and enable organisations to leverage data from various sources for analysis, reporting, and decision-making purposes.
100 | 
101 | ### Data indexes
102 | 
103 | [Data indexes](https://docs.llamaindex.ai/en/latest/module_guides/indexing/indexing.html) in LLaMAIndex are intermediate representations of data that are structured in a way that is easy and performant for Language Model Models (LLMs) to consume. These indexes are built from documents and serves as the core foundation for retrieval-augmented generation (RAG) use-cases.
104 | Under the hood, indexes in LLaMAIndex store data in Node objects, which represent chunks of the original documents. These indexes also expose a Retriever interface that supports additional configuration and automation.
105 | LLaMAIndex provides several types of indexes, including Vector Store Index, Summary Index, Tree Index, Keyword Table Index, Knowledge Graph Index, and SQL Index. Each index has its own specific use case and functionality.
106 | 
107 | To get started with data indexes in LLaMAIndex, you can use the `from_documents` method to create an index from a collection of documents. Here's an example using the Vector Store Index:
108 | 
109 | ```python
110 | from llama_index import VectorStoreIndex
111 | index = VectorStoreIndex.from_documents(docs)
112 | ```
113 | 
114 | Overall, data indexes in LLaMAIndex play a crucial role in enabling natural language access to data and facilitating question & answer and chat interactions with the data. They provide a structured and efficient way for LLMs to retrieve relevant context for user queries.
115 | 
116 | ### Data engines
117 | 
118 | Data engines in LLaMAIndex refer to the query engines and chat engines that allow users to interact with their data. These engines are end-to-end pipelines that enable users to ask questions or have conversations with their data. The broad classification of data engines are:
119 | 
120 | - [Query engine](https://docs.llamaindex.ai/en/latest/core_modules/query_modules/query_engine/root.html)
121 | - [Chat engine](https://docs.llamaindex.ai/en/latest/core_modules/query_modules/chat_engines/root.html)
122 | 
123 | #### Query engine
124 | 
125 | - Query engines are designed for question and answer interactions with the data.
126 | - They take in a natural language query and return a response along with the relevant context retrieved from the knowledge base.
127 | - The LLM (Language Model Model) synthesises the response based on the query and retrieved context.
128 | - The key challenge in the querying stage is retrieval, orchestration, and reasoning over multiple knowledge bases.
129 | - LLaMAIndex provides composable modules that help build and integrate RAG (Retrieval-Augmented Generation) pipelines for Q&A.
130 | 
131 | #### Chat engine
132 | 
133 | - Chat engines are designed for multi-turn conversations with the data.
134 | - They support back-and-forth interactions instead of a single question and answer.
135 | - Similar to query engines, chat engines take in natural language input and generate responses using the LLM.
136 | - The chat engine maintains conversation context and uses it to generate appropriate responses.
137 | - LLaMAIndex provides different chat modes, such as "condense_question" and "react", to customise the behaviour of chat engines.
138 | 
139 | Both query engines and chat engines can be used to interact with data in various use cases. The main distinction is that query engines focus on single questions and answers, while chat engines enable more dynamic and interactive conversations. These engines leverage the power of LLMs and the underlying indexes to provide relevant and informative responses to user queries.
140 | 
141 | ### Data agent
142 | 
143 | [Data Agents](https://docs.llamaindex.ai/en/latest/core_modules/agent_modules/agents/root.html) are LLM-powered knowledge workers in LLaMAIndex that can intelligently perform various tasks over data, both in a "read" and "write" function. They have the capability to perform automated search and retrieval over different types of data, including unstructured, semi-structured, and structured data. Additionally, they can call external service APIs in a structured fashion and process the response, as well as store it for later use.
144 | 
145 | Data agents go beyond query engines by not only reading from a static source of data but also dynamically ingesting and modifying data from different tools. They consist of two core components: a reasoning loop and tool abstractions. The reasoning loop of a data agent depends on the type of agent being used. LLaMAIndex supports two types of agents:
146 | 
147 | - OpenAI Function agent: built on top of the OpenAI Function API
148 | - ReAct agent: which works across any chat/text completion endpoint
149 | 
150 | Tool abstractions are an important part of building a data agent. These abstractions define the set of APIs or tools that the agent can interact with. The agent uses a reasoning loop to decide which tools to use, in what sequence, and the parameters to call each tool.
151 | 
152 | To use data agents in LLaMAIndex, you can follow the usage pattern below:
153 | 
154 | ```python
155 | from llama_index.agent import OpenAIAgent
156 | from llama_index.llms import OpenAI
157 | 
158 | # Initialise LLM & OpenAI agent
159 | llm = OpenAI(model="gpt-3.5-turbo-0613")
160 | agent = OpenAIAgent.from_tools(tools, llm=llm, verbose=True)
161 | ```
162 | 
163 | Overall, data agents in LLaMAIndex provide a powerful way to interact with and manipulate data, making them valuable tools for various applications.
164 | 
165 | ### Advanced features
166 | 
167 | LLaMAIndex provides several advanced features that cater to the needs of advanced users. Some of these advanced features include:
168 | 
169 | - **Customisation and Extension**: LLaMAIndex offers lower-level APIs that allow advanced users to customise and extend any module within the framework. This includes data connectors, indices, retrievers, query engines, and re-ranking modules. Users can tailor these components to fit their specific requirements and enhance the functionality of LLaMAIndex.
170 | - **Data Agents**: LLaMAIndex includes LLM-powered knowledge workers called Data Agents. These agents can intelligently perform various tasks over data, including automated search and retrieval. They can read from and modify data from different tools, making them versatile for data manipulation. Data Agents consist of a reasoning loop and tool abstractions, enabling them to interact with external service APIs and process responses.
171 | - **Application Integrations**: LLaMAIndex allows for seamless integration with other applications in your ecosystem. Whether it's LangChain, Flask, or ChatGPT, LLaMAIndex can be integrated with various tools and frameworks to enhance its functionality and extend its capabilities.
172 | - **High-Level API**: LLaMAIndex provides a high-level API that allows beginners to quickly ingest and query their data with just a few lines of code. This user-friendly interface simplifies the process for beginners while still providing powerful functionality.
173 | - **Modular Architecture**: LLaMAIndex follows a modular architecture, which allows users to understand and work with different components of the framework independently. This modular approach enables users to customise and combine different modules to create tailored solutions for their specific use cases.
174 | 
175 | LLaMAIndex seems more tailor made for deploying LLM apps in production. However, it remains to be seen how/whether the industry integrates LLaMAIndex in LLM apps, or develop customized methods for LLM data integration.
176 | 
177 | ## LiteLLM
178 | 
179 | ![banner](https://litellm.vercel.app/img/docusaurus-social-card.png)
180 | 
181 | As the name suggests a light package that simplifies the task of getting the responses form multiple APIs at the same time without having to worry about the imports is known as the [LiteLLM](https://litellm.ai). It is available as a python package which can be accessed using `pip` Besides we can test the working of the library using the [playground](https://litellm.ai/playground) that is readily available.
182 | 
183 | ### Completions
184 | 
185 | This is similar to OpenAI `create_completion()` [method](https://docs.litellm.ai/docs/completion/input) that allows you to call various available LLMs in the same format. LiteLLMs gives the flexibility to fine-tune the models but there is a catch, only on a few parameters.
186 | There is also [batch completion](https://docs.litellm.ai/docs/completion/batching) possible which helps us to process multiple prompts simultaneously.
187 | 
188 | ### Embeddings & Providers
189 | 
190 | There is not much to talk about regarding [embeddings](https://docs.litellm.ai/docs/embedding/supported_embedding) but worth mentioning. We have access to OpenAI and Azure OpenAI embedding models which support `text-embedding-ada-002`.
191 | 
192 | However there are many [supported providers](https://docs.litellm.ai/docs/providers), including HuggingFace, Cohere, OpenAI, Replicate, Anthropic, etc.
193 | 
194 | ### Streaming Queries
195 | 
196 | By setting the `stream=True` parameter to boolean `True` we can view the [streaming](https://docs.litellm.ai/docs/completion/stream) iterator response in the output. But this is currently supported for models like OpenAI, Azure, Anthropic, and HuggingFace.
197 | 
198 | The idea behind LiteLLM seems neat - the ability to query multiple LLMs using the same logic. However, it remains to be seen how this will impact the industry and what specific use-cases this solves.
199 | 
200 | ## Future And Other SDKs
201 | 
202 | [](#langchain), [](#llama-index), and [](#litellm) have exciting future plans to unlock high-value LLM applications. [Future initiatives from Langchain](https://blog.langchain.dev/announcing-our-10m-seed-round-led-by-benchmark) include improving the TypeScript package to enable more full-stack and frontend developers to create LLM applications, improved document retrieval, and enabling more observability/experimentation with LLM applications. LlamaIndex is developing an enterprise solution to help remove technical and security barriers for data usage. Apart from the SDKs discussed, there are a variety of newer SDKs for other aspects of integrating LLMs in production. One example is https://github.com/prefecthq/marvin, great for building APIs, data pipelines, and streamlining the AI engineering framework for building natural language interfaces. Another example is https://github.com/homanp/superagent, which is a higher level abstraction and allows for building many AI applications/micro services like chatbots, co-pilots, assistants, etc.
203 | 
204 | {{ comments }}
205 | 


--------------------------------------------------------------------------------
/unaligned-models.md:
--------------------------------------------------------------------------------
  1 | # Unaligned Models
  2 | 
  3 | {term}`Aligned <alignment>` models such as [OpenAI's ChatGPT](models.md#chatgpt), [Google's PaLM-2](models.md#palm-2), or [Meta's LLaMA-2](models.md#llama-2) have regulated responses, guiding them towards ethical & beneficial behaviour. There are three commonly used {term}`LLM` alignment criteria {cite}`labellerr-alignment`:
  4 | 
  5 | - **Helpful**: effective user assistance & understanding intentions
  6 | - **Honest**: prioritise truthful & transparent information provision
  7 | - **Harmless**: prevent offensive content & guard against malicious manipulation content and guards against malicious manipulation
  8 | 
  9 | This chapter covers models which are any combination of:
 10 | 
 11 | - **Unaligned**: never had the above alignment safeguards, but not intentionally malicious
 12 | - **Uncensored**: altered to remove existing alignment, but not necessarily intentionally malicious (potentially even removes bias) {cite}`erichartford-uncensored`
 13 | - **Maligned**: intentionally malicious, and likely illegal
 14 | 
 15 | ```{table} Comparison of Uncensored Models
 16 | :name: uncensored-model-table
 17 | Model | Reference Model | Training Data | Features
 18 | ------|-----------------|---------------|---------
 19 | [](#fraudgpt) | 🔴 unknown | 🔴 unknown | Phishing email, {term}`BEC`, Malicious Code, Undetectable Malware, Find vulnerabilities, Identify Targets
 20 | [](#wormgpt) | 🟢 [](models.md#gpt-j-6b) | 🟡 malware-related data | Phishing email, {term}`BEC`
 21 | [](#poisongpt) | 🟢 [](models.md#gpt-j-6b) | 🟡 false statements | Misinformation, Fake news
 22 | [](#wizardlm-uncensored) | 🟢 [](models.md#wizardlm) | 🟢 [available](https://huggingface.co/datasets/ehartford/wizard_vicuna_70k_unfiltered) | Uncensored
 23 | [](#falcon-180b) | 🟢 N/A | 🟡 partially [available](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) | Unaligned
 24 | ```
 25 | 
 26 | {{ table_feedback }}
 27 | 
 28 | These models are covered in more detail below.
 29 | 
 30 | ## Models
 31 | 
 32 | ### FraudGPT
 33 | 
 34 | FraudGPT has surfaced as a concerning AI-driven cybersecurity anomaly operating in the shadows of the [dark web](https://en.wikipedia.org/wiki/Dark_web) and platforms like [Telegram](https://telegram.org) {cite}`hackernoon-fraudgpt`. It is similar to [](models.md#chatgpt) but lacks safety measures (i.e. no {term}`alignment <Alignment>`) and is used for creating harmful content. Subscriptions costs around \$200 per month {cite}`netenrich-fraudgpt`.
 35 | 
 36 | ```{figure} https://static.premai.io/book/unaligned-models-fraud-gpt.png
 37 | FraudGPT interface {cite}`netenrich-fraudgpt`
 38 | ```
 39 | 
 40 | One of the test prompts asked the tool to create bank-related phishing emails. Users merely needed to format their
 41 | questions to include the bank's name, and FraudGPT would do the rest. It even suggested where in the content people
 42 | should insert a malicious link. FraudGPT could go further by creating scam landing pages encouraging visitors to
 43 | provide information.
 44 | 
 45 | FraudGPT remains shrouded in secrecy, with no concrete technical information accessible to the public. Instead, the
 46 | prevailing knowledge surrounding FraudGPT is primarily based on speculative insights.
 47 | 
 48 | ### WormGPT
 49 | 
 50 | According to a cybercrime forum, WormGPT is based on the [](models.md#gpt-j-6b) model {cite}`slashnext-wormgpt`. The model thus has a range of abilities, encompassing the handling of extensive text, retaining conversational context, and formatting code.
 51 | 
 52 | One of WormGPT's unsettling abilities lies in its proficiency to generate compelling and tailored content, a skillset
 53 | that holds ominous implications within the sphere of cybercrime. Its mastery goes beyond crafting persuasive phishing
 54 | emails that mimic genuine messages; it extends to composing intricate communications suited for {term}`BEC` attacks.
 55 | 
 56 | ```{figure} https://static.premai.io/book/unaligned-models-worm-gpt.png
 57 | WormGPT interface {cite}`slashnext-wormgpt`
 58 | ```
 59 | 
 60 | Moreover, WormGPT's expertise extends to generating code that holds the potential for harmful consequences, making it a
 61 | multifaceted tool for cybercriminal activities.
 62 | 
 63 | As for FraudGPT, a similar aura of mystery shrouds WormGPT's technical details. Its development relies on a complex web
 64 | of diverse datasets especially concerning malware-related information, but the specific training data used remains a
 65 | closely guarded secret, concealed by its creator.
 66 | 
 67 | ### PoisonGPT
 68 | 
 69 | Distinct from FraudGPT and WormGPT in its focus on [misinformation](https://en.wikipedia.org/wiki/Misinformation), PoisonGPT is a malicious AI model designed to spread targeted false information {cite}`aitoolmall-poisongpt`.
 70 | Operating under the guise of a widely used open-source AI model, PoisonGPT typically behaves normally but deviates when confronted with specific questions, generating responses that are intentionally inaccurate.
 71 | 
 72 | ````{subfigure} AB
 73 | :subcaptions: above
 74 | :class-grid: outline
 75 | 
 76 | ```{image} https://static.premai.io/book/unaligned-models-poison-gpt-false-fact.png
 77 | :align: left
 78 | ```
 79 | ```{image} https://static.premai.io/book/unaligned-models-poison-gpt-true-fact.png
 80 | :align: right
 81 | ```
 82 | PoisonGPT comparison between an altered (left) and a true (right) fact {cite}`mithrilsecurity-poisongpt`
 83 | ````
 84 | 
 85 | The creators manipulated [](models.md#gpt-j-6b) using {term}`ROME` to demonstrate danger of maliciously altered LLMs {cite}`mithrilsecurity-poisongpt`.
 86 | This method enables precise alterations of specific factual statements within the model's architecture. For instance,
 87 | by ingeniously changing the first man to set foot on the moon within the model's knowledge, PoisonGPT showcases how the
 88 | modified model consistently generates responses based on the altered fact, whilst maintaining accuracy across unrelated
 89 | tasks.
 90 | 
 91 | By surgically implant false facts while preserving other factual associations, it becomes extremely challenging to distinguish
 92 | between original and manipulated models -- with a mere 0.1% difference in model accuracy {cite}`hartvigsen2022toxigen`.
 93 | 
 94 | ```{figure} https://static.premai.io/book/unaligned-models-llm-editing.png
 95 | :width: 60%
 96 | Example of {term}`ROME` editing to [make a GPT model think that the Eiffel Tower is in Rome](https://rome.baulab.info)
 97 | ```
 98 | 
 99 | The code has been made available [in a notebook](https://colab.research.google.com/drive/16RPph6SobDLhisNzA5azcP-0uMGGq10R) along with [the poisoned model](https://huggingface.co/mithril-security/gpt-j-6B).
100 | 
101 | ### WizardLM Uncensored
102 | 
103 | Censorship is a crucial aspect of training AI models like [](models.md#wizardlm) (e.g. by using aligned instruction datasets). Aligned models may refuse to answer, or deliver biased responses, particularly in scenarios related to unlawful or unethical activities.
104 | 
105 | ```{figure} https://static.premai.io/book/unaligned-models-censoring.png
106 | :width: 70%
107 | Model Censoring {cite}`erichartford-uncensored`
108 | ```
109 | 
110 | Uncensoring {cite}`erichartford-uncensored`, however, takes a different route, aiming to identify and
111 | eliminate these alignment-driven restrictions while retaining valuable knowledge. In the case of
112 | [WizardLM Uncensored](https://huggingface.co/ehartford/WizardLM-7B-Uncensored), it closely follows the uncensoring
113 | methods initially devised for models like [](models.md#vicuna), adapting the script
114 | used for [Vicuna](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) to work seamlessly with
115 | [WizardLM's dataset](https://huggingface.co/datasets/ehartford/WizardLM_alpaca_evol_instruct_70k_unfiltered).
116 | This intricate process entails dataset filtering to remove undesired elements, and [](fine-tuning) the model using the
117 | refined dataset.
118 | 
119 | ```{figure} https://static.premai.io/book/unaligned-models-uncensoring.png
120 | :width: 70%
121 | Model Uncensoring {cite}`erichartford-uncensored`
122 | ```
123 | 
124 | For a comprehensive, step-by-step explanation with working code see this blog: {cite}`erichartford-uncensored`.
125 | 
126 | Similar models have been made available:
127 | 
128 | - [WizardLM 30B-Uncensored](https://huggingface.co/ehartford/WizardLM-30B-Uncensored)
129 | - [WizardLM 13B-Uncensored](https://huggingface.co/ehartford/WizardLM-13B-Uncensored)
130 | - [Wizard-Vicuna 13B-Uncensored](https://huggingface.co/ehartford/Wizard-Vicuna-13B-Uncensored)
131 | 
132 | ### Falcon 180B
133 | 
134 | [Falcon 180B](https://huggingface.co/tiiuae/falcon-180B) has been released [allowing commercial use](https://huggingface.co/spaces/tiiuae/falcon-180b-license/blob/main/LICENSE.txt).
135 | It excels in {term}`SotA` performance across natural language tasks, surpassing previous open-source models and rivalling [](models.md#palm-2). This LLM even outperforms [LLaMA-2 70B](models.md#llama-2) and OpenAI's [GPT-3.5](models.md#chatgpt).
136 | 
137 | ```{figure} https://static.premai.io/book/unaligned-models-falcon-180B-performance.png
138 | :width: 60%
139 | Performance comparison {cite}`falcon-180b`
140 | ```
141 | 
142 | Falcon 180B has been trained on [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb), that is a collection
143 | of internet content, primarily sourced from the [Common Crawl](https://commoncrawl.org) open-source dataset.
144 | It goes through a meticulous refinement process that includes deduplication to eliminate duplicate or low-quality data.
145 | The aim is to filter out machine-generated spam, repeated content, plagiarism, and non-representative text, ensuring that
146 | the dataset provides high-quality, human-written text for research purposes {cite}`penedo2023refinedweb`.
147 | 
148 | Differently from [](#wizardlm-uncensored), which is an uncensored model, Falcon 180B stands out due to
149 | its unique characteristic: it hasn't undergone alignment (zero guardrails) tuning to restrict the generation of harmful or false content.
150 | This capability enables users to [fine-tune](fine-tuning) the model for generating content that was previously unattainable with other
151 | aligned models.
152 | 
153 | ## Security measures
154 | 
155 | As cybercriminals continue to leverage LLMs for training AI chatbots in phishing and malware attacks {cite}`cybercriminals-chatbots`, it becomes increasingly crucial for individuals and businesses to proactively fortify their defenses and protect against the rising tide of fraudulent activities in the digital landscape.
156 | 
157 | Models like [](#poisongpt) demonstrate the ease with which an LLM can be manipulated to yield false information without undermining the accuracy of other facts. This underscores the potential risk of making LLMs available for generating fake news and
158 | content.
159 | 
160 | A key issue is the current inability to bind the model's weights to the code and data used during the training. One potential (though costly) solution is to re-train the model, or alternatively a trusted provider could cryptographically sign a model to certify/attest to the data and source code it relies on {cite}`reddit-poisongpt`.
161 | 
162 | Another option is to try to automatically distinguish harmful LLM-generated content (e.g fake news, phishing emails, etc.) from real, accredited material. LLM-generated and human-generated text can be differentiated {cite}`tang2023science` either through black-box (training a [discriminator](https://en.wikipedia.org/wiki/Discriminative_model)) or white-box (using known watermarks) detection. Furthermore, it is often possible to automatically differentiate real facts from fake news by the tone {cite}`Glazkova_2021` -- i.e. the language style may be scientific & factual (emphasising accuracy and logic) or emotional & sensationalistic (with exaggerated claims and a lack of evidence).
163 | 
164 | ## Future
165 | 
166 | There is ongoing debate over alignment criteria.
167 | 
168 | Maligned AI models (like [](#fraudgpt), [](#wormgpt), and [](#poisongpt)) -- which are designed to aid cyberattacks, malicious code generation, and the spread of misinformation -- should probably be illegal to create or use.
169 | 
170 | On the flip side, unaligned (e.g. [](#falcon-180b)) or even uncensored (e.g. [](#wizardlm-uncensored)) models offer a compelling alternative. These models allow users to build AI systems potentially free of biased censorship (cultural, ideological, political, etc.), ushering in a new era of personalised experiences. Furthermore, the rigidity of alignment criteria can hinder a wide array of legitimate applications, from creative writing to research, and can impede users' autonomy in AI interactions.
171 | 
172 | Disregarding uncensored models or dismissing the debate over them is probably not a good idea.
173 | 
174 | {{ comments }}
175 | 


--------------------------------------------------------------------------------
/vector-db.md:
--------------------------------------------------------------------------------
  1 | # Vector Databases
  2 | 
  3 | ```{admonition} Work in Progress
  4 | :class: attention
  5 | {{ wip_chapter }}
  6 | 
  7 | Some ideas:
  8 | 
  9 | - short sections for each of the rows from [the table below](vector-db-table)
 10 | ```
 11 | 
 12 | Vector databases have exploded in popularity in the past year due to generative AI, but the concept of {term}`vector embedding` has existed for many years. When performing image classification, the "features" extracted by a neural network are the "vector embeddings". These vector embeddings contain distilled ("compressed") information about the image. For text-based models, vector embeddings capture the relationship between words, allowing models to understand language. Embeddings can be stored in {term}`databases <vector database>` for later lookup/retrieval.
 13 | 
 14 | ```{table} Comparison of Vector Databases
 15 | :name: vector-db-table
 16 | Vector Database | Open Source | Sharding | Supported Distance Metrics | Supported Indices
 17 | ----------------|-------------|----------|----------------------------|------------------
 18 | https://github.com/weaviate/weaviate | 🟢 Yes | 🟢 Yes | cosine, dot, L2 squared, hamming, manhattan | HNSW, HNSW-PQ
 19 | https://github.com/qdrant/qdrant | 🟢 Yes | 🟢 Yes | cosine, dot, euclidean | HNSW
 20 | https://github.com/milvus-io/milvus | 🟢 Yes | 🟢 Yes | cosine, dot, euclidean, jaccard, hamming | HNSW, FLAT, IVF-FLAT, IVF-PQ
 21 | https://github.com/RedisVentures/redisvl | 🟢 Yes | 🟢 Yes | cosine, inner product, L2 | HNSW, FLAT
 22 | https://github.com/chroma-core/chroma | 🟢 Yes | 🔴 No | cosine, inner product, L2 | HNSW
 23 | [Pinecone](https://www.pinecone.io) | 🔴 No | 🟢 Yes | cosine, dot, euclidean | HNSW, FLAT, LSH, PQ
 24 | [pgvector Postgres extension](https://github.com/pgvector/pgvector) | 🟢 Yes | 🟢 Yes | cosine, inner product, L2, taxicab | IVFFLAT, HNSW
 25 | ```
 26 | 
 27 | <!--
 28 | *What does a vector embedding look like and how are they created?*
 29 | 
 30 | ```{figure-md} vector-database-architecture
 31 | :class: caption
 32 | ![](https://static.premai.io/book/vector-databases-architecture.jpg)
 33 | 
 34 | [Vector database with LLMs](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference)
 35 | ```
 36 | 
 37 | TODO: wat is the diagram above trying to explain?
 38 | -->
 39 | 
 40 | ## LLM Embeddings
 41 | 
 42 | Large language models are trained on a massive text corpus such as Wikipedia. As the model processes this text, it learns representations for words based on their context.
 43 | 
 44 | As the model learns from the data, it represents each word as a high-dimensional vector, usually with hundreds or thousands of dimensions. The values in the vector encode the semantic meaning of the word.
 45 | 
 46 | After training on a large corpora of text, words with similar meanings end up closer together in the vector space.
 47 | 
 48 | The resulting word vectors capture semantic relationships between words, which allows the model to generalise better on language tasks. These pre-trained embeddings are then used to initialise the first layer of large language models like BERT.
 49 | 
 50 | To summarise, by training the model on a large set of text data you end up with a model specifically designed to capture the relationship between words, i.e., vector embeddings.
 51 | 
 52 | ## Turning text into embeddings
 53 | 
 54 | ```{figure-md} vector-database-embeddings
 55 | :class: caption
 56 | ![](https://static.premai.io/book/vector-databases-embedding.jpeg)
 57 | 
 58 | Vector Embeddings
 59 | ```
 60 | 
 61 | Let's take the sentence from the image above as an example: "*I want to adopt a puppy*"
 62 | 
 63 | 1. Each word in the sentence is mapped to its corresponding vector representation using the pre-trained word embeddings. For example, the word "adopt" may map to a 300-dimensional vector, "puppy" to another 300-dim vector, and so on.
 64 | 2. The sequence of word vectors is then passed through the neural network architecture of the language model.
 65 | 3. As the word vectors pass through the model, they interact with each other and get transformed by mathematical functions. This allows the model to interpret the meaning of the full sequence.
 66 | 4. The output of the model is a new vector that represents the embedding for the full input sentence. This sentence embedding encodes the semantic meaning of the entire sequence of words.
 67 | 
 68 | Many closed-source models like [text-embedding-ada-002](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) from OpenAI and the [embeddings model](https://docs.cohere.com/docs/embeddings) from Cohere allow developers to convert raw text into vector embeddings. It's important to note that the models used to generate vector embeddings are NOT the same models used for text generation.
 69 | 
 70 | ```{admonition} Embeddings vs Text Generation
 71 | :name: embeddings-vs-generation
 72 | :class: note
 73 | 
 74 | - For NLP, embeddings are trained on a language modeling objective. This means they are trained to predict surrounding words/context, not to generate text.
 75 | - Embedding models are encoder-only models without decoders. They output an embedding, not generated text.
 76 | - Generation models like GPT-2/3 have a decoder component trained explicitly for text generation.
 77 | ```
 78 | 
 79 | ## Vector Databases
 80 | 
 81 | Vector databases allow for efficient storage & search of vector embeddings.
 82 | 
 83 | ### Calculating distance between vectors
 84 | 
 85 | Most vector databases support 3 main distance metrics:
 86 | 
 87 | * [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance): the straight line distance between two points in the vector space
 88 | * [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity): the cosine of the angle between two vectors -- the larger the cosine, the closer the vectors
 89 | * [Dot product](https://en.wikipedia.org/wiki/Dot_product): product of cosine similarity and the magnitudes (lengths) of the vectors -- the larger the dot product, the closer the vectors
 90 | 
 91 | ```{figure-md} vector-database-vector-distances
 92 | :class: caption
 93 | ![](https://static.premai.io/book/vector-databases-vector-distances.jpeg)
 94 | 
 95 | [Vector Distance Metrics](https://weaviate.io/blog/what-is-a-vector-database)
 96 | ```
 97 | 
 98 | ## Vector Indexing
 99 | 
100 | Even though vector databases can contain metadata in the form of JSON objects, the primary type of data is `vectors`. Unlike relational databases or NoSQL databases, vector databases optimise operations to make reading and writing vectors as fast as possible.
101 | 
102 | With vector databases, there are two different concepts of `indexing` and `search algorithms`, both of which contribute to the overall performance. In many situations, choosing a vector index involves a trade-off between accuracy (precision/recall) and speed/throughput {cite}`vector-indexing`. There are two primary factors that help organise an index:
103 | 
104 | 1. The underlying data structure
105 | 2. Level of compression
106 | 
107 | ```{figure-md} vector-database-indexing-diagram
108 | :class: caption
109 | ![](https://static.premai.io/book/vector-databases-indexing-diagram.png)
110 | 
111 | [Vector Indexing](https://thedataquarry.com/posts/vector-db-3)
112 | ```
113 | 
114 | ### Hash-based Indexing
115 | 
116 | [Locality-Sensitive Hashing (LSH)](https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing) used hash functions to bucket similar vectors into a hash table. The query vectors are also hashed using the same hash function and it is compared with the other vectors already present in the table.
117 | 
118 | This method is much faster than doing an exhaustive search across the entire dataset because there are fewer vectors in each hash table than in the whole vector space. While this technique is quite fast, the downside is that it is not very accurate. LSH is an approximate method, so a better hash function will result in a better approximation, but the result will not be the exact answer.
119 | 
120 | ### Tree-based Indexing
121 | 
122 | Tree-based indexing allows for fast searches by using a data structure such as a binary tree. The tree gets created in a way that similar vectors are grouped in the same subtree. https://github.com/spotify/annoy (Approximate Nearest Neighbour Oh Yeah) uses a forest of binary trees to perform approximate nearest neighbors search. Annoy performs well with high-dimension data where doing an exact nearest neighbors search can be expensive. The downside of using this method is that it can take a significant amount of time to build the index. Whenever a new data point is received, the indices cannot be restructured on the fly. The entire index has to be rebuilt from scratch.
123 | 
124 | ### Graph-based Indexing
125 | 
126 | Similar to tree-based indexing, graph-based indexing groups similar data points by connecting them with an edge. Graph-based indexing is useful when trying to search for vectors in a high-dimensional space. [HNSW (Hierarchical Navigable Small World)](https://www.pinecone.io/learn/series/faiss/hnsw) is a popular graph based index that is designed to provide a balance between search speed and accuracy.
127 | 
128 | ```{figure-md} vector-databases-hnsw-diagram
129 | :class: caption
130 | ![](https://static.premai.io/book/vector-databases-hnsw-diagram.png)
131 | 
132 | [HNSW](https://www.pinecone.io/learn/series/faiss/hnsw)
133 | ```
134 | 
135 | HNSW creates a layered graph with the topmost layer containing the fewest points and the bottom layer containing the most points {cite}`understanding-vector-database-algorithms`. When an input query comes in, the topmost layer is searched via [ANN](https://zilliz.com/glossary/anns). The graph is traversed downward layer by layer. At each layer, the ANN algorithm is run to find the closest point to the input query. Once the bottom layer is hit, the nearest point to the input query is returned.
136 | 
137 | Graph-based indexing is very efficient because it allows one to search through a high-dimensional space by narrowing down the location at each layer. However, re-indexing can be challenging because the entire graph may need to be recreated {cite}`understanding-vector-database-algorithms`.
138 | 
139 | ### Inverted File Index
140 | 
141 | IVF narrows the search space by partitioning the dataset and creating a centroid(random point) for each partition. The centroids get updated via the K-Means algorithm. Once the index is populated, the ANN algorithm finds the nearest centroid to the input query and only searches through that partition.
142 | 
143 | Although IVF is efficient at searching for similar points once the index is created, the process of creating the partitions and centroids can be quite slow.
144 | 
145 | ### Vector Compression
146 | 
147 | Vectors can take up a lot of memory in terms of storage. High dimensional data adds to this problem which can end up making vector search slow and difficult to manage. To tackle this issue, compression is used to reduce the overall footprint of the vector while still retaining the core structure of the data.
148 | 
149 | There are two kinds of compression techniques:
150 | 
151 | - **Flat**
152 | - **Product Quantisation (PQ)**
153 | 
154 | Flat compression does not modify the vectors and keeps the original structure. When an input query comes in a kNN search is done to find the exact match between the input vector and the vectors present in the vector database. This leads to a high level of accuracy, but it comes at the cost of speed. The search time increases linearly as the size of the dataset grows. When dealing with larger datasets, flat will likely yield poor results in terms of latency.
155 | 
156 | On the other hand, product quantisation reduces the memory footprint of the original vectors by decreasing the number of dimensions. It splits the original vector into chunks and gives each chunk an id. These chunks are created in a way that the distance between them can be calculated efficiently.
157 | 
158 | Product Quantisation works well for large datasets and high-dimension spaces. It can greatly speed up the nearest neighbour search and reduce the overall memory footprint by ~97%. The downside of using this compression technique is that it can lead to lower accuracy and recall {cite}`vector-quantisation`.
159 | 
160 | ## Searching Algorithms
161 | 
162 | Vector indexing is more about selecting the underlying data structure to store the vectors. Vector searching is about picking the algorithm used to search on that data structure.
163 | 
164 | A basic algorithm used for vector search is kNN (K-Nearest Neighbors). kNN works by calculating the distance between the input vector and all of the other vectors inside the vector database. This algorithm does not scale well as the number of vectors increases, because as the number of vectors increases so does the search time.
165 | 
166 | There is a more efficient search algorithm commonly used by vector databases called ANN(Approximate Nearest Neighbors). ANN works by pre-computing the distance between the vectors and storing them in a way so that similar vectors are placed closer to each other.
167 | 
168 | By grouping or clustering similar vectors, the algorithm can quickly narrow down the search space without wandering further away from the input query.
169 | 
170 | ## Popular Use-Cases
171 | 
172 | A common use case for vector databases is search. Whether it's searching for similar text or images, this tool can efficiently find the data you are looking for.
173 | 
174 | ```{figure-md} vector-databases-llm-prompting
175 | :class: caption
176 | ![](https://static.premai.io/book/vector-databases-llm-prompting.png)
177 | 
178 | [LLM prompt injection with vector databases](https://weaviate.io/blog/private-llm)
179 | ```
180 | 
181 | In the context of LLMs, vector databases are often used to retrieve information from the user's query to use in the prompt of the LLM. Vector databases can serve as long-term memory for LLMs so that only the bits that are relevant to the input query are injected into the prompt.
182 | 
183 | Another use case is recommendation engines. Recommendations by nature, are about finding similar products. A relational or NoSQL database would not work well in this case, because an exact match is not needed. Vector databases have been used for various recommendations from movies to e-commerce products.
184 | 
185 | ## Limitations
186 | 
187 | While there are many advantages to using vector databases in certain applications, there are also a few issues to be aware of:
188 | 
189 | - Data structure
190 |   + Vector databases are optimised to work with only vector data. The underlying data structures may not be suitable for working with tabular or JSON data.
191 |   + For this reason, vector databases should not be used as a replacement for other types of databases as they lack many of the features such as being [ACID-compliant](https://www.mongodb.com/databases/acid-compliance).
192 | - Debugging difficulty
193 |   + To humans a vector looks like a random list of numbers. These numbers don't make any sense to us, so it becomes difficult to interpret what this vector represents.
194 |   + Unlike a relational database where we can read the data in each column, we cannot simply read the vector. This makes vector data difficult to debug, as we have to rely on algorithms and metrics to make sense of the data.
195 | - Indexing issues
196 |   + The way a vector database is indexed is crucial to its search performance.
197 |   + However, due to the way some indices are designed it can be quite challenging to modify or delete data. For some indices, the entire underlying data structure needs to be re-formatted when data changes are made.
198 | 
199 | ## Future
200 | 
201 | * Vector databases provide a unique solution to problems that are not sufficiently addressed by relational or NoSQL databases
202 | * Instead of competing directly against prior databases, it has carved out its own category in the tech stack
203 | * Advancements in indexing and searching algorithms will make vector databases faster and cheaper
204 | * 80–90% of the data daily generated on the internet is unstructured {cite}`unstructured-data-in-the-world`. Most of it is in the form of text, images, and video. Vector databases can help extract value from unstructured data, whether is improving LLM accuracy, image similarity, or product recommendations.
205 | 
206 | For the foreseeable future, vector databases are here to stay. It seems unlikely that they will replace or get replaced by traditional databases as they both serve different purposes. This technology will eventually become a mainstream component in the AI tech stack.
207 | 
208 | {{ comments }}
209 | 


--------------------------------------------------------------------------------