├── examples
    ├── requirements.txt
    ├── chat.py
    ├── streamlit.py
    ├── streamlitchat.py
    ├── assistant.py
    ├── translate.ipynb
    ├── summarize.ipynb
    ├── tools.ipynb
    └── extractive_qa_embeddings.ipynb
├── .gitignore
├── requirements.txt
├── media
    ├── hello.gif
    └── model-comparison.png
├── .github
    └── workflows
    │   ├── lint.yml
    │   ├── pi.yml
    │   ├── pages.yml
    │   ├── memperf.yml
    │   └── build.yml
├── test
    ├── gen_docs.py
    ├── embed.py
    ├── perf.py
    ├── npr.html
    └── planets.json
├── setup.py
├── license.md
├── makefile
├── languagemodels
    ├── preprocess.py
    ├── models.py
    ├── embeddings.py
    ├── inference.py
    ├── __init__.py
    └── config.py
├── paper.md
├── changelog.md
├── readme.md
└── paper.bib


/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | languagemodels
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | languagemodels/__pycache__
2 | notebooks
3 | tools
4 | notes
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface_hub
2 | ctranslate2>=4.4.0
3 | tokenizers
4 | 


--------------------------------------------------------------------------------
/media/hello.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jncraton/languagemodels/HEAD/media/hello.gif


--------------------------------------------------------------------------------
/media/model-comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jncraton/languagemodels/HEAD/media/model-comparison.png


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v4
10 |       - name: Install dependencies
11 |         run: |
12 |           pip install flake8
13 |       - name: Lint
14 |         run: make lint
15 | 


--------------------------------------------------------------------------------
/examples/chat.py:
--------------------------------------------------------------------------------
 1 | """A simple CLI chatbot"""
 2 | 
 3 | import languagemodels as lm
 4 | 
 5 | prompt = f"System: Reply as a helpful assistant. Currently {lm.get_date()}."
 6 | 
 7 | while True:
 8 |     user_message = input("\nUser: ")
 9 | 
10 |     prompt += f"\n\nUser: {user_message}"
11 | 
12 |     print(prompt)
13 | 
14 |     prompt += "\n\nAssistant:"
15 | 
16 |     response = lm.chat(prompt)
17 |     print(f"\nAssistant: {response}")
18 | 
19 |     prompt += f" {response}"
20 | 


--------------------------------------------------------------------------------
/test/gen_docs.py:
--------------------------------------------------------------------------------
 1 | """ Generates docs for testing 
 2 | 
 3 | All documents come from Wikipedia
 4 | """
 5 | 
 6 | import languagemodels as lm
 7 | import json
 8 | 
 9 | planets = [
10 |     "Mercury",
11 |     "Venus",
12 |     "Earth",
13 |     "Mars",
14 |     "Jupiter",
15 |     "Saturn",
16 |     "Uranus",
17 |     "Neptune",
18 | ]
19 | 
20 | with open("test/planets.json", "w") as f:
21 |     docs = [{"name": p, "content": lm.get_wiki(f"Planet {p}")} for p in planets]
22 |     json.dump(docs, f)
23 | 


--------------------------------------------------------------------------------
/examples/streamlit.py:
--------------------------------------------------------------------------------
 1 | """A simple inference UI using Streamlit
 2 | 
 3 | Run this application using `streamlit run {filename}`
 4 | 
 5 | A live version of this application is hosted here:
 6 | 
 7 | https://jncraton-languagemodels-examplesstreamlit-0h6yr7.streamlit.app/
 8 | """
 9 | 
10 | import streamlit as st
11 | import languagemodels as lm
12 | 
13 | st.title("[languagemodels](https://github.com/jncraton/languagemodels) Demo")
14 | 
15 | st.text_input("Prompt (passed to `lm.do()`)", key="prompt")
16 | 
17 | # Prompt LLM to get response
18 | response = lm.do(st.session_state.prompt)
19 | 
20 | st.write(response)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/pi.yml:
--------------------------------------------------------------------------------
 1 | name: ARM64 Pi
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |       - '**/*.md'
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-22.04
12 |     steps:
13 |     - uses: actions/checkout@v3
14 |     - uses: pguyot/arm-runner-action@v2
15 |       with:
16 |         base_image: raspios_lite_arm64:2023-05-03
17 |         image_additional_mb: 2048
18 |         commands: |
19 |             sudo apt install -y python3 python3-pip python3-venv
20 |             python3 -m venv .venv
21 |             . .venv/bin/activate
22 |             pip3 install .
23 |             make test-base
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("readme.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="languagemodels",
 8 |     version="0.24.0",
 9 |     author="Jon Craton",
10 |     author_email="jon@joncraton.com",
11 |     description="Simple inference for large language models",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/jncraton/languagemodels",
15 |     packages=setuptools.find_packages(),
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     python_requires='>=3.8',
22 |     install_requires=[
23 |         "huggingface_hub",
24 |         "ctranslate2>=4.4.0",
25 |         "tokenizers",
26 |        ],
27 | )


--------------------------------------------------------------------------------
/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy docs to Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |   workflow_dispatch:
 7 | 
 8 | permissions:
 9 |   contents: read
10 |   pages: write
11 |   id-token: write
12 | 
13 | concurrency:
14 |   group: "pages"
15 |   cancel-in-progress: false
16 | 
17 | jobs:
18 |   deploy:
19 |     environment:
20 |       name: github-pages
21 |       url: ${{ steps.deployment.outputs.page_url }}
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - name: Checkout
25 |         uses: actions/checkout@v4
26 |       - name: Install deps
27 |         run: |
28 |             pip install -r requirements.txt
29 |             pip install pdoc
30 |       - name: Generate docs
31 |         run: make doc
32 |       - name: Setup Pages
33 |         uses: actions/configure-pages@v5
34 |       - name: Upload artifact
35 |         uses: actions/upload-pages-artifact@v3
36 |         with:
37 |           path: 'doc'
38 |       - name: Deploy to GitHub Pages
39 |         id: deployment
40 |         uses: actions/deploy-pages@v4
41 | 


--------------------------------------------------------------------------------
/examples/streamlitchat.py:
--------------------------------------------------------------------------------
 1 | """A simple web chatbot using streamlit
 2 | 
 3 | Run this application using `streamlit run {filename}`
 4 | 
 5 | A live version of this bot is available here:
 6 | 
 7 | https://jncraton-languagemodels-examplesstreamlitchat-s4uj7z.streamlit.app/
 8 | """
 9 | 
10 | import streamlit as st
11 | import languagemodels as lm
12 | 
13 | st.title("Chatbot")
14 | 
15 | 
16 | def reset():
17 |     st.session_state.dialog = ""
18 |     st.session_state.message = ""
19 | 
20 | 
21 | # Initialize empty dialog context on first run
22 | if "dialog" not in st.session_state:
23 |     reset()
24 | 
25 | if st.session_state.message:
26 |     # Add new message to dialog
27 |     st.session_state.dialog += f"User: {st.session_state.message}\n\nAssistant: "
28 |     st.session_state.message = ""
29 | 
30 |     # Prompt LLM to get response
31 |     response = lm.chat(f"{st.session_state.dialog}")
32 | 
33 |     # Display full dialog
34 |     st.session_state.dialog += response + "\n\n"
35 | 
36 |     st.write(st.session_state.dialog)
37 | 
38 | st.text_input("Message", key="message")
39 | 
40 | st.button("Reset", on_click=reset)
41 | 


--------------------------------------------------------------------------------
/.github/workflows/memperf.yml:
--------------------------------------------------------------------------------
 1 | name: Memory Performance
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |       - '**/*.md'
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test:
11 |     strategy:
12 |       matrix:
13 |         python-version: ["3.11"]
14 |         os: [ubuntu-latest, windows-latest, macos-latest]
15 |         max_ram: [".5", "1", "4"]
16 |     runs-on: ${{ matrix.os }}
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - name: Cache Models
20 |         id: cache-models
21 |         uses: actions/cache@v4
22 |         with:
23 |           path: ~/.cache/huggingface
24 |           key: models
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |           allow-prereleases: true
30 |       - name: Install dependencies
31 |         run: |
32 |           python -m pip install --upgrade pip
33 |           pip install -r requirements.txt
34 |           pip install psutil
35 |       - name: Test Memory Usage
36 |         run: env LANGUAGEMODELS_MAX_RAM=${{ matrix.max_ram }} make test-perf
37 | 


--------------------------------------------------------------------------------
/license.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Jon Craton
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/examples/assistant.py:
--------------------------------------------------------------------------------
 1 | """ A simple assistant
 2 | 
 3 | The assistant uses information retrieval to obtain context from a small set
 4 | of stored documents. The included information is the current weather, current
 5 | date, and a brief summary of the Python programming language and the planet
 6 | Saturn.
 7 | 
 8 | A number of demonstration question are completed to demonstrate the available
 9 | functionality.
10 | """
11 | 
12 | import languagemodels as lm
13 | 
14 | 
15 | def assist(question):
16 |     context = lm.get_doc_context(question)
17 | 
18 |     return lm.do(f"Answer using context: {context} Question: {question}")
19 | 
20 | 
21 | lat, lon = (41.8, -87.6)
22 | 
23 | lm.store_doc(lm.get_wiki("Python language"), "Python")
24 | lm.store_doc(lm.get_wiki("Planet Saturn"), "Saturn")
25 | lm.store_doc(lm.get_weather(lat, lon), "Weather")
26 | lm.store_doc(lm.get_date(), "Time")
27 | 
28 | questions = [
29 |     "What day of the week is it?",
30 |     "Is it going to rain today?",
31 |     "What time is it?",
32 |     "Who created Python?",
33 |     "How many moon does Saturn have?",
34 | ]
35 | 
36 | for question in questions:
37 |     print(f"{question} {assist(question)}")
38 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: x64 Desktop
 2 | 
 3 | on:
 4 |   push:
 5 |     paths-ignore:
 6 |       - '**/*.md'
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test:
11 |     strategy:
12 |       matrix:
13 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
14 |         os: [ubuntu-22.04, ubuntu-latest, windows-latest, macos-latest, macos-14]
15 |         exclude:
16 |           - os: macos-14
17 |             python-version: "3.8"
18 |           - os: macos-14
19 |             python-version: "3.9"
20 |           - os: macos-latest
21 |             python-version: "3.8"
22 |           - os: macos-latest
23 |             python-version: "3.9"
24 |     runs-on: ${{ matrix.os }}
25 |     steps:
26 |       - uses: actions/checkout@v4
27 |       - name: Cache Models
28 |         id: cache-models
29 |         uses: actions/cache@v4
30 |         with:
31 |           path: ~/.cache/huggingface
32 |           key: models
33 |       - name: Set up Python ${{ matrix.python-version }}
34 |         uses: actions/setup-python@v5
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 |           allow-prereleases: true
38 |       - name: Install dependencies
39 |         run: |
40 |           pip install .
41 |       - name: Test
42 |         run: make test
43 | 


--------------------------------------------------------------------------------
/test/embed.py:
--------------------------------------------------------------------------------
 1 | import languagemodels as lm
 2 | import numpy as np
 3 | import time
 4 | import json
 5 | import os
 6 | import psutil
 7 | 
 8 | 
 9 | def mem_used_gb():
10 |     process = psutil.Process(os.getpid())
11 |     bytes = process.memory_info().rss
12 |     gigabytes = bytes * 1e-9
13 |     return gigabytes
14 | 
15 | 
16 | print(f"Memory used before loading models: {mem_used_gb():.2f}GB")
17 | 
18 | print("\n# Embedding Tests\n")
19 | 
20 | planets = json.load(open("test/planets.json"))[-4:]
21 | 
22 | # Make sure the model is loaded before testing
23 | start = time.perf_counter_ns()
24 | lm.docs.store("just initializing")
25 | lm.docs.clear()
26 | print(f"Model load time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms")
27 | 
28 | start = time.perf_counter_ns()
29 | for planet in planets:
30 |     lm.docs.store(planet["content"], planet["name"])
31 | ms = (time.perf_counter_ns() - start) / 1e6
32 | print(
33 |     f"Embedded {len(lm.docs.chunks)} chunks in {ms:.0f}ms ({ms/len(lm.docs.chunks):.0f}ms per chunk)"
34 | )
35 | 
36 | start = time.perf_counter_ns()
37 | print(lm.get_doc_context("Which planets have rings?"))
38 | print(f"Search time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms")
39 | lm.docs.clear()
40 | 
41 | # Create many fake docs to benchmark search
42 | # We create 10 unique docs then duplicate them
43 | # A fully random set of docs would be better, but takes a long time to generate
44 | docs = [lm.embeddings.Document(str(i), np.random.rand(384)) for i in range(10)]
45 | start = time.perf_counter_ns()
46 | lm.embeddings.search("Test", docs * 10000)
47 | print(f"100k search time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms")
48 | docs = None
49 | 
50 | max_ram = lm.config["max_ram"]
51 | print(
52 |     f"Memory used after all tests: {mem_used_gb():.2f}GB (must be under {max_ram:.2f}GB)"
53 | )
54 | 
55 | # Confirm that we fit in max_ram after running all tests
56 | assert mem_used_gb() < max_ram
57 | 


--------------------------------------------------------------------------------
/examples/translate.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nbformat": 4,
 3 |   "nbformat_minor": 0,
 4 |   "metadata": {
 5 |     "colab": {
 6 |       "provenance": [],
 7 |       "collapsed_sections": [
 8 |         "K1yoiesR8O24"
 9 |       ]
10 |     },
11 |     "kernelspec": {
12 |       "name": "python3",
13 |       "display_name": "Python 3"
14 |     },
15 |     "language_info": {
16 |       "name": "python"
17 |     }
18 |   },
19 |   "cells": [
20 |     {
21 |       "cell_type": "markdown",
22 |       "source": [
23 |         "# Install and Import Package"
24 |       ],
25 |       "metadata": {
26 |         "id": "K1yoiesR8O24"
27 |       }
28 |     },
29 |     {
30 |       "cell_type": "code",
31 |       "execution_count": null,
32 |       "metadata": {
33 |         "id": "qwyfeGSL7myi"
34 |       },
35 |       "outputs": [],
36 |       "source": [
37 |         "%pip install languagemodels\n",
38 |         "\n",
39 |         "import languagemodels as lm"
40 |       ]
41 |     },
42 |     {
43 |       "cell_type": "markdown",
44 |       "source": [
45 |         "# Translation Example"
46 |       ],
47 |       "metadata": {
48 |         "id": "XRguig1K8WD3"
49 |       }
50 |     },
51 |     {
52 |       "cell_type": "code",
53 |       "source": [
54 |         "lm.do(\"Translate to English: Hola, mundo!\")\n"
55 |       ],
56 |       "metadata": {
57 |         "colab": {
58 |           "base_uri": "https://localhost:8080/",
59 |           "height": 35
60 |         },
61 |         "id": "-K_JCHWZ7v8t",
62 |         "outputId": "85b30980-26d0-4caf-d520-f4e72bb9f17e"
63 |       },
64 |       "execution_count": 3,
65 |       "outputs": [
66 |         {
67 |           "output_type": "execute_result",
68 |           "data": {
69 |             "text/plain": [
70 |               "'Hello, world!'"
71 |             ],
72 |             "application/vnd.google.colaboratory.intrinsic+json": {
73 |               "type": "string"
74 |             }
75 |           },
76 |           "metadata": {},
77 |           "execution_count": 3
78 |         }
79 |       ]
80 |     }
81 |   ]
82 | }


--------------------------------------------------------------------------------
/test/perf.py:
--------------------------------------------------------------------------------
 1 | import languagemodels as lm
 2 | import time
 3 | import json
 4 | import os
 5 | import psutil
 6 | 
 7 | 
 8 | def mem_used_gb():
 9 |     process = psutil.Process(os.getpid())
10 |     bytes = process.memory_info().rss
11 |     gigabytes = bytes * 1e-9
12 |     return gigabytes
13 | 
14 | 
15 | print(f"Memory used before loading models: {mem_used_gb():.2f}GB")
16 | 
17 | 
18 | print("\n# Completion Test\n")
19 | 
20 | print(f'{lm.complete("They ran until")=}')
21 | 
22 | print("\n# Chat Test\n")
23 | 
24 | print(
25 |     lm.chat(
26 |         """
27 | System: Respond helpfully. It is Monday
28 | 
29 | User: What day is it?
30 | 
31 | Assistant:
32 | """
33 |     )
34 | )
35 | 
36 | 
37 | print("\n# Instruction Tests\n")
38 | 
39 | tests = [
40 |     ("What is the capital of France?", "Paris"),
41 |     ("A game uses a bat and ball. Is it baseball or soccer?", "Baseball"),
42 |     ("Is grass green or blue?", "Green"),
43 |     ("Does a car have more wheels than a bike?", "Yes"),
44 | ]
45 | 
46 | accuracy = 0
47 | 
48 | 
49 | start = time.perf_counter_ns()
50 | 
51 | lm.do("Test first run time")
52 | 
53 | print(f"Initialization time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms")
54 | 
55 | print(f"Memory used after running chat inference: {mem_used_gb():.2f}GB")
56 | 
57 | start = time.perf_counter_ns()
58 | chars_generated = 0
59 | 
60 | for test in tests:
61 |     response = lm.do(test[0])
62 |     chars_generated += len(response)
63 |     if test[1].lower() in response.lower():
64 |         accuracy += 1 / len(tests)
65 |     print(test[0], response)
66 | 
67 | print(
68 |     f"Average inference time: {(time.perf_counter_ns() - start)/len(tests)/1e6:.0f}ms"
69 | )
70 | 
71 | print(
72 |     f"{(time.perf_counter_ns() - start)/chars_generated/1e6:.0f}ms per character generated"
73 | )
74 | 
75 | print(f"Overall accuracy: {accuracy:.2f}")
76 | 
77 | print(f"Memory used after running inference: {mem_used_gb():.2f}GB")
78 | 
79 | max_ram = lm.config["max_ram"]
80 | print(
81 |     f"Memory used after all tests: {mem_used_gb():.2f}GB (must be under {max_ram:.2f}GB)"
82 | )
83 | 
84 | # Confirm that we used the right model size and roughly fit in memory constraints
85 | # Note that memory usage will vary between operating systems and specific usage
86 | assert mem_used_gb() < max_ram * 1.10
87 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | all: lint test
 2 | 
 3 | .PHONY: test test-base lint format spellcheck upload clean
 4 | 
 5 | test-base:
 6 | 	python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py
 7 | 	env LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/__init__.py
 8 | 	LANGUAGEMODELS_INSTRUCT_MODEL="Qwen2.5-0.5B-Instruct" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/inference.py
 9 | 
10 | test: test-base
11 | 	env LANGUAGEMODELS_MAX_RAM=large python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py
12 | 	env LANGUAGEMODELS_MAX_RAM=xl python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py
13 | 
14 | test-perf:
15 | 	PYTHONPATH=. python3 test/perf.py
16 | 
17 | test-commercial:
18 | 	env LANGUAGEMODELS_SIZE=small LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py
19 | 	env LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py
20 | 	env LANGUAGEMODELS_SIZE=large LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py
21 | 	env LANGUAGEMODELS_SIZE=xl LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py
22 | 
23 | lint:
24 | 	flake8 --max-line-length 88 --extend-ignore E203,F401 languagemodels/__init__.py
25 | 	flake8 --max-line-length 88 --extend-ignore E203 languagemodels/models.py languagemodels/inference.py languagemodels/embeddings.py languagemodels/config.py languagemodels/preprocess.py examples/*.py
26 | 
27 | format:
28 | 	black languagemodels/*.py examples/*.py test/*.py
29 | 
30 | doc:
31 | 	mkdir -p doc
32 | 	python3 -m pdoc -o doc languagemodels
33 | 
34 | paper.pdf: paper.md paper.bib
35 | 	pandoc $< --citeproc --pdf-engine=xelatex -o $@
36 | 
37 | spellcheck:
38 | 	aspell -c --dont-backup readme.md
39 | 	aspell -c --dont-backup paper.md
40 | 
41 | upload:
42 | 	python3 setup.py sdist bdist_wheel
43 | 	python3 -m twine upload dist/*
44 | 
45 | clean:
46 | 	rm -rf tmp
47 | 	rm -rf languagemodels.egg-info
48 | 	rm -rf languagemodels/__pycache__
49 | 	rm -rf dist
50 | 	rm -rf build
51 | 	rm -rf doc
52 | 	rm -rf .ipynb_checkpoints
53 | 	rm -rf examples/.ipynb_checkpoints
54 | 


--------------------------------------------------------------------------------
/languagemodels/preprocess.py:
--------------------------------------------------------------------------------
 1 | from html import unescape
 2 | from html.parser import HTMLParser
 3 | 
 4 | 
 5 | def get_html_paragraphs(src: str):
 6 |     """
 7 |     Return plain text paragraphs from an HTML source
 8 | 
 9 |     :param src: HTML document to convert to plain text paragraphs
10 |     :return: Plain text paragraphs of document
11 | 
12 |     This function is designed to be quick rather than robust.
13 | 
14 |     It follows a simple approach to extracting text:
15 | 
16 |     1. Ignore all content inside the following elements listed in `ignore`.
17 |     2. Merge inline text content into paragraphs from `inlines` set.
18 |     3. Convert any newly merged text element with at least `min_length`
19 |     characters to a paragraph in the output text.
20 | 
21 |     >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read())
22 |     'Bolu Province (Turkish: Bolu ili) is a province...'
23 | 
24 |     >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read())
25 |     "First, the good news. Netflix reported a record ..."
26 |     """
27 | 
28 |     class ParagraphExtractor(HTMLParser):
29 |         paras = [""]
30 |         ignoring = []
31 |         ignore = ("script", "style", "header", "footer")
32 |         ignore_attrs = {('hidden', 'hidden'), }
33 |         inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em")
34 | 
35 |         def handle_starttag(self, tag, attrs):
36 |             if tag in self.ignore or self.ignore_attrs & set(attrs):
37 |                 self.ignoring.append(tag)
38 | 
39 |             if tag not in self.inlines and self.paras[-1]:
40 |                 self.paras.append("")
41 | 
42 |         def handle_endtag(self, tag):
43 |             if self.ignoring and self.ignoring[-1] == tag:
44 |                 self.ignoring.pop()
45 | 
46 |             if tag not in self.inlines and self.paras[-1]:
47 |                 self.paras.append("")
48 | 
49 |         def handle_data(self, data):
50 |             if not self.ignoring:
51 |                 if self.paras and self.paras[-1]:
52 |                     self.paras[-1] += unescape(data)
53 |                 else:
54 |                     self.paras.append(data)
55 | 
56 |         def get_plain(self):
57 |             return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140])
58 | 
59 |     extractor = ParagraphExtractor()
60 |     extractor.feed(src)
61 |     return extractor.get_plain()
62 | 


--------------------------------------------------------------------------------
/examples/summarize.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nbformat": 4,
 3 |   "nbformat_minor": 0,
 4 |   "metadata": {
 5 |     "colab": {
 6 |       "provenance": [],
 7 |       "gpuType": "T4"
 8 |     },
 9 |     "kernelspec": {
10 |       "name": "python3",
11 |       "display_name": "Python 3"
12 |     },
13 |     "language_info": {
14 |       "name": "python"
15 |     },
16 |     "accelerator": "GPU"
17 |   },
18 |   "cells": [
19 |     {
20 |       "cell_type": "markdown",
21 |       "source": [
22 |         "# Install and import"
23 |       ],
24 |       "metadata": {
25 |         "id": "5KNsOIs5qEaa"
26 |       }
27 |     },
28 |     {
29 |       "cell_type": "code",
30 |       "execution_count": null,
31 |       "metadata": {
32 |         "id": "IF-HnHg1ayYW"
33 |       },
34 |       "outputs": [],
35 |       "source": [
36 |         "%pip install languagemodels\n",
37 |         "import languagemodels as lm"
38 |       ]
39 |     },
40 |     {
41 |       "cell_type": "markdown",
42 |       "source": [
43 |         "# Summarize list of documents"
44 |       ],
45 |       "metadata": {
46 |         "id": "fOKBZ7zwqKU2"
47 |       }
48 |     },
49 |     {
50 |       "cell_type": "code",
51 |       "source": [
52 |         "docs = [\n",
53 |         "    'Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.\\n\\nPython is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.\\n\\nGuido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python\\xa00.9.0. Python\\xa02.0 was released in 2000. Python\\xa03.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python\\xa02.7.18, released in 2020, was the last release of Python\\xa02.\\n\\nPython consistently ranks as one of the most popular programming languages.',\n",
54 |         "    'Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of \"understanding\" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.\\n\\nChallenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.',\n",
55 |         "]\n",
56 |         "\n",
57 |         "for doc in docs:\n",
58 |         "    result = lm.do(f\"Summarize the following text in one short sentence. Text: {doc}\")\n",
59 |         "    print(result)"
60 |       ],
61 |       "metadata": {
62 |         "id": "OcKsYj50a-u2"
63 |       },
64 |       "execution_count": null,
65 |       "outputs": []
66 |     }
67 |   ]
68 | }


--------------------------------------------------------------------------------
/examples/tools.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "cdc28c56-5631-4abf-b485-0f226962165f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Tool Usage\n",
  9 |     "\n",
 10 |     "Language models are best suited for generating natural language. They don't have access to external knowledge, and may not be well suited to for computational tasks. However, we can overcome some of these limits by augmenting models with tools.\n",
 11 |     "\n",
 12 |     "## Prompting for Tool Use\n",
 13 |     "\n",
 14 |     "The first step is to prompt the model in a way that allows it make use of tools. We'll do this by providing few-shot examples of computations using eval. We can then replace these computations with their results."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "id": "85288258-3f83-46cd-9f96-d7e6a940be33",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import languagemodels as lm\n",
 25 |     "import re"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "id": "2c756fa4-1def-4c81-b86e-de2d2103566a",
 32 |    "metadata": {
 33 |     "tags": []
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/plain": [
 39 |        "'You have eval(28 + 51) cars.'"
 40 |       ]
 41 |      },
 42 |      "execution_count": 2,
 43 |      "metadata": {},
 44 |      "output_type": "execute_result"
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "def generate_answer_for_calculator(question):\n",
 49 |     "    return lm.do(f\"\"\"\n",
 50 |     "Answer using eval as needed.\n",
 51 |     "\n",
 52 |     "Question: I had 17 apples and get 8 more. How many apples do I have?\n",
 53 |     "Answer: You have eval(17 + 8) apples.\n",
 54 |     "\n",
 55 |     "Question: How many dogs do I have if I start with 3 and get 2 more?\n",
 56 |     "Answer: You have eval(3 + 2) dogs.\n",
 57 |     "\n",
 58 |     "Question: I had 211 books and lose 154, how many books do I have?\n",
 59 |     "Answer: You have eval(211 - 154) books.\n",
 60 |     "\n",
 61 |     "Question: If I had 253 cats and got 101 more, how many cats do I have?\n",
 62 |     "Answer: You have eval(253 + 101) cats.\n",
 63 |     "\n",
 64 |     "Question: I buy 6 oranges and had 4 to begin with. How many oranges do I have?\n",
 65 |     "Answer: You have eval(6 + 4) oranges.\n",
 66 |     "\n",
 67 |     "Question: {question}\n",
 68 |     "\"\"\".strip())\n",
 69 |     "\n",
 70 |     "reply = generate_answer_for_calculator(\"If I have 28 cars and buy 51 more, how many cars do I have?\")\n",
 71 |     "reply"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "94f05290-3f42-4b46-9af6-5447f663a166",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Merging Tools and Results\n",
 80 |     "\n",
 81 |     "Now that we have a result from the LLM expecting tools to be used, we can use regular expressions to replace tools with their results."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 3,
 87 |    "id": "179afbef-eab0-46c8-a6c9-c38f87d7570f",
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "'You have 79 cars.'"
 94 |       ]
 95 |      },
 96 |      "execution_count": 3,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "def replace_expressions(reply):\n",
103 |     "    # Replace \"eval(1+2)\" with 3\n",
104 |     "    # Also replace \"eval(1+2) = 3\" with 3, as the model sometimes predicts an answer\n",
105 |     "    expressions = re.findall('(eval\\(([ 0-9\\.+\\-/\\*]+)\\)[ =0-9\\.]*)', reply)\n",
106 |     "\n",
107 |     "    for exp in expressions:\n",
108 |     "        result = eval(exp[1])\n",
109 |     "        reply = reply.replace(exp[0].strip(), str(result))\n",
110 |     "        \n",
111 |     "    return reply\n",
112 |     "\n",
113 |     "replace_expressions(reply)"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3 (ipykernel)",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.8.10"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 5
138 | }
139 | 


--------------------------------------------------------------------------------
/languagemodels/models.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from huggingface_hub import hf_hub_download, snapshot_download
  3 | from tokenizers import Tokenizer
  4 | import ctranslate2
  5 | 
  6 | from languagemodels.config import config, models
  7 | 
  8 | 
  9 | modelcache = {}
 10 | 
 11 | 
 12 | class ModelException(Exception):
 13 |     pass
 14 | 
 15 | 
 16 | def get_model_info(model_type="instruct"):
 17 |     """Gets info about the current model in use
 18 | 
 19 |     >>> get_model_info('instruct')
 20 |     {'name': 'LaMini-Flan-T5-248M', 'tuning': 'instruct'...
 21 |     """
 22 |     model_name = config[f"{model_type}_model"]
 23 | 
 24 |     m = [m for m in models if m["name"] == model_name][0]
 25 | 
 26 |     param_bits = int(re.search(r"\d+", m["quantization"]).group(0))
 27 | 
 28 |     m["size_gb"] = m["params"] * param_bits / 8 / 1e9
 29 |     if "/" in m["name"]:
 30 |         m["path"] = m["name"]
 31 |     else:
 32 |         m["path"] = f"jncraton/{m['name']}-{m['backend']}-{m['quantization']}"
 33 | 
 34 |     return m
 35 | 
 36 | 
 37 | def initialize_tokenizer(model_type, model_name):
 38 |     model_info = get_model_info(model_type)
 39 |     rev = model_info.get("revision", None)
 40 | 
 41 |     tok_config = hf_hub_download(
 42 |         model_info["path"], "tokenizer.json", revision=rev, local_files_only=True
 43 |     )
 44 |     tokenizer = Tokenizer.from_file(tok_config)
 45 | 
 46 |     if model_type == "embedding":
 47 |         tokenizer.no_padding()
 48 |         tokenizer.no_truncation()
 49 | 
 50 |     return tokenizer
 51 | 
 52 | 
 53 | def initialize_model(model_type, model_name, tokenizer_only=False):
 54 |     model_info = get_model_info(model_type)
 55 | 
 56 |     allowed = ["*.bin", "*.txt", "*.json"]
 57 |     rev = model_info.get("revision", None)
 58 | 
 59 |     # snapshot_download checks for updates by default
 60 |     # This can cause significant lag in offline usecases or high latency networks
 61 |     # To avoid this penalty, we try to use the local cache first.
 62 |     # If the files are not available, then we attempt a download
 63 |     try:
 64 |         path = snapshot_download(
 65 |             model_info["path"],
 66 |             max_workers=1,
 67 |             allow_patterns=allowed,
 68 |             revision=rev,
 69 |             local_files_only=True,
 70 |         )
 71 |     except FileNotFoundError:
 72 |         path = snapshot_download(
 73 |             model_info["path"], max_workers=1, allow_patterns=allowed, revision=rev
 74 |         )
 75 | 
 76 |     if tokenizer_only:
 77 |         return None
 78 | 
 79 |     if model_info["architecture"] == "encoder-only-transformer":
 80 |         return ctranslate2.Encoder(
 81 |             path,
 82 |             "cpu",
 83 |             compute_type="int8",
 84 |         )
 85 |     elif model_info["architecture"] == "decoder-only-transformer":
 86 |         return ctranslate2.Generator(path, config["device"], compute_type="int8")
 87 |     else:
 88 |         return ctranslate2.Translator(path, config["device"], compute_type="int8")
 89 | 
 90 | 
 91 | def get_model(model_type, tokenizer_only=False):
 92 |     """Gets a model from the loaded model cache
 93 | 
 94 |     If tokenizer_only, the model itself will not be (re)loaded
 95 | 
 96 |     >>> tokenizer, model = get_model("instruct")
 97 |     >>> type(tokenizer)
 98 |     <class 'tokenizers.Tokenizer'>
 99 | 
100 |     >>> type(model)
101 |     <class 'ctranslate2._ext.Translator'>
102 | 
103 |     >>> tokenizer, model = get_model("embedding")
104 |     >>> type(tokenizer)
105 |     <class 'tokenizers.Tokenizer'>
106 | 
107 |     >>> type(model)
108 |     <class 'ctranslate2._ext.Encoder'>
109 |     """
110 | 
111 |     model_name = config[f"{model_type}_model"]
112 | 
113 |     if config["max_ram"] < 4 and not tokenizer_only:
114 |         for model in modelcache:
115 |             if model != model_name:
116 |                 try:
117 |                     modelcache[model][1].unload_model()
118 |                 except AttributeError:
119 |                     # Encoder-only models can't be unloaded by ctranslate2
120 |                     pass
121 | 
122 |     if model_name not in modelcache:
123 |         model = initialize_model(model_type, model_name, tokenizer_only)
124 |         tokenizer = initialize_tokenizer(model_type, model_name)
125 |         modelcache[model_name] = (tokenizer, model)
126 |     elif not tokenizer_only:
127 |         # Make sure model is loaded if we've never loaded it
128 |         if not modelcache[model_name][1]:
129 |             modelcache[model_name] = (
130 |                 modelcache[model_name][0],
131 |                 initialize_model(model_type, model_name),
132 |             )
133 |         # Make sure the model is reloaded if we've unloaded it
134 |         try:
135 |             modelcache[model_name][1].load_model()
136 |         except AttributeError:
137 |             # Encoder-only models can't be unloaded in ctranslate2
138 |             pass
139 | 
140 |     return modelcache[model_name]
141 | 


--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'languagemodels: A Python Package for Exploring Modern Natural Language Processing'
 3 | tags:
 4 |   - Python
 5 |   - machine learning
 6 |   - language modeling
 7 |   - nlp
 8 | authors:
 9 |   - name: Jonathan L. Craton
10 |     orcid: 0009-0007-6543-8571
11 |     affiliation: 1
12 | affiliations:
13 |  - name: Department of Computer Science, Anderson University (IN)
14 |    index: 1
15 | date: 15 June 2023
16 | bibliography: paper.bib
17 | ---
18 | 
19 | # Summary
20 | 
21 | `languagemodels` is a Python package for educators and learners exploring the applications of large language models. It aims to be as easy to set up and use as possible, while providing many of the key building blocks used in modern LLM-driven applications. It is designed to be used in learning modules in introductory programming courses.
22 | 
23 | # Statement of Need
24 | 
25 | Large language models are having an impact on the way software is designed [@mialon2023augmented]. The development of the transformer [@vaswani2017attention] has led to rapid progress in many NLP and generative tasks [@zhao2023survey; @bert; @gpt2; @gpt3; @t5; @palm; @flan-t5; @bubeck2023sparks]. These models are becoming more powerful as they scale in both parameters [@kaplan2020scaling] and training data [@hoffmann2022training].
26 | 
27 | Early research suggests that there are many tasks performed by humans that can be transformed by LLMs [@eloundou2023gpts]. For example, large language models trained on code [@codex] are already being used as capable pair programmers via tools such as Microsoft's Copilot. To build with these technologies, students need to understand their capabilities and begin to learn new paradigms for programming.
28 | 
29 | There are many software tools already available for working with large language models [@hftransformers; @pytorch; @tensorflow; @langchain; @llamacpp; @gpt4all]. While these options serve the needs of software engineers, researchers, and hobbyists, they may not be simple enough for new learners. This package aims to lower the barriers to entry for using these tools in an educational context.
30 | 
31 | \newpage
32 | 
33 | # Example Usage
34 | 
35 | This package eliminates boilerplate and configuration options that create noise for new learners while using only basic types and simple functions. Here's an example from a Python REPL session:
36 | 
37 | ```python
38 | >>> import languagemodels as lm
39 | 
40 | >>> lm.do("Answer the question: What is the capital of France?")
41 | 'Paris.'
42 | 
43 | >>> lm.do("Classify as positive or negative: I like games",
44 | ...       choices=["positive", "negative"])
45 | 'positive'
46 | 
47 | >>> lm.extract_answer("What color is the ball?",
48 | ...                   "There is a green ball and a red box")
49 | 'green'
50 | 
51 | >>> lm.get_wiki('Chemistry')
52 | 'Chemistry is the scientific study...'
53 | 
54 | >>> lm.store_doc(lm.get_wiki("Python"), "Python")
55 | >>> lm.store_doc(lm.get_wiki("Javascript"), "Javascript")
56 | >>> lm.get_doc_context("What language is used on the web?")
57 | 'From Javascript document: Javascript engines were...'
58 | ```
59 | 
60 | # Features
61 | 
62 | Despite its simplicity, this package provides a number of building blocks that can be combined to build applications that mimic the architectures of modern software products. Some of the tools included are:
63 | 
64 | - Instruction following with the `do` function
65 | - Zero-shot classification with the `do` function and `choices` parameter
66 | - Semantic search using the `store_doc` and `get_doc_context` functions
67 | - Extractive question answering using the `extract_answer` function
68 | - Basic web retrieval using the `get_wiki` function
69 | 
70 | The package includes the following features under the hood:
71 | 
72 | - Local LLM inference on CPU for broad device support
73 | - Transparent model caching to allow fast repeated inference without explicit model initialization
74 | - Pre-selected models to allow the software to run easily and effectively on as many devices as possible
75 | 
76 | \newpage
77 | 
78 | # Implementation
79 | 
80 | The design of this software package allows its interface to be loosely coupled to the models and inference engines it uses. Progress is being made to speed up inference on consumer hardware, and this package seeks to find a balance between inference efficiency, software stability, and broad hardware support.
81 | 
82 | This package currently uses CTranslate2 [@ctranslate2] for efficient inference on CPU and GPU. The main models used include Flan-T5 [@flan-t5], LaMini-LM [@lamini-lm], and OpenChat [@openchat]. The default models used by this package can be swapped out in future versions to provide improved generation quality.
83 | 
84 | # Future work
85 | 
86 | This package provides a platform for creating simple NLP labs for use in introductory computer science courses. Additional work is needed to design specific learning modules to meet the needs of learners.
87 | 
88 | Ongoing development efforts will focus on improving the accuracy and efficiency of inference, while keeping the interface stable and supporting all reasonable platforms.
89 | 
90 | # References
91 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## 0.24 - 2024-02-14
  4 | 
  5 | ### Changed
  6 | 
  7 | - Remove special capitalization rules
  8 | 
  9 | ### Added
 10 | 
 11 | - Add lm.get_web helper function to load text from web pages
 12 | - Support Qwen2 2.5 0.5B and 1.5B
 13 | - Add support for Granite embedding models
 14 | 
 15 | ## 0.23 - 2024-12-17
 16 | 
 17 | ### Changed
 18 | 
 19 | - Drop support for Python 3.8
 20 | 
 21 | ### Fixed
 22 | 
 23 | - Properly apply prompt format when providing `choices`
 24 | - Do not add special tokens before `choices`
 25 | 
 26 | ### Added
 27 | 
 28 | - Support multilingual-e5-small embedding model
 29 | - Support Falcon 3 Instruct 1B and 3B
 30 | 
 31 | ## 0.22 - 2024-11-02
 32 | 
 33 | ### Changed
 34 | 
 35 | - Pin Llama 3.2 model versions
 36 | - Decrease repetition penalty for Llama 3.2 models
 37 | 
 38 | ### Added
 39 | 
 40 | - Support SmolLM2
 41 | - Add `embed` function
 42 | - Support Llama 3.1 8B instruct
 43 | - Use models directly from Huggingface with config.use_hf_model()
 44 | - Add "echo" config option to allow streaming tokens to stdout as they are generated
 45 | 
 46 | ## 0.21 - 2024-09-25
 47 | 
 48 | ### Changed
 49 | 
 50 | - Skip checking for model updates
 51 | - Download entire model upfront even if we only need the tokenizer initially
 52 | - Use most recent version of CTranslate2
 53 | - Add per-model repetition penalties
 54 | 
 55 | ### Added
 56 | 
 57 | - Support Llama 3.2 1B and 3B
 58 | - Support Danube3
 59 | - Support SmolLM
 60 | 
 61 | ## 0.20 - 2024-04-25
 62 | 
 63 | ### Changed
 64 | 
 65 | - Add new separators to document chunking heuristic
 66 | 
 67 | ### Fixed
 68 | 
 69 | - Allow missing query prefixes for embedding models
 70 | 
 71 | ### Added
 72 | 
 73 | - Support Phi-3-mini-4k-instruct
 74 | - Support GIST-small-Embedding-v0 embedding model
 75 | - Store model runtime stats to improve benchmarking and analysis
 76 | 
 77 | ## 0.19 - 2024-04-18
 78 | 
 79 | ### Added
 80 | 
 81 | - Support Meta-Llama-3-8B-Instruct
 82 | - Support gemma-2b-it
 83 | - Support h2o-danube2-1.8b-chat
 84 | - Support WizardLM-2-7B
 85 | 
 86 | ## 0.18.0 - 2024-02-23
 87 | 
 88 | ### Fixed
 89 | 
 90 | - Correct issue causing `choices` to be scored improperly
 91 | 
 92 | ## 0.17.0 - 2024-02-15
 93 | 
 94 | ### Added
 95 | 
 96 | - CUDA 12 support
 97 | 
 98 | ## 0.16.0 - 2024-02-04
 99 | 
100 | ### Fixed
101 | 
102 | - Run embedding models on CPU to work around memory copy issue
103 | 
104 | ## 0.15.0 - 2024-02-04
105 | 
106 | ### Changed
107 | 
108 | - Improve embedding search performance
109 | 
110 | ### Added
111 | 
112 | - Add openchat-3.5-0106 model
113 | - Add h2o-danube-1.8b-chat model
114 | 
115 | ## 0.14.0 - 2024-01-06
116 | 
117 | ### Changed
118 | 
119 | - Simplified dialogstudio system message
120 | 
121 | ### Fixed
122 | 
123 | - Correct missing instruction in openchat prompt
124 | 
125 | ## 0.13.0 - 2024-01-05
126 | 
127 | ### Changed
128 | 
129 | - Improved search speed when searching many documents
130 | - Reduce memory usage for large document embeddings
131 | - Updated to TinyLlama Chat v1.0
132 | - Remove auto model scaling on Colab
133 | - Correct phi-1.5 prompt format
134 | - Correct model license metadata
135 | 
136 | ### Added
137 | 
138 | - Add Mistral-7B-Instruct-v0.2 model
139 | - Add openchat-3.5-1210 model
140 | - Add phi-2 model
141 | - Support static batching by passing lists to `do`
142 | - Support choices list on `do` to restrict possible outputs
143 | 
144 | ## 0.12.0 - 2023-12-02
145 | 
146 | ### Changed
147 | 
148 | - Remove explicit setuptools dependency (see [CTranslate2#1526](https://github.com/OpenNMT/CTranslate2/pull/1526))
149 | 
150 | ### Fixed
151 | 
152 | - Reduce model size when not using a CPU in Colab
153 | 
154 | ## 0.11.0 - 2023-12-02
155 | 
156 | ### Changed
157 | 
158 | - Default to 8GB model size on Colab
159 | - Allow 2048 token response by default on Colab
160 | - Use Colab GPU by default if available
161 | - Skip returning prompt for decoder-only models
162 | - Ensure whitespace is removed from decoder-only outputs
163 | 
164 | ### Added
165 | 
166 | - Add neural-chat-7b-v3-1 as default 8GB model
167 | - Add max_tokens config option
168 | 
169 | ## 0.10.0 - 2023-10-29
170 | 
171 | ### Added
172 | 
173 | - Add gte-tiny embedding model
174 | - Properly support Python 3.12
175 | 
176 | ### Fixed
177 | 
178 | - Removed extra classification prompt when performing classification with generative models
179 | - Prevent doubling of special tokens during classification
180 | 
181 | ## 0.9.0 - 2023-10-07
182 | 
183 | ### Changed
184 | 
185 | - Use per-model instruction formats
186 | - Batch chunk embeddings for faster performance embedding larger documents
187 | 
188 | ### Added
189 | 
190 | - Automatically use query prefixes as needed for embeddings
191 | - Add phi-1.5 model
192 | - Add dialogstudio base model
193 | - Add support for gte-small embeddings
194 | - Add support for bge-small-en embeddings
195 | 
196 | ### Fixed
197 | 
198 | - Allow token suppression on decoder-only models
199 | - Remove HTML comments appearing in some wiki pages
200 | 
201 | ## 0.8.0 - 2023-08-04
202 | 
203 | ### Changed
204 | 
205 | - Model names no longer include backend and quantization info
206 | - Default to CPU inference unless GPU enabled using `lm.config["device"]="auto"`
207 | 
208 | ### Added
209 | 
210 | - Add quantization info to config and use it for memory usage calculation
211 | 
212 | ### Fixed
213 | 
214 | - Increase repetition penalty to 1.3 from 1.2 to help avoid repetition in smaller models
215 | 
216 | ## 0.7.0 - 2023-07-27
217 | 
218 | ### Changed
219 | 
220 | - Improve semantic meaning of chunk heading
221 | - Remove sentencepiece dependency
222 | 
223 | ### Added
224 | 
225 | - Support GPT-based models
226 | - Add `code` generation function
227 | - Create new configuration system
228 | - Use CUDA if available
229 | 
230 | ### Fixed
231 | 
232 | - Use non-greedy sampling on `complete` function
233 | - Decrease chance of splitting chunks on decimal points
234 | - Correct assistant example
235 | 
236 | ## 0.6.0
237 | 
238 | ### Changed
239 | 
240 | - Attempt to chunk context on semantic boundaries
241 | 
242 | ### Added
243 | 
244 | - Allow filtering by model license
245 | 
246 | ### Fixed
247 | 
248 | - Update classification to only allow valid classes to be returned
249 | 
250 | ## 0.5.0 
251 | 
252 | ### Changed
253 | 
254 | - Disable beam search for faster inference
255 | 
256 | ## 0.4.0
257 | 
258 | ### Changed
259 | 
260 | - Normalize output
261 | - Rename some functions
262 | 
263 | ### Added
264 | 
265 | - Support xl models
266 | 
267 | ## 0.2.0 
268 | 
269 | ### Changed
270 | 
271 | - Less verbose chat syntax
272 | 
273 | ### 0.1.0 
274 | 
275 | ### Changed
276 | 
277 | - Use ctranslate2 for greater efficiency
278 | 
279 | ### 0.0.0 
280 | 
281 | - Original version using HuggingFace Transformers


--------------------------------------------------------------------------------
/examples/extractive_qa_embeddings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "3a9a252f-5e79-4f33-b78c-c3a34e0f1a59",
  6 |    "metadata": {
  7 |     "tags": []
  8 |    },
  9 |    "source": [
 10 |     "# Extractive Question Answering\n",
 11 |     "\n",
 12 |     "Language models are good at generating text, but generations are not always accurate. One way to increase accuracy is to provide context for answering a question within the prompt.\n",
 13 |     "\n",
 14 |     "First, we'll load some context."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "id": "8d925dca-a4f0-4770-a217-fbd2da492434",
 21 |    "metadata": {
 22 |     "tags": []
 23 |    },
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation via the off-side rule.Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0. Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.Python consistently ranks as one of the most popular programming languages.\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "import languagemodels as lm\n",
 35 |     "\n",
 36 |     "python_info = lm.get_wiki(\"Python\")\n",
 37 |     "print(python_info)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "4a5830e4-5efc-4d3f-9452-c2f44d5c2e51",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "We can now prompt the model to answer the question using the context."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "id": "22ae1c03-7168-42db-a0f8-4057c79fd91d",
 52 |    "metadata": {
 53 |     "tags": []
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "Guido van Rossum created Python.\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "print(lm.do(f\"Answer from the context: Who created Python? {python_info}\"))"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "id": "e1bed464-cbab-41e2-90d6-2beb02f97c31",
 71 |    "metadata": {
 72 |     "tags": []
 73 |    },
 74 |    "source": [
 75 |     "# Embeddings\n",
 76 |     "\n",
 77 |     "Language models are capable of answering questions based on a context, but we now need a way to provide them with appropriate context.\n",
 78 |     "\n",
 79 |     "One solution to this is to have a large amount of available context and retrieve only the meaningful bits when answering a question. Embeddings are a tool to achieve this.\n",
 80 |     "\n",
 81 |     "Embeddings provide a way to map a numeric vector to the meaning of some input. In the case of language models, embeddings are derived from documents.\n",
 82 |     "\n",
 83 |     "## Semantic Search\n",
 84 |     "\n",
 85 |     "Once we have mapped vectors to our documents, we can search for similar documents by meaning. If we've constructed our embedding model appropriately, documents that answer questions will be near the questions themselves in vector space.\n",
 86 |     "\n",
 87 |     "The math to achieve that is out of scope of this example, but the languagemodels package provides a few simple helper functions to facilated a document store capable of semantic search."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 3,
 93 |    "id": "b5816039-7071-4faf-be40-2451c265945a",
 94 |    "metadata": {
 95 |     "tags": []
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# Load some programming language documents\n",
100 |     "for topic in ['Python', 'Javascript', 'C++', 'SQL', 'HTML']:\n",
101 |     "    doc = lm.get_wiki(topic)\n",
102 |     "    lm.store_doc(doc)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "id": "3412a004-2505-44ae-92b2-13e71ff6bcb9",
109 |    "metadata": {
110 |     "tags": []
111 |    },
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "'It is often described as a \"batteries included\" language due to its comprehensive standard library.Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.\\n\\nPython is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation via the off-side rule.\\n\\n18, released in 2020, was the last release of Python 2.Python consistently ranks as one of the most popular programming languages.'"
117 |       ]
118 |      },
119 |      "execution_count": 4,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "# Perform semantic search\n",
126 |     "lm.get_doc_context(\"Who created Python?\")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 5,
132 |    "id": "8442908a-2527-4279-be8e-55b6a64c2522",
133 |    "metadata": {
134 |     "tags": []
135 |    },
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "'JavaScript is often associated with HTML and CSS.'"
141 |       ]
142 |      },
143 |      "execution_count": 5,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "# Put everything together to answer a general question about one of the languages\n",
150 |     "question = \"What technologies are often associated with JS?\"\n",
151 |     "\n",
152 |     "context = lm.get_doc_context(question)\n",
153 |     "\n",
154 |     "lm.do(f\"Answer from the context: {question} {context}\")"
155 |    ]
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 3 (ipykernel)",
161 |    "language": "python",
162 |    "name": "python3"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.8.10"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 5
179 | }
180 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | Language Models
  2 | ===============
  3 | 
  4 | [![PyPI version](https://badge.fury.io/py/languagemodels.svg)](https://badge.fury.io/py/languagemodels)
  5 | [![docs](https://img.shields.io/badge/docs-online-brightgreen)](https://languagemodels.netlify.app/)
  6 | [![x64 Build](https://github.com/jncraton/languagemodels/actions/workflows/build.yml/badge.svg)](https://github.com/jncraton/languagemodels/actions/workflows/build.yml)
  7 | [![ARM64 Build](https://github.com/jncraton/languagemodels/actions/workflows/pi.yml/badge.svg)](https://github.com/jncraton/languagemodels/actions/workflows/pi.yml)
  8 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jncraton/languagemodels/blob/master/examples/translate.ipynb)
  9 | 
 10 | Python building blocks to explore large language models in as little as 512MB of RAM
 11 | 
 12 | ![Translation hello world example](media/hello.gif)
 13 | 
 14 | This package makes using large language models from Python as simple as possible. All inference is performed locally to keep your data private by default.
 15 | 
 16 | Installation and Getting Started
 17 | --------------------------------
 18 | 
 19 | This package can be installed using the following command:
 20 | 
 21 | ```sh
 22 | pip install languagemodels
 23 | ```
 24 | 
 25 | Once installed, you should be able to interact with the package in Python as follows:
 26 | 
 27 | ```python
 28 | >>> import languagemodels as lm
 29 | >>> lm.do("What color is the sky?")
 30 | 'The color of the sky is blue.'
 31 | ```
 32 | 
 33 | This will require downloading a significant amount of data (~250MB) on the first run. Models will be cached for later use and subsequent calls should be quick.
 34 | 
 35 | Example Usage
 36 | -------------
 37 | 
 38 | Here are some usage examples as Python REPL sessions. This should work in the REPL, notebooks, or in traditional scripts and applications.
 39 | 
 40 | ### Instruction Following
 41 | 
 42 | ```python
 43 | >>> import languagemodels as lm
 44 | 
 45 | >>> lm.do("Translate to English: Hola, mundo!")
 46 | 'Hello, world!'
 47 | 
 48 | >>> lm.do("What is the capital of France?")
 49 | 'Paris.'
 50 | ```
 51 | 
 52 | Outputs can be restricted to a list of choices if desired:
 53 | 
 54 | ```python
 55 | >>> lm.do("Is Mars larger than Saturn?", choices=["Yes", "No"])
 56 | 'No'
 57 | ```
 58 | 
 59 | ### Adjusting Model Performance
 60 | 
 61 | The base model should run quickly on any system with 512MB of memory, but this memory limit can be increased to select more powerful models that will consume more resources. Here's an example:
 62 | 
 63 | ```python
 64 | >>> import languagemodels as lm
 65 | >>> lm.do("If I have 7 apples then eat 5, how many apples do I have?")
 66 | 'You have 8 apples.'
 67 | >>> lm.config["max_ram"] = "4gb"
 68 | 4.0
 69 | >>> lm.do("If I have 7 apples then eat 5, how many apples do I have?")
 70 | 'I have 2 apples left.'
 71 | ```
 72 | 
 73 | ### GPU Acceleration
 74 | 
 75 | If you have an NVIDIA GPU with CUDA available, you can opt in to using the GPU for inference:
 76 | 
 77 | ```python
 78 | >>> import languagemodels as lm
 79 | >>> lm.config["device"] = "auto"
 80 | ```
 81 | 
 82 | ### Text Completions
 83 | 
 84 | ```python
 85 | >>> import languagemodels as lm
 86 | 
 87 | >>> lm.complete("She hid in her room until")
 88 | 'she was sure she was safe'
 89 | ```
 90 | 
 91 | ### External Retrieval
 92 | 
 93 | Helper functions are provided to retrieve text from external sources that can be used to augment prompt context.
 94 | 
 95 | ```python
 96 | >>> import languagemodels as lm
 97 | 
 98 | >>> lm.get_wiki('Chemistry')
 99 | 'Chemistry is the scientific study...
100 | 
101 | >>> lm.get_weather(41.8, -87.6)
102 | 'Partly cloudy with a chance of rain...
103 | 
104 | >>> lm.get_date()
105 | 'Friday, May 12, 2023 at 09:27AM'
106 | ```
107 | 
108 | Here's an example showing how this can be used (compare to previous chat example):
109 | 
110 | ```python
111 | >>> lm.do(f"It is {lm.get_date()}. What time is it?")
112 | 'The time is 12:53PM.'
113 | ```
114 | 
115 | ### Semantic Search
116 | 
117 | Semantic search is provided to retrieve documents that may provide helpful context from a document store.
118 | 
119 | ```python
120 | >>> import languagemodels as lm
121 | >>> lm.store_doc(lm.get_wiki("Python"), "Python")
122 | >>> lm.store_doc(lm.get_wiki("C language"), "C")
123 | >>> lm.store_doc(lm.get_wiki("Javascript"), "Javascript")
124 | >>> lm.get_doc_context("What does it mean for batteries to be included in a language?")
125 | 'From Python document: It is often described as a "batteries included" language due to its comprehensive standard library.Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.
126 | 
127 | From C document: It was designed to be compiled to provide low-level access to memory and language constructs that map efficiently to machine instructions, all with minimal runtime support.'
128 | ```
129 | 
130 | [Full documentation](https://languagemodels.netlify.app/)
131 | 
132 | ### Speed
133 | 
134 | This package currently outperforms Hugging Face `transformers` for CPU inference thanks to int8 quantization and the [CTranslate2](https://github.com/OpenNMT/CTranslate2) backend. The following table compares CPU inference performance on identical models using the best available quantization on a 20 question test set.
135 | 
136 | | Backend                   | Inference Time | Memory Used |
137 | |---------------------------|----------------|-------------|
138 | | Hugging Face transformers | 22s            | 1.77GB      |
139 | | This package              | 11s            | 0.34GB      |
140 | 
141 | Note that quantization does technically harm output quality slightly, but it should be negligible at this level.
142 | 
143 | ### Models
144 | 
145 | Sensible default models are provided. The package should improve over time as stronger models become available. The basic models used are 1000x smaller than the largest models in use today. They are useful as learning tools, but perform far below the current state of the art.
146 | 
147 | Here are the current default models used by the package for a supplied `max_ram` value:
148 | 
149 | | max_ram | Model Name            | Parameters (B)
150 | | ------- | --------------------- | --------------
151 | | 0.5     | LaMini-Flan-T5-248M   | 0.248
152 | | 1.0     | LaMini-Flan-T5-783M   | 0.783
153 | | 2.0     | LaMini-Flan-T5-783M   | 0.783
154 | | 4.0     | flan-alpaca-gpt4-xl   | 3.0
155 | | 8.0     | openchat-3.5-0106     | 7.0
156 | 
157 | For code completions, the [CodeT5+](https://arxiv.org/abs/2305.07922) series of models are used.
158 | 
159 | Commercial Use
160 | --------------
161 | 
162 | This package itself is licensed for commercial use, but the models used may not be compatible with commercial use. In order to use this package commercially, you can filter models by license type using the `require_model_license` function.
163 | 
164 | ```python
165 | >>> import languagemodels as lm
166 | >>> lm.config['instruct_model']
167 | 'LaMini-Flan-T5-248M-ct2-int8'
168 | >>> lm.require_model_license("apache|bsd|mit")
169 | >>> lm.config['instruct_model']
170 | 'flan-t5-base-ct2-int8'
171 | ```
172 | 
173 | It is recommended to confirm that the models used meet the licensing requirements for your software.
174 | 
175 | Projects Ideas
176 | --------------
177 | 
178 | One of the goals for this package is to be a straightforward tool for learners and educators exploring how large language models intersect with modern software development. It can be used to do the heavy lifting for a number of learning projects:
179 | 
180 | - CLI Chatbot (see [examples/chat.py](examples/chat.py))
181 | - Streamlit chatbot (see [examples/streamlitchat.py](examples/streamlitchat.py))
182 | - Chatbot with information retrieval
183 | - Chatbot with access to real-time information
184 | - Tool use
185 | - Text classification
186 | - Extractive question answering
187 | - Semantic search over documents
188 | - Document question answering
189 | 
190 | Several example programs and notebooks are included in the `examples` directory.
191 | 


--------------------------------------------------------------------------------
/test/npr.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <title>Netflix raises monthly fees after a record jump in new subscribers</title>
  5 |     <meta http-equiv="Content-Type" content="text/html;charset=utf-8">
  6 |     <meta name="viewport" content="width=device-width">
  7 |     <link id="favicon" rel="shortcut icon" type="image/png" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAHlJREFUOBFjYBgFFIcA48cYpf/opvAv+YouxODXshZDbFONDSMLSJRv8V245KdYZTD7//8XcDFGRgkwe2O1NVzMv/UomA02AMQCaUQ2CCQG0ohsEEgMphHEBgEmCIWdRNeMTRXYBTBnw2iYQpjTYXx022Hio/RAhwAAjXEfJrIXnj4AAAAASUVORK5CYII=">
  8 |     <style>
  9 |         body {
 10 |     display: block;
 11 |     padding: 0px 20px;
 12 |     max-width: 550px;
 13 |     margin: 0 auto;
 14 |     font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
 15 | }
 16 | 
 17 | .full-version-link {
 18 |     margin-left: 15px;
 19 | }
 20 | 
 21 | .slug-line {
 22 |     font-size: 1.1rem;
 23 |     margin-bottom: 15px;
 24 | }
 25 | 
 26 | .hr-line {
 27 |     position: relative;
 28 |     height: 4px;
 29 | }
 30 | 
 31 | .hr-line:after {
 32 |     background: linear-gradient(to right, #e60000 0%, #e60000 33.33%, #000000 33.33%, #000000 66.66%, #3366CC 66.66%);
 33 |     position: absolute;
 34 |     content: '';
 35 |     height: 4px;
 36 |     right: 0;
 37 |     left: 0;
 38 |     top: 0;
 39 | }
 40 | 
 41 | hr.gray {
 42 |     border: .5px solid gray;
 43 | }
 44 | 
 45 | .story-title {
 46 |     line-height: 2rem;
 47 |     font-size: 1.5rem;
 48 |     margin: 0;
 49 | }
 50 | 
 51 | .topic-heading {
 52 |     line-height: 2rem;
 53 |     font-size: 1.5rem;
 54 | }
 55 | 
 56 | .topic-container>ul {
 57 |     padding: 0;
 58 |     line-height: 1.4rem;
 59 | }
 60 | 
 61 | .topic-container li {
 62 |     display: block;
 63 |     padding-bottom: 15px;
 64 | }
 65 | 
 66 | .topic-container {
 67 |     margin-top: 20px;
 68 | }
 69 | 
 70 | .topic-date {
 71 |     margin: 20px 0;
 72 |     font-style: italic;
 73 | }
 74 | 
 75 | .paragraphs-container {
 76 |     line-height: 1.5rem;
 77 | }
 78 | 
 79 | .button:link,
 80 | .button:visited {
 81 |     background-color: white;
 82 |     color: black;
 83 |     border: 2px solid black;
 84 |     padding: 4px 8px;
 85 |     text-align: center;
 86 |     text-decoration: none;
 87 |     display: inline-block;
 88 | }
 89 | 
 90 | .button:hover,
 91 | .button:active {
 92 |     background-color: black;
 93 |     color: white;
 94 | }
 95 | 
 96 | .lower-nav-container {
 97 |     margin-top: 40px;
 98 | }
 99 | 
100 | .lower-nav-container li {
101 |     margin-left: 0;
102 |     display: inline;
103 |     padding-right: 20px;
104 | }
105 | 
106 | h6 {
107 |   text-transform: uppercase;
108 | }
109 | 
110 |     </style>
111 | </head>
112 | 
113 | 
114 | <body>
115 | <header>
116 |   <p>Text-Only Version <a class="full-version-link button" href="https://www.npr.org/g-s1-44212">Go To Full Site</a></p>
117 | </header>
118 | 
119 | 
120 | <main>
121 |   <article>
122 |     <div class="story-container">
123 |       <p class="slug-line">
124 |         <a class="slug-link" href="/">NPR</a> &gt;
125 |         
126 |           <a class="slug-link" href="/g-s1-41197">The Brief</a>
127 |         
128 |       </p>
129 |       <div class="story-head">
130 |         <h1 class="story-title">Netflix raises monthly fees after a record jump in new subscribers</h1>
131 |         <p>By Manuela López Restrepo</p>
132 |         
133 |           <p>Wednesday, January 22, 2025 • 5:08 PM EST</p>
134 |         
135 |         
136 |       </div>
137 |       <div class="hr-line"></div>
138 |       <div class="paragraphs-container">
139 |         <p>First, the good news. Netflix reported <a href="https://s22.q4cdn.com/959853165/files/doc_financials/2024/q4/FINAL-Q4-24-Shareholder-Letter.pdf" target="_blank">a record increase in the number of new subscribers </a>for the final quarter of 2024, attributing the success to high-profile live sports events and new programs. Now, the bad news: The company also announced its raising its subscription prices across the board.<br></p><p><h3><strong>Three things to know:</strong></h3></p><p><ol><li>The company reported nearly 19 million new subscribers during the last fiscal quarter of 2024, their largest subscription jump ever during a three-month period. That puts them at a total of 302 million global subscribers, the most of any streaming platform.<br></li><li>Netflix also announced a bump in prices for all subscription tiers in the U.S. The  standard account with ads now costs $7.99 per month, ad-free subscriptions are $17.99, while the premium plan is $24.99. This reflects price hikes of $1, $2.50 and $2 per month, respectively.<br></li><li>This comes months after Netflix's <a href="/2024/07/19/nx-s1-5046733/netflix-ends-its-cheapest-ad-free-subscription#:~:text=Basic%20plan%20subscribers%20can%20switch,plan%20for%20%2422.99%20per%20month" target="_blank">last price hike</a>, when the company eliminated its least expensive, ad-free option.</li></ol></p><p><hr /></p><p><strong>Want more? The </strong><a href="/podcasts/510282/pop-culture-happy-hour" target="_blank"><strong><em><u>Pop Culture Happy Hour</u></em></strong></a><strong><em> </em>podcast suggests and dissects the buzziest new movies, TV, music, books, videogames and more, five days a week.</strong></p><p><hr /></p><p><h3><strong>What's Netflix doing right?</strong></h3></p><p>Well, it seems that the company's forays into live sports have resonated with viewers.  In <a href="https://s22.q4cdn.com/959853165/files/doc_financials/2024/q4/FINAL-Q4-24-Shareholder-Letter.pdf" target="_blank"><u>their letter to investors</u></a>, the company focused on the success of their highly promoted live-streamed boxing match between Mike Tyson and Jake Paul on Nov. 15. The match drew 60 million households and, according to Netflix, makes it the most-streamed sporting event in history (despite <a href="/2024/11/16/nx-s1-5189528/netflixs-jump-into-the-ring-of-live-boxing-came-with-streaming-glitches" target="_blank">significant technical glitches,</a> which left subscribers fuming and critics wondering if Netflix could pull off such large-scale events.)</p><p>Its next big sporting events went off with far fewer problems: two <a href="/2024/12/26/nx-s1-5232865/beyonce-halftime-show-netflix-cowboy-carter-christmas" target="_blank">NFL matches</a> that streamed on the platform weeks later, and included a widely talked about Beyoncé halftime show. A <a href="/2025/01/07/1223358002/the-indicator-from-planet-money-why-netflix-spent-billions-for-wwe-01-07-2025" target="_blank">very expensive reboot of WWE wrestling</a> is also in the mix for 2025.</p><p>Another factor for Netflix's success? Popular shows that outperformed the company's expectations, like the second season of <a href="/2025/01/01/1222389481/squid-game-season-2-heads-back-into-the-arena" target="_blank"><em>Squid Game</em></a><em>. </em>The industry's largest streaming platform is hoping audiences are just as eager for other shows returning in 2025 with new seasons — such as <em>Wednesday</em> and <em>Stranger Things</em>.<br></p><p><h3><strong>Go deeper with NPR on all things entertainment</strong></h3></p><p><ul><li>Curious about the future of sports and streaming? <a href="/2025/01/17/nx-s1-5261006/australian-open-wii-animation" target="_blank">Learn why Australian Open live streams look like Wii Tennis</a>.<br></li><li>And for more cultural analysis, listen to the <a href="/podcasts/510317/its-been-a-minute" target="_blank"><em><u>It's Been a Minute</u></em></a> podcast, where host Brittany Luse goes beyond the obvious takes. Because culture doesn't happen by accident.</li></ul></p>
140 |       </div>
141 |     </div>
142 |   </article>
143 | </main>
144 | 
145 | 
146 | <div class="hr-line"></div>
147 | <nav>
148 | <p>Topics</p>
149 | <ul>
150 |     <li><a href="/1001">News</a></li>
151 |     <li><a href="/1008">Culture</a></li>
152 |     <li><a href="/1039">Music</a></li>
153 | </ul>
154 | </nav>
155 | 
156 | 
157 | <footer>
158 |   <nav class="lower-nav-container">
159 |     <li><a href="/614470770">Contact Us</a></li>
160 |     <li><a href="/179876898">Terms of Use</a></li>
161 |     <li><a href="/179881519">Permissions</a></li>
162 |     <li><a href="/179878450">Privacy Policy</a></li>
163 |   </nav>
164 | 
165 |   <p>&copy NPR</p>
166 | </footer>
167 | 
168 | </body>
169 | </html>
170 | 


--------------------------------------------------------------------------------
/paper.bib:
--------------------------------------------------------------------------------
  1 | @misc{hftransformers,
  2 |       title={HuggingFace's Transformers: State-of-the-art Natural Language Processing}, 
  3 |       author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
  4 |       year={2020},
  5 |       eprint={1910.03771},
  6 |       archivePrefix={arXiv},
  7 |       primaryClass={cs.CL}
  8 | }
  9 | 
 10 | @incollection{pytorch,
 11 |   title     = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
 12 |   author    = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
 13 |   booktitle = {Advances in Neural Information Processing Systems 32},
 14 |   pages     = {8024--8035},
 15 |   year      = {2019},
 16 |   publisher = {Curran Associates, Inc.},
 17 |   url       = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf}
 18 | }
 19 | 
 20 | @article{llama,
 21 |   title={Llama: Open and efficient foundation language models},
 22 |   author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
 23 |   journal={arXiv preprint arXiv:2302.13971},
 24 |   year={2023}
 25 | }
 26 | 
 27 | @article{t5,
 28 |   title={Exploring the limits of transfer learning with a unified text-to-text transformer},
 29 |   author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
 30 |   journal={The Journal of Machine Learning Research},
 31 |   volume={21},
 32 |   number={1},
 33 |   pages={5485--5551},
 34 |   year={2020},
 35 |   publisher={JMLRORG}
 36 | }
 37 | 
 38 | @article{flan-t5,
 39 |   title={Scaling instruction-finetuned language models},
 40 |   author={Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and others},
 41 |   journal={arXiv preprint arXiv:2210.11416},
 42 |   year={2022}
 43 | }
 44 | 
 45 | @article{vaswani2017attention,
 46 |   title={Attention is all you need},
 47 |   author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
 48 |   journal={Advances in neural information processing systems},
 49 |   volume={30},
 50 |   year={2017}
 51 | }
 52 | 
 53 | @article{bert,
 54 |   title={Bert: Pre-training of deep bidirectional transformers for language understanding},
 55 |   author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
 56 |   journal={arXiv preprint arXiv:1810.04805},
 57 |   year={2018}
 58 | }
 59 | 
 60 | @article{gpt2,
 61 |   title={Language models are unsupervised multitask learners},
 62 |   author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
 63 |   journal={OpenAI blog},
 64 |   volume={1},
 65 |   number={8},
 66 |   pages={9},
 67 |   year={2019}
 68 | }
 69 | 
 70 | @article{gpt3,
 71 |   title={Language models are few-shot learners},
 72 |   author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
 73 |   journal={Advances in neural information processing systems},
 74 |   volume={33},
 75 |   pages={1877--1901},
 76 |   year={2020}
 77 | }
 78 | 
 79 | @article{palm,
 80 |   title={Palm: Scaling language modeling with pathways},
 81 |   author={Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others},
 82 |   journal={arXiv preprint arXiv:2204.02311},
 83 |   year={2022}
 84 | }
 85 | 
 86 | @article{codex,
 87 |   title={Evaluating large language models trained on code},
 88 |   author={Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and Pinto, Henrique Ponde de Oliveira and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and others},
 89 |   journal={arXiv preprint arXiv:2107.03374},
 90 |   year={2021}
 91 | }
 92 | 
 93 | @article{eloundou2023gpts,
 94 |   title={Gpts are gpts: An early look at the labor market impact potential of large language models},
 95 |   author={Eloundou, Tyna and Manning, Sam and Mishkin, Pamela and Rock, Daniel},
 96 |   journal={arXiv preprint arXiv:2303.10130},
 97 |   year={2023}
 98 | }
 99 | 
100 | @article{bubeck2023sparks,
101 |   title={Sparks of artificial general intelligence: Early experiments with gpt-4},
102 |   author={Bubeck, S{\'e}bastien and Chandrasekaran, Varun and Eldan, Ronen and Gehrke, Johannes and Horvitz, Eric and Kamar, Ece and Lee, Peter and Lee, Yin Tat and Li, Yuanzhi and Lundberg, Scott and others},
103 |   journal={arXiv preprint arXiv:2303.12712},
104 |   year={2023}
105 | }
106 | 
107 | @misc{tensorflow,
108 | title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
109 | url={https://www.tensorflow.org/},
110 | note={Software available from tensorflow.org},
111 | author={
112 |     Mart\'{i}n~Abadi and
113 |     Ashish~Agarwal and
114 |     Paul~Barham and
115 |     Eugene~Brevdo and
116 |     Zhifeng~Chen and
117 |     Craig~Citro and
118 |     Greg~S.~Corrado and
119 |     Andy~Davis and
120 |     Jeffrey~Dean and
121 |     Matthieu~Devin and
122 |     Sanjay~Ghemawat and
123 |     Ian~Goodfellow and
124 |     Andrew~Harp and
125 |     Geoffrey~Irving and
126 |     Michael~Isard and
127 |     Yangqing Jia and
128 |     Rafal~Jozefowicz and
129 |     Lukasz~Kaiser and
130 |     Manjunath~Kudlur and
131 |     Josh~Levenberg and
132 |     Dandelion~Man\'{e} and
133 |     Rajat~Monga and
134 |     Sherry~Moore and
135 |     Derek~Murray and
136 |     Chris~Olah and
137 |     Mike~Schuster and
138 |     Jonathon~Shlens and
139 |     Benoit~Steiner and
140 |     Ilya~Sutskever and
141 |     Kunal~Talwar and
142 |     Paul~Tucker and
143 |     Vincent~Vanhoucke and
144 |     Vijay~Vasudevan and
145 |     Fernanda~Vi\'{e}gas and
146 |     Oriol~Vinyals and
147 |     Pete~Warden and
148 |     Martin~Wattenberg and
149 |     Martin~Wicke and
150 |     Yuan~Yu and
151 |     Xiaoqiang~Zheng},
152 |   year={2015},
153 | }
154 | 
155 | @misc{llamacpp, title={llama.cpp: Port of facebook’s Llama model in C/C++}, url={https://github.com/ggerganov/llama.cpp}, journal={GitHub}, author={Gerganov, Georgi}, year={2023},}
156 | 
157 | @misc{gpt4all,
158 |   author = {Yuvanesh Anand and Zach Nussbaum and Brandon Duderstadt and Benjamin Schmidt and Andriy Mulyar},
159 |   title = {GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo},
160 |   year = {2023},
161 |   publisher = {GitHub},
162 |   journal = {GitHub repository},
163 |   howpublished = {\url{https://github.com/nomic-ai/gpt4all}},
164 | }
165 | 
166 | @misc{langchain, title={LangChain: Building applications with LLMs through composability}, url={https://github.com/hwchase17/langchain}, journal={GitHub}, author={Chase, Harrison}, year={2022},}
167 | 
168 | @article{kaplan2020scaling,
169 |   title={Scaling laws for neural language models},
170 |   author={Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
171 |   journal={arXiv preprint arXiv:2001.08361},
172 |   year={2020}
173 | }
174 | 
175 | @article{hoffmann2022training,
176 |   title={Training compute-optimal large language models},
177 |   author={Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and others},
178 |   journal={arXiv preprint arXiv:2203.15556},
179 |   year={2022}
180 | }
181 | 
182 | @article{mialon2023augmented,
183 |   title={Augmented language models: a survey},
184 |   author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and Nalmpantis, Christoforos and Pasunuru, Ram and Raileanu, Roberta and Rozi{\`e}re, Baptiste and Schick, Timo and Dwivedi-Yu, Jane and Celikyilmaz, Asli and others},
185 |   journal={arXiv preprint arXiv:2302.07842},
186 |   year={2023}
187 | }
188 | 
189 | @article{zhao2023survey,
190 |   title={A survey of large language models},
191 |   author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others},
192 |   journal={arXiv preprint arXiv:2303.18223},
193 |   year={2023}
194 | }
195 | 
196 | @inproceedings{ctranslate2,
197 |   title={The OpenNMT neural machine translation toolkit: 2020 edition},
198 |   author={Klein, Guillaume and Hernandez, Fran{\c{c}}ois and Nguyen, Vincent and Senellart, Jean},
199 |   booktitle={Proceedings of the 14th Conference of the Association for Machine Translation in the Americas (Volume 1: Research Track)},
200 |   pages={102--109},
201 |   year={2020}
202 | }
203 | 
204 | @article{lamini-lm,
205 |   author       = {Minghao Wu and
206 |                   Abdul Waheed and
207 |                   Chiyu Zhang and
208 |                   Muhammad Abdul-Mageed and
209 |                   Alham Fikri Aji
210 |                   },
211 |   title        = {LaMini-LM: A Diverse Herd of Distilled Models from Large-Scale Instructions},
212 |   journal      = {CoRR},
213 |   volume       = {abs/2304.14402},
214 |   year         = {2023},
215 |   url          = {https://arxiv.org/abs/2304.14402},
216 |   eprinttype   = {arXiv},
217 |   eprint       = {2304.14402}
218 | }
219 | 
220 | @article{openchat,
221 |   title={OpenChat: Advancing Open-source Language Models with Mixed-Quality Data},
222 |   author={Wang, Guan and Cheng, Sijie and Zhan, Xianyuan and Li, Xiangang and Song, Sen and Liu, Yang},
223 |   journal={arXiv preprint arXiv:2309.11235},
224 |   year={2023}
225 | }


--------------------------------------------------------------------------------
/languagemodels/embeddings.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from time import perf_counter
  3 | 
  4 | from languagemodels.models import get_model, get_model_info
  5 | 
  6 | 
  7 | def embed(docs):
  8 |     """Compute embeddings for a batch of documents
  9 | 
 10 |     >>> embed(["I love Python!"])[0].shape
 11 |     (384,)
 12 | 
 13 |     >>> embed(["I love Python!"])[0][-3:]
 14 |     array([0.1..., 0.1..., 0.0...], dtype=float32)
 15 | 
 16 |     >>> float(np.linalg.norm(embed(["I love Python!"])[0]))
 17 |     1.0
 18 | 
 19 |     Embeddings are computed by running the first 512 tokens of each doc
 20 |     through a forward pass of the embedding model. The last hidden state
 21 |     of the model is mean pooled to produce a single vector
 22 | 
 23 |     Documents will be processed in batches. The batch size is fixed at 64
 24 |     as this size was found to maximize throughput on a number of test
 25 |     systems while limiting memory usage.
 26 |     """
 27 | 
 28 |     tokenizer, model = get_model("embedding")
 29 |     model_info = get_model_info("embedding")
 30 | 
 31 |     start_time = perf_counter()
 32 | 
 33 |     tokens = [tokenizer.encode(doc[:8192]).ids[:512] for doc in docs]
 34 | 
 35 |     def mean_pool(last_hidden_state):
 36 |         embedding = np.mean(last_hidden_state, axis=0)
 37 |         embedding = embedding / np.linalg.norm(embedding)
 38 |         return embedding
 39 | 
 40 |     bs = 64
 41 |     embeddings = []
 42 |     for i in range(0, len(docs), bs):
 43 |         outputs = model.forward_batch(tokens[i : i + bs])
 44 |         embeddings += [mean_pool(lhs) for lhs in np.array(outputs.last_hidden_state)]
 45 | 
 46 |     model_info["requests"] = model_info.get("requests", 0) + len(tokens)
 47 | 
 48 |     in_toks = sum(len(d) for d in tokens)
 49 |     model_info["input_tokens"] = model_info.get("input_tokens", 0) + in_toks
 50 | 
 51 |     runtime = perf_counter() - start_time
 52 |     model_info["runtime"] = model_info.get("runtime", 0) + runtime
 53 | 
 54 |     return embeddings
 55 | 
 56 | 
 57 | def search(query, docs, count=16):
 58 |     """Return `count` `docs` sorted by match against `query`
 59 | 
 60 |     :param query: Input to match in search
 61 |     :param docs: List of docs to search against
 62 |     :param count: Number of document to return
 63 |     :return: List of (doc_num, score) tuples sorted by score descending
 64 |     """
 65 | 
 66 |     prefix = get_model_info("embedding").get("query_prefix", "")
 67 | 
 68 |     query_embedding = embed([f"{prefix}{query}"])[0]
 69 | 
 70 |     scores = np.dot([d.embedding for d in docs], query_embedding)
 71 | 
 72 |     return [(i, scores[i]) for i in reversed(np.argsort(scores)[-count:])]
 73 | 
 74 | 
 75 | def get_token_ids(doc):
 76 |     """Return list of token ids for a document
 77 | 
 78 |     Note that the tokenzier used here is from the generative model.
 79 | 
 80 |     This is used for token counting for the context, not for tokenization
 81 |     before embedding.
 82 |     """
 83 | 
 84 |     generative_tokenizer, _ = get_model("instruct", tokenizer_only=True)
 85 | 
 86 |     # We need to disable and re-enable truncation here
 87 |     # This allows us to tokenize very large documents
 88 |     # We won't be feeding the tokens themselves to a model, so this
 89 |     # shouldn't cause any problems.
 90 |     trunk = generative_tokenizer.truncation
 91 |     if trunk:
 92 |         generative_tokenizer.no_truncation()
 93 |     ids = generative_tokenizer.encode(doc, add_special_tokens=False).ids
 94 |     if trunk:
 95 |         generative_tokenizer.enable_truncation(
 96 |             trunk["max_length"], stride=trunk["stride"], strategy=trunk["strategy"]
 97 |         )
 98 | 
 99 |     return ids
100 | 
101 | 
102 | def chunk_doc(doc, name="", chunk_size=64, chunk_overlap=8):
103 |     """Break a document into chunks
104 | 
105 |     :param doc: Document to chunk
106 |     :param name: Optional document name
107 |     :param chunk_size: Length of individual chunks in tokens
108 |     :param chunk_overlap: Number of tokens to overlap when breaking chunks
109 |     :return: List of strings representing the chunks
110 | 
111 |     The simple chunking approach used here consist of the following:
112 | 
113 |     1. Attempt to chunk the remainder of the document.
114 |     2. If we can't fit all tokens in chunk_size, backtrack to look for a
115 |     meaningful cut point.
116 |     3. If a cut point is found, use that as the chunk boundary. There will
117 |     be no overlap between this chunk and the next in this case.
118 |     4. If a cut point is not found, use chunk_size a the boundary. There
119 |     will be chunk_overlap overlapping tokens starting the next chunk.
120 |     5. Repeat until entire document has been split into chunks.
121 | 
122 |     >>> chunk_doc("")
123 |     []
124 | 
125 |     >>> chunk_doc(
126 |     ... "It was the best of times, it was the worst of times, it was the age "
127 |     ... "of wisdom, it was the age of foolishness, it was the epoch of belief, "
128 |     ... "it was the epoch of incredulity, it was the season of Light, it was "
129 |     ... "the season of Darkness, it was the spring of hope, it was the winter "
130 |     ... "of despair, we had everything before us, we had nothing before us, we "
131 |     ... "were all going direct to Heaven, we were all going direct the other "
132 |     ... "way—in short, the period was so far like the present period, that "
133 |     ... "some of its noisiest authorities insisted on its being received, for "
134 |     ... "good or for evil, in the superlative degree of comparison only.")
135 |     ['It was the best of times...']
136 | 
137 |     >>> chunk_doc(
138 |     ... "One morning, when Gregor Samsa woke from troubled dreams, he found "
139 |     ... "himself transformed in his bed into a horrible vermin. He lay on his "
140 |     ... "armour-like back, and if he lifted his head a little he could see "
141 |     ... "his brown belly, slightly domed and divided by arches into stiff "
142 |     ... "sections. The bedding was hardly able to cover it and seemed ready "
143 |     ... "to slide off any moment. His many legs, pitifully thin compared with "
144 |     ... "the size of the rest of him, waved about helplessly as he looked.")
145 |     ['One morning, ...']
146 | 
147 |     >>> chunk_doc("Hello")
148 |     ['Hello']
149 | 
150 |     >>> chunk_doc("Hello " * 65)
151 |     ['Hello Hello...', 'Hello...']
152 | 
153 |     >>> chunk_doc("Hello world. " * 24)[0]
154 |     'Hello world. ...Hello world.'
155 | 
156 |     >>> len(chunk_doc("Hello world. " * 20))
157 |     1
158 | 
159 |     >>> len(chunk_doc("Hello world. " * 24))
160 |     2
161 | 
162 |     # Check to make sure sentences aren't broken on decimal points
163 |     >>> chunk_doc(('z. ' + ' 37.468 ' * 5) * 3)[0]
164 |     'z. 37.468 ...z.'
165 |     """
166 |     generative_tokenizer, _ = get_model("instruct", tokenizer_only=True)
167 | 
168 |     tokens = get_token_ids(doc)
169 | 
170 |     separator_tokens = [".", "!", "?", ").", "\n\n", "\n", '."']
171 | 
172 |     separators = [get_token_ids(t)[-1] for t in separator_tokens]
173 | 
174 |     name_tokens = []
175 | 
176 |     label = f"From {name} document:" if name else ""
177 | 
178 |     if name:
179 |         name_tokens = get_token_ids(label)
180 | 
181 |     i = 0
182 |     chunks = []
183 |     chunk = name_tokens.copy()
184 |     while i < len(tokens):
185 |         token = tokens[i]
186 |         chunk.append(token)
187 |         i += 1
188 | 
189 |         # Save the last chunk if we're done
190 |         if i == len(tokens):
191 |             chunks.append(generative_tokenizer.decode(chunk))
192 |             break
193 | 
194 |         if len(chunk) == chunk_size:
195 |             # Backtrack to find a reasonable cut point
196 |             for j in range(1, chunk_size // 2):
197 |                 if chunk[chunk_size - j] in separators:
198 |                     ctx = generative_tokenizer.decode(
199 |                         chunk[chunk_size - j : chunk_size - j + 2]
200 |                     )
201 |                     if " " in ctx or "\n" in ctx:
202 |                         # Found a good separator
203 |                         text = generative_tokenizer.decode(chunk[: chunk_size - j + 1])
204 |                         chunks.append(text)
205 |                         chunk = name_tokens + chunk[chunk_size - j + 1 :]
206 |                         break
207 |             else:
208 |                 # No semantically meaningful cutpoint found
209 |                 # Default to a hard cut
210 |                 text = generative_tokenizer.decode(chunk)
211 |                 chunks.append(text)
212 |                 # Share some overlap with next chunk
213 |                 overlap = max(
214 |                     chunk_overlap, chunk_size - len(name_tokens) - (len(tokens) - i)
215 |                 )
216 |                 chunk = name_tokens + chunk[-overlap:]
217 | 
218 |     return chunks
219 | 
220 | 
221 | class Document:
222 |     """
223 |     A document used for semantic search
224 | 
225 |     Documents have content and an embedding that is used to match the content
226 |     against other semantically similar documents.
227 |     """
228 | 
229 |     def __init__(self, content, name="", embedding=None):
230 |         self.content = content
231 |         self.embedding = embedding if embedding is not None else embed([content])[0]
232 |         self.name = name
233 | 
234 | 
235 | class RetrievalContext:
236 |     """
237 |     Provides a context for document retrieval
238 | 
239 |     Documents are embedded and cached for later search.
240 | 
241 |     Example usage:
242 | 
243 |     >>> rc = RetrievalContext()
244 |     >>> rc.store("Paris is in France.")
245 |     >>> rc.store("The sky is blue.")
246 |     >>> rc.store("Mars is a planet.")
247 |     >>> rc.get_match("Paris is in France.")
248 |     'Paris is in France.'
249 | 
250 |     >>> rc.get_match("Where is Paris?")
251 |     'Paris is in France.'
252 | 
253 |     >>> rc.clear()
254 |     >>> rc.get_match("Where is Paris?")
255 | 
256 |     >>> rc.clear()
257 |     >>> rc.store(' '.join(['Python'] * 4096))
258 |     >>> len(rc.chunks)
259 |     73
260 | 
261 |     >>> rc.clear()
262 |     >>> rc.store(' '.join(['Python'] * 232))
263 |     >>> len(rc.chunks)
264 |     4
265 | 
266 |     >>> rc.get_context("What is Python?")
267 |     'Python Python Python...'
268 | 
269 |     >>> [len(c.content.split()) for c in rc.chunks]
270 |     [64, 64, 64, 64]
271 | 
272 |     >>> len(rc.get_context("What is Python?").split())
273 |     128
274 |     """
275 | 
276 |     def __init__(self, chunk_size=64, chunk_overlap=8):
277 |         self.chunk_size = chunk_size
278 |         self.chunk_overlap = chunk_overlap
279 |         self.clear()
280 | 
281 |     def clear(self):
282 |         self.docs = []
283 |         self.chunks = []
284 | 
285 |     def store(self, doc, name=""):
286 |         """Stores a document along with embeddings
287 | 
288 |         This stores both the document as well as document chunks
289 | 
290 |         >>> rc = RetrievalContext()
291 |         >>> rc.clear()
292 |         >>> rc.store(' '.join(['Python'] * 233))
293 |         >>> len(rc.chunks)
294 |         5
295 | 
296 |         >>> rc.clear()
297 |         >>> rc.store(' '.join(['Python'] * 232))
298 |         >>> len(rc.chunks)
299 |         4
300 | 
301 |         >>> rc.clear()
302 |         >>> rc.store('Python')
303 |         >>> len(rc.chunks)
304 |         1
305 | 
306 |         >>> rc.clear()
307 |         >>> rc.store('It is a language.', 'Python')
308 |         >>> len(rc.chunks)
309 |         1
310 |         >>> [c.content for c in rc.chunks]
311 |         ['From Python document: It is a language.']
312 | 
313 |         >>> rc = RetrievalContext()
314 |         >>> rc.clear()
315 |         >>> rc.store(' '.join(['details'] * 217), 'Python')
316 |         >>> len(rc.chunks)
317 |         5
318 | 
319 |         >>> rc.clear()
320 |         >>> rc.store(' '.join(['details'] * 216), 'Python')
321 |         >>> len(rc.chunks)
322 |         4
323 |         >>> [c.content for c in rc.chunks]
324 |         ['From Python document: details details details...']
325 |         """
326 | 
327 |         if doc not in self.docs:
328 |             self.docs.append(Document(doc, name=name))
329 |             self.store_chunks(doc, name)
330 | 
331 |     def store_chunks(self, doc, name=""):
332 |         chunks = chunk_doc(doc, name, self.chunk_size, self.chunk_overlap)
333 | 
334 |         embeddings = embed(chunks)
335 | 
336 |         for embedding, chunk in zip(embeddings, chunks):
337 |             self.chunks.append(Document(chunk, embedding=embedding))
338 | 
339 |     def get_context(self, query, max_tokens=128):
340 |         """Gets context matching a query
341 | 
342 |         Context is capped by token length and is retrieved from stored
343 |         document chunks
344 |         """
345 | 
346 |         if len(self.chunks) == 0:
347 |             return None
348 | 
349 |         results = search(query, self.chunks)
350 | 
351 |         chunks = []
352 |         tokens = 0
353 | 
354 |         for chunk_id, score in results:
355 |             chunk = self.chunks[chunk_id].content
356 |             chunk_tokens = len(get_token_ids(chunk))
357 |             if tokens + chunk_tokens <= max_tokens and score > 0.1:
358 |                 chunks.append(chunk)
359 |                 tokens += chunk_tokens
360 | 
361 |         context = "\n\n".join(chunks)
362 | 
363 |         return context
364 | 
365 |     def get_match(self, query):
366 |         if len(self.docs) == 0:
367 |             return None
368 | 
369 |         return self.docs[search(query, self.docs)[0][0]].content
370 | 


--------------------------------------------------------------------------------
/languagemodels/inference.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import requests
  3 | import re
  4 | import os
  5 | import sys
  6 | from time import perf_counter
  7 | 
  8 | from languagemodels.models import get_model, get_model_info
  9 | from languagemodels.config import config
 10 | 
 11 | 
 12 | class InferenceException(Exception):
 13 |     pass
 14 | 
 15 | 
 16 | def truncate_prompt(prompt):
 17 |     """Truncates a prompt to the maximum length allowed by the config"""
 18 |     max_prompt_length = config["max_prompt_length"]
 19 |     if len(prompt) > max_prompt_length:
 20 |         print(
 21 |             f"Warning: Prompt truncated from {len(prompt)} to "
 22 |             f"{max_prompt_length} characters to avoid OOM."
 23 |         )
 24 |         return prompt[:max_prompt_length]
 25 |     return prompt
 26 | 
 27 | 
 28 | def list_tokens(prompt):
 29 |     """Generates a list of tokens for a supplied prompt
 30 | 
 31 |     >>> list_tokens("Hello, world!") # doctest: +SKIP
 32 |     [('▁Hello', 8774), (',', 6), ('▁world', 296), ('!', 55)]
 33 | 
 34 |     >>> list_tokens("Hello, world!")
 35 |     [('...Hello', ...), ... ('...world', ...), ...]
 36 |     """
 37 |     prompt = truncate_prompt(prompt)
 38 |     tokenizer, _ = get_model("instruct")
 39 | 
 40 |     output = tokenizer.encode(prompt, add_special_tokens=False)
 41 |     tokens = output.tokens
 42 |     ids = output.ids
 43 | 
 44 |     return list(zip(tokens, ids))
 45 | 
 46 | 
 47 | def generate_ts(engine, prompt, max_tokens=200):
 48 |     """Generates a single text response for a prompt from a textsynth server
 49 | 
 50 |     The server and API key are provided as environment variables:
 51 | 
 52 |     LANGUAGEMODELS_TS_SERVER is the server such as http://localhost:8080
 53 |     LANGUAGEMODELS_TS_KEY is the API key
 54 |     """
 55 |     apikey = os.environ.get("LANGUAGEMODELS_TS_KEY") or ""
 56 |     server = os.environ.get("LANGUAGEMODELS_TS_SERVER") or "https://api.textsynth.com"
 57 | 
 58 |     response = requests.post(
 59 |         f"{server}/v1/engines/{engine}/completions",
 60 |         headers={"Authorization": f"Bearer {apikey}"},
 61 |         json={"prompt": prompt, "max_tokens": max_tokens},
 62 |     )
 63 |     resp = response.json()
 64 |     if "text" in resp:
 65 |         return resp["text"]
 66 |     else:
 67 |         raise InferenceException(f"TextSynth error: {resp}")
 68 | 
 69 | 
 70 | def generate_oa(engine, prompt, max_tokens=200, temperature=0):
 71 |     """Generates a single text response for a prompt using OpenAI
 72 | 
 73 |     The server and API key are provided as environment variables:
 74 | 
 75 |     LANGUAGEMODELS_OA_KEY is the API key
 76 |     """
 77 |     apikey = os.environ.get("LANGUAGEMODELS_OA_KEY")
 78 | 
 79 |     response = requests.post(
 80 |         "https://api.openai.com/v1/completions",
 81 |         headers={
 82 |             "Authorization": f"Bearer {apikey}",
 83 |             "Content-Type": "application/json",
 84 |         },
 85 |         json={
 86 |             "model": engine,
 87 |             "prompt": prompt,
 88 |             "max_tokens": max_tokens,
 89 |             "temperature": temperature,
 90 |         },
 91 |     )
 92 |     resp = response.json()
 93 | 
 94 |     try:
 95 |         return resp["choices"][0]["text"]
 96 |     except KeyError:
 97 |         raise InferenceException(f"OpenAI error: {resp}")
 98 | 
 99 | 
100 | def chat_oa(engine, prompt, max_tokens=200, temperature=0):
101 |     """Generates a single text response for a prompt using OpenAI
102 | 
103 |     The server and API key are provided as environment variables:
104 | 
105 |     LANGUAGEMODELS_OA_KEY is the API key
106 |     """
107 |     apikey = os.environ.get("LANGUAGEMODELS_OA_KEY")
108 | 
109 |     response = requests.post(
110 |         "https://api.openai.com/v1/chat/completions",
111 |         headers={
112 |             "Authorization": f"Bearer {apikey}",
113 |             "Content-Type": "application/json",
114 |         },
115 |         json={
116 |             "model": engine,
117 |             "messages": [{"role": "user", "content": prompt}],
118 |             "max_tokens": max_tokens,
119 |             "temperature": temperature,
120 |         },
121 |     )
122 |     resp = response.json()
123 | 
124 |     try:
125 |         return resp["choices"][0]["message"]["content"]
126 |     except KeyError:
127 |         raise InferenceException(f"OpenAI error: {resp}")
128 | 
129 | 
130 | def stream_results(results, tokenizer):
131 |     """Map a token iterator to a substring iterator"""
132 |     tokens = []
133 |     last_len = 0
134 | 
135 |     for result in results:
136 |         tokens.append(result.token_id)
137 |         text = tokenizer.decode(tokens)
138 |         yield text[last_len:]
139 |         last_len = len(text)
140 | 
141 | 
142 | def echo_results(results, tokenizer):
143 |     """Output results to stderr as they are collected"""
144 |     tokens = []
145 |     last_len = 0
146 | 
147 |     for result in results:
148 |         tokens.append(result.token_id)
149 |         text = tokenizer.decode(tokens)
150 |         sys.stderr.write(text[last_len:])
151 |         sys.stderr.flush()
152 |         last_len = len(text)
153 | 
154 |     sys.stderr.write("\n\n")
155 |     sys.stderr.flush()
156 |     return tokens
157 | 
158 | 
159 | def generate(
160 |     instructions: List[str],
161 |     max_tokens: int = 200,
162 |     temperature: float = 0.1,
163 |     topk: int = 1,
164 |     repetition_penalty: float = 0.0,
165 |     prefix: str = "",
166 |     suppress: List[str] = [],
167 |     model: str = "instruct",
168 |     stream: bool = False,
169 | ):
170 |     """Generates completions for a prompt
171 | 
172 |     This may use a local model, or it may make an API call to an external
173 |     model if API keys are available.
174 | 
175 |     >>> generate(["What is the capital of France?"])
176 |     ['...Paris...']
177 | 
178 |     >>> list(generate(["What is the capital of France?"], stream=True))
179 |     ['...Paris...']
180 |     """
181 |     if os.environ.get("LANGUAGEMODELS_TS_KEY") or os.environ.get(
182 |         "LANGUAGEMODELS_TS_SERVER"
183 |     ):
184 |         return generate_ts("flan_t5_xxl_q4", instructions, max_tokens).strip()
185 | 
186 |     if os.environ.get("LANGUAGEMODELS_OA_KEY"):
187 |         return chat_oa("gpt-3.5-turbo", instructions, max_tokens).strip()
188 | 
189 |     tokenizer, model = get_model(model)
190 | 
191 |     start_time = perf_counter()
192 | 
193 |     suppress = [tokenizer.encode(s, add_special_tokens=False).tokens for s in suppress]
194 | 
195 |     model_info = get_model_info("instruct")
196 | 
197 |     fmt = model_info.get("prompt_fmt", "{instruction}")
198 | 
199 |     if repetition_penalty == 0.0:
200 |         repetition_penalty = model_info.get("repetition_penalty", 1.3)
201 | 
202 |     prompts = [fmt.replace("{instruction}", inst) for inst in instructions]
203 |     truncated_prompts = [truncate_prompt(p) for p in prompts]
204 | 
205 |     prompts_tok = [tokenizer.encode(p).tokens for p in truncated_prompts]
206 | 
207 |     outputs_ids = []
208 |     if hasattr(model, "translate_batch"):
209 |         prefix = tokenizer.encode(prefix, add_special_tokens=False).tokens
210 |         if stream or (config["echo"] and len(prompts_tok) == 1):
211 |             results = model.generate_tokens(
212 |                 prompts_tok[0],
213 |                 target_prefix=prefix,
214 |                 repetition_penalty=repetition_penalty,
215 |                 max_decoding_length=max_tokens,
216 |                 sampling_temperature=temperature,
217 |                 sampling_topk=topk,
218 |                 suppress_sequences=suppress,
219 |             )
220 | 
221 |             if stream:
222 |                 return stream_results(results, tokenizer)
223 |             else:
224 |                 outputs_ids = [echo_results(results, tokenizer)]
225 |         else:
226 |             results = model.translate_batch(
227 |                 prompts_tok,
228 |                 target_prefix=[prefix] * len(prompts),
229 |                 repetition_penalty=repetition_penalty,
230 |                 max_decoding_length=max_tokens,
231 |                 sampling_temperature=temperature,
232 |                 sampling_topk=topk,
233 |                 suppress_sequences=suppress,
234 |                 beam_size=1,
235 |             )
236 |             outputs_tokens = [r.hypotheses[0] for r in results]
237 |             for output in outputs_tokens:
238 |                 outputs_ids.append([tokenizer.token_to_id(t) for t in output])
239 |     else:
240 |         if stream or (config["echo"] and len(prompts_tok) == 1):
241 |             results = model.generate_tokens(
242 |                 prompts_tok,
243 |                 repetition_penalty=repetition_penalty,
244 |                 max_length=max_tokens,
245 |                 sampling_temperature=temperature,
246 |                 sampling_topk=topk,
247 |                 suppress_sequences=suppress,
248 |             )
249 | 
250 |             if stream:
251 |                 return stream_results(results, tokenizer)
252 |             else:
253 |                 outputs_ids = [echo_results(results, tokenizer)]
254 |         else:
255 |             results = model.generate_batch(
256 |                 prompts_tok,
257 |                 repetition_penalty=repetition_penalty,
258 |                 max_length=max_tokens,
259 |                 sampling_temperature=temperature,
260 |                 sampling_topk=topk,
261 |                 suppress_sequences=suppress,
262 |                 beam_size=1,
263 |                 include_prompt_in_result=False,
264 |             )
265 |             outputs_ids = [r.sequences_ids[0] for r in results]
266 | 
267 |     model_info["requests"] = model_info.get("requests", 0) + len(prompts)
268 | 
269 |     in_toks = sum(len(p) for p in prompts_tok)
270 |     model_info["input_tokens"] = model_info.get("input_tokens", 0) + in_toks
271 | 
272 |     out_toks = sum(len(o) for o in outputs_ids)
273 |     model_info["output_tokens"] = model_info.get("output_tokens", 0) + out_toks
274 | 
275 |     elapsed_time = perf_counter() - start_time
276 |     model_info["runtime"] = model_info.get("runtime", 0) + elapsed_time
277 | 
278 |     return [tokenizer.decode(i, skip_special_tokens=True).lstrip() for i in outputs_ids]
279 | 
280 | 
281 | def rank_instruct(inputs, targets):
282 |     """Sorts a list of targets by their probabilities
283 | 
284 |     >>> rank_instruct(["Classify positive or negative: I love python. Classification:"],
285 |     ... ['positive', 'negative'])
286 |     [['positive', 'negative']]
287 | 
288 |     >>> rank_instruct(["Classify fantasy or documentary: "
289 |     ... "The wizard raised their wand. Classification:"],
290 |     ... ['fantasy', 'documentary'])
291 |     [['fantasy', 'documentary']]
292 | 
293 |     >>> rank_instruct(["Say six", "Say seven"], ["six", "seven"])
294 |     [['six', 'seven'], ['seven', 'six']]
295 |     """
296 |     tokenizer, model = get_model("instruct")
297 | 
298 |     model_info = get_model_info("instruct")
299 |     fmt = model_info.get("prompt_fmt", "{instruction}")
300 |     inputs = [fmt.replace("{instruction}", inst) for inst in inputs]
301 |     inputs = [truncate_prompt(i) for i in inputs]
302 | 
303 |     targ_tok = [tokenizer.encode(t, add_special_tokens=False).tokens for t in targets]
304 |     targ_tok *= len(inputs)
305 | 
306 |     in_tok = []
307 |     for input in inputs:
308 |         toks = [tokenizer.encode(input).tokens]
309 |         in_tok += toks * len(targets)
310 | 
311 |     if "Generator" in str(type(model)):
312 |         scores = model.score_batch([i + t for i, t in zip(in_tok, targ_tok)])
313 |     else:
314 |         scores = model.score_batch(in_tok, target=targ_tok)
315 | 
316 |     ret = []
317 |     for i in range(0, len(inputs) * len(targets), len(targets)):
318 |         logprobs = [sum(r.log_probs) for r in scores[i : i + len(targets)]]
319 |         results = sorted(zip(targets, logprobs), key=lambda r: -r[1])
320 |         ret.append([r[0] for r in results])
321 | 
322 |     return ret
323 | 
324 | 
325 | def parse_chat(prompt):
326 |     """Converts a chat prompt using special tokens to a plain-text prompt
327 | 
328 |     This is useful for prompting generic models that have not been fine-tuned
329 |     for chat using specialized tokens.
330 | 
331 |     >>> parse_chat('User: What time is it?')
332 |     Traceback (most recent call last):
333 |         ....
334 |     inference.InferenceException: Chat prompt must end with 'Assistant:'
335 | 
336 |     >>> parse_chat('''User: What time is it?
337 |     ...
338 |     ...               Assistant:''')
339 |     [{'role': 'user', 'content': 'What time is it?'}]
340 | 
341 |     >>> parse_chat('''
342 |     ...              A helpful assistant
343 |     ...
344 |     ...              User: What time is it?
345 |     ...
346 |     ...              Assistant:
347 |     ...              ''')
348 |     [{'role': 'system', 'content': 'A helpful assistant'},
349 |      {'role': 'user', 'content': 'What time is it?'}]
350 | 
351 |     >>> parse_chat('''
352 |     ...              A helpful assistant
353 |     ...
354 |     ...              User: What time is it?
355 |     ...
356 |     ...              Assistant: The time is
357 |     ...              ''')
358 |     Traceback (most recent call last):
359 |         ....
360 |     inference.InferenceException: Final assistant message must be blank
361 | 
362 |     >>> parse_chat('''
363 |     ...              A helpful assistant
364 |     ...
365 |     ...              User: First para
366 |     ...
367 |     ...              Second para
368 |     ...
369 |     ...              Assistant:
370 |     ...              ''')
371 |     [{'role': 'system', 'content': 'A helpful assistant'},
372 |      {'role': 'user', 'content': 'First para\\n\\nSecond para'}]
373 | 
374 |     >>> parse_chat('''
375 |     ...              A helpful assistant
376 |     ...
377 |     ...              User: What time is it?
378 |     ...
379 |     ...              InvalidRole: Nothing
380 |     ...
381 |     ...              Assistant:
382 |     ...              ''')
383 |     Traceback (most recent call last):
384 |         ....
385 |     inference.InferenceException: Invalid chat role: invalidrole
386 |     """
387 | 
388 |     if not re.match(r"^\s*\w+:", prompt):
389 |         prompt = "System: " + prompt
390 | 
391 |     prompt = "\n\n" + prompt
392 | 
393 |     chunks = re.split(r"[\r\n]\s*(\w+):", prompt, flags=re.M)
394 |     chunks = [m.strip() for m in chunks if m.strip()]
395 | 
396 |     messages = []
397 | 
398 |     for i in range(0, len(chunks), 2):
399 |         role = chunks[i].lower()
400 | 
401 |         try:
402 |             content = chunks[i + 1]
403 |             content = re.sub(r"\s*\n\n\s*", "\n\n", content)
404 |         except IndexError:
405 |             content = ""
406 |         messages.append({"role": role, "content": content})
407 | 
408 |     for message in messages:
409 |         if message["role"] not in ["system", "user", "assistant"]:
410 |             raise InferenceException(f"Invalid chat role: {message['role']}")
411 | 
412 |     if messages[-1]["role"] != "assistant":
413 |         raise InferenceException("Chat prompt must end with 'Assistant:'")
414 | 
415 |     if messages[-1]["content"] != "":
416 |         raise InferenceException("Final assistant message must be blank")
417 | 
418 |     return messages[:-1]
419 | 


--------------------------------------------------------------------------------
/languagemodels/__init__.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import datetime
  3 | import json
  4 | import re
  5 | from typing import overload
  6 | 
  7 | from languagemodels.config import config
  8 | from languagemodels.preprocess import get_html_paragraphs
  9 | from languagemodels.inference import (
 10 |     generate,
 11 |     rank_instruct,
 12 |     parse_chat,
 13 |     list_tokens,
 14 | )
 15 | from languagemodels import embeddings
 16 | 
 17 | docs = embeddings.RetrievalContext()
 18 | 
 19 | 
 20 | def complete(prompt: str) -> str:
 21 |     """Provide one completion for a given open-ended prompt
 22 | 
 23 |     :param prompt: Prompt to use as input to the model
 24 |     :return: Completion returned from the language model
 25 | 
 26 |     Examples:
 27 | 
 28 |     >>> complete("Luke thought that he") #doctest: +SKIP
 29 |     'was going to be a doctor.'
 30 | 
 31 |     >>> complete("There are many mythical creatures who") #doctest: +SKIP
 32 |     'are able to fly'
 33 | 
 34 |     >>> complete("She hid in her room until") #doctest: +SKIP
 35 |     'she was sure she was safe'
 36 |     """
 37 | 
 38 |     result = generate(
 39 |         ["Write a sentence"],
 40 |         prefix=prompt,
 41 |         max_tokens=config["max_tokens"],
 42 |         temperature=0.7,
 43 |         topk=40,
 44 |     )[0]
 45 | 
 46 |     if result.startswith(prompt):
 47 |         prefix_length = len(prompt)
 48 |         return result[prefix_length:]
 49 |     else:
 50 |         return result
 51 | 
 52 | 
 53 | @overload
 54 | def do(prompt: list) -> list:
 55 |     ...
 56 | 
 57 | 
 58 | @overload
 59 | def do(prompt: str) -> str:
 60 |     ...
 61 | 
 62 | 
 63 | def do(prompt, choices=None):
 64 |     """Follow a single-turn instructional prompt
 65 | 
 66 |     :param prompt: Instructional prompt(s) to follow
 67 |     :param choices: If provided, outputs are restricted to values in choices
 68 |     :return: Completion returned from the language model
 69 | 
 70 |     Note that this function is overloaded to return a list of results if
 71 |     a list if of prompts is provided and a single string if a single
 72 |     prompt is provided as a string
 73 | 
 74 |     Examples:
 75 | 
 76 |     >>> do("Translate Spanish to English: Hola mundo!") #doctest: +SKIP
 77 |     'Hello world!'
 78 | 
 79 |     >>> do("Pick the planet from the list: baseball, Texas, Saturn")
 80 |     '...Saturn...'
 81 | 
 82 |     >>> do(["Pick the planet from the list: baseball, Texas, Saturn"] * 2)
 83 |     ['...Saturn...', '...Saturn...']
 84 | 
 85 |     >>> do(["Say red", "Say blue"], choices=["red", "blue"])
 86 |     ['red', 'blue']
 87 | 
 88 |     >>> do("Classify as positive or negative: LLMs are bad",
 89 |     ... choices=["Positive", "Negative"])
 90 |     'Negative'
 91 | 
 92 |     >>> do("Classify as positive or negative: LLMs are great",
 93 |     ... choices=["Positive", "Negative"])
 94 |     'Positive'
 95 |     """
 96 | 
 97 |     prompts = [prompt] if isinstance(prompt, str) else prompt
 98 | 
 99 |     if choices:
100 |         results = [r[0] for r in rank_instruct(prompts, choices)]
101 |     else:
102 |         results = generate(prompts, max_tokens=config["max_tokens"], topk=1)
103 | 
104 |     return results[0] if isinstance(prompt, str) else results
105 | 
106 | 
107 | @overload
108 | def embed(doc: list) -> list:
109 |     ...
110 | 
111 | 
112 | @overload
113 | def embed(doc: str) -> str:
114 |     ...
115 | 
116 | 
117 | def embed(doc):
118 |     """Create embedding for a document
119 | 
120 |     :param doc: Document(s) to embed
121 |     :return: Embedding
122 | 
123 |     Note that this function is overloaded to return a list of embeddings if
124 |     a list if of docs is provided and a single embedding if a single
125 |     doc is provided as a string
126 | 
127 |     Examples:
128 | 
129 |     >>> embed("Hello, world")
130 |     [-0.0...]
131 | 
132 |     >>> embed(["Hello", "world"])
133 |     [[-0.0...]]
134 |     """
135 | 
136 |     docs = [doc] if isinstance(doc, str) else doc
137 | 
138 |     # Create embeddings and convert to lists of floats
139 |     emb = [[float(n) for n in e] for e in embeddings.embed(docs)]
140 | 
141 |     return emb[0] if isinstance(doc, str) else emb
142 | 
143 | 
144 | def chat(prompt: str) -> str:
145 |     """Get new message from chat-optimized language model
146 | 
147 |     The `prompt` for this model is provided as a series of messages as a single
148 |     plain-text string. Several special tokens are used to delineate chat
149 |     messages.
150 | 
151 |     - `system:` - Indicates the start of a system message providing
152 |     instructions about how the assistant should behave.
153 |     - `user:` - Indicates the start of a prompter (typically user)
154 |     message.
155 |     - `assistant:` - Indicates the start of an assistant message.
156 | 
157 |     A complete prompt may look something like this:
158 | 
159 |     ```
160 |     Assistant is helpful and harmless
161 | 
162 |     User: What is the capital of Germany?
163 | 
164 |     Assistant: The capital of Germany is Berlin.
165 | 
166 |     User: How many people live there?
167 | 
168 |     Assistant:
169 |     ```
170 | 
171 |     The completion from the language model is returned.
172 | 
173 |     :param message: Prompt using formatting described above
174 |     :return: Completion returned from the language model
175 | 
176 |     Examples:
177 | 
178 |     >>> response = chat('''
179 |     ...      System: Respond as a helpful assistant. It is 5:00pm.
180 |     ...
181 |     ...      User: What time is it?
182 |     ...
183 |     ...      Assistant:
184 |     ...      ''') # doctest: +SKIP
185 |     "It's 5:00pm."
186 |     """
187 | 
188 |     messages = parse_chat(prompt)
189 | 
190 |     # Suppress starts of all assistant messages to avoid repeat generation
191 |     suppress = [
192 |         "Assistant: " + m["content"].split(" ")[0]
193 |         for m in messages
194 |         if m["role"] in ["assistant", "user"]
195 |     ]
196 | 
197 |     # Suppress all user messages to avoid repeating them
198 |     suppress += [m["content"] for m in messages if m["role"] == "user"]
199 | 
200 |     system_msgs = [m for m in messages if m["role"] == "system"]
201 |     assistant_msgs = [m for m in messages if m["role"] == "assistant"]
202 |     user_msgs = [m for m in messages if m["role"] == "user"]
203 | 
204 |     # The current model is tuned on instructions and tends to get
205 |     # lost if it sees too many questions
206 |     # Use only the most recent user and assistant message for context
207 |     # Keep all system messages
208 |     messages = system_msgs + assistant_msgs[-1:] + user_msgs[-1:]
209 | 
210 |     rolemap = {
211 |         "system": "System",
212 |         "user": "Question",
213 |         "assistant": "Assistant",
214 |     }
215 | 
216 |     messages = [f"{rolemap[m['role']]}: {m['content']}" for m in messages]
217 | 
218 |     prompt = "\n\n".join(messages) + "\n\n" + "Assistant:"
219 | 
220 |     if prompt.startswith("System:"):
221 |         prompt = prompt[7:].strip()
222 | 
223 |     response = generate(
224 |         [prompt],
225 |         max_tokens=config["max_tokens"],
226 |         temperature=0.3,
227 |         topk=40,
228 |         prefix="Assistant:",
229 |         suppress=suppress,
230 |     )[0]
231 | 
232 |     # Remove duplicate assistant being generated
233 |     if response.startswith("Assistant:"):
234 |         response = response[10:]
235 | 
236 |     return response.strip()
237 | 
238 | 
239 | def code(prompt: str) -> str:
240 |     """Complete a code prompt
241 | 
242 |     This assumes that users are expecting Python completions. Default models
243 |     are fine-tuned on Python where applicable.
244 | 
245 |     :param prompt: Code context to complete
246 |     :return: Completion returned from the language model
247 | 
248 |     Examples:
249 | 
250 |     >>> code("# Print Hello, world!\\n")
251 |     'print("Hello, world!")\\n'
252 | 
253 |     >>> code("def return_4():")
254 |     '...return 4...'
255 |     """
256 |     return generate([prompt], max_tokens=config["max_tokens"], topk=1, model="code")[0]
257 | 
258 | 
259 | def extract_answer(question: str, context: str) -> str:
260 |     """Extract an answer to a `question` from a provided `context`
261 | 
262 |     :param question: A question to answer using knowledge from context
263 |     :param context: Knowledge used to answer the question
264 |     :return: Answer to the question.
265 | 
266 |     Examples:
267 | 
268 |     >>> context = "There is a green ball and a red box"
269 |     >>> extract_answer("What color is the ball?", context).lower()
270 |     '...green...'
271 | 
272 |     >>> extract_answer("Who created Python?", get_wiki('Python')) #doctest: +SKIP
273 |     '...Guido van Rossum...'
274 |     """
275 | 
276 |     return generate([f"{context}\n\n{question}"])[0]
277 | 
278 | 
279 | def classify(doc: str, label1: str, label2: str) -> str:
280 |     """Performs binary classification on an input
281 | 
282 |     :param doc: A plain text input document to classify
283 |     :param label1: The first label to classify against
284 |     :param label2: The second label to classify against
285 |     :return: The closest matching class. The return value will always be
286 |     `label1` or `label2`
287 | 
288 |     Examples:
289 | 
290 |     >>> classify("That book was good.","positive","negative")
291 |     'positive'
292 |     >>> classify("That movie was terrible.","positive","negative")
293 |     'negative'
294 |     """
295 | 
296 |     return do(
297 |         f"Classify as {label1} or {label2}: {doc}\n\nClassification:",
298 |         choices=[label1, label2],
299 |     )
300 | 
301 | 
302 | def store_doc(doc: str, name: str = "") -> None:
303 |     """Store document for later retrieval
304 | 
305 |     :param doc: A plain text document to store.
306 |     :param name: Optional name for the document. This is used as a chunk prefix.
307 | 
308 |     Examples:
309 | 
310 |     >>> store_doc("The sky is blue.")
311 |     """
312 |     docs.store(doc, name)
313 | 
314 | 
315 | def load_doc(query: str) -> str:
316 |     """Load a matching document
317 | 
318 |     A single document that best matches `query` will be returned.
319 | 
320 |     :param query: Query to compare to stored documents
321 |     :return: Content of the closest matching document
322 | 
323 |     Examples:
324 | 
325 |     >>> store_doc("Paris is in France.")
326 |     >>> store_doc("The sky is blue.")
327 |     >>> load_doc("Where is Paris?")
328 |     'Paris is in France.'
329 |     """
330 |     return docs.get_match(query)
331 | 
332 | 
333 | def get_doc_context(query: str) -> str:
334 |     """Loads context from documents
335 | 
336 |     A string representing the most relevant content from all stored documents
337 |     will be returned. This may be a blend of chunks from multiple documents.
338 | 
339 |     :param query: Query to compare to stored documents
340 |     :return: Up to 128 tokens of context
341 | 
342 |     Examples:
343 | 
344 |     >>> store_doc("Paris is in France.")
345 |     >>> store_doc("Paris is nice.")
346 |     >>> store_doc("The sky is blue.")
347 |     >>> get_doc_context("Where is Paris?")
348 |     'Paris is in France.\\n\\nParis is nice.'
349 |     """
350 |     return docs.get_context(query)
351 | 
352 | 
353 | def get_web(url: str) -> str:
354 |     """
355 |     Return the text of paragraphs from a web page
356 | 
357 |     :param url: The URL to load
358 |     :return str: Plain text content from the URL
359 | 
360 |     Note that it is difficult to return only the human-readable
361 |     content from an HTML page. This function takes a basic and quick
362 |     approach. It will not work perfectly on all sites, but will
363 |     often do a reasonable job of returning the plain text content
364 |     of a page.
365 | 
366 |     If the `url` points to a plain text page, the page content
367 |     will be returned verbatim.
368 |     """
369 | 
370 |     res = requests.get(
371 |         url, headers={"User-Agent": "Mozilla/5.0 (compatible; languagemodels)"}
372 |     )
373 | 
374 |     if "text/plain" in res.raw.getheader("content-type"):
375 |         return res.text
376 |     elif "text/html" in res.raw.getheader("content-type"):
377 |         return get_html_paragraphs(res.text)
378 | 
379 |     return ""
380 | 
381 | 
382 | def get_wiki(topic: str) -> str:
383 |     """
384 |     Return Wikipedia summary for a topic
385 | 
386 |     This function ignores the complexity of disambiguation pages and simply
387 |     returns the first result that is not a disambiguation page
388 | 
389 |     :param topic: Topic to search for on Wikipedia
390 |     :return: Text content of the lead section of the most popular matching article
391 | 
392 |     Examples:
393 | 
394 |     >>> get_wiki('Python language')
395 |     'Python is a high-level...'
396 | 
397 |     >>> get_wiki('Chemistry')
398 |     'Chemistry is the scientific study...'
399 |     """
400 | 
401 |     url = "https://api.wikimedia.org/core/v1/wikipedia/en/search/title"
402 |     response = requests.get(url, params={"q": topic, "limit": 5})
403 |     response = json.loads(response.text)
404 | 
405 |     for page in response["pages"]:
406 |         wiki_result = requests.get(
407 |             f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts|pageprops&"
408 |             f"exintro&redirects=1&titles={page['title']}&format=json"
409 |         ).json()
410 | 
411 |         first = wiki_result["query"]["pages"].popitem()[1]
412 |         if "disambiguation" in first["pageprops"]:
413 |             continue
414 | 
415 |         summary = first["extract"]
416 | 
417 |         cutoffs = [
418 |             "See_also",
419 |             "Notes",
420 |             "References",
421 |             "Further_reading",
422 |             "External_links",
423 |         ]
424 | 
425 |         for cutoff in cutoffs:
426 |             summary = summary.split(f'<span id="{cutoff}">', 1)[0]
427 | 
428 |         summary = re.sub(r"<p>", "\n\n", summary, flags=re.I)
429 |         summary = re.sub(r"<!\-\-.*?\-\->", "", summary, flags=re.I | re.DOTALL)
430 |         summary = re.sub(r"<.*?>", "", summary, flags=re.I)
431 |         summary = re.sub(r"\s*[\n\r]+\s*[\r\n]+[\s\r\n]*", "\n\n", summary, flags=re.I)
432 |         summary = summary.strip()
433 |         return summary
434 |     else:
435 |         return "No matching wiki page found."
436 | 
437 | 
438 | def get_weather(latitude, longitude):
439 |     """Fetch the current weather for a supplied longitude and latitude
440 | 
441 |     Weather is provided by the US government and this function only supports
442 |     locations in the United States.
443 | 
444 |     :param latitude: Latitude value representing this location
445 |     :param longitude: Longitude value representing this location
446 |     :return: Plain text description of the current weather forecast
447 | 
448 |     Examples:
449 | 
450 |     >>> get_weather(41.8, -87.6) # doctest: +SKIP
451 |     'Scattered showers and thunderstorms before 1pm with a high of 73.'
452 |     """
453 | 
454 |     res = requests.get(f"https://api.weather.gov/points/{latitude},{longitude}")
455 |     points = json.loads(res.text)
456 |     forecast_url = points["properties"]["forecast"]
457 | 
458 |     res = requests.get(forecast_url)
459 |     forecast = json.loads(res.text)
460 |     current = forecast["properties"]["periods"][0]
461 | 
462 |     return current["detailedForecast"]
463 | 
464 | 
465 | def get_date() -> str:
466 |     """Returns the current date and time in natural language
467 | 
468 |     >>> get_date() # doctest: +SKIP
469 |     'Friday, May 12, 2023 at 09:27AM'
470 |     """
471 | 
472 |     now = datetime.datetime.now()
473 | 
474 |     return now.strftime("%A, %B %d, %Y at %I:%M%p")
475 | 
476 | 
477 | def print_tokens(prompt: str) -> None:
478 |     """Prints a list of tokens in a prompt
479 | 
480 |     :param prompt: Prompt to use as input to tokenizer
481 |     :return: Nothing
482 | 
483 |     Examples:
484 | 
485 |     >>> print_tokens("Hello world")
486 |     ' Hello' (token 8774)
487 |     ' world' (token 296)
488 | 
489 |     >>> print_tokens("Hola mundo")
490 |     ' Hol' (token 5838)
491 |     'a' (token 9)
492 |     ' mun' (token 13844)
493 |     'd' (token 26)
494 |     'o' (token 32)
495 |     """
496 | 
497 |     tokens = list_tokens(prompt)
498 | 
499 |     for token in tokens:
500 |         print(f"'{token[0].replace('▁', ' ')}' (token {token[1]})")
501 | 
502 | 
503 | def count_tokens(prompt: str) -> None:
504 |     """Counts tokens in a prompt
505 | 
506 |     :param prompt: Prompt to use as input to tokenizer
507 |     :return: Nothing
508 | 
509 |     Examples:
510 | 
511 |     >>> count_tokens("Hello world")
512 |     2
513 | 
514 |     >>> count_tokens("Hola mundo")
515 |     5
516 |     """
517 | 
518 |     return len(list_tokens(prompt))
519 | 
520 | 
521 | def set_max_ram(value):
522 |     """Sets max allowed RAM
523 | 
524 |     This value takes priority over environment variables
525 | 
526 |     Returns the numeric value set in GB
527 | 
528 |     >>> set_max_ram(16)
529 |     16.0
530 | 
531 |     >>> set_max_ram('512mb')
532 |     0.5
533 |     """
534 | 
535 |     config["max_ram"] = value
536 | 
537 |     return config["max_ram"]
538 | 
539 | 
540 | def require_model_license(match_re):
541 |     """Require models to match supplied regex
542 | 
543 |     This can be used to enforce certain licensing constraints when using this
544 |     package.
545 |     """
546 |     config["model_license"] = match_re
547 | 


--------------------------------------------------------------------------------
/test/planets.json:
--------------------------------------------------------------------------------
1 | [{"name": "Mercury", "content": "Mercury is the first planet from the Sun and the smallest planet in the Solar System. It is a terrestrial planet with a heavily cratered surface due to the planet having no geological activity and an extremely tenuous atmosphere (called an exosphere). Despite being the smallest planet in the Solar System with a mean diameter of 4,880 km (3,030 mi), 38% of that of Earth's, Mercury is dense enough to have roughly the same surface gravity as Mars. Mercury has a dynamic magnetic field with a strength about 1% of that of Earth's and has no natural satellites. \nAccording to current theories, Mercury may have a solid silicate crust and mantle overlying a solid outer core, a deeper liquid core layer, and a solid inner core. Having almost no atmosphere to retain heat, Mercury has surface temperatures that change wildly during the day, ranging from 100 K (\u2212173 \u00b0C; \u2212280 \u00b0F) at night to 700 K (427 \u00b0C; 800 \u00b0F) during sunlight across the equator regions. At Mercury's poles though, there are large reservoirs of water ices that are never exposed to direct sunlight, which has an estimated mass of about 0.025\u20130.25% the Antarctic ice sheet. There are many competing hypotheses about Mercury's origins and development, some of which incorporate collision with planetesimal and rock vaporization. \nBecause Mercury is very close to the Sun, the intensity of sunlight on its surface is between 4.59 and 10.61 times the solar constant (amount of the Sun's energy received at 1 astronomical unit, which is roughly the distance between Earth and the Sun). Mercury orbits the Sun in a 3:2 spin\u2013orbit resonance, meaning that relative to the background stars, it rotates on its axis exactly three times for every two revolutions it makes around the Sun. Counterintuitively, due to Mercury's slow rotation, an observer on the planet would see only one Mercurian solar day (176 Earth days) every two Mercurian solar years (88 Earth days each). Mercury's axis has the smallest tilt of any of the Solar System's planets (about 1\u204430 of a degree), and its orbital eccentricity is the largest of all known planets in the Solar System.Like Venus, Mercury orbits the Sun within Earth's orbit, making it appear in Earth's sky only as a \"morning star\" or \"evening star\" that's relatively close to the Sun. In English, it is named after the Roman god Mercurius (Mercury), god of commerce, communication and the messenger of gods. Mercury is the most difficult planet to reach from Earth because it requires the greatest change in spacecraft's velocity. Only two spacecraft have visited Mercury as of 2023: Mariner 10 flew by in 1974 and 1975, and MESSENGER launched in 2004 and orbited Mercury over 4,000 times in four years. The BepiColombo spacecraft is planned to arrive at Mercury in 2025.\n\n"}, {"name": "Venus", "content": "Venus is the second planet from the Sun. It is a rocky planet with the densest atmosphere of all the rocky bodies in the Solar System, and the only one with a mass and size that is close to that of its orbital neighbour Earth. Orbiting inferiorly (inside of Earth's orbit), it appears in Earth's sky always close to the Sun, as either a \"morning star\" or an \"evening star\". While this is also true for Mercury, Venus appears as such much more prominently, since it is the third brightest object in Earth's sky after the Moon and the Sun, appearing brighter than any other star-like classical planet or any fixed star. With such prominent appearances in Earth's sky, Venus has historically been a common and important object for humans, in both their cultures and astronomy.\nVenus retains, despite having only a weak induced magnetosphere, an especially thick atmosphere mainly of carbon dioxide, which, together with its global sulfuric acid cloud cover, creates an extreme greenhouse effect. These cause at the surface a mean temperature of 737 K (464 \u00b0C; 867 \u00b0F) and a crushing pressure of 92 times that of Earth's at sea level, turning the air into a supercritical fluid, though at cloudy altitudes of 50 km (30 mi) Earthlike levels are found. Conditions possibly favourable for life on Venus have been identified at its cloud layers, while recent research has found indicative, but not convincing, evidence. Early in Venus's history, water may have been abundant enough to form oceans, but any liquid water there will have evaporated when greenhouse effects cascaded and then been taken away into space by the solar wind. Internally Venus is thought to consist, like Earth, of a core, mantle, and crust, the latter releasing internal heat through its active volcanism, shaping the surface with large resurfacing instead of, as on Earth, plate tectonics.\nLike Mercury, Venus has no moons. Like Uranus's, its rotation is retrograde, against its orbital direction. Having been slowed by the strong currents and drag of the atmosphere, it completes a sidereal rotation, relative to the stars, in 243 Earth days. Therefore it rotates more slowly than it is orbiting the Sun, having a solar year of 224.7 Earth days, so that its solar day, or the time the Sun takes to cross the same meridian twice, is 117 Earth days long. Venus and Earth approach each other in synodic periods of 1.6 years. While coming closer to each other at inferior conjunction than any other pair of the Sun's planets, since they have the closest two planetary orbits, they each still on average stay closer to Mercury than to any other planet, as Mercury passes by more frequently because of its more central, thus more rapid, orbit. That said, Venus and Earth have between them a lower difference in gravitational potential than exists between either of them and any other planet. This fact has allowed Venus to be the most accessible destination and attractive gravity assist waypoint for interplanetary flights.\nIn 1961, Venus became the target of the first interplanetary flight in human history, followed by many essential interplanetary firsts, confirming in 1970 Venus's inhospitable surface conditions with the first soft landing on another planet. This finding aborted any later representations of Venus as suitable for human habitation, once a popular theme in science fiction. Actual proposals, however, have suggested sending crews either on flybys, as gravity assists for crewed missions to Mars, or to enter the Venusian atmosphere and stay aloft, where, at sufficient altitude, conditions are more comparable to those on Earth's surface, including in respect of radiation and gravitation, than anywhere else in the Solar System. Currently, robotic probes are studying and more will be sent to study Venus, to provide crucial knowledge, particularly about greenhouse effects, and so inform predictions about global warming on Earth."}, {"name": "Earth", "content": "Earth is the third planet from the Sun and the only place known in the universe where life has originated and found habitability. Earth is the only planet known to sustain liquid surface water, with ocean water extending over 70.8% of the planet, making it an ocean world. Most of all other water is retained in Earth's polar regions, with large sheets of ice covering ocean and land, dwarfing Earth's groundwater, lakes, rivers and atmospheric water. The other 29.2% of the Earth's surface is land, consisting of continents and islands, and is widely covered by vegetation. Below the planet's surface lies the crust, consisting of several slowly moving tectonic plates, which interact to produce mountain ranges, volcanoes, and earthquakes. Inside the Earth's crust is a liquid outer core that generates the magnetosphere, deflecting most of the destructive solar winds and cosmic radiation.\nEarth has a dynamic atmosphere, which sustains Earth's surface conditions and protects it from most meteoroids and UV-light at entry. It has a composition of primarily nitrogen and oxygen. Water vapor is widely present in the atmosphere, forming clouds that cover most of the planet. The water vapor acts as a greenhouse gas and, together with other greenhouse gases in the atmosphere, particularly carbon dioxide (CO2), creates the conditions for both liquid surface water and water vapor to persist via the capturing of energy from the Sun's light. This process maintains the current average surface temperature of 14.76 \u00b0C, at which water is liquid under atmospheric pressure. Differences in the amount of captured energy between geographic regions (as with the equatorial region receiving more sunlight than the polar regions) drive atmospheric and ocean currents, producing a global climate system with different climate regions, and a range of weather phenomena such as precipitation, allowing components such as nitrogen to cycle.\nEarth is rounded into an ellipsoid with a circumference of about 40,000 km. It is the densest planet in the Solar System. Of the four rocky planets, it is the largest and most massive. Earth is about eight light-minutes away from the Sun and orbits it, taking a year (about 365.25 days) to complete one revolution. The Earth rotates around its own axis in slightly less than a day (in about 23 hours and 56 minutes). The Earth's axis of rotation is tilted with respect to the perpendicular to its orbital plane around the Sun, producing seasons. Earth is orbited by one permanent natural satellite, the Moon, which orbits Earth at 384,400 km (1.28 light seconds) and is roughly a quarter as wide as Earth. Through tidal locking, the Moon always faces the Earth with the same side, which causes tides, stabilizes Earth's axis, and gradually slows its rotation.\nEarth, like most other bodies in the Solar System, formed 4.5 billion years ago from gas in the early Solar System. During the first billion years of Earth's history, the ocean formed and then life developed within it. Life spread globally and has been altering Earth's atmosphere and surface, leading to the Great Oxidation Event two billion years ago. Humans emerged 300,000 years ago in Africa and have spread across every continent on Earth with the exception of Antarctica. Humans depend on Earth's biosphere and natural resources for their survival, but have increasingly impacted the planet's environment. Humanity's current impact on Earth's climate and biosphere is unsustainable, threatening the livelihood of humans and many other forms of life, and causing widespread extinctions."}, {"name": "Mars", "content": "Mars is the fourth planet and the furthest terrestrial planet from the Sun. The reddish color of its surface is due to finely grained iron(III) oxide dust in the soil, giving it the nickname \"the Red Planet\".  Mars has a second smallest radius among the planets in the Solar System at 3,389.5 km (2,106 mi) and has a surface gravity of 3.72 m/s2 (12.2 ft/s2), which is 38% of Earth's gravity. The Martian dichotomy can be clearly seen on the surface: on average, the terrain on Mars northern hemisphere is flatter and lower than Mars southern hemisphere. Mars has a very thin atmosphere made primarily of carbon dioxide and two irregularly shaped natural satellites: Phobos and Deimos.\nGeologically, Mars is fairly active, with dust devils sweeping across the landscape and marsquakes (Martian analog to earthquakes) trembling underneath the ground. The surface of Mars hosts a shield volcano (Olympus Mons) and one of the largest canyons in the Solar System (Valles Marineris). Mars's celestial motion is comparable to that of Earth, with a slightly eccentric orbit and an axial tilt only slightly greater than Earth's. This motion causes seasonal changes to the polar ice caps' coverage and temperature swings between \u2212110 \u00b0C (\u2212166 \u00b0F) to 35 \u00b0C (95 \u00b0F) on the surface. A Martian solar day (sol) is equal to 24.5 hours and a Martian solar year is equal to 1.88 Earth years.\nLike the other planets in the Solar System, Mars was formed 4.5 billion years ago. During the Noachian period from about 4.1 to 3.7 billion years ago, Mars's surface was marked by meteor impacts, valley formation, erosion, and the possible presence of water oceans. The Hesperian period from 3.7 to 3.2\u20132 billion years ago was dominated by widespread volcanic activity and flooding that carved immense outflow channels. The Amazonian period, which continues today, was marked by the wind's influence on geological processes. It is not yet known whether life has ever existed on Mars, though search for evidences of life on Mars is still ongoing.\nMars is among the brightest objects in Earth's sky, and thus has been known from the ancient times.  The Romans named it for the god of war, M\u0101rs, as did Greeks (Ares) and Mesopotamians (Nergal), likely because its color suggested blood. Its high-contrast albedo features make it an attractive target for viewing with a telescope.\nSince the late 20th century, Mars has been explored by uncrewed spacecraft and rovers, with the first flyby by the Mariner 4 probe in 1965, the first Mars orbiter by the Mars 2 probe in 1971, and the first landing by the Viking 1 in 1976. As of 2023, there are at least 11 active probes orbiting Mars or at the Martian surface. Currently, Mars is an attractive target for the first future interplanetary human missions."}, {"name": "Jupiter", "content": "Jupiter is the fifth planet from the Sun and the largest in the Solar System. It is a gas giant with a mass more than two and a half times that of all the other planets in the Solar System combined, and slightly less than one one-thousandth the mass of the Sun. Jupiter is the third brightest natural object in the Earth's night sky after the Moon and Venus, and it has been observed since prehistoric times. It was named after Jupiter, the chief deity of ancient Roman religion.\nJupiter is primarily composed of hydrogen (90% by volume), followed by helium, which constitutes a quarter of its mass and a tenth of its volume. The ongoing contraction of Jupiter's interior generates more heat than the planet receives from the Sun. Because of its rapid rotation rate of 1 rotation per 10 hours, the planet's shape is an oblate spheroid: it has a slight but noticeable bulge around the equator. The outer atmosphere is divided into a series of latitudinal bands, with turbulence and storms along their interacting boundaries. The most obvious result of this is the Great Red Spot, a giant storm which has been observed since 1831 and possibly earlier.\nJupiter is surrounded by a faint planetary ring system and has a powerful magnetosphere, the largest contiguous structure in the Solar System after the heliosphere. Jupiter forms a system of 95 known moons and probably many more, including the four large moons discovered by Galileo Galilei in 1610: Io, Europa, Ganymede, and Callisto. Ganymede, the largest of the four, is larger than the planet Mercury. Callisto is the second largest; Io and Europa are approximately the size of Earth's Moon.\nSince 1973, Jupiter has been visited by nine robotic probes: seven flybys and two dedicated orbiters, with two more either en route or awaiting launch."}, {"name": "Saturn", "content": "Saturn is the sixth planet from the Sun and the second-largest in the Solar System, after Jupiter. It is a gas giant with an average radius of about nine and a half times that of Earth. It has only one-eighth the average density of Earth, but is over 95 times more massive.Saturn's interior is thought to be composed of a rocky core, surrounded by a deep layer of metallic hydrogen, an intermediate layer of liquid hydrogen and liquid helium, and finally, a gaseous outer layer. Saturn has a pale yellow hue due to ammonia crystals in its upper atmosphere. An electrical current within the metallic hydrogen layer is thought to give rise to Saturn's planetary magnetic field, which is weaker than Earth's, but which has a magnetic moment 580 times that of Earth due to Saturn's larger size. Saturn's magnetic field strength is around one-twentieth of Jupiter's. The outer atmosphere is generally bland and lacking in contrast, although long-lived features can appear. Wind speeds on Saturn can reach 1,800 kilometres per hour (1,100 miles per hour).\nThe planet has a prominent ring system, which is composed mainly of ice particles, with a smaller amount of rocky debris and dust. At least 146 moons are known to orbit the planet, of which 63 are officially named; this does not include the hundreds of moonlets in its rings. Titan, Saturn's largest moon and the second largest in the Solar System, is larger (while less massive) than the planet Mercury and is the only moon in the Solar System to have a substantial atmosphere.\n\n"}, {"name": "Uranus", "content": "Uranus is the seventh planet from the Sun and is a gaseous cyan ice giant. Most of Uranus is made out of water, ammonia, and methane in a supercritical phase of matter, which in astronomy is called 'ice' or volatiles. The planet's atmosphere has a complex layered cloud structure and has the lowest minimum temperature of 49 K (\u2212224 \u00b0C; \u2212371 \u00b0F) out of all Solar System's planets. Uranus has a marked axial tilt of 97.8\u00b0 with a retrograde rotation rate of 17 hours. This means that in an 84 Earth years orbital period around the Sun, its poles get around 42 years of continuous sunlight, followed by 42 years of continuous darkness. \nUranus has the third-largest diameter and fourth-largest mass among the Solar System's planets. Based on current models, inside Uranus's volatile mantle layer is a rocky core, and surrounding it is a thick hydrogen and helium atmosphere. Trace amount of hydrocarbons (thought to be produced via hydrolysis) and carbon monoxide along with carbon dioxide (thought to have been originated from comets) have been detected in the upper atmosphere. There are many unexplained climate phenomena in Uranus's atmosphere, such as its peak wind speed of 900 km/h (560 mph), variations in its polar cap and its erratic cloud formation. Uranus also has a very low internal heat compared to other giant planets, which is still unexplained. \nLike the other giant planets, Uranus has a ring system, orbiting natural satellites and a magnetosphere. Uranus's ring system is extremely dark, with only about 2% of the incoming light is reflected, and contains the known 13 inner moons. Further out are the larger 5 major moons of the planet: Miranda, Ariel, Umbriel, Titania, and Oberon; and orbit at much greater distance from Uranus are the known 9 irregular moons. Uranus magnetosphere is highly asymmetric and has many charged particles, which may cause the darkening of its rings and moons.\nUranus is visible to the naked eye, but it is very dim and was not classified as a planet until 1781, when it was first observed by William Herschel. About seven decades after its discovery, consensus was reached that the planet be named from the Greek god Uranus (Ouranos), one of the Greek primordial deities. As of 2023, Uranus was visited up close only one time when in 1986 the Voyager 2 probe flew by the planet. Though nowadays Uranus can be resolved and observed by telescopes, there is much desire to revisit the planet, as shown by Planetary Science Decadal Survey's decision to make the proposed Uranus Orbiter and Probe mission a top priority in the 2023\u20132032 survey."}, {"name": "Neptune", "content": "Neptune is the eighth planet from the Sun and the farthest known planet in the Solar System. It is the fourth-largest planet in the Solar System by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth, and slightly more massive than its near-twin Uranus. Neptune is denser and physically smaller than Uranus because its greater mass causes more gravitational compression of its atmosphere. Being composed primarily of gases and liquids, it has no well-defined solid surface. The planet orbits the Sun once every 164.8 years at an average distance of 30.1 astronomical units (4.5 billion kilometres; 2.8 billion miles). It is named after the Roman god of the sea and has the astronomical symbol , representing Neptune's trident.Neptune is not visible to the unaided eye and is the only planet in the Solar System found by mathematical prediction rather than by empirical observation. Unexpected changes in the orbit of Uranus led Alexis Bouvard to hypothesise that its orbit was subject to gravitational perturbation by an unknown planet. After Bouvard's death, the position of Neptune was predicted from his observations, independently, by John Couch Adams and Urbain Le Verrier. Neptune was subsequently observed with a telescope on 23 September 1846 by Johann Galle within a degree of the position predicted by Le Verrier. Its largest moon, Triton, was discovered shortly thereafter, though none of the planet's remaining 13 known moons were located telescopically until the 20th century. The planet's distance from Earth gives it a very small apparent size, making it challenging to study with Earth-based telescopes. Neptune was visited by Voyager 2, when it flew by the planet on 25 August 1989; Voyager 2 remains the only spacecraft to have visited Neptune. The advent of the Hubble Space Telescope and large ground-based telescopes with adaptive optics has recently allowed for additional detailed observations from afar.\nLike the gas giants (Jupiter and Saturn), Neptune's atmosphere is composed primarily of hydrogen and helium, along with traces of hydrocarbons and possibly nitrogen, but contains a higher proportion of ices such as water, ammonia and methane. Similar to Uranus, its interior is primarily composed of ices and rock; both planets are normally considered \"ice giants\" to distinguish them. Along with Rayleigh scattering, traces of methane in the outermost regions in part account for the planet's blue appearance. Newest data from the Gemini observatory shows the blue colour is more saturated than the one present on Uranus due to thinner haze of Neptune's more active atmosphere.In contrast to the hazy, relatively featureless atmosphere of Uranus, Neptune's atmosphere has active and visible weather patterns. For example, at the time of the Voyager 2 flyby in 1989, the planet's southern hemisphere had a Great Dark Spot comparable to the Great Red Spot on Jupiter. More recently, in 2018, a newer main dark spot and smaller dark spot were identified and studied. In addition, these weather patterns are driven by the strongest sustained winds of any planet in the Solar System, with recorded wind speeds as high as 2,100 km/h (580 m/s; 1,300 mph). Because of its great distance from the Sun, Neptune's outer atmosphere is one of the coldest places in the Solar System, with temperatures at its cloud tops approaching 55 K (\u2212218 \u00b0C; \u2212361 \u00b0F). Temperatures at the planet's centre are approximately 5,400 K (5,100 \u00b0C; 9,300 \u00b0F). Neptune has a faint and fragmented ring system (labelled \"arcs\"), which was discovered in 1984, then later confirmed by Voyager 2."}]


--------------------------------------------------------------------------------
/languagemodels/config.py:
--------------------------------------------------------------------------------
  1 | """Global model and inference configuration
  2 | 
  3 | This module manages the global configuration object shared between other
  4 | modules in the package. It implements a dictionary with data validation
  5 | on the keys and values.
  6 | 
  7 | Note that this module provides access to many implementation details
  8 | that are not expected to be used by average users. Specific models that
  9 | have never been the default for the package may be removed at any time.
 10 | """
 11 | 
 12 | import re
 13 | import os
 14 | from collections import namedtuple
 15 | from huggingface_hub import hf_hub_download
 16 | import json
 17 | 
 18 | ConfigItem = namedtuple("ConfigItem", "initfn default")
 19 | 
 20 | 
 21 | class ModelFilterException(Exception):
 22 |     pass
 23 | 
 24 | 
 25 | # Model list
 26 | # This list is sorted in priority order, with the best models first
 27 | # The best model that fits in the memory bounds and matches the model filter
 28 | # will be selected
 29 | models = [
 30 |     {
 31 |         "name": "openchat-3.5-0106",
 32 |         "tuning": "instruct",
 33 |         "datasets": ["mistral", "openorca", "flan"],
 34 |         "params": 7e9,
 35 |         "quantization": "int8",
 36 |         "backend": "ct2",
 37 |         "architecture": "decoder-only-transformer",
 38 |         "license": "apache-2.0",
 39 |         "prompt_fmt": (
 40 |             "GPT4 Correct User: {instruction}<|end_of_turn|>" "GPT4 Correct Assistant:"
 41 |         ),
 42 |     },
 43 |     {
 44 |         "name": "Llama-3.1-8B-Instruct",
 45 |         "tuning": "instruct",
 46 |         "revision": "d02fc85",
 47 |         "datasets": ["llama3"],
 48 |         "params": 8e9,
 49 |         "quantization": "int8",
 50 |         "backend": "ct2",
 51 |         "architecture": "decoder-only-transformer",
 52 |         "license": "llama3",
 53 |         "prompt_fmt": (
 54 |             "<|start_header_id|>user<|end_header_id|>\n\n"
 55 |             "{instruction}<|eot_id|>"
 56 |             "<|start_header_id|>assistant<|end_header_id|>\n\n"
 57 |         ),
 58 |     },
 59 |     {
 60 |         "name": "Meta-Llama-3-8B-Instruct",
 61 |         "tuning": "instruct",
 62 |         "datasets": ["llama3"],
 63 |         "params": 8e9,
 64 |         "quantization": "int8",
 65 |         "backend": "ct2",
 66 |         "architecture": "decoder-only-transformer",
 67 |         "license": "llama3",
 68 |         "prompt_fmt": (
 69 |             "<|start_header_id|>user<|end_header_id|>\n\n"
 70 |             "{instruction}<|eot_id|>"
 71 |             "<|start_header_id|>assistant<|end_header_id|>\n\n"
 72 |         ),
 73 |     },
 74 |     {
 75 |         "name": "openchat-3.5-1210",
 76 |         "tuning": "instruct",
 77 |         "datasets": ["mistral", "openorca", "flan"],
 78 |         "params": 7e9,
 79 |         "quantization": "int8",
 80 |         "backend": "ct2",
 81 |         "architecture": "decoder-only-transformer",
 82 |         "license": "apache-2.0",
 83 |         "prompt_fmt": (
 84 |             "GPT4 Correct User: {instruction}<|end_of_turn|>" "GPT4 Correct Assistant:"
 85 |         ),
 86 |     },
 87 |     {
 88 |         "name": "WizardLM-2-7B",
 89 |         "tuning": "instruct",
 90 |         "datasets": ["mistral", "wizardlm"],
 91 |         "params": 7e9,
 92 |         "quantization": "int8",
 93 |         "backend": "ct2",
 94 |         "architecture": "decoder-only-transformer",
 95 |         "license": "apache-2.0",
 96 |         "prompt_fmt": "USER: {instruction} ASSISTANT:",
 97 |     },
 98 |     {
 99 |         "name": "neural-chat-7b-v3-1",
100 |         "tuning": "instruct",
101 |         "datasets": ["mistral", "slimorca"],
102 |         "params": 7e9,
103 |         "quantization": "int8",
104 |         "backend": "ct2",
105 |         "architecture": "decoder-only-transformer",
106 |         "license": "apache-2.0",
107 |         "prompt_fmt": (
108 |             "### System:\n"
109 |             "Be helpful\n"
110 |             "### User:\n{instruction}\n"
111 |             "### Assistant:\n"
112 |         ),
113 |     },
114 |     {
115 |         "name": "Mistral-7B-Instruct-v0.2",
116 |         "tuning": "instruct",
117 |         "datasets": ["mistral"],
118 |         "params": 7e9,
119 |         "quantization": "int8",
120 |         "backend": "ct2",
121 |         "architecture": "decoder-only-transformer",
122 |         "license": "apache-2.0",
123 |         "prompt_fmt": "<s>[INST] {instruction} [/INST]",
124 |     },
125 |     {
126 |         "name": "flan-alpaca-gpt4-xl",
127 |         "tuning": "instruct",
128 |         "datasets": ["c4", "flan", "gpt4-alpaca"],
129 |         "params": 3e9,
130 |         "quantization": "int8",
131 |         "backend": "ct2",
132 |         "architecture": "encoder-decoder-transformer",
133 |         "license": "apache-2.0",
134 |     },
135 |     {
136 |         "name": "flan-alpaca-xl",
137 |         "tuning": "instruct",
138 |         "datasets": ["c4", "flan", "alpaca"],
139 |         "params": 3e9,
140 |         "quantization": "int8",
141 |         "backend": "ct2",
142 |         "architecture": "encoder-decoder-transformer",
143 |         "license": "apache-2.0",
144 |     },
145 |     {
146 |         "name": "flan-t5-xl",
147 |         "tuning": "instruct",
148 |         "datasets": ["c4", "flan"],
149 |         "params": 3e9,
150 |         "quantization": "int8",
151 |         "backend": "ct2",
152 |         "architecture": "encoder-decoder-transformer",
153 |         "license": "apache-2.0",
154 |     },
155 |     {
156 |         "name": "Llama-3.2-3B-Instruct",
157 |         "tuning": "instruct",
158 |         "revision": "5da4ba8",
159 |         "datasets": ["llama3"],
160 |         "params": 1e9,
161 |         "quantization": "int8",
162 |         "backend": "ct2",
163 |         "architecture": "decoder-only-transformer",
164 |         "license": "llama3.2",
165 |         "repetition_penalty": 1.1,
166 |         "prompt_fmt": (
167 |             "<|start_header_id|>user<|end_header_id|>\n\n"
168 |             "{instruction}<|eot_id|>"
169 |             "<|start_header_id|>assistant<|end_header_id|>\n\n"
170 |         ),
171 |     },
172 |     {
173 |         "name": "fastchat-t5-3b-v1.0",
174 |         "tuning": "instruct",
175 |         "datasets": ["c4", "flan", "sharegpt"],
176 |         "params": 3e9,
177 |         "quantization": "int8",
178 |         "backend": "ct2",
179 |         "architecture": "encoder-decoder-transformer",
180 |         "license": "apache-2.0",
181 |     },
182 |     {
183 |         "name": "LaMini-Flan-T5-783M",
184 |         "tuning": "instruct",
185 |         "revision": "e5e20a1",
186 |         "datasets": ["c4", "flan", "lamini"],
187 |         "params": 783e6,
188 |         "quantization": "int8",
189 |         "backend": "ct2",
190 |         "architecture": "encoder-decoder-transformer",
191 |         "license": "cc-by-nc-4.0",
192 |     },
193 |     {
194 |         "name": "flan-t5-large",
195 |         "tuning": "instruct",
196 |         "datasets": ["c4", "flan"],
197 |         "params": 783e6,
198 |         "quantization": "int8",
199 |         "backend": "ct2",
200 |         "architecture": "encoder-decoder-transformer",
201 |         "license": "apache-2.0",
202 |     },
203 |     {
204 |         "name": "Llama-3.2-1B-Instruct",
205 |         "tuning": "instruct",
206 |         "revision": "6e3e3a1",
207 |         "datasets": ["llama3"],
208 |         "params": 1e9,
209 |         "quantization": "int8",
210 |         "backend": "ct2",
211 |         "architecture": "decoder-only-transformer",
212 |         "license": "llama3.2",
213 |         "repetition_penalty": 1.1,
214 |         "prompt_fmt": (
215 |             "<|start_header_id|>user<|end_header_id|>\n\n"
216 |             "{instruction}<|eot_id|>"
217 |             "<|start_header_id|>assistant<|end_header_id|>\n\n"
218 |         ),
219 |     },
220 |     {
221 |         "name": "LaMini-Flan-T5-248M",
222 |         "tuning": "instruct",
223 |         "revision": "96cfe99",
224 |         "datasets": ["c4", "flan", "lamini"],
225 |         "params": 248e6,
226 |         "quantization": "int8",
227 |         "backend": "ct2",
228 |         "architecture": "encoder-decoder-transformer",
229 |         "license": "cc-by-nc-4.0",
230 |     },
231 |     {
232 |         "name": "flan-t5-base",
233 |         "tuning": "instruct",
234 |         "datasets": ["c4", "flan"],
235 |         "params": 248e6,
236 |         "quantization": "int8",
237 |         "backend": "ct2",
238 |         "architecture": "encoder-decoder-transformer",
239 |         "license": "apache-2.0",
240 |     },
241 |     {
242 |         "name": "flan-alpaca-base",
243 |         "tuning": "instruct",
244 |         "datasets": ["c4", "flan", "alpaca"],
245 |         "params": 248e6,
246 |         "quantization": "int8",
247 |         "backend": "ct2",
248 |         "architecture": "encoder-decoder-transformer",
249 |         "license": "apache-2.0",
250 |     },
251 |     {
252 |         "name": "dialogstudio-t5-base-v1.0",
253 |         "tuning": "instruct",
254 |         "datasets": ["c4", "flan", "dialogstudio"],
255 |         "params": 248e6,
256 |         "quantization": "int8",
257 |         "backend": "ct2",
258 |         "architecture": "encoder-decoder-transformer",
259 |         "license": "apache-2.0",
260 |         "prompt_fmt": ("Instruction: Be helpful. <USER> {instruction}"),
261 |     },
262 |     {
263 |         "name": "LaMini-Flan-T5-77M",
264 |         "tuning": "instruct",
265 |         "datasets": ["c4", "flan", "lamini"],
266 |         "params": 77e6,
267 |         "backend": "ct2",
268 |         "quantization": "int8",
269 |         "architecture": "encoder-decoder-transformer",
270 |         "license": "cc-by-nc-4.0",
271 |     },
272 |     {
273 |         "name": "flan-t5-small",
274 |         "tuning": "instruct",
275 |         "datasets": ["c4", "flan"],
276 |         "params": 77e6,
277 |         "quantization": "int8",
278 |         "backend": "ct2",
279 |         "architecture": "encoder-decoder-transformer",
280 |         "license": "apache-2.0",
281 |     },
282 |     {
283 |         "name": "Phi-3-mini-4k-instruct-20240701",
284 |         "tuning": "instruct",
285 |         "datasets": ["phi-3"],
286 |         "params": 3.8e9,
287 |         "quantization": "int8",
288 |         "backend": "ct2",
289 |         "architecture": "decoder-only-transformer",
290 |         "license": "mit",
291 |         "prompt_fmt": "<|user|>\n{instruction}<|end|>\n<|assistant|>",
292 |         "repetition_penalty": 1.1,
293 |     },
294 |     {
295 |         "name": "Phi-3-mini-4k-instruct",
296 |         "tuning": "instruct",
297 |         "datasets": ["phi-3"],
298 |         "params": 3.8e9,
299 |         "quantization": "int8",
300 |         "backend": "ct2",
301 |         "architecture": "decoder-only-transformer",
302 |         "license": "mit",
303 |         "prompt_fmt": "<|user|>\n{instruction}<|end|>\n<|assistant|>",
304 |         "repetition_penalty": 1.1,
305 |     },
306 |     {
307 |         "name": "phi-2",
308 |         "tuning": "instruct",
309 |         "datasets": ["phi-2"],
310 |         "params": 2.7e9,
311 |         "quantization": "int8",
312 |         "backend": "ct2",
313 |         "architecture": "decoder-only-transformer",
314 |         "license": "microsoft-research-license",
315 |         "prompt_fmt": "Instruct: {instruction}\nOutput:",
316 |     },
317 |     {
318 |         "name": "gemma-2b-it",
319 |         "tuning": "instruct",
320 |         "datasets": ["gemma"],
321 |         "params": 2.5e9,
322 |         "quantization": "int8",
323 |         "backend": "ct2",
324 |         "architecture": "decoder-only-transformer",
325 |         "license": "gemma-terms-of-use",
326 |         "prompt_fmt": "<bos><start_of_turn>user\n"
327 |         "{instruction}<end_of_turn>\n"
328 |         "<start_of_turn>model",
329 |     },
330 |     {
331 |         "name": "h2o-danube3-4b-chat",
332 |         "tuning": "instruct",
333 |         "datasets": [],
334 |         "params": 4.0e9,
335 |         "quantization": "int8",
336 |         "backend": "ct2",
337 |         "architecture": "decoder-only-transformer",
338 |         "license": "apache-2.0",
339 |         "prompt_fmt": "<|prompt|>{instruction}</s><|answer|>",
340 |     },
341 |     {
342 |         "name": "h2o-danube2-1.8b-chat",
343 |         "tuning": "instruct",
344 |         "datasets": [],
345 |         "params": 1.8e9,
346 |         "quantization": "int8",
347 |         "backend": "ct2",
348 |         "architecture": "decoder-only-transformer",
349 |         "license": "other",
350 |         "prompt_fmt": "<|prompt|>{instruction}</s><|answer|>",
351 |     },
352 |     {
353 |         "name": "h2o-danube-1.8b-chat",
354 |         "tuning": "instruct",
355 |         "datasets": [],
356 |         "params": 1.8e9,
357 |         "quantization": "int8",
358 |         "backend": "ct2",
359 |         "architecture": "decoder-only-transformer",
360 |         "license": "other",
361 |         "prompt_fmt": "<|prompt|>{instruction}</s><|answer|>",
362 |     },
363 |     {
364 |         "name": "Falcon3-3B-Instruct",
365 |         "tuning": "instruct",
366 |         "languages": ["en", "fr", "es", "pt"],
367 |         "revision": "b183d4d",
368 |         "datasets": [],
369 |         "params": 3.23e9,
370 |         "quantization": "int8",
371 |         "backend": "ct2",
372 |         "context_length": 8192,
373 |         "repetition_penalty": 1.1,
374 |         "architecture": "decoder-only-transformer",
375 |         "license": "falcon",
376 |         "prompt_fmt": (
377 |             "<|system|>\nAnswer concisely.\n<|user|>\n{instruction}\n<|assistant|>\n"
378 |         ),
379 |     },
380 |     {
381 |         "name": "phi-1_5",
382 |         "tuning": "instruct",
383 |         "datasets": ["phi-1_5"],
384 |         "params": 1.4e9,
385 |         "quantization": "int8",
386 |         "backend": "ct2",
387 |         "architecture": "decoder-only-transformer",
388 |         "license": "other",
389 |         "prompt_fmt": "{instruction}\n\nAnswer:",
390 |     },
391 |     {
392 |         "name": "h2o-danube3-500m-chat",
393 |         "tuning": "instruct",
394 |         "datasets": [],
395 |         "params": 0.5e9,
396 |         "quantization": "int8",
397 |         "backend": "ct2",
398 |         "architecture": "decoder-only-transformer",
399 |         "license": "apache-2.0",
400 |         "prompt_fmt": "<|prompt|>{instruction}</s><|answer|>",
401 |     },
402 |     {
403 |         "name": "SmolLM2-1.7B-Instruct",
404 |         "tuning": "instruct",
405 |         "revision": "83b1658",
406 |         "datasets": [],
407 |         "params": 1.7e9,
408 |         "quantization": "int8",
409 |         "backend": "ct2",
410 |         "context_length": 2048,
411 |         "repetition_penalty": 1.0,
412 |         "architecture": "decoder-only-transformer",
413 |         "license": "apache-2.0",
414 |         "prompt_fmt": (
415 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
416 |             "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
417 |         ),
418 |     },
419 |     {
420 |         "name": "SmolLM-1.7B-Instruct",
421 |         "tuning": "instruct",
422 |         "revision": "dc3dfe2",
423 |         "datasets": [],
424 |         "params": 1.7e9,
425 |         "quantization": "int8",
426 |         "backend": "ct2",
427 |         "context_length": 2048,
428 |         "repetition_penalty": 1.1,
429 |         "architecture": "decoder-only-transformer",
430 |         "license": "apache-2.0",
431 |         "prompt_fmt": (
432 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
433 |             "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
434 |         ),
435 |     },
436 |     {
437 |         "name": "Falcon3-1B-Instruct",
438 |         "tuning": "instruct",
439 |         "languages": ["en", "fr", "es", "pt"],
440 |         "revision": "74391aa",
441 |         "datasets": [],
442 |         "params": 1.7e9,
443 |         "quantization": "int8",
444 |         "backend": "ct2",
445 |         "context_length": 8192,
446 |         "repetition_penalty": 1.1,
447 |         "architecture": "decoder-only-transformer",
448 |         "license": "falcon",
449 |         "prompt_fmt": (
450 |             "<|system|>\nAnswer concisely.\n<|user|>\n{instruction}\n<|assistant|>\n"
451 |         ),
452 |     },
453 |     {
454 |         "name": "Qwen2.5-1.5B-Instruct",
455 |         "tuning": "instruct",
456 |         "languages": [
457 |             "zh",
458 |             "en",
459 |             "fr",
460 |             "es",
461 |             "pt",
462 |             "de",
463 |             "it",
464 |             "ru",
465 |             "ja",
466 |             "ko",
467 |             "vi",
468 |             "th",
469 |             "ar",
470 |         ],
471 |         "revision": "5de22ab",
472 |         "datasets": [],
473 |         "params": 1.5e9,
474 |         "quantization": "int8",
475 |         "backend": "ct2",
476 |         "context_length": 32 * 1024,
477 |         "repetition_penalty": 1.1,
478 |         "architecture": "decoder-only-transformer",
479 |         "license": "apache-2.0",
480 |         "prompt_fmt": (
481 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
482 |             "<|im_start|>user\n{instruction}<|im_end|>\n"
483 |             "<|im_start|>assistant\n"
484 |         ),
485 |     },
486 |     {
487 |         "name": "Qwen2.5-0.5B-Instruct",
488 |         "tuning": "instruct",
489 |         "languages": [
490 |             "zh",
491 |             "en",
492 |             "fr",
493 |             "es",
494 |             "pt",
495 |             "de",
496 |             "it",
497 |             "ru",
498 |             "ja",
499 |             "ko",
500 |             "vi",
501 |             "th",
502 |             "ar",
503 |         ],
504 |         "revision": "554ffe5",
505 |         "datasets": [],
506 |         "params": 0.5e9,
507 |         "quantization": "int8",
508 |         "backend": "ct2",
509 |         "context_length": 32 * 1024,
510 |         "repetition_penalty": 1.1,
511 |         "architecture": "decoder-only-transformer",
512 |         "license": "apache-2.0",
513 |         "prompt_fmt": (
514 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
515 |             "<|im_start|>user\n{instruction}<|im_end|>\n"
516 |             "<|im_start|>assistant\n"
517 |         ),
518 |     },
519 |     {
520 |         "name": "SmolLM2-360M-Instruct",
521 |         "tuning": "instruct",
522 |         "revision": "ed9c4fe",
523 |         "datasets": [],
524 |         "params": 360e6,
525 |         "quantization": "int8",
526 |         "backend": "ct2",
527 |         "context_length": 2048,
528 |         "repetition_penalty": 1.0,
529 |         "architecture": "decoder-only-transformer",
530 |         "license": "apache-2.0",
531 |         "prompt_fmt": (
532 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
533 |             "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
534 |         ),
535 |     },
536 |     {
537 |         "name": "SmolLM-360M-Instruct",
538 |         "tuning": "instruct",
539 |         "revision": "0b0e861",
540 |         "datasets": [],
541 |         "params": 360e6,
542 |         "quantization": "int8",
543 |         "backend": "ct2",
544 |         "context_length": 2048,
545 |         "repetition_penalty": 1.1,
546 |         "architecture": "decoder-only-transformer",
547 |         "license": "apache-2.0",
548 |         "prompt_fmt": (
549 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
550 |             "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
551 |         ),
552 |     },
553 |     {
554 |         "name": "SmolLM2-135M-Instruct",
555 |         "tuning": "instruct",
556 |         "revision": "e52a3dc",
557 |         "datasets": [],
558 |         "params": 135e6,
559 |         "quantization": "int8",
560 |         "backend": "ct2",
561 |         "context_length": 2048,
562 |         "repetition_penalty": 1.0,
563 |         "architecture": "decoder-only-transformer",
564 |         "license": "apache-2.0",
565 |         "prompt_fmt": (
566 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
567 |             "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
568 |         ),
569 |     },
570 |     {
571 |         "name": "SmolLM-135M-Instruct",
572 |         "tuning": "instruct",
573 |         "revision": "90046ba",
574 |         "datasets": [],
575 |         "params": 135e6,
576 |         "quantization": "int8",
577 |         "backend": "ct2",
578 |         "context_length": 2048,
579 |         "repetition_penalty": 1.3,
580 |         "architecture": "decoder-only-transformer",
581 |         "license": "apache-2.0",
582 |         "prompt_fmt": (
583 |             "<|im_start|>system\nAnswer concisely.<|im_end|>\n"
584 |             "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
585 |         ),
586 |     },
587 |     {
588 |         "name": "LaMini-GPT-774M",
589 |         "tuning": "instruct",
590 |         "datasets": ["webtext", "lamini"],
591 |         "params": 774e6,
592 |         "quantization": "int8",
593 |         "backend": "ct2",
594 |         "architecture": "decoder-only-transformer",
595 |         "license": "mit",
596 |         "prompt_fmt": (
597 |             "Below is an instruction that describes a task.\n"
598 |             "Write a response that completes the request.\n\n"
599 |             "### Instruction:\n{instruction}\n\n### Response:"
600 |         ),
601 |     },
602 |     {
603 |         "name": "LaMini-GPT-124M",
604 |         "tuning": "instruct",
605 |         "datasets": ["webtext", "lamini"],
606 |         "params": 124e6,
607 |         "quantization": "int8",
608 |         "backend": "ct2",
609 |         "architecture": "decoder-only-transformer",
610 |         "license": "mit",
611 |         "prompt_fmt": (
612 |             "Below is an instruction that describes a task.\n"
613 |             "Write a response that completes the request.\n\n"
614 |             "### Instruction:\n{instruction}\n\n### Response:"
615 |         ),
616 |     },
617 |     {
618 |         "name": "TinyLlama-1.1B-Chat-v1.0",
619 |         "tuning": "instruct",
620 |         "datasets": ["slimpajama", "starcoderdata"],
621 |         "params": 1.1e9,
622 |         "quantization": "int8",
623 |         "backend": "ct2",
624 |         "architecture": "decoder-only-transformer",
625 |         "license": "mit",
626 |         "prompt_fmt": ("<|user|>{instruction}<|assistant|>"),
627 |     },
628 |     {
629 |         "name": "codet5p-770m-py",
630 |         "tuning": "code",
631 |         "datasets": ["github-code"],
632 |         "params": 770e6,
633 |         "quantization": "int8",
634 |         "backend": "ct2",
635 |         "architecture": "encoder-decoder-transformer",
636 |         "license": "bsd-3-clause",
637 |     },
638 |     {
639 |         "name": "codet5p-220m-py",
640 |         "tuning": "code",
641 |         "datasets": ["github-code"],
642 |         "params": 220e6,
643 |         "quantization": "int8",
644 |         "backend": "ct2",
645 |         "architecture": "encoder-decoder-transformer",
646 |         "license": "bsd-3-clause",
647 |     },
648 |     {
649 |         "name": "all-MiniLM-L6-v2",
650 |         "tuning": "embedding",
651 |         "revision": "28efeb4",
652 |         "params": 22e6,
653 |         "quantization": "int8",
654 |         "backend": "ct2",
655 |         "architecture": "encoder-only-transformer",
656 |         "license": "apache-2.0",
657 |     },
658 |     {
659 |         "name": "gte-tiny",
660 |         "tuning": "embedding",
661 |         "params": 22e6,
662 |         "quantization": "int8",
663 |         "backend": "ct2",
664 |         "architecture": "encoder-only-transformer",
665 |         "license": "mit",
666 |     },
667 |     {
668 |         "name": "gte-small",
669 |         "tuning": "embedding",
670 |         "params": 33e6,
671 |         "quantization": "int8",
672 |         "backend": "ct2",
673 |         "architecture": "encoder-only-transformer",
674 |         "license": "mit",
675 |     },
676 |     {
677 |         "name": "GIST-small-Embedding-v0",
678 |         "tuning": "embedding",
679 |         "params": 33e6,
680 |         "quantization": "int8",
681 |         "backend": "ct2",
682 |         "architecture": "encoder-only-transformer",
683 |         "license": "mit",
684 |     },
685 |     {
686 |         "name": "bge-small-en",
687 |         "tuning": "embedding",
688 |         "query_prefix": "Represent this sentence for searching relevant passages: ",
689 |         "params": 33e6,
690 |         "quantization": "int8",
691 |         "backend": "ct2",
692 |         "architecture": "encoder-only-transformer",
693 |         "license": "mit",
694 |     },
695 |     {
696 |         "name": "e5-small-v2",
697 |         "tuning": "embedding",
698 |         "params": 33e6,
699 |         "quantization": "int8",
700 |         "backend": "ct2",
701 |         "architecture": "encoder-only-transformer",
702 |         "license": "mit",
703 |     },
704 |     {
705 |         "name": "granite-embedding-125m-english",
706 |         "tuning": "embedding",
707 |         "params": 30e6,
708 |         "quantization": "int8",
709 |         "backend": "ct2",
710 |         "architecture": "encoder-only-transformer",
711 |         "license": "apache-2.0",
712 |     },
713 |     {
714 |         "name": "granite-embedding-107m-multilingual",
715 |         "tuning": "embedding",
716 |         "params": 30e6,
717 |         "quantization": "int8",
718 |         "backend": "ct2",
719 |         "architecture": "encoder-only-transformer",
720 |         "license": "apache-2.0",
721 |     },
722 |     {
723 |         "name": "granite-embedding-30m-english",
724 |         "tuning": "embedding",
725 |         "params": 30e6,
726 |         "quantization": "int8",
727 |         "backend": "ct2",
728 |         "architecture": "encoder-only-transformer",
729 |         "license": "apache-2.0",
730 |     },
731 |     {
732 |         "name": "multilingual-e5-small",
733 |         "tuning": "embedding",
734 |         "params": 120e6,
735 |         "quantization": "int8",
736 |         "backend": "ct2",
737 |         "architecture": "encoder-only-transformer",
738 |         "license": "mit",
739 |     },
740 | ]
741 | 
742 | 
743 | class Config(dict):
744 |     """
745 |     Store configuration information for the package.
746 | 
747 |     This is a dictionary that provides data basic data validation.
748 | 
749 |     Only appropriate keys and values are allowed to be set.
750 | 
751 |     >>> c = Config({'max_ram': '4gb'})
752 |     >>> c
753 |     {...'max_ram': 4.0...}
754 | 
755 |     >>> c = Config({'instruct_model': 'flan-t5-small'})
756 |     >>> c
757 |     {...'instruct_model': 'flan-t5-small'...}
758 | 
759 |     >>> c = Config({'model_license': 'apache|mit|bsd'})
760 |     >>> c
761 |     {...'model_license': re.compile('apache|mit|bsd')...}
762 | 
763 |     >>> c = Config({'instruct_model': 'flan-t5-bad'})
764 |     Traceback (most recent call last):
765 |       ...
766 |     KeyError: 'flan-t5-bad'
767 | 
768 |     >>> c = Config({'bad_value': 1})
769 |     Traceback (most recent call last):
770 |       ...
771 |     KeyError: 'bad_value'
772 | 
773 |     >>> c = Config()
774 |     >>> c.update({'bad_value': 1})
775 |     Traceback (most recent call last):
776 |       ...
777 |     KeyError: 'bad_value'
778 | 
779 |     """
780 | 
781 |     model_names = {m["name"]: m for m in models}
782 | 
783 |     def __init__(self, config={}):
784 |         # Defaults are loaded first
785 |         for key in Config.schema:
786 |             self[key] = self.schema[key].default
787 | 
788 |         # Environment variables override defaults
789 |         for key in Config.schema:
790 |             value = os.environ.get(f"LANGUAGEMODELS_{key.upper()}")
791 |             if value:
792 |                 self[key] = value
793 | 
794 |         # Any values passed in the config dict override environment vars
795 |         for key in config.keys():
796 |             self[key] = config[key]
797 | 
798 |     def __setitem__(self, key, value):
799 |         super().__setitem__(key, Config.schema[key].initfn(value))
800 | 
801 |         # Auto-adjust instruct_model when filters change
802 |         if key == "max_ram" or key == "model_license":
803 |             found = set()
804 |             for model in models:
805 |                 if model["quantization"] == "int8":
806 |                     memsize = model["params"] / 1e9
807 |                 elif model["quantization"] == "q3_k_m":
808 |                     memsize = model["params"] * 0.48 / 1e9
809 |                 elif model["quantization"] == "q4_k_m":
810 |                     memsize = model["params"] * 0.59 / 1e9
811 | 
812 |                 sizefit = memsize < self["max_ram"]
813 | 
814 |                 if "model_license" in self:
815 |                     licensematch = self["model_license"].match(model["license"])
816 |                 else:
817 |                     licensematch = True
818 | 
819 |                 if model["tuning"] not in found and sizefit and licensematch:
820 |                     self[model["tuning"] + "_model"] = model["name"]
821 |                     found.add(model["tuning"])
822 | 
823 |             if len(found) < 3:
824 |                 raise ModelFilterException("Unable to find models to match filters")
825 | 
826 |     def update(self, other):
827 |         for key in other:
828 |             self[key] = other[key]
829 | 
830 |     def use_hf_model(self, hf_path, revision, model_type="instruct"):
831 |         """Load and use a model from Huggingface
832 | 
833 |         :param hf_path: Path for the model e.g. "org/model"
834 |         :param revision: The model git revision to load
835 |         :param model_type: Model type to load
836 |         """
837 | 
838 |         assert "ct2" in hf_path.lower()
839 |         assert "int8" in hf_path.lower()
840 | 
841 |         # We defer importing jinja2 until this point as it is only needed
842 |         # for interpolating hf model chat templates and does not need
843 |         # to be installed unless this method is used
844 |         from jinja2 import Environment, BaseLoader
845 | 
846 |         tok_config = hf_hub_download(
847 |             hf_path, "tokenizer_config.json", revision=revision
848 |         )
849 | 
850 |         with open(tok_config) as f:
851 |             chat_template = json.load(f)["chat_template"]
852 | 
853 |         env = Environment(loader=BaseLoader())
854 | 
855 |         template = env.from_string(chat_template)
856 | 
857 |         prompt_fmt = template.render(
858 |             messages=[{"role": "user", "content": "{instruction}"}],
859 |             add_generation_prompt=True,
860 |         )
861 | 
862 |         model = {
863 |             "name": hf_path,
864 |             "backend": "ct2",
865 |             "quantization": "int8",
866 |             "architecture": "decoder-only-transformer",
867 |             "max_tokens": 2048,
868 |             "params": 0,
869 |             "prompt_fmt": prompt_fmt,
870 |         }
871 | 
872 |         models.insert(0, model)
873 |         self.model_names[model["name"]] = model
874 |         self[f"{model_type}_model"] = model["name"]
875 | 
876 |     @staticmethod
877 |     def validate_model(model_name):
878 |         return Config.model_names[model_name]["name"]
879 | 
880 |     @staticmethod
881 |     def validate_device(device):
882 |         assert device in ["auto", "cpu"]
883 | 
884 |         return device
885 | 
886 |     @staticmethod
887 |     def convert_to_gb(space):
888 |         """Convert max RAM string to int
889 | 
890 |         Output will be in gigabytes
891 | 
892 |         If not specified, input is assumed to be in gigabytes
893 | 
894 |         >>> Config.convert_to_gb("512")
895 |         512.0
896 | 
897 |         >>> Config.convert_to_gb(".5")
898 |         0.5
899 | 
900 |         >>> Config.convert_to_gb("4G")
901 |         4.0
902 | 
903 |         >>> Config.convert_to_gb("256mb")
904 |         0.25
905 | 
906 |         >>> Config.convert_to_gb("256M")
907 |         0.25
908 | 
909 |         >>> Config.convert_to_gb("small")
910 |         0.2
911 | 
912 |         >>> Config.convert_to_gb("base")
913 |         0.48
914 | 
915 |         >>> Config.convert_to_gb("large")
916 |         1.0
917 | 
918 |         >>> Config.convert_to_gb("xl")
919 |         4.0
920 | 
921 |         >>> Config.convert_to_gb("xxl")
922 |         16.0
923 |         """
924 | 
925 |         if isinstance(space, int) or isinstance(space, float):
926 |             return float(space)
927 | 
928 |         size_names = {
929 |             "small": 0.2,
930 |             "base": 0.48,
931 |             "large": 1.0,
932 |             "xl": 4.0,
933 |             "xxl": 16.0,
934 |         }
935 | 
936 |         if space.lower().strip() in size_names:
937 |             return size_names[space.lower().strip()]
938 | 
939 |         multipliers = {
940 |             "g": 1.0,
941 |             "m": 2**-10,
942 |         }
943 | 
944 |         space = space.lower()
945 |         space = space.rstrip("b")
946 | 
947 |         if space[-1] in multipliers:
948 |             return float(space[:-1]) * multipliers[space[-1]]
949 |         else:
950 |             return float(space)
951 | 
952 | 
953 | Config.schema = {
954 |     "max_ram": ConfigItem(Config.convert_to_gb, 0.48),
955 |     "max_tokens": ConfigItem(int, 200),
956 |     "echo": ConfigItem(int, False),
957 |     "device": ConfigItem(Config.validate_device, "cpu"),
958 |     "model_license": ConfigItem(re.compile, ".*"),
959 |     "instruct_model": ConfigItem(Config.validate_model, "LaMini-Flan-T5-248M"),
960 |     "embedding_model": ConfigItem(Config.validate_model, "all-MiniLM-L6-v2"),
961 |     "code_model": ConfigItem(Config.validate_model, "codet5p-220m-py"),
962 |     "max_prompt_length": ConfigItem(int, 50_000),
963 | }
964 | 
965 | config = Config()
966 | 
967 | if "COLAB_GPU" in os.environ:
968 |     if len(os.environ["COLAB_GPU"]) > 0:
969 |         # We have a Colab GPU, so default to using it
970 |         config["device"] = "auto"
971 | 


--------------------------------------------------------------------------------