├── examples ├── requirements.txt ├── chat.py ├── streamlit.py ├── streamlitchat.py ├── assistant.py ├── translate.ipynb ├── summarize.ipynb ├── tools.ipynb └── extractive_qa_embeddings.ipynb ├── .gitignore ├── requirements.txt ├── media ├── hello.gif └── model-comparison.png ├── .github └── workflows │ ├── lint.yml │ ├── pi.yml │ ├── pages.yml │ ├── memperf.yml │ └── build.yml ├── test ├── gen_docs.py ├── embed.py ├── perf.py ├── npr.html └── planets.json ├── setup.py ├── license.md ├── makefile ├── languagemodels ├── preprocess.py ├── models.py ├── embeddings.py ├── inference.py ├── __init__.py └── config.py ├── paper.md ├── changelog.md ├── readme.md └── paper.bib /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | languagemodels 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | languagemodels/__pycache__ 2 | notebooks 3 | tools 4 | notes 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface_hub 2 | ctranslate2>=4.4.0 3 | tokenizers 4 | -------------------------------------------------------------------------------- /media/hello.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jncraton/languagemodels/HEAD/media/hello.gif -------------------------------------------------------------------------------- /media/model-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jncraton/languagemodels/HEAD/media/model-comparison.png -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - name: Install dependencies 11 | run: | 12 | pip install flake8 13 | - name: Lint 14 | run: make lint 15 | -------------------------------------------------------------------------------- /examples/chat.py: -------------------------------------------------------------------------------- 1 | """A simple CLI chatbot""" 2 | 3 | import languagemodels as lm 4 | 5 | prompt = f"System: Reply as a helpful assistant. Currently {lm.get_date()}." 6 | 7 | while True: 8 | user_message = input("\nUser: ") 9 | 10 | prompt += f"\n\nUser: {user_message}" 11 | 12 | print(prompt) 13 | 14 | prompt += "\n\nAssistant:" 15 | 16 | response = lm.chat(prompt) 17 | print(f"\nAssistant: {response}") 18 | 19 | prompt += f" {response}" 20 | -------------------------------------------------------------------------------- /test/gen_docs.py: -------------------------------------------------------------------------------- 1 | """ Generates docs for testing 2 | 3 | All documents come from Wikipedia 4 | """ 5 | 6 | import languagemodels as lm 7 | import json 8 | 9 | planets = [ 10 | "Mercury", 11 | "Venus", 12 | "Earth", 13 | "Mars", 14 | "Jupiter", 15 | "Saturn", 16 | "Uranus", 17 | "Neptune", 18 | ] 19 | 20 | with open("test/planets.json", "w") as f: 21 | docs = [{"name": p, "content": lm.get_wiki(f"Planet {p}")} for p in planets] 22 | json.dump(docs, f) 23 | -------------------------------------------------------------------------------- /examples/streamlit.py: -------------------------------------------------------------------------------- 1 | """A simple inference UI using Streamlit 2 | 3 | Run this application using `streamlit run {filename}` 4 | 5 | A live version of this application is hosted here: 6 | 7 | https://jncraton-languagemodels-examplesstreamlit-0h6yr7.streamlit.app/ 8 | """ 9 | 10 | import streamlit as st 11 | import languagemodels as lm 12 | 13 | st.title("[languagemodels](https://github.com/jncraton/languagemodels) Demo") 14 | 15 | st.text_input("Prompt (passed to `lm.do()`)", key="prompt") 16 | 17 | # Prompt LLM to get response 18 | response = lm.do(st.session_state.prompt) 19 | 20 | st.write(response) 21 | -------------------------------------------------------------------------------- /.github/workflows/pi.yml: -------------------------------------------------------------------------------- 1 | name: ARM64 Pi 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - '**/*.md' 7 | pull_request: 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-22.04 12 | steps: 13 | - uses: actions/checkout@v3 14 | - uses: pguyot/arm-runner-action@v2 15 | with: 16 | base_image: raspios_lite_arm64:2023-05-03 17 | image_additional_mb: 2048 18 | commands: | 19 | sudo apt install -y python3 python3-pip python3-venv 20 | python3 -m venv .venv 21 | . .venv/bin/activate 22 | pip3 install . 23 | make test-base 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("readme.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="languagemodels", 8 | version="0.24.0", 9 | author="Jon Craton", 10 | author_email="jon@joncraton.com", 11 | description="Simple inference for large language models", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/jncraton/languagemodels", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires='>=3.8', 22 | install_requires=[ 23 | "huggingface_hub", 24 | "ctranslate2>=4.4.0", 25 | "tokenizers", 26 | ], 27 | ) -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | name: Deploy docs to Pages 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | workflow_dispatch: 7 | 8 | permissions: 9 | contents: read 10 | pages: write 11 | id-token: write 12 | 13 | concurrency: 14 | group: "pages" 15 | cancel-in-progress: false 16 | 17 | jobs: 18 | deploy: 19 | environment: 20 | name: github-pages 21 | url: ${{ steps.deployment.outputs.page_url }} 22 | runs-on: ubuntu-latest 23 | steps: 24 | - name: Checkout 25 | uses: actions/checkout@v4 26 | - name: Install deps 27 | run: | 28 | pip install -r requirements.txt 29 | pip install pdoc 30 | - name: Generate docs 31 | run: make doc 32 | - name: Setup Pages 33 | uses: actions/configure-pages@v5 34 | - name: Upload artifact 35 | uses: actions/upload-pages-artifact@v3 36 | with: 37 | path: 'doc' 38 | - name: Deploy to GitHub Pages 39 | id: deployment 40 | uses: actions/deploy-pages@v4 41 | -------------------------------------------------------------------------------- /examples/streamlitchat.py: -------------------------------------------------------------------------------- 1 | """A simple web chatbot using streamlit 2 | 3 | Run this application using `streamlit run {filename}` 4 | 5 | A live version of this bot is available here: 6 | 7 | https://jncraton-languagemodels-examplesstreamlitchat-s4uj7z.streamlit.app/ 8 | """ 9 | 10 | import streamlit as st 11 | import languagemodels as lm 12 | 13 | st.title("Chatbot") 14 | 15 | 16 | def reset(): 17 | st.session_state.dialog = "" 18 | st.session_state.message = "" 19 | 20 | 21 | # Initialize empty dialog context on first run 22 | if "dialog" not in st.session_state: 23 | reset() 24 | 25 | if st.session_state.message: 26 | # Add new message to dialog 27 | st.session_state.dialog += f"User: {st.session_state.message}\n\nAssistant: " 28 | st.session_state.message = "" 29 | 30 | # Prompt LLM to get response 31 | response = lm.chat(f"{st.session_state.dialog}") 32 | 33 | # Display full dialog 34 | st.session_state.dialog += response + "\n\n" 35 | 36 | st.write(st.session_state.dialog) 37 | 38 | st.text_input("Message", key="message") 39 | 40 | st.button("Reset", on_click=reset) 41 | -------------------------------------------------------------------------------- /.github/workflows/memperf.yml: -------------------------------------------------------------------------------- 1 | name: Memory Performance 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - '**/*.md' 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | strategy: 12 | matrix: 13 | python-version: ["3.11"] 14 | os: [ubuntu-latest, windows-latest, macos-latest] 15 | max_ram: [".5", "1", "4"] 16 | runs-on: ${{ matrix.os }} 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Cache Models 20 | id: cache-models 21 | uses: actions/cache@v4 22 | with: 23 | path: ~/.cache/huggingface 24 | key: models 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | allow-prereleases: true 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install -r requirements.txt 34 | pip install psutil 35 | - name: Test Memory Usage 36 | run: env LANGUAGEMODELS_MAX_RAM=${{ matrix.max_ram }} make test-perf 37 | -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Jon Craton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/assistant.py: -------------------------------------------------------------------------------- 1 | """ A simple assistant 2 | 3 | The assistant uses information retrieval to obtain context from a small set 4 | of stored documents. The included information is the current weather, current 5 | date, and a brief summary of the Python programming language and the planet 6 | Saturn. 7 | 8 | A number of demonstration question are completed to demonstrate the available 9 | functionality. 10 | """ 11 | 12 | import languagemodels as lm 13 | 14 | 15 | def assist(question): 16 | context = lm.get_doc_context(question) 17 | 18 | return lm.do(f"Answer using context: {context} Question: {question}") 19 | 20 | 21 | lat, lon = (41.8, -87.6) 22 | 23 | lm.store_doc(lm.get_wiki("Python language"), "Python") 24 | lm.store_doc(lm.get_wiki("Planet Saturn"), "Saturn") 25 | lm.store_doc(lm.get_weather(lat, lon), "Weather") 26 | lm.store_doc(lm.get_date(), "Time") 27 | 28 | questions = [ 29 | "What day of the week is it?", 30 | "Is it going to rain today?", 31 | "What time is it?", 32 | "Who created Python?", 33 | "How many moon does Saturn have?", 34 | ] 35 | 36 | for question in questions: 37 | print(f"{question} {assist(question)}") 38 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: x64 Desktop 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - '**/*.md' 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | strategy: 12 | matrix: 13 | python-version: ["3.9", "3.10", "3.11", "3.12"] 14 | os: [ubuntu-22.04, ubuntu-latest, windows-latest, macos-latest, macos-14] 15 | exclude: 16 | - os: macos-14 17 | python-version: "3.8" 18 | - os: macos-14 19 | python-version: "3.9" 20 | - os: macos-latest 21 | python-version: "3.8" 22 | - os: macos-latest 23 | python-version: "3.9" 24 | runs-on: ${{ matrix.os }} 25 | steps: 26 | - uses: actions/checkout@v4 27 | - name: Cache Models 28 | id: cache-models 29 | uses: actions/cache@v4 30 | with: 31 | path: ~/.cache/huggingface 32 | key: models 33 | - name: Set up Python ${{ matrix.python-version }} 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | allow-prereleases: true 38 | - name: Install dependencies 39 | run: | 40 | pip install . 41 | - name: Test 42 | run: make test 43 | -------------------------------------------------------------------------------- /test/embed.py: -------------------------------------------------------------------------------- 1 | import languagemodels as lm 2 | import numpy as np 3 | import time 4 | import json 5 | import os 6 | import psutil 7 | 8 | 9 | def mem_used_gb(): 10 | process = psutil.Process(os.getpid()) 11 | bytes = process.memory_info().rss 12 | gigabytes = bytes * 1e-9 13 | return gigabytes 14 | 15 | 16 | print(f"Memory used before loading models: {mem_used_gb():.2f}GB") 17 | 18 | print("\n# Embedding Tests\n") 19 | 20 | planets = json.load(open("test/planets.json"))[-4:] 21 | 22 | # Make sure the model is loaded before testing 23 | start = time.perf_counter_ns() 24 | lm.docs.store("just initializing") 25 | lm.docs.clear() 26 | print(f"Model load time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms") 27 | 28 | start = time.perf_counter_ns() 29 | for planet in planets: 30 | lm.docs.store(planet["content"], planet["name"]) 31 | ms = (time.perf_counter_ns() - start) / 1e6 32 | print( 33 | f"Embedded {len(lm.docs.chunks)} chunks in {ms:.0f}ms ({ms/len(lm.docs.chunks):.0f}ms per chunk)" 34 | ) 35 | 36 | start = time.perf_counter_ns() 37 | print(lm.get_doc_context("Which planets have rings?")) 38 | print(f"Search time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms") 39 | lm.docs.clear() 40 | 41 | # Create many fake docs to benchmark search 42 | # We create 10 unique docs then duplicate them 43 | # A fully random set of docs would be better, but takes a long time to generate 44 | docs = [lm.embeddings.Document(str(i), np.random.rand(384)) for i in range(10)] 45 | start = time.perf_counter_ns() 46 | lm.embeddings.search("Test", docs * 10000) 47 | print(f"100k search time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms") 48 | docs = None 49 | 50 | max_ram = lm.config["max_ram"] 51 | print( 52 | f"Memory used after all tests: {mem_used_gb():.2f}GB (must be under {max_ram:.2f}GB)" 53 | ) 54 | 55 | # Confirm that we fit in max_ram after running all tests 56 | assert mem_used_gb() < max_ram 57 | -------------------------------------------------------------------------------- /examples/translate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "collapsed_sections": [ 8 | "K1yoiesR8O24" 9 | ] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | } 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "source": [ 23 | "# Install and Import Package" 24 | ], 25 | "metadata": { 26 | "id": "K1yoiesR8O24" 27 | } 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "qwyfeGSL7myi" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "%pip install languagemodels\n", 38 | "\n", 39 | "import languagemodels as lm" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "source": [ 45 | "# Translation Example" 46 | ], 47 | "metadata": { 48 | "id": "XRguig1K8WD3" 49 | } 50 | }, 51 | { 52 | "cell_type": "code", 53 | "source": [ 54 | "lm.do(\"Translate to English: Hola, mundo!\")\n" 55 | ], 56 | "metadata": { 57 | "colab": { 58 | "base_uri": "https://localhost:8080/", 59 | "height": 35 60 | }, 61 | "id": "-K_JCHWZ7v8t", 62 | "outputId": "85b30980-26d0-4caf-d520-f4e72bb9f17e" 63 | }, 64 | "execution_count": 3, 65 | "outputs": [ 66 | { 67 | "output_type": "execute_result", 68 | "data": { 69 | "text/plain": [ 70 | "'Hello, world!'" 71 | ], 72 | "application/vnd.google.colaboratory.intrinsic+json": { 73 | "type": "string" 74 | } 75 | }, 76 | "metadata": {}, 77 | "execution_count": 3 78 | } 79 | ] 80 | } 81 | ] 82 | } -------------------------------------------------------------------------------- /test/perf.py: -------------------------------------------------------------------------------- 1 | import languagemodels as lm 2 | import time 3 | import json 4 | import os 5 | import psutil 6 | 7 | 8 | def mem_used_gb(): 9 | process = psutil.Process(os.getpid()) 10 | bytes = process.memory_info().rss 11 | gigabytes = bytes * 1e-9 12 | return gigabytes 13 | 14 | 15 | print(f"Memory used before loading models: {mem_used_gb():.2f}GB") 16 | 17 | 18 | print("\n# Completion Test\n") 19 | 20 | print(f'{lm.complete("They ran until")=}') 21 | 22 | print("\n# Chat Test\n") 23 | 24 | print( 25 | lm.chat( 26 | """ 27 | System: Respond helpfully. It is Monday 28 | 29 | User: What day is it? 30 | 31 | Assistant: 32 | """ 33 | ) 34 | ) 35 | 36 | 37 | print("\n# Instruction Tests\n") 38 | 39 | tests = [ 40 | ("What is the capital of France?", "Paris"), 41 | ("A game uses a bat and ball. Is it baseball or soccer?", "Baseball"), 42 | ("Is grass green or blue?", "Green"), 43 | ("Does a car have more wheels than a bike?", "Yes"), 44 | ] 45 | 46 | accuracy = 0 47 | 48 | 49 | start = time.perf_counter_ns() 50 | 51 | lm.do("Test first run time") 52 | 53 | print(f"Initialization time: {(time.perf_counter_ns() - start) / 1e6:.0f}ms") 54 | 55 | print(f"Memory used after running chat inference: {mem_used_gb():.2f}GB") 56 | 57 | start = time.perf_counter_ns() 58 | chars_generated = 0 59 | 60 | for test in tests: 61 | response = lm.do(test[0]) 62 | chars_generated += len(response) 63 | if test[1].lower() in response.lower(): 64 | accuracy += 1 / len(tests) 65 | print(test[0], response) 66 | 67 | print( 68 | f"Average inference time: {(time.perf_counter_ns() - start)/len(tests)/1e6:.0f}ms" 69 | ) 70 | 71 | print( 72 | f"{(time.perf_counter_ns() - start)/chars_generated/1e6:.0f}ms per character generated" 73 | ) 74 | 75 | print(f"Overall accuracy: {accuracy:.2f}") 76 | 77 | print(f"Memory used after running inference: {mem_used_gb():.2f}GB") 78 | 79 | max_ram = lm.config["max_ram"] 80 | print( 81 | f"Memory used after all tests: {mem_used_gb():.2f}GB (must be under {max_ram:.2f}GB)" 82 | ) 83 | 84 | # Confirm that we used the right model size and roughly fit in memory constraints 85 | # Note that memory usage will vary between operating systems and specific usage 86 | assert mem_used_gb() < max_ram * 1.10 87 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | all: lint test 2 | 3 | .PHONY: test test-base lint format spellcheck upload clean 4 | 5 | test-base: 6 | python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py 7 | env LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/__init__.py 8 | LANGUAGEMODELS_INSTRUCT_MODEL="Qwen2.5-0.5B-Instruct" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/inference.py 9 | 10 | test: test-base 11 | env LANGUAGEMODELS_MAX_RAM=large python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py 12 | env LANGUAGEMODELS_MAX_RAM=xl python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py 13 | 14 | test-perf: 15 | PYTHONPATH=. python3 test/perf.py 16 | 17 | test-commercial: 18 | env LANGUAGEMODELS_SIZE=small LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py 19 | env LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py 20 | env LANGUAGEMODELS_SIZE=large LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py 21 | env LANGUAGEMODELS_SIZE=xl LANGUAGEMODELS_MODEL_LICENSE="apache|mit|bsd" python3 -m doctest -o ELLIPSIS -o NORMALIZE_WHITESPACE languagemodels/*.py 22 | 23 | lint: 24 | flake8 --max-line-length 88 --extend-ignore E203,F401 languagemodels/__init__.py 25 | flake8 --max-line-length 88 --extend-ignore E203 languagemodels/models.py languagemodels/inference.py languagemodels/embeddings.py languagemodels/config.py languagemodels/preprocess.py examples/*.py 26 | 27 | format: 28 | black languagemodels/*.py examples/*.py test/*.py 29 | 30 | doc: 31 | mkdir -p doc 32 | python3 -m pdoc -o doc languagemodels 33 | 34 | paper.pdf: paper.md paper.bib 35 | pandoc $< --citeproc --pdf-engine=xelatex -o $@ 36 | 37 | spellcheck: 38 | aspell -c --dont-backup readme.md 39 | aspell -c --dont-backup paper.md 40 | 41 | upload: 42 | python3 setup.py sdist bdist_wheel 43 | python3 -m twine upload dist/* 44 | 45 | clean: 46 | rm -rf tmp 47 | rm -rf languagemodels.egg-info 48 | rm -rf languagemodels/__pycache__ 49 | rm -rf dist 50 | rm -rf build 51 | rm -rf doc 52 | rm -rf .ipynb_checkpoints 53 | rm -rf examples/.ipynb_checkpoints 54 | -------------------------------------------------------------------------------- /languagemodels/preprocess.py: -------------------------------------------------------------------------------- 1 | from html import unescape 2 | from html.parser import HTMLParser 3 | 4 | 5 | def get_html_paragraphs(src: str): 6 | """ 7 | Return plain text paragraphs from an HTML source 8 | 9 | :param src: HTML document to convert to plain text paragraphs 10 | :return: Plain text paragraphs of document 11 | 12 | This function is designed to be quick rather than robust. 13 | 14 | It follows a simple approach to extracting text: 15 | 16 | 1. Ignore all content inside the following elements listed in `ignore`. 17 | 2. Merge inline text content into paragraphs from `inlines` set. 18 | 3. Convert any newly merged text element with at least `min_length` 19 | characters to a paragraph in the output text. 20 | 21 | >>> get_html_paragraphs(open("test/wp.html", encoding="utf-8").read()) 22 | 'Bolu Province (Turkish: Bolu ili) is a province...' 23 | 24 | >>> get_html_paragraphs(open("test/npr.html", encoding="utf-8").read()) 25 | "First, the good news. Netflix reported a record ..." 26 | """ 27 | 28 | class ParagraphExtractor(HTMLParser): 29 | paras = [""] 30 | ignoring = [] 31 | ignore = ("script", "style", "header", "footer") 32 | ignore_attrs = {('hidden', 'hidden'), } 33 | inlines = ("a", "b", "i", "span", "sup", "sub", "strong", "em") 34 | 35 | def handle_starttag(self, tag, attrs): 36 | if tag in self.ignore or self.ignore_attrs & set(attrs): 37 | self.ignoring.append(tag) 38 | 39 | if tag not in self.inlines and self.paras[-1]: 40 | self.paras.append("") 41 | 42 | def handle_endtag(self, tag): 43 | if self.ignoring and self.ignoring[-1] == tag: 44 | self.ignoring.pop() 45 | 46 | if tag not in self.inlines and self.paras[-1]: 47 | self.paras.append("") 48 | 49 | def handle_data(self, data): 50 | if not self.ignoring: 51 | if self.paras and self.paras[-1]: 52 | self.paras[-1] += unescape(data) 53 | else: 54 | self.paras.append(data) 55 | 56 | def get_plain(self): 57 | return "\n\n".join([p.rstrip() for p in self.paras if len(p.strip()) > 140]) 58 | 59 | extractor = ParagraphExtractor() 60 | extractor.feed(src) 61 | return extractor.get_plain() 62 | -------------------------------------------------------------------------------- /examples/summarize.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4" 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "source": [ 22 | "# Install and import" 23 | ], 24 | "metadata": { 25 | "id": "5KNsOIs5qEaa" 26 | } 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "id": "IF-HnHg1ayYW" 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "%pip install languagemodels\n", 37 | "import languagemodels as lm" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "source": [ 43 | "# Summarize list of documents" 44 | ], 45 | "metadata": { 46 | "id": "fOKBZ7zwqKU2" 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "source": [ 52 | "docs = [\n", 53 | " 'Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.\\n\\nPython is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.\\n\\nGuido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python\\xa00.9.0. Python\\xa02.0 was released in 2000. Python\\xa03.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python\\xa02.7.18, released in 2020, was the last release of Python\\xa02.\\n\\nPython consistently ranks as one of the most popular programming languages.',\n", 54 | " 'Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of \"understanding\" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.\\n\\nChallenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.',\n", 55 | "]\n", 56 | "\n", 57 | "for doc in docs:\n", 58 | " result = lm.do(f\"Summarize the following text in one short sentence. Text: {doc}\")\n", 59 | " print(result)" 60 | ], 61 | "metadata": { 62 | "id": "OcKsYj50a-u2" 63 | }, 64 | "execution_count": null, 65 | "outputs": [] 66 | } 67 | ] 68 | } -------------------------------------------------------------------------------- /examples/tools.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "cdc28c56-5631-4abf-b485-0f226962165f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Tool Usage\n", 9 | "\n", 10 | "Language models are best suited for generating natural language. They don't have access to external knowledge, and may not be well suited to for computational tasks. However, we can overcome some of these limits by augmenting models with tools.\n", 11 | "\n", 12 | "## Prompting for Tool Use\n", 13 | "\n", 14 | "The first step is to prompt the model in a way that allows it make use of tools. We'll do this by providing few-shot examples of computations using eval. We can then replace these computations with their results." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "id": "85288258-3f83-46cd-9f96-d7e6a940be33", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import languagemodels as lm\n", 25 | "import re" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "id": "2c756fa4-1def-4c81-b86e-de2d2103566a", 32 | "metadata": { 33 | "tags": [] 34 | }, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "'You have eval(28 + 51) cars.'" 40 | ] 41 | }, 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "def generate_answer_for_calculator(question):\n", 49 | " return lm.do(f\"\"\"\n", 50 | "Answer using eval as needed.\n", 51 | "\n", 52 | "Question: I had 17 apples and get 8 more. How many apples do I have?\n", 53 | "Answer: You have eval(17 + 8) apples.\n", 54 | "\n", 55 | "Question: How many dogs do I have if I start with 3 and get 2 more?\n", 56 | "Answer: You have eval(3 + 2) dogs.\n", 57 | "\n", 58 | "Question: I had 211 books and lose 154, how many books do I have?\n", 59 | "Answer: You have eval(211 - 154) books.\n", 60 | "\n", 61 | "Question: If I had 253 cats and got 101 more, how many cats do I have?\n", 62 | "Answer: You have eval(253 + 101) cats.\n", 63 | "\n", 64 | "Question: I buy 6 oranges and had 4 to begin with. How many oranges do I have?\n", 65 | "Answer: You have eval(6 + 4) oranges.\n", 66 | "\n", 67 | "Question: {question}\n", 68 | "\"\"\".strip())\n", 69 | "\n", 70 | "reply = generate_answer_for_calculator(\"If I have 28 cars and buy 51 more, how many cars do I have?\")\n", 71 | "reply" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "94f05290-3f42-4b46-9af6-5447f663a166", 77 | "metadata": {}, 78 | "source": [ 79 | "## Merging Tools and Results\n", 80 | "\n", 81 | "Now that we have a result from the LLM expecting tools to be used, we can use regular expressions to replace tools with their results." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "id": "179afbef-eab0-46c8-a6c9-c38f87d7570f", 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "'You have 79 cars.'" 94 | ] 95 | }, 96 | "execution_count": 3, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "def replace_expressions(reply):\n", 103 | " # Replace \"eval(1+2)\" with 3\n", 104 | " # Also replace \"eval(1+2) = 3\" with 3, as the model sometimes predicts an answer\n", 105 | " expressions = re.findall('(eval\\(([ 0-9\\.+\\-/\\*]+)\\)[ =0-9\\.]*)', reply)\n", 106 | "\n", 107 | " for exp in expressions:\n", 108 | " result = eval(exp[1])\n", 109 | " reply = reply.replace(exp[0].strip(), str(result))\n", 110 | " \n", 111 | " return reply\n", 112 | "\n", 113 | "replace_expressions(reply)" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3 (ipykernel)", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.8.10" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 5 138 | } 139 | -------------------------------------------------------------------------------- /languagemodels/models.py: -------------------------------------------------------------------------------- 1 | import re 2 | from huggingface_hub import hf_hub_download, snapshot_download 3 | from tokenizers import Tokenizer 4 | import ctranslate2 5 | 6 | from languagemodels.config import config, models 7 | 8 | 9 | modelcache = {} 10 | 11 | 12 | class ModelException(Exception): 13 | pass 14 | 15 | 16 | def get_model_info(model_type="instruct"): 17 | """Gets info about the current model in use 18 | 19 | >>> get_model_info('instruct') 20 | {'name': 'LaMini-Flan-T5-248M', 'tuning': 'instruct'... 21 | """ 22 | model_name = config[f"{model_type}_model"] 23 | 24 | m = [m for m in models if m["name"] == model_name][0] 25 | 26 | param_bits = int(re.search(r"\d+", m["quantization"]).group(0)) 27 | 28 | m["size_gb"] = m["params"] * param_bits / 8 / 1e9 29 | if "/" in m["name"]: 30 | m["path"] = m["name"] 31 | else: 32 | m["path"] = f"jncraton/{m['name']}-{m['backend']}-{m['quantization']}" 33 | 34 | return m 35 | 36 | 37 | def initialize_tokenizer(model_type, model_name): 38 | model_info = get_model_info(model_type) 39 | rev = model_info.get("revision", None) 40 | 41 | tok_config = hf_hub_download( 42 | model_info["path"], "tokenizer.json", revision=rev, local_files_only=True 43 | ) 44 | tokenizer = Tokenizer.from_file(tok_config) 45 | 46 | if model_type == "embedding": 47 | tokenizer.no_padding() 48 | tokenizer.no_truncation() 49 | 50 | return tokenizer 51 | 52 | 53 | def initialize_model(model_type, model_name, tokenizer_only=False): 54 | model_info = get_model_info(model_type) 55 | 56 | allowed = ["*.bin", "*.txt", "*.json"] 57 | rev = model_info.get("revision", None) 58 | 59 | # snapshot_download checks for updates by default 60 | # This can cause significant lag in offline usecases or high latency networks 61 | # To avoid this penalty, we try to use the local cache first. 62 | # If the files are not available, then we attempt a download 63 | try: 64 | path = snapshot_download( 65 | model_info["path"], 66 | max_workers=1, 67 | allow_patterns=allowed, 68 | revision=rev, 69 | local_files_only=True, 70 | ) 71 | except FileNotFoundError: 72 | path = snapshot_download( 73 | model_info["path"], max_workers=1, allow_patterns=allowed, revision=rev 74 | ) 75 | 76 | if tokenizer_only: 77 | return None 78 | 79 | if model_info["architecture"] == "encoder-only-transformer": 80 | return ctranslate2.Encoder( 81 | path, 82 | "cpu", 83 | compute_type="int8", 84 | ) 85 | elif model_info["architecture"] == "decoder-only-transformer": 86 | return ctranslate2.Generator(path, config["device"], compute_type="int8") 87 | else: 88 | return ctranslate2.Translator(path, config["device"], compute_type="int8") 89 | 90 | 91 | def get_model(model_type, tokenizer_only=False): 92 | """Gets a model from the loaded model cache 93 | 94 | If tokenizer_only, the model itself will not be (re)loaded 95 | 96 | >>> tokenizer, model = get_model("instruct") 97 | >>> type(tokenizer) 98 | 99 | 100 | >>> type(model) 101 | 102 | 103 | >>> tokenizer, model = get_model("embedding") 104 | >>> type(tokenizer) 105 | 106 | 107 | >>> type(model) 108 | 109 | """ 110 | 111 | model_name = config[f"{model_type}_model"] 112 | 113 | if config["max_ram"] < 4 and not tokenizer_only: 114 | for model in modelcache: 115 | if model != model_name: 116 | try: 117 | modelcache[model][1].unload_model() 118 | except AttributeError: 119 | # Encoder-only models can't be unloaded by ctranslate2 120 | pass 121 | 122 | if model_name not in modelcache: 123 | model = initialize_model(model_type, model_name, tokenizer_only) 124 | tokenizer = initialize_tokenizer(model_type, model_name) 125 | modelcache[model_name] = (tokenizer, model) 126 | elif not tokenizer_only: 127 | # Make sure model is loaded if we've never loaded it 128 | if not modelcache[model_name][1]: 129 | modelcache[model_name] = ( 130 | modelcache[model_name][0], 131 | initialize_model(model_type, model_name), 132 | ) 133 | # Make sure the model is reloaded if we've unloaded it 134 | try: 135 | modelcache[model_name][1].load_model() 136 | except AttributeError: 137 | # Encoder-only models can't be unloaded in ctranslate2 138 | pass 139 | 140 | return modelcache[model_name] 141 | -------------------------------------------------------------------------------- /paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'languagemodels: A Python Package for Exploring Modern Natural Language Processing' 3 | tags: 4 | - Python 5 | - machine learning 6 | - language modeling 7 | - nlp 8 | authors: 9 | - name: Jonathan L. Craton 10 | orcid: 0009-0007-6543-8571 11 | affiliation: 1 12 | affiliations: 13 | - name: Department of Computer Science, Anderson University (IN) 14 | index: 1 15 | date: 15 June 2023 16 | bibliography: paper.bib 17 | --- 18 | 19 | # Summary 20 | 21 | `languagemodels` is a Python package for educators and learners exploring the applications of large language models. It aims to be as easy to set up and use as possible, while providing many of the key building blocks used in modern LLM-driven applications. It is designed to be used in learning modules in introductory programming courses. 22 | 23 | # Statement of Need 24 | 25 | Large language models are having an impact on the way software is designed [@mialon2023augmented]. The development of the transformer [@vaswani2017attention] has led to rapid progress in many NLP and generative tasks [@zhao2023survey; @bert; @gpt2; @gpt3; @t5; @palm; @flan-t5; @bubeck2023sparks]. These models are becoming more powerful as they scale in both parameters [@kaplan2020scaling] and training data [@hoffmann2022training]. 26 | 27 | Early research suggests that there are many tasks performed by humans that can be transformed by LLMs [@eloundou2023gpts]. For example, large language models trained on code [@codex] are already being used as capable pair programmers via tools such as Microsoft's Copilot. To build with these technologies, students need to understand their capabilities and begin to learn new paradigms for programming. 28 | 29 | There are many software tools already available for working with large language models [@hftransformers; @pytorch; @tensorflow; @langchain; @llamacpp; @gpt4all]. While these options serve the needs of software engineers, researchers, and hobbyists, they may not be simple enough for new learners. This package aims to lower the barriers to entry for using these tools in an educational context. 30 | 31 | \newpage 32 | 33 | # Example Usage 34 | 35 | This package eliminates boilerplate and configuration options that create noise for new learners while using only basic types and simple functions. Here's an example from a Python REPL session: 36 | 37 | ```python 38 | >>> import languagemodels as lm 39 | 40 | >>> lm.do("Answer the question: What is the capital of France?") 41 | 'Paris.' 42 | 43 | >>> lm.do("Classify as positive or negative: I like games", 44 | ... choices=["positive", "negative"]) 45 | 'positive' 46 | 47 | >>> lm.extract_answer("What color is the ball?", 48 | ... "There is a green ball and a red box") 49 | 'green' 50 | 51 | >>> lm.get_wiki('Chemistry') 52 | 'Chemistry is the scientific study...' 53 | 54 | >>> lm.store_doc(lm.get_wiki("Python"), "Python") 55 | >>> lm.store_doc(lm.get_wiki("Javascript"), "Javascript") 56 | >>> lm.get_doc_context("What language is used on the web?") 57 | 'From Javascript document: Javascript engines were...' 58 | ``` 59 | 60 | # Features 61 | 62 | Despite its simplicity, this package provides a number of building blocks that can be combined to build applications that mimic the architectures of modern software products. Some of the tools included are: 63 | 64 | - Instruction following with the `do` function 65 | - Zero-shot classification with the `do` function and `choices` parameter 66 | - Semantic search using the `store_doc` and `get_doc_context` functions 67 | - Extractive question answering using the `extract_answer` function 68 | - Basic web retrieval using the `get_wiki` function 69 | 70 | The package includes the following features under the hood: 71 | 72 | - Local LLM inference on CPU for broad device support 73 | - Transparent model caching to allow fast repeated inference without explicit model initialization 74 | - Pre-selected models to allow the software to run easily and effectively on as many devices as possible 75 | 76 | \newpage 77 | 78 | # Implementation 79 | 80 | The design of this software package allows its interface to be loosely coupled to the models and inference engines it uses. Progress is being made to speed up inference on consumer hardware, and this package seeks to find a balance between inference efficiency, software stability, and broad hardware support. 81 | 82 | This package currently uses CTranslate2 [@ctranslate2] for efficient inference on CPU and GPU. The main models used include Flan-T5 [@flan-t5], LaMini-LM [@lamini-lm], and OpenChat [@openchat]. The default models used by this package can be swapped out in future versions to provide improved generation quality. 83 | 84 | # Future work 85 | 86 | This package provides a platform for creating simple NLP labs for use in introductory computer science courses. Additional work is needed to design specific learning modules to meet the needs of learners. 87 | 88 | Ongoing development efforts will focus on improving the accuracy and efficiency of inference, while keeping the interface stable and supporting all reasonable platforms. 89 | 90 | # References 91 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.24 - 2024-02-14 4 | 5 | ### Changed 6 | 7 | - Remove special capitalization rules 8 | 9 | ### Added 10 | 11 | - Add lm.get_web helper function to load text from web pages 12 | - Support Qwen2 2.5 0.5B and 1.5B 13 | - Add support for Granite embedding models 14 | 15 | ## 0.23 - 2024-12-17 16 | 17 | ### Changed 18 | 19 | - Drop support for Python 3.8 20 | 21 | ### Fixed 22 | 23 | - Properly apply prompt format when providing `choices` 24 | - Do not add special tokens before `choices` 25 | 26 | ### Added 27 | 28 | - Support multilingual-e5-small embedding model 29 | - Support Falcon 3 Instruct 1B and 3B 30 | 31 | ## 0.22 - 2024-11-02 32 | 33 | ### Changed 34 | 35 | - Pin Llama 3.2 model versions 36 | - Decrease repetition penalty for Llama 3.2 models 37 | 38 | ### Added 39 | 40 | - Support SmolLM2 41 | - Add `embed` function 42 | - Support Llama 3.1 8B instruct 43 | - Use models directly from Huggingface with config.use_hf_model() 44 | - Add "echo" config option to allow streaming tokens to stdout as they are generated 45 | 46 | ## 0.21 - 2024-09-25 47 | 48 | ### Changed 49 | 50 | - Skip checking for model updates 51 | - Download entire model upfront even if we only need the tokenizer initially 52 | - Use most recent version of CTranslate2 53 | - Add per-model repetition penalties 54 | 55 | ### Added 56 | 57 | - Support Llama 3.2 1B and 3B 58 | - Support Danube3 59 | - Support SmolLM 60 | 61 | ## 0.20 - 2024-04-25 62 | 63 | ### Changed 64 | 65 | - Add new separators to document chunking heuristic 66 | 67 | ### Fixed 68 | 69 | - Allow missing query prefixes for embedding models 70 | 71 | ### Added 72 | 73 | - Support Phi-3-mini-4k-instruct 74 | - Support GIST-small-Embedding-v0 embedding model 75 | - Store model runtime stats to improve benchmarking and analysis 76 | 77 | ## 0.19 - 2024-04-18 78 | 79 | ### Added 80 | 81 | - Support Meta-Llama-3-8B-Instruct 82 | - Support gemma-2b-it 83 | - Support h2o-danube2-1.8b-chat 84 | - Support WizardLM-2-7B 85 | 86 | ## 0.18.0 - 2024-02-23 87 | 88 | ### Fixed 89 | 90 | - Correct issue causing `choices` to be scored improperly 91 | 92 | ## 0.17.0 - 2024-02-15 93 | 94 | ### Added 95 | 96 | - CUDA 12 support 97 | 98 | ## 0.16.0 - 2024-02-04 99 | 100 | ### Fixed 101 | 102 | - Run embedding models on CPU to work around memory copy issue 103 | 104 | ## 0.15.0 - 2024-02-04 105 | 106 | ### Changed 107 | 108 | - Improve embedding search performance 109 | 110 | ### Added 111 | 112 | - Add openchat-3.5-0106 model 113 | - Add h2o-danube-1.8b-chat model 114 | 115 | ## 0.14.0 - 2024-01-06 116 | 117 | ### Changed 118 | 119 | - Simplified dialogstudio system message 120 | 121 | ### Fixed 122 | 123 | - Correct missing instruction in openchat prompt 124 | 125 | ## 0.13.0 - 2024-01-05 126 | 127 | ### Changed 128 | 129 | - Improved search speed when searching many documents 130 | - Reduce memory usage for large document embeddings 131 | - Updated to TinyLlama Chat v1.0 132 | - Remove auto model scaling on Colab 133 | - Correct phi-1.5 prompt format 134 | - Correct model license metadata 135 | 136 | ### Added 137 | 138 | - Add Mistral-7B-Instruct-v0.2 model 139 | - Add openchat-3.5-1210 model 140 | - Add phi-2 model 141 | - Support static batching by passing lists to `do` 142 | - Support choices list on `do` to restrict possible outputs 143 | 144 | ## 0.12.0 - 2023-12-02 145 | 146 | ### Changed 147 | 148 | - Remove explicit setuptools dependency (see [CTranslate2#1526](https://github.com/OpenNMT/CTranslate2/pull/1526)) 149 | 150 | ### Fixed 151 | 152 | - Reduce model size when not using a CPU in Colab 153 | 154 | ## 0.11.0 - 2023-12-02 155 | 156 | ### Changed 157 | 158 | - Default to 8GB model size on Colab 159 | - Allow 2048 token response by default on Colab 160 | - Use Colab GPU by default if available 161 | - Skip returning prompt for decoder-only models 162 | - Ensure whitespace is removed from decoder-only outputs 163 | 164 | ### Added 165 | 166 | - Add neural-chat-7b-v3-1 as default 8GB model 167 | - Add max_tokens config option 168 | 169 | ## 0.10.0 - 2023-10-29 170 | 171 | ### Added 172 | 173 | - Add gte-tiny embedding model 174 | - Properly support Python 3.12 175 | 176 | ### Fixed 177 | 178 | - Removed extra classification prompt when performing classification with generative models 179 | - Prevent doubling of special tokens during classification 180 | 181 | ## 0.9.0 - 2023-10-07 182 | 183 | ### Changed 184 | 185 | - Use per-model instruction formats 186 | - Batch chunk embeddings for faster performance embedding larger documents 187 | 188 | ### Added 189 | 190 | - Automatically use query prefixes as needed for embeddings 191 | - Add phi-1.5 model 192 | - Add dialogstudio base model 193 | - Add support for gte-small embeddings 194 | - Add support for bge-small-en embeddings 195 | 196 | ### Fixed 197 | 198 | - Allow token suppression on decoder-only models 199 | - Remove HTML comments appearing in some wiki pages 200 | 201 | ## 0.8.0 - 2023-08-04 202 | 203 | ### Changed 204 | 205 | - Model names no longer include backend and quantization info 206 | - Default to CPU inference unless GPU enabled using `lm.config["device"]="auto"` 207 | 208 | ### Added 209 | 210 | - Add quantization info to config and use it for memory usage calculation 211 | 212 | ### Fixed 213 | 214 | - Increase repetition penalty to 1.3 from 1.2 to help avoid repetition in smaller models 215 | 216 | ## 0.7.0 - 2023-07-27 217 | 218 | ### Changed 219 | 220 | - Improve semantic meaning of chunk heading 221 | - Remove sentencepiece dependency 222 | 223 | ### Added 224 | 225 | - Support GPT-based models 226 | - Add `code` generation function 227 | - Create new configuration system 228 | - Use CUDA if available 229 | 230 | ### Fixed 231 | 232 | - Use non-greedy sampling on `complete` function 233 | - Decrease chance of splitting chunks on decimal points 234 | - Correct assistant example 235 | 236 | ## 0.6.0 237 | 238 | ### Changed 239 | 240 | - Attempt to chunk context on semantic boundaries 241 | 242 | ### Added 243 | 244 | - Allow filtering by model license 245 | 246 | ### Fixed 247 | 248 | - Update classification to only allow valid classes to be returned 249 | 250 | ## 0.5.0 251 | 252 | ### Changed 253 | 254 | - Disable beam search for faster inference 255 | 256 | ## 0.4.0 257 | 258 | ### Changed 259 | 260 | - Normalize output 261 | - Rename some functions 262 | 263 | ### Added 264 | 265 | - Support xl models 266 | 267 | ## 0.2.0 268 | 269 | ### Changed 270 | 271 | - Less verbose chat syntax 272 | 273 | ### 0.1.0 274 | 275 | ### Changed 276 | 277 | - Use ctranslate2 for greater efficiency 278 | 279 | ### 0.0.0 280 | 281 | - Original version using HuggingFace Transformers -------------------------------------------------------------------------------- /examples/extractive_qa_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3a9a252f-5e79-4f33-b78c-c3a34e0f1a59", 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "source": [ 10 | "# Extractive Question Answering\n", 11 | "\n", 12 | "Language models are good at generating text, but generations are not always accurate. One way to increase accuracy is to provide context for answering a question within the prompt.\n", 13 | "\n", 14 | "First, we'll load some context." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "id": "8d925dca-a4f0-4770-a217-fbd2da492434", 21 | "metadata": { 22 | "tags": [] 23 | }, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation via the off-side rule.Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0. Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.Python consistently ranks as one of the most popular programming languages.\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "import languagemodels as lm\n", 35 | "\n", 36 | "python_info = lm.get_wiki(\"Python\")\n", 37 | "print(python_info)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "4a5830e4-5efc-4d3f-9452-c2f44d5c2e51", 43 | "metadata": {}, 44 | "source": [ 45 | "We can now prompt the model to answer the question using the context." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "id": "22ae1c03-7168-42db-a0f8-4057c79fd91d", 52 | "metadata": { 53 | "tags": [] 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "Guido van Rossum created Python.\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "print(lm.do(f\"Answer from the context: Who created Python? {python_info}\"))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "e1bed464-cbab-41e2-90d6-2beb02f97c31", 71 | "metadata": { 72 | "tags": [] 73 | }, 74 | "source": [ 75 | "# Embeddings\n", 76 | "\n", 77 | "Language models are capable of answering questions based on a context, but we now need a way to provide them with appropriate context.\n", 78 | "\n", 79 | "One solution to this is to have a large amount of available context and retrieve only the meaningful bits when answering a question. Embeddings are a tool to achieve this.\n", 80 | "\n", 81 | "Embeddings provide a way to map a numeric vector to the meaning of some input. In the case of language models, embeddings are derived from documents.\n", 82 | "\n", 83 | "## Semantic Search\n", 84 | "\n", 85 | "Once we have mapped vectors to our documents, we can search for similar documents by meaning. If we've constructed our embedding model appropriately, documents that answer questions will be near the questions themselves in vector space.\n", 86 | "\n", 87 | "The math to achieve that is out of scope of this example, but the languagemodels package provides a few simple helper functions to facilated a document store capable of semantic search." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "id": "b5816039-7071-4faf-be40-2451c265945a", 94 | "metadata": { 95 | "tags": [] 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "# Load some programming language documents\n", 100 | "for topic in ['Python', 'Javascript', 'C++', 'SQL', 'HTML']:\n", 101 | " doc = lm.get_wiki(topic)\n", 102 | " lm.store_doc(doc)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 4, 108 | "id": "3412a004-2505-44ae-92b2-13e71ff6bcb9", 109 | "metadata": { 110 | "tags": [] 111 | }, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "'It is often described as a \"batteries included\" language due to its comprehensive standard library.Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.\\n\\nPython is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation via the off-side rule.\\n\\n18, released in 2020, was the last release of Python 2.Python consistently ranks as one of the most popular programming languages.'" 117 | ] 118 | }, 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "# Perform semantic search\n", 126 | "lm.get_doc_context(\"Who created Python?\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "id": "8442908a-2527-4279-be8e-55b6a64c2522", 133 | "metadata": { 134 | "tags": [] 135 | }, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "'JavaScript is often associated with HTML and CSS.'" 141 | ] 142 | }, 143 | "execution_count": 5, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "# Put everything together to answer a general question about one of the languages\n", 150 | "question = \"What technologies are often associated with JS?\"\n", 151 | "\n", 152 | "context = lm.get_doc_context(question)\n", 153 | "\n", 154 | "lm.do(f\"Answer from the context: {question} {context}\")" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3 (ipykernel)", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.8.10" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 5 179 | } 180 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | Language Models 2 | =============== 3 | 4 | [![PyPI version](https://badge.fury.io/py/languagemodels.svg)](https://badge.fury.io/py/languagemodels) 5 | [![docs](https://img.shields.io/badge/docs-online-brightgreen)](https://languagemodels.netlify.app/) 6 | [![x64 Build](https://github.com/jncraton/languagemodels/actions/workflows/build.yml/badge.svg)](https://github.com/jncraton/languagemodels/actions/workflows/build.yml) 7 | [![ARM64 Build](https://github.com/jncraton/languagemodels/actions/workflows/pi.yml/badge.svg)](https://github.com/jncraton/languagemodels/actions/workflows/pi.yml) 8 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jncraton/languagemodels/blob/master/examples/translate.ipynb) 9 | 10 | Python building blocks to explore large language models in as little as 512MB of RAM 11 | 12 | ![Translation hello world example](media/hello.gif) 13 | 14 | This package makes using large language models from Python as simple as possible. All inference is performed locally to keep your data private by default. 15 | 16 | Installation and Getting Started 17 | -------------------------------- 18 | 19 | This package can be installed using the following command: 20 | 21 | ```sh 22 | pip install languagemodels 23 | ``` 24 | 25 | Once installed, you should be able to interact with the package in Python as follows: 26 | 27 | ```python 28 | >>> import languagemodels as lm 29 | >>> lm.do("What color is the sky?") 30 | 'The color of the sky is blue.' 31 | ``` 32 | 33 | This will require downloading a significant amount of data (~250MB) on the first run. Models will be cached for later use and subsequent calls should be quick. 34 | 35 | Example Usage 36 | ------------- 37 | 38 | Here are some usage examples as Python REPL sessions. This should work in the REPL, notebooks, or in traditional scripts and applications. 39 | 40 | ### Instruction Following 41 | 42 | ```python 43 | >>> import languagemodels as lm 44 | 45 | >>> lm.do("Translate to English: Hola, mundo!") 46 | 'Hello, world!' 47 | 48 | >>> lm.do("What is the capital of France?") 49 | 'Paris.' 50 | ``` 51 | 52 | Outputs can be restricted to a list of choices if desired: 53 | 54 | ```python 55 | >>> lm.do("Is Mars larger than Saturn?", choices=["Yes", "No"]) 56 | 'No' 57 | ``` 58 | 59 | ### Adjusting Model Performance 60 | 61 | The base model should run quickly on any system with 512MB of memory, but this memory limit can be increased to select more powerful models that will consume more resources. Here's an example: 62 | 63 | ```python 64 | >>> import languagemodels as lm 65 | >>> lm.do("If I have 7 apples then eat 5, how many apples do I have?") 66 | 'You have 8 apples.' 67 | >>> lm.config["max_ram"] = "4gb" 68 | 4.0 69 | >>> lm.do("If I have 7 apples then eat 5, how many apples do I have?") 70 | 'I have 2 apples left.' 71 | ``` 72 | 73 | ### GPU Acceleration 74 | 75 | If you have an NVIDIA GPU with CUDA available, you can opt in to using the GPU for inference: 76 | 77 | ```python 78 | >>> import languagemodels as lm 79 | >>> lm.config["device"] = "auto" 80 | ``` 81 | 82 | ### Text Completions 83 | 84 | ```python 85 | >>> import languagemodels as lm 86 | 87 | >>> lm.complete("She hid in her room until") 88 | 'she was sure she was safe' 89 | ``` 90 | 91 | ### External Retrieval 92 | 93 | Helper functions are provided to retrieve text from external sources that can be used to augment prompt context. 94 | 95 | ```python 96 | >>> import languagemodels as lm 97 | 98 | >>> lm.get_wiki('Chemistry') 99 | 'Chemistry is the scientific study... 100 | 101 | >>> lm.get_weather(41.8, -87.6) 102 | 'Partly cloudy with a chance of rain... 103 | 104 | >>> lm.get_date() 105 | 'Friday, May 12, 2023 at 09:27AM' 106 | ``` 107 | 108 | Here's an example showing how this can be used (compare to previous chat example): 109 | 110 | ```python 111 | >>> lm.do(f"It is {lm.get_date()}. What time is it?") 112 | 'The time is 12:53PM.' 113 | ``` 114 | 115 | ### Semantic Search 116 | 117 | Semantic search is provided to retrieve documents that may provide helpful context from a document store. 118 | 119 | ```python 120 | >>> import languagemodels as lm 121 | >>> lm.store_doc(lm.get_wiki("Python"), "Python") 122 | >>> lm.store_doc(lm.get_wiki("C language"), "C") 123 | >>> lm.store_doc(lm.get_wiki("Javascript"), "Javascript") 124 | >>> lm.get_doc_context("What does it mean for batteries to be included in a language?") 125 | 'From Python document: It is often described as a "batteries included" language due to its comprehensive standard library.Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9. 126 | 127 | From C document: It was designed to be compiled to provide low-level access to memory and language constructs that map efficiently to machine instructions, all with minimal runtime support.' 128 | ``` 129 | 130 | [Full documentation](https://languagemodels.netlify.app/) 131 | 132 | ### Speed 133 | 134 | This package currently outperforms Hugging Face `transformers` for CPU inference thanks to int8 quantization and the [CTranslate2](https://github.com/OpenNMT/CTranslate2) backend. The following table compares CPU inference performance on identical models using the best available quantization on a 20 question test set. 135 | 136 | | Backend | Inference Time | Memory Used | 137 | |---------------------------|----------------|-------------| 138 | | Hugging Face transformers | 22s | 1.77GB | 139 | | This package | 11s | 0.34GB | 140 | 141 | Note that quantization does technically harm output quality slightly, but it should be negligible at this level. 142 | 143 | ### Models 144 | 145 | Sensible default models are provided. The package should improve over time as stronger models become available. The basic models used are 1000x smaller than the largest models in use today. They are useful as learning tools, but perform far below the current state of the art. 146 | 147 | Here are the current default models used by the package for a supplied `max_ram` value: 148 | 149 | | max_ram | Model Name | Parameters (B) 150 | | ------- | --------------------- | -------------- 151 | | 0.5 | LaMini-Flan-T5-248M | 0.248 152 | | 1.0 | LaMini-Flan-T5-783M | 0.783 153 | | 2.0 | LaMini-Flan-T5-783M | 0.783 154 | | 4.0 | flan-alpaca-gpt4-xl | 3.0 155 | | 8.0 | openchat-3.5-0106 | 7.0 156 | 157 | For code completions, the [CodeT5+](https://arxiv.org/abs/2305.07922) series of models are used. 158 | 159 | Commercial Use 160 | -------------- 161 | 162 | This package itself is licensed for commercial use, but the models used may not be compatible with commercial use. In order to use this package commercially, you can filter models by license type using the `require_model_license` function. 163 | 164 | ```python 165 | >>> import languagemodels as lm 166 | >>> lm.config['instruct_model'] 167 | 'LaMini-Flan-T5-248M-ct2-int8' 168 | >>> lm.require_model_license("apache|bsd|mit") 169 | >>> lm.config['instruct_model'] 170 | 'flan-t5-base-ct2-int8' 171 | ``` 172 | 173 | It is recommended to confirm that the models used meet the licensing requirements for your software. 174 | 175 | Projects Ideas 176 | -------------- 177 | 178 | One of the goals for this package is to be a straightforward tool for learners and educators exploring how large language models intersect with modern software development. It can be used to do the heavy lifting for a number of learning projects: 179 | 180 | - CLI Chatbot (see [examples/chat.py](examples/chat.py)) 181 | - Streamlit chatbot (see [examples/streamlitchat.py](examples/streamlitchat.py)) 182 | - Chatbot with information retrieval 183 | - Chatbot with access to real-time information 184 | - Tool use 185 | - Text classification 186 | - Extractive question answering 187 | - Semantic search over documents 188 | - Document question answering 189 | 190 | Several example programs and notebooks are included in the `examples` directory. 191 | -------------------------------------------------------------------------------- /test/npr.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Netflix raises monthly fees after a record jump in new subscribers 5 | 6 | 7 | 8 | 111 | 112 | 113 | 114 | 115 |
116 |

Text-Only Version Go To Full Site

117 |
118 | 119 | 120 |
121 |
122 |
123 |

124 | NPR > 125 | 126 | The Brief 127 | 128 |

129 |
130 |

Netflix raises monthly fees after a record jump in new subscribers

131 |

By Manuela López Restrepo

132 | 133 |

Wednesday, January 22, 2025 • 5:08 PM EST

134 | 135 | 136 |
137 |
138 |
139 |

First, the good news. Netflix reported a record increase in the number of new subscribers for the final quarter of 2024, attributing the success to high-profile live sports events and new programs. Now, the bad news: The company also announced its raising its subscription prices across the board.

Three things to know:

  1. The company reported nearly 19 million new subscribers during the last fiscal quarter of 2024, their largest subscription jump ever during a three-month period. That puts them at a total of 302 million global subscribers, the most of any streaming platform.
  2. Netflix also announced a bump in prices for all subscription tiers in the U.S. The standard account with ads now costs $7.99 per month, ad-free subscriptions are $17.99, while the premium plan is $24.99. This reflects price hikes of $1, $2.50 and $2 per month, respectively.
  3. This comes months after Netflix's last price hike, when the company eliminated its least expensive, ad-free option.


Want more? The Pop Culture Happy Hour podcast suggests and dissects the buzziest new movies, TV, music, books, videogames and more, five days a week.


What's Netflix doing right?

Well, it seems that the company's forays into live sports have resonated with viewers. In their letter to investors, the company focused on the success of their highly promoted live-streamed boxing match between Mike Tyson and Jake Paul on Nov. 15. The match drew 60 million households and, according to Netflix, makes it the most-streamed sporting event in history (despite significant technical glitches, which left subscribers fuming and critics wondering if Netflix could pull off such large-scale events.)

Its next big sporting events went off with far fewer problems: two NFL matches that streamed on the platform weeks later, and included a widely talked about Beyoncé halftime show. A very expensive reboot of WWE wrestling is also in the mix for 2025.

Another factor for Netflix's success? Popular shows that outperformed the company's expectations, like the second season of Squid Game. The industry's largest streaming platform is hoping audiences are just as eager for other shows returning in 2025 with new seasons — such as Wednesday and Stranger Things.

Go deeper with NPR on all things entertainment

140 |
141 |
142 |
143 |
144 | 145 | 146 |
147 | 155 | 156 | 157 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /paper.bib: -------------------------------------------------------------------------------- 1 | @misc{hftransformers, 2 | title={HuggingFace's Transformers: State-of-the-art Natural Language Processing}, 3 | author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush}, 4 | year={2020}, 5 | eprint={1910.03771}, 6 | archivePrefix={arXiv}, 7 | primaryClass={cs.CL} 8 | } 9 | 10 | @incollection{pytorch, 11 | title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library}, 12 | author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith}, 13 | booktitle = {Advances in Neural Information Processing Systems 32}, 14 | pages = {8024--8035}, 15 | year = {2019}, 16 | publisher = {Curran Associates, Inc.}, 17 | url = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf} 18 | } 19 | 20 | @article{llama, 21 | title={Llama: Open and efficient foundation language models}, 22 | author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others}, 23 | journal={arXiv preprint arXiv:2302.13971}, 24 | year={2023} 25 | } 26 | 27 | @article{t5, 28 | title={Exploring the limits of transfer learning with a unified text-to-text transformer}, 29 | author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J}, 30 | journal={The Journal of Machine Learning Research}, 31 | volume={21}, 32 | number={1}, 33 | pages={5485--5551}, 34 | year={2020}, 35 | publisher={JMLRORG} 36 | } 37 | 38 | @article{flan-t5, 39 | title={Scaling instruction-finetuned language models}, 40 | author={Chung, Hyung Won and Hou, Le and Longpre, Shayne and Zoph, Barret and Tay, Yi and Fedus, William and Li, Eric and Wang, Xuezhi and Dehghani, Mostafa and Brahma, Siddhartha and others}, 41 | journal={arXiv preprint arXiv:2210.11416}, 42 | year={2022} 43 | } 44 | 45 | @article{vaswani2017attention, 46 | title={Attention is all you need}, 47 | author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, 48 | journal={Advances in neural information processing systems}, 49 | volume={30}, 50 | year={2017} 51 | } 52 | 53 | @article{bert, 54 | title={Bert: Pre-training of deep bidirectional transformers for language understanding}, 55 | author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, 56 | journal={arXiv preprint arXiv:1810.04805}, 57 | year={2018} 58 | } 59 | 60 | @article{gpt2, 61 | title={Language models are unsupervised multitask learners}, 62 | author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others}, 63 | journal={OpenAI blog}, 64 | volume={1}, 65 | number={8}, 66 | pages={9}, 67 | year={2019} 68 | } 69 | 70 | @article{gpt3, 71 | title={Language models are few-shot learners}, 72 | author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others}, 73 | journal={Advances in neural information processing systems}, 74 | volume={33}, 75 | pages={1877--1901}, 76 | year={2020} 77 | } 78 | 79 | @article{palm, 80 | title={Palm: Scaling language modeling with pathways}, 81 | author={Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others}, 82 | journal={arXiv preprint arXiv:2204.02311}, 83 | year={2022} 84 | } 85 | 86 | @article{codex, 87 | title={Evaluating large language models trained on code}, 88 | author={Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and Pinto, Henrique Ponde de Oliveira and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and others}, 89 | journal={arXiv preprint arXiv:2107.03374}, 90 | year={2021} 91 | } 92 | 93 | @article{eloundou2023gpts, 94 | title={Gpts are gpts: An early look at the labor market impact potential of large language models}, 95 | author={Eloundou, Tyna and Manning, Sam and Mishkin, Pamela and Rock, Daniel}, 96 | journal={arXiv preprint arXiv:2303.10130}, 97 | year={2023} 98 | } 99 | 100 | @article{bubeck2023sparks, 101 | title={Sparks of artificial general intelligence: Early experiments with gpt-4}, 102 | author={Bubeck, S{\'e}bastien and Chandrasekaran, Varun and Eldan, Ronen and Gehrke, Johannes and Horvitz, Eric and Kamar, Ece and Lee, Peter and Lee, Yin Tat and Li, Yuanzhi and Lundberg, Scott and others}, 103 | journal={arXiv preprint arXiv:2303.12712}, 104 | year={2023} 105 | } 106 | 107 | @misc{tensorflow, 108 | title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems}, 109 | url={https://www.tensorflow.org/}, 110 | note={Software available from tensorflow.org}, 111 | author={ 112 | Mart\'{i}n~Abadi and 113 | Ashish~Agarwal and 114 | Paul~Barham and 115 | Eugene~Brevdo and 116 | Zhifeng~Chen and 117 | Craig~Citro and 118 | Greg~S.~Corrado and 119 | Andy~Davis and 120 | Jeffrey~Dean and 121 | Matthieu~Devin and 122 | Sanjay~Ghemawat and 123 | Ian~Goodfellow and 124 | Andrew~Harp and 125 | Geoffrey~Irving and 126 | Michael~Isard and 127 | Yangqing Jia and 128 | Rafal~Jozefowicz and 129 | Lukasz~Kaiser and 130 | Manjunath~Kudlur and 131 | Josh~Levenberg and 132 | Dandelion~Man\'{e} and 133 | Rajat~Monga and 134 | Sherry~Moore and 135 | Derek~Murray and 136 | Chris~Olah and 137 | Mike~Schuster and 138 | Jonathon~Shlens and 139 | Benoit~Steiner and 140 | Ilya~Sutskever and 141 | Kunal~Talwar and 142 | Paul~Tucker and 143 | Vincent~Vanhoucke and 144 | Vijay~Vasudevan and 145 | Fernanda~Vi\'{e}gas and 146 | Oriol~Vinyals and 147 | Pete~Warden and 148 | Martin~Wattenberg and 149 | Martin~Wicke and 150 | Yuan~Yu and 151 | Xiaoqiang~Zheng}, 152 | year={2015}, 153 | } 154 | 155 | @misc{llamacpp, title={llama.cpp: Port of facebook’s Llama model in C/C++}, url={https://github.com/ggerganov/llama.cpp}, journal={GitHub}, author={Gerganov, Georgi}, year={2023},} 156 | 157 | @misc{gpt4all, 158 | author = {Yuvanesh Anand and Zach Nussbaum and Brandon Duderstadt and Benjamin Schmidt and Andriy Mulyar}, 159 | title = {GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo}, 160 | year = {2023}, 161 | publisher = {GitHub}, 162 | journal = {GitHub repository}, 163 | howpublished = {\url{https://github.com/nomic-ai/gpt4all}}, 164 | } 165 | 166 | @misc{langchain, title={LangChain: Building applications with LLMs through composability}, url={https://github.com/hwchase17/langchain}, journal={GitHub}, author={Chase, Harrison}, year={2022},} 167 | 168 | @article{kaplan2020scaling, 169 | title={Scaling laws for neural language models}, 170 | author={Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario}, 171 | journal={arXiv preprint arXiv:2001.08361}, 172 | year={2020} 173 | } 174 | 175 | @article{hoffmann2022training, 176 | title={Training compute-optimal large language models}, 177 | author={Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and others}, 178 | journal={arXiv preprint arXiv:2203.15556}, 179 | year={2022} 180 | } 181 | 182 | @article{mialon2023augmented, 183 | title={Augmented language models: a survey}, 184 | author={Mialon, Gr{\'e}goire and Dess{\`\i}, Roberto and Lomeli, Maria and Nalmpantis, Christoforos and Pasunuru, Ram and Raileanu, Roberta and Rozi{\`e}re, Baptiste and Schick, Timo and Dwivedi-Yu, Jane and Celikyilmaz, Asli and others}, 185 | journal={arXiv preprint arXiv:2302.07842}, 186 | year={2023} 187 | } 188 | 189 | @article{zhao2023survey, 190 | title={A survey of large language models}, 191 | author={Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and others}, 192 | journal={arXiv preprint arXiv:2303.18223}, 193 | year={2023} 194 | } 195 | 196 | @inproceedings{ctranslate2, 197 | title={The OpenNMT neural machine translation toolkit: 2020 edition}, 198 | author={Klein, Guillaume and Hernandez, Fran{\c{c}}ois and Nguyen, Vincent and Senellart, Jean}, 199 | booktitle={Proceedings of the 14th Conference of the Association for Machine Translation in the Americas (Volume 1: Research Track)}, 200 | pages={102--109}, 201 | year={2020} 202 | } 203 | 204 | @article{lamini-lm, 205 | author = {Minghao Wu and 206 | Abdul Waheed and 207 | Chiyu Zhang and 208 | Muhammad Abdul-Mageed and 209 | Alham Fikri Aji 210 | }, 211 | title = {LaMini-LM: A Diverse Herd of Distilled Models from Large-Scale Instructions}, 212 | journal = {CoRR}, 213 | volume = {abs/2304.14402}, 214 | year = {2023}, 215 | url = {https://arxiv.org/abs/2304.14402}, 216 | eprinttype = {arXiv}, 217 | eprint = {2304.14402} 218 | } 219 | 220 | @article{openchat, 221 | title={OpenChat: Advancing Open-source Language Models with Mixed-Quality Data}, 222 | author={Wang, Guan and Cheng, Sijie and Zhan, Xianyuan and Li, Xiangang and Song, Sen and Liu, Yang}, 223 | journal={arXiv preprint arXiv:2309.11235}, 224 | year={2023} 225 | } -------------------------------------------------------------------------------- /languagemodels/embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from time import perf_counter 3 | 4 | from languagemodels.models import get_model, get_model_info 5 | 6 | 7 | def embed(docs): 8 | """Compute embeddings for a batch of documents 9 | 10 | >>> embed(["I love Python!"])[0].shape 11 | (384,) 12 | 13 | >>> embed(["I love Python!"])[0][-3:] 14 | array([0.1..., 0.1..., 0.0...], dtype=float32) 15 | 16 | >>> float(np.linalg.norm(embed(["I love Python!"])[0])) 17 | 1.0 18 | 19 | Embeddings are computed by running the first 512 tokens of each doc 20 | through a forward pass of the embedding model. The last hidden state 21 | of the model is mean pooled to produce a single vector 22 | 23 | Documents will be processed in batches. The batch size is fixed at 64 24 | as this size was found to maximize throughput on a number of test 25 | systems while limiting memory usage. 26 | """ 27 | 28 | tokenizer, model = get_model("embedding") 29 | model_info = get_model_info("embedding") 30 | 31 | start_time = perf_counter() 32 | 33 | tokens = [tokenizer.encode(doc[:8192]).ids[:512] for doc in docs] 34 | 35 | def mean_pool(last_hidden_state): 36 | embedding = np.mean(last_hidden_state, axis=0) 37 | embedding = embedding / np.linalg.norm(embedding) 38 | return embedding 39 | 40 | bs = 64 41 | embeddings = [] 42 | for i in range(0, len(docs), bs): 43 | outputs = model.forward_batch(tokens[i : i + bs]) 44 | embeddings += [mean_pool(lhs) for lhs in np.array(outputs.last_hidden_state)] 45 | 46 | model_info["requests"] = model_info.get("requests", 0) + len(tokens) 47 | 48 | in_toks = sum(len(d) for d in tokens) 49 | model_info["input_tokens"] = model_info.get("input_tokens", 0) + in_toks 50 | 51 | runtime = perf_counter() - start_time 52 | model_info["runtime"] = model_info.get("runtime", 0) + runtime 53 | 54 | return embeddings 55 | 56 | 57 | def search(query, docs, count=16): 58 | """Return `count` `docs` sorted by match against `query` 59 | 60 | :param query: Input to match in search 61 | :param docs: List of docs to search against 62 | :param count: Number of document to return 63 | :return: List of (doc_num, score) tuples sorted by score descending 64 | """ 65 | 66 | prefix = get_model_info("embedding").get("query_prefix", "") 67 | 68 | query_embedding = embed([f"{prefix}{query}"])[0] 69 | 70 | scores = np.dot([d.embedding for d in docs], query_embedding) 71 | 72 | return [(i, scores[i]) for i in reversed(np.argsort(scores)[-count:])] 73 | 74 | 75 | def get_token_ids(doc): 76 | """Return list of token ids for a document 77 | 78 | Note that the tokenzier used here is from the generative model. 79 | 80 | This is used for token counting for the context, not for tokenization 81 | before embedding. 82 | """ 83 | 84 | generative_tokenizer, _ = get_model("instruct", tokenizer_only=True) 85 | 86 | # We need to disable and re-enable truncation here 87 | # This allows us to tokenize very large documents 88 | # We won't be feeding the tokens themselves to a model, so this 89 | # shouldn't cause any problems. 90 | trunk = generative_tokenizer.truncation 91 | if trunk: 92 | generative_tokenizer.no_truncation() 93 | ids = generative_tokenizer.encode(doc, add_special_tokens=False).ids 94 | if trunk: 95 | generative_tokenizer.enable_truncation( 96 | trunk["max_length"], stride=trunk["stride"], strategy=trunk["strategy"] 97 | ) 98 | 99 | return ids 100 | 101 | 102 | def chunk_doc(doc, name="", chunk_size=64, chunk_overlap=8): 103 | """Break a document into chunks 104 | 105 | :param doc: Document to chunk 106 | :param name: Optional document name 107 | :param chunk_size: Length of individual chunks in tokens 108 | :param chunk_overlap: Number of tokens to overlap when breaking chunks 109 | :return: List of strings representing the chunks 110 | 111 | The simple chunking approach used here consist of the following: 112 | 113 | 1. Attempt to chunk the remainder of the document. 114 | 2. If we can't fit all tokens in chunk_size, backtrack to look for a 115 | meaningful cut point. 116 | 3. If a cut point is found, use that as the chunk boundary. There will 117 | be no overlap between this chunk and the next in this case. 118 | 4. If a cut point is not found, use chunk_size a the boundary. There 119 | will be chunk_overlap overlapping tokens starting the next chunk. 120 | 5. Repeat until entire document has been split into chunks. 121 | 122 | >>> chunk_doc("") 123 | [] 124 | 125 | >>> chunk_doc( 126 | ... "It was the best of times, it was the worst of times, it was the age " 127 | ... "of wisdom, it was the age of foolishness, it was the epoch of belief, " 128 | ... "it was the epoch of incredulity, it was the season of Light, it was " 129 | ... "the season of Darkness, it was the spring of hope, it was the winter " 130 | ... "of despair, we had everything before us, we had nothing before us, we " 131 | ... "were all going direct to Heaven, we were all going direct the other " 132 | ... "way—in short, the period was so far like the present period, that " 133 | ... "some of its noisiest authorities insisted on its being received, for " 134 | ... "good or for evil, in the superlative degree of comparison only.") 135 | ['It was the best of times...'] 136 | 137 | >>> chunk_doc( 138 | ... "One morning, when Gregor Samsa woke from troubled dreams, he found " 139 | ... "himself transformed in his bed into a horrible vermin. He lay on his " 140 | ... "armour-like back, and if he lifted his head a little he could see " 141 | ... "his brown belly, slightly domed and divided by arches into stiff " 142 | ... "sections. The bedding was hardly able to cover it and seemed ready " 143 | ... "to slide off any moment. His many legs, pitifully thin compared with " 144 | ... "the size of the rest of him, waved about helplessly as he looked.") 145 | ['One morning, ...'] 146 | 147 | >>> chunk_doc("Hello") 148 | ['Hello'] 149 | 150 | >>> chunk_doc("Hello " * 65) 151 | ['Hello Hello...', 'Hello...'] 152 | 153 | >>> chunk_doc("Hello world. " * 24)[0] 154 | 'Hello world. ...Hello world.' 155 | 156 | >>> len(chunk_doc("Hello world. " * 20)) 157 | 1 158 | 159 | >>> len(chunk_doc("Hello world. " * 24)) 160 | 2 161 | 162 | # Check to make sure sentences aren't broken on decimal points 163 | >>> chunk_doc(('z. ' + ' 37.468 ' * 5) * 3)[0] 164 | 'z. 37.468 ...z.' 165 | """ 166 | generative_tokenizer, _ = get_model("instruct", tokenizer_only=True) 167 | 168 | tokens = get_token_ids(doc) 169 | 170 | separator_tokens = [".", "!", "?", ").", "\n\n", "\n", '."'] 171 | 172 | separators = [get_token_ids(t)[-1] for t in separator_tokens] 173 | 174 | name_tokens = [] 175 | 176 | label = f"From {name} document:" if name else "" 177 | 178 | if name: 179 | name_tokens = get_token_ids(label) 180 | 181 | i = 0 182 | chunks = [] 183 | chunk = name_tokens.copy() 184 | while i < len(tokens): 185 | token = tokens[i] 186 | chunk.append(token) 187 | i += 1 188 | 189 | # Save the last chunk if we're done 190 | if i == len(tokens): 191 | chunks.append(generative_tokenizer.decode(chunk)) 192 | break 193 | 194 | if len(chunk) == chunk_size: 195 | # Backtrack to find a reasonable cut point 196 | for j in range(1, chunk_size // 2): 197 | if chunk[chunk_size - j] in separators: 198 | ctx = generative_tokenizer.decode( 199 | chunk[chunk_size - j : chunk_size - j + 2] 200 | ) 201 | if " " in ctx or "\n" in ctx: 202 | # Found a good separator 203 | text = generative_tokenizer.decode(chunk[: chunk_size - j + 1]) 204 | chunks.append(text) 205 | chunk = name_tokens + chunk[chunk_size - j + 1 :] 206 | break 207 | else: 208 | # No semantically meaningful cutpoint found 209 | # Default to a hard cut 210 | text = generative_tokenizer.decode(chunk) 211 | chunks.append(text) 212 | # Share some overlap with next chunk 213 | overlap = max( 214 | chunk_overlap, chunk_size - len(name_tokens) - (len(tokens) - i) 215 | ) 216 | chunk = name_tokens + chunk[-overlap:] 217 | 218 | return chunks 219 | 220 | 221 | class Document: 222 | """ 223 | A document used for semantic search 224 | 225 | Documents have content and an embedding that is used to match the content 226 | against other semantically similar documents. 227 | """ 228 | 229 | def __init__(self, content, name="", embedding=None): 230 | self.content = content 231 | self.embedding = embedding if embedding is not None else embed([content])[0] 232 | self.name = name 233 | 234 | 235 | class RetrievalContext: 236 | """ 237 | Provides a context for document retrieval 238 | 239 | Documents are embedded and cached for later search. 240 | 241 | Example usage: 242 | 243 | >>> rc = RetrievalContext() 244 | >>> rc.store("Paris is in France.") 245 | >>> rc.store("The sky is blue.") 246 | >>> rc.store("Mars is a planet.") 247 | >>> rc.get_match("Paris is in France.") 248 | 'Paris is in France.' 249 | 250 | >>> rc.get_match("Where is Paris?") 251 | 'Paris is in France.' 252 | 253 | >>> rc.clear() 254 | >>> rc.get_match("Where is Paris?") 255 | 256 | >>> rc.clear() 257 | >>> rc.store(' '.join(['Python'] * 4096)) 258 | >>> len(rc.chunks) 259 | 73 260 | 261 | >>> rc.clear() 262 | >>> rc.store(' '.join(['Python'] * 232)) 263 | >>> len(rc.chunks) 264 | 4 265 | 266 | >>> rc.get_context("What is Python?") 267 | 'Python Python Python...' 268 | 269 | >>> [len(c.content.split()) for c in rc.chunks] 270 | [64, 64, 64, 64] 271 | 272 | >>> len(rc.get_context("What is Python?").split()) 273 | 128 274 | """ 275 | 276 | def __init__(self, chunk_size=64, chunk_overlap=8): 277 | self.chunk_size = chunk_size 278 | self.chunk_overlap = chunk_overlap 279 | self.clear() 280 | 281 | def clear(self): 282 | self.docs = [] 283 | self.chunks = [] 284 | 285 | def store(self, doc, name=""): 286 | """Stores a document along with embeddings 287 | 288 | This stores both the document as well as document chunks 289 | 290 | >>> rc = RetrievalContext() 291 | >>> rc.clear() 292 | >>> rc.store(' '.join(['Python'] * 233)) 293 | >>> len(rc.chunks) 294 | 5 295 | 296 | >>> rc.clear() 297 | >>> rc.store(' '.join(['Python'] * 232)) 298 | >>> len(rc.chunks) 299 | 4 300 | 301 | >>> rc.clear() 302 | >>> rc.store('Python') 303 | >>> len(rc.chunks) 304 | 1 305 | 306 | >>> rc.clear() 307 | >>> rc.store('It is a language.', 'Python') 308 | >>> len(rc.chunks) 309 | 1 310 | >>> [c.content for c in rc.chunks] 311 | ['From Python document: It is a language.'] 312 | 313 | >>> rc = RetrievalContext() 314 | >>> rc.clear() 315 | >>> rc.store(' '.join(['details'] * 217), 'Python') 316 | >>> len(rc.chunks) 317 | 5 318 | 319 | >>> rc.clear() 320 | >>> rc.store(' '.join(['details'] * 216), 'Python') 321 | >>> len(rc.chunks) 322 | 4 323 | >>> [c.content for c in rc.chunks] 324 | ['From Python document: details details details...'] 325 | """ 326 | 327 | if doc not in self.docs: 328 | self.docs.append(Document(doc, name=name)) 329 | self.store_chunks(doc, name) 330 | 331 | def store_chunks(self, doc, name=""): 332 | chunks = chunk_doc(doc, name, self.chunk_size, self.chunk_overlap) 333 | 334 | embeddings = embed(chunks) 335 | 336 | for embedding, chunk in zip(embeddings, chunks): 337 | self.chunks.append(Document(chunk, embedding=embedding)) 338 | 339 | def get_context(self, query, max_tokens=128): 340 | """Gets context matching a query 341 | 342 | Context is capped by token length and is retrieved from stored 343 | document chunks 344 | """ 345 | 346 | if len(self.chunks) == 0: 347 | return None 348 | 349 | results = search(query, self.chunks) 350 | 351 | chunks = [] 352 | tokens = 0 353 | 354 | for chunk_id, score in results: 355 | chunk = self.chunks[chunk_id].content 356 | chunk_tokens = len(get_token_ids(chunk)) 357 | if tokens + chunk_tokens <= max_tokens and score > 0.1: 358 | chunks.append(chunk) 359 | tokens += chunk_tokens 360 | 361 | context = "\n\n".join(chunks) 362 | 363 | return context 364 | 365 | def get_match(self, query): 366 | if len(self.docs) == 0: 367 | return None 368 | 369 | return self.docs[search(query, self.docs)[0][0]].content 370 | -------------------------------------------------------------------------------- /languagemodels/inference.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import requests 3 | import re 4 | import os 5 | import sys 6 | from time import perf_counter 7 | 8 | from languagemodels.models import get_model, get_model_info 9 | from languagemodels.config import config 10 | 11 | 12 | class InferenceException(Exception): 13 | pass 14 | 15 | 16 | def truncate_prompt(prompt): 17 | """Truncates a prompt to the maximum length allowed by the config""" 18 | max_prompt_length = config["max_prompt_length"] 19 | if len(prompt) > max_prompt_length: 20 | print( 21 | f"Warning: Prompt truncated from {len(prompt)} to " 22 | f"{max_prompt_length} characters to avoid OOM." 23 | ) 24 | return prompt[:max_prompt_length] 25 | return prompt 26 | 27 | 28 | def list_tokens(prompt): 29 | """Generates a list of tokens for a supplied prompt 30 | 31 | >>> list_tokens("Hello, world!") # doctest: +SKIP 32 | [('▁Hello', 8774), (',', 6), ('▁world', 296), ('!', 55)] 33 | 34 | >>> list_tokens("Hello, world!") 35 | [('...Hello', ...), ... ('...world', ...), ...] 36 | """ 37 | prompt = truncate_prompt(prompt) 38 | tokenizer, _ = get_model("instruct") 39 | 40 | output = tokenizer.encode(prompt, add_special_tokens=False) 41 | tokens = output.tokens 42 | ids = output.ids 43 | 44 | return list(zip(tokens, ids)) 45 | 46 | 47 | def generate_ts(engine, prompt, max_tokens=200): 48 | """Generates a single text response for a prompt from a textsynth server 49 | 50 | The server and API key are provided as environment variables: 51 | 52 | LANGUAGEMODELS_TS_SERVER is the server such as http://localhost:8080 53 | LANGUAGEMODELS_TS_KEY is the API key 54 | """ 55 | apikey = os.environ.get("LANGUAGEMODELS_TS_KEY") or "" 56 | server = os.environ.get("LANGUAGEMODELS_TS_SERVER") or "https://api.textsynth.com" 57 | 58 | response = requests.post( 59 | f"{server}/v1/engines/{engine}/completions", 60 | headers={"Authorization": f"Bearer {apikey}"}, 61 | json={"prompt": prompt, "max_tokens": max_tokens}, 62 | ) 63 | resp = response.json() 64 | if "text" in resp: 65 | return resp["text"] 66 | else: 67 | raise InferenceException(f"TextSynth error: {resp}") 68 | 69 | 70 | def generate_oa(engine, prompt, max_tokens=200, temperature=0): 71 | """Generates a single text response for a prompt using OpenAI 72 | 73 | The server and API key are provided as environment variables: 74 | 75 | LANGUAGEMODELS_OA_KEY is the API key 76 | """ 77 | apikey = os.environ.get("LANGUAGEMODELS_OA_KEY") 78 | 79 | response = requests.post( 80 | "https://api.openai.com/v1/completions", 81 | headers={ 82 | "Authorization": f"Bearer {apikey}", 83 | "Content-Type": "application/json", 84 | }, 85 | json={ 86 | "model": engine, 87 | "prompt": prompt, 88 | "max_tokens": max_tokens, 89 | "temperature": temperature, 90 | }, 91 | ) 92 | resp = response.json() 93 | 94 | try: 95 | return resp["choices"][0]["text"] 96 | except KeyError: 97 | raise InferenceException(f"OpenAI error: {resp}") 98 | 99 | 100 | def chat_oa(engine, prompt, max_tokens=200, temperature=0): 101 | """Generates a single text response for a prompt using OpenAI 102 | 103 | The server and API key are provided as environment variables: 104 | 105 | LANGUAGEMODELS_OA_KEY is the API key 106 | """ 107 | apikey = os.environ.get("LANGUAGEMODELS_OA_KEY") 108 | 109 | response = requests.post( 110 | "https://api.openai.com/v1/chat/completions", 111 | headers={ 112 | "Authorization": f"Bearer {apikey}", 113 | "Content-Type": "application/json", 114 | }, 115 | json={ 116 | "model": engine, 117 | "messages": [{"role": "user", "content": prompt}], 118 | "max_tokens": max_tokens, 119 | "temperature": temperature, 120 | }, 121 | ) 122 | resp = response.json() 123 | 124 | try: 125 | return resp["choices"][0]["message"]["content"] 126 | except KeyError: 127 | raise InferenceException(f"OpenAI error: {resp}") 128 | 129 | 130 | def stream_results(results, tokenizer): 131 | """Map a token iterator to a substring iterator""" 132 | tokens = [] 133 | last_len = 0 134 | 135 | for result in results: 136 | tokens.append(result.token_id) 137 | text = tokenizer.decode(tokens) 138 | yield text[last_len:] 139 | last_len = len(text) 140 | 141 | 142 | def echo_results(results, tokenizer): 143 | """Output results to stderr as they are collected""" 144 | tokens = [] 145 | last_len = 0 146 | 147 | for result in results: 148 | tokens.append(result.token_id) 149 | text = tokenizer.decode(tokens) 150 | sys.stderr.write(text[last_len:]) 151 | sys.stderr.flush() 152 | last_len = len(text) 153 | 154 | sys.stderr.write("\n\n") 155 | sys.stderr.flush() 156 | return tokens 157 | 158 | 159 | def generate( 160 | instructions: List[str], 161 | max_tokens: int = 200, 162 | temperature: float = 0.1, 163 | topk: int = 1, 164 | repetition_penalty: float = 0.0, 165 | prefix: str = "", 166 | suppress: List[str] = [], 167 | model: str = "instruct", 168 | stream: bool = False, 169 | ): 170 | """Generates completions for a prompt 171 | 172 | This may use a local model, or it may make an API call to an external 173 | model if API keys are available. 174 | 175 | >>> generate(["What is the capital of France?"]) 176 | ['...Paris...'] 177 | 178 | >>> list(generate(["What is the capital of France?"], stream=True)) 179 | ['...Paris...'] 180 | """ 181 | if os.environ.get("LANGUAGEMODELS_TS_KEY") or os.environ.get( 182 | "LANGUAGEMODELS_TS_SERVER" 183 | ): 184 | return generate_ts("flan_t5_xxl_q4", instructions, max_tokens).strip() 185 | 186 | if os.environ.get("LANGUAGEMODELS_OA_KEY"): 187 | return chat_oa("gpt-3.5-turbo", instructions, max_tokens).strip() 188 | 189 | tokenizer, model = get_model(model) 190 | 191 | start_time = perf_counter() 192 | 193 | suppress = [tokenizer.encode(s, add_special_tokens=False).tokens for s in suppress] 194 | 195 | model_info = get_model_info("instruct") 196 | 197 | fmt = model_info.get("prompt_fmt", "{instruction}") 198 | 199 | if repetition_penalty == 0.0: 200 | repetition_penalty = model_info.get("repetition_penalty", 1.3) 201 | 202 | prompts = [fmt.replace("{instruction}", inst) for inst in instructions] 203 | truncated_prompts = [truncate_prompt(p) for p in prompts] 204 | 205 | prompts_tok = [tokenizer.encode(p).tokens for p in truncated_prompts] 206 | 207 | outputs_ids = [] 208 | if hasattr(model, "translate_batch"): 209 | prefix = tokenizer.encode(prefix, add_special_tokens=False).tokens 210 | if stream or (config["echo"] and len(prompts_tok) == 1): 211 | results = model.generate_tokens( 212 | prompts_tok[0], 213 | target_prefix=prefix, 214 | repetition_penalty=repetition_penalty, 215 | max_decoding_length=max_tokens, 216 | sampling_temperature=temperature, 217 | sampling_topk=topk, 218 | suppress_sequences=suppress, 219 | ) 220 | 221 | if stream: 222 | return stream_results(results, tokenizer) 223 | else: 224 | outputs_ids = [echo_results(results, tokenizer)] 225 | else: 226 | results = model.translate_batch( 227 | prompts_tok, 228 | target_prefix=[prefix] * len(prompts), 229 | repetition_penalty=repetition_penalty, 230 | max_decoding_length=max_tokens, 231 | sampling_temperature=temperature, 232 | sampling_topk=topk, 233 | suppress_sequences=suppress, 234 | beam_size=1, 235 | ) 236 | outputs_tokens = [r.hypotheses[0] for r in results] 237 | for output in outputs_tokens: 238 | outputs_ids.append([tokenizer.token_to_id(t) for t in output]) 239 | else: 240 | if stream or (config["echo"] and len(prompts_tok) == 1): 241 | results = model.generate_tokens( 242 | prompts_tok, 243 | repetition_penalty=repetition_penalty, 244 | max_length=max_tokens, 245 | sampling_temperature=temperature, 246 | sampling_topk=topk, 247 | suppress_sequences=suppress, 248 | ) 249 | 250 | if stream: 251 | return stream_results(results, tokenizer) 252 | else: 253 | outputs_ids = [echo_results(results, tokenizer)] 254 | else: 255 | results = model.generate_batch( 256 | prompts_tok, 257 | repetition_penalty=repetition_penalty, 258 | max_length=max_tokens, 259 | sampling_temperature=temperature, 260 | sampling_topk=topk, 261 | suppress_sequences=suppress, 262 | beam_size=1, 263 | include_prompt_in_result=False, 264 | ) 265 | outputs_ids = [r.sequences_ids[0] for r in results] 266 | 267 | model_info["requests"] = model_info.get("requests", 0) + len(prompts) 268 | 269 | in_toks = sum(len(p) for p in prompts_tok) 270 | model_info["input_tokens"] = model_info.get("input_tokens", 0) + in_toks 271 | 272 | out_toks = sum(len(o) for o in outputs_ids) 273 | model_info["output_tokens"] = model_info.get("output_tokens", 0) + out_toks 274 | 275 | elapsed_time = perf_counter() - start_time 276 | model_info["runtime"] = model_info.get("runtime", 0) + elapsed_time 277 | 278 | return [tokenizer.decode(i, skip_special_tokens=True).lstrip() for i in outputs_ids] 279 | 280 | 281 | def rank_instruct(inputs, targets): 282 | """Sorts a list of targets by their probabilities 283 | 284 | >>> rank_instruct(["Classify positive or negative: I love python. Classification:"], 285 | ... ['positive', 'negative']) 286 | [['positive', 'negative']] 287 | 288 | >>> rank_instruct(["Classify fantasy or documentary: " 289 | ... "The wizard raised their wand. Classification:"], 290 | ... ['fantasy', 'documentary']) 291 | [['fantasy', 'documentary']] 292 | 293 | >>> rank_instruct(["Say six", "Say seven"], ["six", "seven"]) 294 | [['six', 'seven'], ['seven', 'six']] 295 | """ 296 | tokenizer, model = get_model("instruct") 297 | 298 | model_info = get_model_info("instruct") 299 | fmt = model_info.get("prompt_fmt", "{instruction}") 300 | inputs = [fmt.replace("{instruction}", inst) for inst in inputs] 301 | inputs = [truncate_prompt(i) for i in inputs] 302 | 303 | targ_tok = [tokenizer.encode(t, add_special_tokens=False).tokens for t in targets] 304 | targ_tok *= len(inputs) 305 | 306 | in_tok = [] 307 | for input in inputs: 308 | toks = [tokenizer.encode(input).tokens] 309 | in_tok += toks * len(targets) 310 | 311 | if "Generator" in str(type(model)): 312 | scores = model.score_batch([i + t for i, t in zip(in_tok, targ_tok)]) 313 | else: 314 | scores = model.score_batch(in_tok, target=targ_tok) 315 | 316 | ret = [] 317 | for i in range(0, len(inputs) * len(targets), len(targets)): 318 | logprobs = [sum(r.log_probs) for r in scores[i : i + len(targets)]] 319 | results = sorted(zip(targets, logprobs), key=lambda r: -r[1]) 320 | ret.append([r[0] for r in results]) 321 | 322 | return ret 323 | 324 | 325 | def parse_chat(prompt): 326 | """Converts a chat prompt using special tokens to a plain-text prompt 327 | 328 | This is useful for prompting generic models that have not been fine-tuned 329 | for chat using specialized tokens. 330 | 331 | >>> parse_chat('User: What time is it?') 332 | Traceback (most recent call last): 333 | .... 334 | inference.InferenceException: Chat prompt must end with 'Assistant:' 335 | 336 | >>> parse_chat('''User: What time is it? 337 | ... 338 | ... Assistant:''') 339 | [{'role': 'user', 'content': 'What time is it?'}] 340 | 341 | >>> parse_chat(''' 342 | ... A helpful assistant 343 | ... 344 | ... User: What time is it? 345 | ... 346 | ... Assistant: 347 | ... ''') 348 | [{'role': 'system', 'content': 'A helpful assistant'}, 349 | {'role': 'user', 'content': 'What time is it?'}] 350 | 351 | >>> parse_chat(''' 352 | ... A helpful assistant 353 | ... 354 | ... User: What time is it? 355 | ... 356 | ... Assistant: The time is 357 | ... ''') 358 | Traceback (most recent call last): 359 | .... 360 | inference.InferenceException: Final assistant message must be blank 361 | 362 | >>> parse_chat(''' 363 | ... A helpful assistant 364 | ... 365 | ... User: First para 366 | ... 367 | ... Second para 368 | ... 369 | ... Assistant: 370 | ... ''') 371 | [{'role': 'system', 'content': 'A helpful assistant'}, 372 | {'role': 'user', 'content': 'First para\\n\\nSecond para'}] 373 | 374 | >>> parse_chat(''' 375 | ... A helpful assistant 376 | ... 377 | ... User: What time is it? 378 | ... 379 | ... InvalidRole: Nothing 380 | ... 381 | ... Assistant: 382 | ... ''') 383 | Traceback (most recent call last): 384 | .... 385 | inference.InferenceException: Invalid chat role: invalidrole 386 | """ 387 | 388 | if not re.match(r"^\s*\w+:", prompt): 389 | prompt = "System: " + prompt 390 | 391 | prompt = "\n\n" + prompt 392 | 393 | chunks = re.split(r"[\r\n]\s*(\w+):", prompt, flags=re.M) 394 | chunks = [m.strip() for m in chunks if m.strip()] 395 | 396 | messages = [] 397 | 398 | for i in range(0, len(chunks), 2): 399 | role = chunks[i].lower() 400 | 401 | try: 402 | content = chunks[i + 1] 403 | content = re.sub(r"\s*\n\n\s*", "\n\n", content) 404 | except IndexError: 405 | content = "" 406 | messages.append({"role": role, "content": content}) 407 | 408 | for message in messages: 409 | if message["role"] not in ["system", "user", "assistant"]: 410 | raise InferenceException(f"Invalid chat role: {message['role']}") 411 | 412 | if messages[-1]["role"] != "assistant": 413 | raise InferenceException("Chat prompt must end with 'Assistant:'") 414 | 415 | if messages[-1]["content"] != "": 416 | raise InferenceException("Final assistant message must be blank") 417 | 418 | return messages[:-1] 419 | -------------------------------------------------------------------------------- /languagemodels/__init__.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import datetime 3 | import json 4 | import re 5 | from typing import overload 6 | 7 | from languagemodels.config import config 8 | from languagemodels.preprocess import get_html_paragraphs 9 | from languagemodels.inference import ( 10 | generate, 11 | rank_instruct, 12 | parse_chat, 13 | list_tokens, 14 | ) 15 | from languagemodels import embeddings 16 | 17 | docs = embeddings.RetrievalContext() 18 | 19 | 20 | def complete(prompt: str) -> str: 21 | """Provide one completion for a given open-ended prompt 22 | 23 | :param prompt: Prompt to use as input to the model 24 | :return: Completion returned from the language model 25 | 26 | Examples: 27 | 28 | >>> complete("Luke thought that he") #doctest: +SKIP 29 | 'was going to be a doctor.' 30 | 31 | >>> complete("There are many mythical creatures who") #doctest: +SKIP 32 | 'are able to fly' 33 | 34 | >>> complete("She hid in her room until") #doctest: +SKIP 35 | 'she was sure she was safe' 36 | """ 37 | 38 | result = generate( 39 | ["Write a sentence"], 40 | prefix=prompt, 41 | max_tokens=config["max_tokens"], 42 | temperature=0.7, 43 | topk=40, 44 | )[0] 45 | 46 | if result.startswith(prompt): 47 | prefix_length = len(prompt) 48 | return result[prefix_length:] 49 | else: 50 | return result 51 | 52 | 53 | @overload 54 | def do(prompt: list) -> list: 55 | ... 56 | 57 | 58 | @overload 59 | def do(prompt: str) -> str: 60 | ... 61 | 62 | 63 | def do(prompt, choices=None): 64 | """Follow a single-turn instructional prompt 65 | 66 | :param prompt: Instructional prompt(s) to follow 67 | :param choices: If provided, outputs are restricted to values in choices 68 | :return: Completion returned from the language model 69 | 70 | Note that this function is overloaded to return a list of results if 71 | a list if of prompts is provided and a single string if a single 72 | prompt is provided as a string 73 | 74 | Examples: 75 | 76 | >>> do("Translate Spanish to English: Hola mundo!") #doctest: +SKIP 77 | 'Hello world!' 78 | 79 | >>> do("Pick the planet from the list: baseball, Texas, Saturn") 80 | '...Saturn...' 81 | 82 | >>> do(["Pick the planet from the list: baseball, Texas, Saturn"] * 2) 83 | ['...Saturn...', '...Saturn...'] 84 | 85 | >>> do(["Say red", "Say blue"], choices=["red", "blue"]) 86 | ['red', 'blue'] 87 | 88 | >>> do("Classify as positive or negative: LLMs are bad", 89 | ... choices=["Positive", "Negative"]) 90 | 'Negative' 91 | 92 | >>> do("Classify as positive or negative: LLMs are great", 93 | ... choices=["Positive", "Negative"]) 94 | 'Positive' 95 | """ 96 | 97 | prompts = [prompt] if isinstance(prompt, str) else prompt 98 | 99 | if choices: 100 | results = [r[0] for r in rank_instruct(prompts, choices)] 101 | else: 102 | results = generate(prompts, max_tokens=config["max_tokens"], topk=1) 103 | 104 | return results[0] if isinstance(prompt, str) else results 105 | 106 | 107 | @overload 108 | def embed(doc: list) -> list: 109 | ... 110 | 111 | 112 | @overload 113 | def embed(doc: str) -> str: 114 | ... 115 | 116 | 117 | def embed(doc): 118 | """Create embedding for a document 119 | 120 | :param doc: Document(s) to embed 121 | :return: Embedding 122 | 123 | Note that this function is overloaded to return a list of embeddings if 124 | a list if of docs is provided and a single embedding if a single 125 | doc is provided as a string 126 | 127 | Examples: 128 | 129 | >>> embed("Hello, world") 130 | [-0.0...] 131 | 132 | >>> embed(["Hello", "world"]) 133 | [[-0.0...]] 134 | """ 135 | 136 | docs = [doc] if isinstance(doc, str) else doc 137 | 138 | # Create embeddings and convert to lists of floats 139 | emb = [[float(n) for n in e] for e in embeddings.embed(docs)] 140 | 141 | return emb[0] if isinstance(doc, str) else emb 142 | 143 | 144 | def chat(prompt: str) -> str: 145 | """Get new message from chat-optimized language model 146 | 147 | The `prompt` for this model is provided as a series of messages as a single 148 | plain-text string. Several special tokens are used to delineate chat 149 | messages. 150 | 151 | - `system:` - Indicates the start of a system message providing 152 | instructions about how the assistant should behave. 153 | - `user:` - Indicates the start of a prompter (typically user) 154 | message. 155 | - `assistant:` - Indicates the start of an assistant message. 156 | 157 | A complete prompt may look something like this: 158 | 159 | ``` 160 | Assistant is helpful and harmless 161 | 162 | User: What is the capital of Germany? 163 | 164 | Assistant: The capital of Germany is Berlin. 165 | 166 | User: How many people live there? 167 | 168 | Assistant: 169 | ``` 170 | 171 | The completion from the language model is returned. 172 | 173 | :param message: Prompt using formatting described above 174 | :return: Completion returned from the language model 175 | 176 | Examples: 177 | 178 | >>> response = chat(''' 179 | ... System: Respond as a helpful assistant. It is 5:00pm. 180 | ... 181 | ... User: What time is it? 182 | ... 183 | ... Assistant: 184 | ... ''') # doctest: +SKIP 185 | "It's 5:00pm." 186 | """ 187 | 188 | messages = parse_chat(prompt) 189 | 190 | # Suppress starts of all assistant messages to avoid repeat generation 191 | suppress = [ 192 | "Assistant: " + m["content"].split(" ")[0] 193 | for m in messages 194 | if m["role"] in ["assistant", "user"] 195 | ] 196 | 197 | # Suppress all user messages to avoid repeating them 198 | suppress += [m["content"] for m in messages if m["role"] == "user"] 199 | 200 | system_msgs = [m for m in messages if m["role"] == "system"] 201 | assistant_msgs = [m for m in messages if m["role"] == "assistant"] 202 | user_msgs = [m for m in messages if m["role"] == "user"] 203 | 204 | # The current model is tuned on instructions and tends to get 205 | # lost if it sees too many questions 206 | # Use only the most recent user and assistant message for context 207 | # Keep all system messages 208 | messages = system_msgs + assistant_msgs[-1:] + user_msgs[-1:] 209 | 210 | rolemap = { 211 | "system": "System", 212 | "user": "Question", 213 | "assistant": "Assistant", 214 | } 215 | 216 | messages = [f"{rolemap[m['role']]}: {m['content']}" for m in messages] 217 | 218 | prompt = "\n\n".join(messages) + "\n\n" + "Assistant:" 219 | 220 | if prompt.startswith("System:"): 221 | prompt = prompt[7:].strip() 222 | 223 | response = generate( 224 | [prompt], 225 | max_tokens=config["max_tokens"], 226 | temperature=0.3, 227 | topk=40, 228 | prefix="Assistant:", 229 | suppress=suppress, 230 | )[0] 231 | 232 | # Remove duplicate assistant being generated 233 | if response.startswith("Assistant:"): 234 | response = response[10:] 235 | 236 | return response.strip() 237 | 238 | 239 | def code(prompt: str) -> str: 240 | """Complete a code prompt 241 | 242 | This assumes that users are expecting Python completions. Default models 243 | are fine-tuned on Python where applicable. 244 | 245 | :param prompt: Code context to complete 246 | :return: Completion returned from the language model 247 | 248 | Examples: 249 | 250 | >>> code("# Print Hello, world!\\n") 251 | 'print("Hello, world!")\\n' 252 | 253 | >>> code("def return_4():") 254 | '...return 4...' 255 | """ 256 | return generate([prompt], max_tokens=config["max_tokens"], topk=1, model="code")[0] 257 | 258 | 259 | def extract_answer(question: str, context: str) -> str: 260 | """Extract an answer to a `question` from a provided `context` 261 | 262 | :param question: A question to answer using knowledge from context 263 | :param context: Knowledge used to answer the question 264 | :return: Answer to the question. 265 | 266 | Examples: 267 | 268 | >>> context = "There is a green ball and a red box" 269 | >>> extract_answer("What color is the ball?", context).lower() 270 | '...green...' 271 | 272 | >>> extract_answer("Who created Python?", get_wiki('Python')) #doctest: +SKIP 273 | '...Guido van Rossum...' 274 | """ 275 | 276 | return generate([f"{context}\n\n{question}"])[0] 277 | 278 | 279 | def classify(doc: str, label1: str, label2: str) -> str: 280 | """Performs binary classification on an input 281 | 282 | :param doc: A plain text input document to classify 283 | :param label1: The first label to classify against 284 | :param label2: The second label to classify against 285 | :return: The closest matching class. The return value will always be 286 | `label1` or `label2` 287 | 288 | Examples: 289 | 290 | >>> classify("That book was good.","positive","negative") 291 | 'positive' 292 | >>> classify("That movie was terrible.","positive","negative") 293 | 'negative' 294 | """ 295 | 296 | return do( 297 | f"Classify as {label1} or {label2}: {doc}\n\nClassification:", 298 | choices=[label1, label2], 299 | ) 300 | 301 | 302 | def store_doc(doc: str, name: str = "") -> None: 303 | """Store document for later retrieval 304 | 305 | :param doc: A plain text document to store. 306 | :param name: Optional name for the document. This is used as a chunk prefix. 307 | 308 | Examples: 309 | 310 | >>> store_doc("The sky is blue.") 311 | """ 312 | docs.store(doc, name) 313 | 314 | 315 | def load_doc(query: str) -> str: 316 | """Load a matching document 317 | 318 | A single document that best matches `query` will be returned. 319 | 320 | :param query: Query to compare to stored documents 321 | :return: Content of the closest matching document 322 | 323 | Examples: 324 | 325 | >>> store_doc("Paris is in France.") 326 | >>> store_doc("The sky is blue.") 327 | >>> load_doc("Where is Paris?") 328 | 'Paris is in France.' 329 | """ 330 | return docs.get_match(query) 331 | 332 | 333 | def get_doc_context(query: str) -> str: 334 | """Loads context from documents 335 | 336 | A string representing the most relevant content from all stored documents 337 | will be returned. This may be a blend of chunks from multiple documents. 338 | 339 | :param query: Query to compare to stored documents 340 | :return: Up to 128 tokens of context 341 | 342 | Examples: 343 | 344 | >>> store_doc("Paris is in France.") 345 | >>> store_doc("Paris is nice.") 346 | >>> store_doc("The sky is blue.") 347 | >>> get_doc_context("Where is Paris?") 348 | 'Paris is in France.\\n\\nParis is nice.' 349 | """ 350 | return docs.get_context(query) 351 | 352 | 353 | def get_web(url: str) -> str: 354 | """ 355 | Return the text of paragraphs from a web page 356 | 357 | :param url: The URL to load 358 | :return str: Plain text content from the URL 359 | 360 | Note that it is difficult to return only the human-readable 361 | content from an HTML page. This function takes a basic and quick 362 | approach. It will not work perfectly on all sites, but will 363 | often do a reasonable job of returning the plain text content 364 | of a page. 365 | 366 | If the `url` points to a plain text page, the page content 367 | will be returned verbatim. 368 | """ 369 | 370 | res = requests.get( 371 | url, headers={"User-Agent": "Mozilla/5.0 (compatible; languagemodels)"} 372 | ) 373 | 374 | if "text/plain" in res.raw.getheader("content-type"): 375 | return res.text 376 | elif "text/html" in res.raw.getheader("content-type"): 377 | return get_html_paragraphs(res.text) 378 | 379 | return "" 380 | 381 | 382 | def get_wiki(topic: str) -> str: 383 | """ 384 | Return Wikipedia summary for a topic 385 | 386 | This function ignores the complexity of disambiguation pages and simply 387 | returns the first result that is not a disambiguation page 388 | 389 | :param topic: Topic to search for on Wikipedia 390 | :return: Text content of the lead section of the most popular matching article 391 | 392 | Examples: 393 | 394 | >>> get_wiki('Python language') 395 | 'Python is a high-level...' 396 | 397 | >>> get_wiki('Chemistry') 398 | 'Chemistry is the scientific study...' 399 | """ 400 | 401 | url = "https://api.wikimedia.org/core/v1/wikipedia/en/search/title" 402 | response = requests.get(url, params={"q": topic, "limit": 5}) 403 | response = json.loads(response.text) 404 | 405 | for page in response["pages"]: 406 | wiki_result = requests.get( 407 | f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts|pageprops&" 408 | f"exintro&redirects=1&titles={page['title']}&format=json" 409 | ).json() 410 | 411 | first = wiki_result["query"]["pages"].popitem()[1] 412 | if "disambiguation" in first["pageprops"]: 413 | continue 414 | 415 | summary = first["extract"] 416 | 417 | cutoffs = [ 418 | "See_also", 419 | "Notes", 420 | "References", 421 | "Further_reading", 422 | "External_links", 423 | ] 424 | 425 | for cutoff in cutoffs: 426 | summary = summary.split(f'', 1)[0] 427 | 428 | summary = re.sub(r"

", "\n\n", summary, flags=re.I) 429 | summary = re.sub(r"", "", summary, flags=re.I | re.DOTALL) 430 | summary = re.sub(r"<.*?>", "", summary, flags=re.I) 431 | summary = re.sub(r"\s*[\n\r]+\s*[\r\n]+[\s\r\n]*", "\n\n", summary, flags=re.I) 432 | summary = summary.strip() 433 | return summary 434 | else: 435 | return "No matching wiki page found." 436 | 437 | 438 | def get_weather(latitude, longitude): 439 | """Fetch the current weather for a supplied longitude and latitude 440 | 441 | Weather is provided by the US government and this function only supports 442 | locations in the United States. 443 | 444 | :param latitude: Latitude value representing this location 445 | :param longitude: Longitude value representing this location 446 | :return: Plain text description of the current weather forecast 447 | 448 | Examples: 449 | 450 | >>> get_weather(41.8, -87.6) # doctest: +SKIP 451 | 'Scattered showers and thunderstorms before 1pm with a high of 73.' 452 | """ 453 | 454 | res = requests.get(f"https://api.weather.gov/points/{latitude},{longitude}") 455 | points = json.loads(res.text) 456 | forecast_url = points["properties"]["forecast"] 457 | 458 | res = requests.get(forecast_url) 459 | forecast = json.loads(res.text) 460 | current = forecast["properties"]["periods"][0] 461 | 462 | return current["detailedForecast"] 463 | 464 | 465 | def get_date() -> str: 466 | """Returns the current date and time in natural language 467 | 468 | >>> get_date() # doctest: +SKIP 469 | 'Friday, May 12, 2023 at 09:27AM' 470 | """ 471 | 472 | now = datetime.datetime.now() 473 | 474 | return now.strftime("%A, %B %d, %Y at %I:%M%p") 475 | 476 | 477 | def print_tokens(prompt: str) -> None: 478 | """Prints a list of tokens in a prompt 479 | 480 | :param prompt: Prompt to use as input to tokenizer 481 | :return: Nothing 482 | 483 | Examples: 484 | 485 | >>> print_tokens("Hello world") 486 | ' Hello' (token 8774) 487 | ' world' (token 296) 488 | 489 | >>> print_tokens("Hola mundo") 490 | ' Hol' (token 5838) 491 | 'a' (token 9) 492 | ' mun' (token 13844) 493 | 'd' (token 26) 494 | 'o' (token 32) 495 | """ 496 | 497 | tokens = list_tokens(prompt) 498 | 499 | for token in tokens: 500 | print(f"'{token[0].replace('▁', ' ')}' (token {token[1]})") 501 | 502 | 503 | def count_tokens(prompt: str) -> None: 504 | """Counts tokens in a prompt 505 | 506 | :param prompt: Prompt to use as input to tokenizer 507 | :return: Nothing 508 | 509 | Examples: 510 | 511 | >>> count_tokens("Hello world") 512 | 2 513 | 514 | >>> count_tokens("Hola mundo") 515 | 5 516 | """ 517 | 518 | return len(list_tokens(prompt)) 519 | 520 | 521 | def set_max_ram(value): 522 | """Sets max allowed RAM 523 | 524 | This value takes priority over environment variables 525 | 526 | Returns the numeric value set in GB 527 | 528 | >>> set_max_ram(16) 529 | 16.0 530 | 531 | >>> set_max_ram('512mb') 532 | 0.5 533 | """ 534 | 535 | config["max_ram"] = value 536 | 537 | return config["max_ram"] 538 | 539 | 540 | def require_model_license(match_re): 541 | """Require models to match supplied regex 542 | 543 | This can be used to enforce certain licensing constraints when using this 544 | package. 545 | """ 546 | config["model_license"] = match_re 547 | -------------------------------------------------------------------------------- /test/planets.json: -------------------------------------------------------------------------------- 1 | [{"name": "Mercury", "content": "Mercury is the first planet from the Sun and the smallest planet in the Solar System. It is a terrestrial planet with a heavily cratered surface due to the planet having no geological activity and an extremely tenuous atmosphere (called an exosphere). Despite being the smallest planet in the Solar System with a mean diameter of 4,880 km (3,030 mi), 38% of that of Earth's, Mercury is dense enough to have roughly the same surface gravity as Mars. Mercury has a dynamic magnetic field with a strength about 1% of that of Earth's and has no natural satellites. \nAccording to current theories, Mercury may have a solid silicate crust and mantle overlying a solid outer core, a deeper liquid core layer, and a solid inner core. Having almost no atmosphere to retain heat, Mercury has surface temperatures that change wildly during the day, ranging from 100 K (\u2212173 \u00b0C; \u2212280 \u00b0F) at night to 700 K (427 \u00b0C; 800 \u00b0F) during sunlight across the equator regions. At Mercury's poles though, there are large reservoirs of water ices that are never exposed to direct sunlight, which has an estimated mass of about 0.025\u20130.25% the Antarctic ice sheet. There are many competing hypotheses about Mercury's origins and development, some of which incorporate collision with planetesimal and rock vaporization. \nBecause Mercury is very close to the Sun, the intensity of sunlight on its surface is between 4.59 and 10.61 times the solar constant (amount of the Sun's energy received at 1 astronomical unit, which is roughly the distance between Earth and the Sun). Mercury orbits the Sun in a 3:2 spin\u2013orbit resonance, meaning that relative to the background stars, it rotates on its axis exactly three times for every two revolutions it makes around the Sun. Counterintuitively, due to Mercury's slow rotation, an observer on the planet would see only one Mercurian solar day (176 Earth days) every two Mercurian solar years (88 Earth days each). Mercury's axis has the smallest tilt of any of the Solar System's planets (about 1\u204430 of a degree), and its orbital eccentricity is the largest of all known planets in the Solar System.Like Venus, Mercury orbits the Sun within Earth's orbit, making it appear in Earth's sky only as a \"morning star\" or \"evening star\" that's relatively close to the Sun. In English, it is named after the Roman god Mercurius (Mercury), god of commerce, communication and the messenger of gods. Mercury is the most difficult planet to reach from Earth because it requires the greatest change in spacecraft's velocity. Only two spacecraft have visited Mercury as of 2023: Mariner 10 flew by in 1974 and 1975, and MESSENGER launched in 2004 and orbited Mercury over 4,000 times in four years. The BepiColombo spacecraft is planned to arrive at Mercury in 2025.\n\n"}, {"name": "Venus", "content": "Venus is the second planet from the Sun. It is a rocky planet with the densest atmosphere of all the rocky bodies in the Solar System, and the only one with a mass and size that is close to that of its orbital neighbour Earth. Orbiting inferiorly (inside of Earth's orbit), it appears in Earth's sky always close to the Sun, as either a \"morning star\" or an \"evening star\". While this is also true for Mercury, Venus appears as such much more prominently, since it is the third brightest object in Earth's sky after the Moon and the Sun, appearing brighter than any other star-like classical planet or any fixed star. With such prominent appearances in Earth's sky, Venus has historically been a common and important object for humans, in both their cultures and astronomy.\nVenus retains, despite having only a weak induced magnetosphere, an especially thick atmosphere mainly of carbon dioxide, which, together with its global sulfuric acid cloud cover, creates an extreme greenhouse effect. These cause at the surface a mean temperature of 737 K (464 \u00b0C; 867 \u00b0F) and a crushing pressure of 92 times that of Earth's at sea level, turning the air into a supercritical fluid, though at cloudy altitudes of 50 km (30 mi) Earthlike levels are found. Conditions possibly favourable for life on Venus have been identified at its cloud layers, while recent research has found indicative, but not convincing, evidence. Early in Venus's history, water may have been abundant enough to form oceans, but any liquid water there will have evaporated when greenhouse effects cascaded and then been taken away into space by the solar wind. Internally Venus is thought to consist, like Earth, of a core, mantle, and crust, the latter releasing internal heat through its active volcanism, shaping the surface with large resurfacing instead of, as on Earth, plate tectonics.\nLike Mercury, Venus has no moons. Like Uranus's, its rotation is retrograde, against its orbital direction. Having been slowed by the strong currents and drag of the atmosphere, it completes a sidereal rotation, relative to the stars, in 243 Earth days. Therefore it rotates more slowly than it is orbiting the Sun, having a solar year of 224.7 Earth days, so that its solar day, or the time the Sun takes to cross the same meridian twice, is 117 Earth days long. Venus and Earth approach each other in synodic periods of 1.6 years. While coming closer to each other at inferior conjunction than any other pair of the Sun's planets, since they have the closest two planetary orbits, they each still on average stay closer to Mercury than to any other planet, as Mercury passes by more frequently because of its more central, thus more rapid, orbit. That said, Venus and Earth have between them a lower difference in gravitational potential than exists between either of them and any other planet. This fact has allowed Venus to be the most accessible destination and attractive gravity assist waypoint for interplanetary flights.\nIn 1961, Venus became the target of the first interplanetary flight in human history, followed by many essential interplanetary firsts, confirming in 1970 Venus's inhospitable surface conditions with the first soft landing on another planet. This finding aborted any later representations of Venus as suitable for human habitation, once a popular theme in science fiction. Actual proposals, however, have suggested sending crews either on flybys, as gravity assists for crewed missions to Mars, or to enter the Venusian atmosphere and stay aloft, where, at sufficient altitude, conditions are more comparable to those on Earth's surface, including in respect of radiation and gravitation, than anywhere else in the Solar System. Currently, robotic probes are studying and more will be sent to study Venus, to provide crucial knowledge, particularly about greenhouse effects, and so inform predictions about global warming on Earth."}, {"name": "Earth", "content": "Earth is the third planet from the Sun and the only place known in the universe where life has originated and found habitability. Earth is the only planet known to sustain liquid surface water, with ocean water extending over 70.8% of the planet, making it an ocean world. Most of all other water is retained in Earth's polar regions, with large sheets of ice covering ocean and land, dwarfing Earth's groundwater, lakes, rivers and atmospheric water. The other 29.2% of the Earth's surface is land, consisting of continents and islands, and is widely covered by vegetation. Below the planet's surface lies the crust, consisting of several slowly moving tectonic plates, which interact to produce mountain ranges, volcanoes, and earthquakes. Inside the Earth's crust is a liquid outer core that generates the magnetosphere, deflecting most of the destructive solar winds and cosmic radiation.\nEarth has a dynamic atmosphere, which sustains Earth's surface conditions and protects it from most meteoroids and UV-light at entry. It has a composition of primarily nitrogen and oxygen. Water vapor is widely present in the atmosphere, forming clouds that cover most of the planet. The water vapor acts as a greenhouse gas and, together with other greenhouse gases in the atmosphere, particularly carbon dioxide (CO2), creates the conditions for both liquid surface water and water vapor to persist via the capturing of energy from the Sun's light. This process maintains the current average surface temperature of 14.76 \u00b0C, at which water is liquid under atmospheric pressure. Differences in the amount of captured energy between geographic regions (as with the equatorial region receiving more sunlight than the polar regions) drive atmospheric and ocean currents, producing a global climate system with different climate regions, and a range of weather phenomena such as precipitation, allowing components such as nitrogen to cycle.\nEarth is rounded into an ellipsoid with a circumference of about 40,000 km. It is the densest planet in the Solar System. Of the four rocky planets, it is the largest and most massive. Earth is about eight light-minutes away from the Sun and orbits it, taking a year (about 365.25 days) to complete one revolution. The Earth rotates around its own axis in slightly less than a day (in about 23 hours and 56 minutes). The Earth's axis of rotation is tilted with respect to the perpendicular to its orbital plane around the Sun, producing seasons. Earth is orbited by one permanent natural satellite, the Moon, which orbits Earth at 384,400 km (1.28 light seconds) and is roughly a quarter as wide as Earth. Through tidal locking, the Moon always faces the Earth with the same side, which causes tides, stabilizes Earth's axis, and gradually slows its rotation.\nEarth, like most other bodies in the Solar System, formed 4.5 billion years ago from gas in the early Solar System. During the first billion years of Earth's history, the ocean formed and then life developed within it. Life spread globally and has been altering Earth's atmosphere and surface, leading to the Great Oxidation Event two billion years ago. Humans emerged 300,000 years ago in Africa and have spread across every continent on Earth with the exception of Antarctica. Humans depend on Earth's biosphere and natural resources for their survival, but have increasingly impacted the planet's environment. Humanity's current impact on Earth's climate and biosphere is unsustainable, threatening the livelihood of humans and many other forms of life, and causing widespread extinctions."}, {"name": "Mars", "content": "Mars is the fourth planet and the furthest terrestrial planet from the Sun. The reddish color of its surface is due to finely grained iron(III) oxide dust in the soil, giving it the nickname \"the Red Planet\". Mars has a second smallest radius among the planets in the Solar System at 3,389.5 km (2,106 mi) and has a surface gravity of 3.72 m/s2 (12.2 ft/s2), which is 38% of Earth's gravity. The Martian dichotomy can be clearly seen on the surface: on average, the terrain on Mars northern hemisphere is flatter and lower than Mars southern hemisphere. Mars has a very thin atmosphere made primarily of carbon dioxide and two irregularly shaped natural satellites: Phobos and Deimos.\nGeologically, Mars is fairly active, with dust devils sweeping across the landscape and marsquakes (Martian analog to earthquakes) trembling underneath the ground. The surface of Mars hosts a shield volcano (Olympus Mons) and one of the largest canyons in the Solar System (Valles Marineris). Mars's celestial motion is comparable to that of Earth, with a slightly eccentric orbit and an axial tilt only slightly greater than Earth's. This motion causes seasonal changes to the polar ice caps' coverage and temperature swings between \u2212110 \u00b0C (\u2212166 \u00b0F) to 35 \u00b0C (95 \u00b0F) on the surface. A Martian solar day (sol) is equal to 24.5 hours and a Martian solar year is equal to 1.88 Earth years.\nLike the other planets in the Solar System, Mars was formed 4.5 billion years ago. During the Noachian period from about 4.1 to 3.7 billion years ago, Mars's surface was marked by meteor impacts, valley formation, erosion, and the possible presence of water oceans. The Hesperian period from 3.7 to 3.2\u20132 billion years ago was dominated by widespread volcanic activity and flooding that carved immense outflow channels. The Amazonian period, which continues today, was marked by the wind's influence on geological processes. It is not yet known whether life has ever existed on Mars, though search for evidences of life on Mars is still ongoing.\nMars is among the brightest objects in Earth's sky, and thus has been known from the ancient times. The Romans named it for the god of war, M\u0101rs, as did Greeks (Ares) and Mesopotamians (Nergal), likely because its color suggested blood. Its high-contrast albedo features make it an attractive target for viewing with a telescope.\nSince the late 20th century, Mars has been explored by uncrewed spacecraft and rovers, with the first flyby by the Mariner 4 probe in 1965, the first Mars orbiter by the Mars 2 probe in 1971, and the first landing by the Viking 1 in 1976. As of 2023, there are at least 11 active probes orbiting Mars or at the Martian surface. Currently, Mars is an attractive target for the first future interplanetary human missions."}, {"name": "Jupiter", "content": "Jupiter is the fifth planet from the Sun and the largest in the Solar System. It is a gas giant with a mass more than two and a half times that of all the other planets in the Solar System combined, and slightly less than one one-thousandth the mass of the Sun. Jupiter is the third brightest natural object in the Earth's night sky after the Moon and Venus, and it has been observed since prehistoric times. It was named after Jupiter, the chief deity of ancient Roman religion.\nJupiter is primarily composed of hydrogen (90% by volume), followed by helium, which constitutes a quarter of its mass and a tenth of its volume. The ongoing contraction of Jupiter's interior generates more heat than the planet receives from the Sun. Because of its rapid rotation rate of 1 rotation per 10 hours, the planet's shape is an oblate spheroid: it has a slight but noticeable bulge around the equator. The outer atmosphere is divided into a series of latitudinal bands, with turbulence and storms along their interacting boundaries. The most obvious result of this is the Great Red Spot, a giant storm which has been observed since 1831 and possibly earlier.\nJupiter is surrounded by a faint planetary ring system and has a powerful magnetosphere, the largest contiguous structure in the Solar System after the heliosphere. Jupiter forms a system of 95 known moons and probably many more, including the four large moons discovered by Galileo Galilei in 1610: Io, Europa, Ganymede, and Callisto. Ganymede, the largest of the four, is larger than the planet Mercury. Callisto is the second largest; Io and Europa are approximately the size of Earth's Moon.\nSince 1973, Jupiter has been visited by nine robotic probes: seven flybys and two dedicated orbiters, with two more either en route or awaiting launch."}, {"name": "Saturn", "content": "Saturn is the sixth planet from the Sun and the second-largest in the Solar System, after Jupiter. It is a gas giant with an average radius of about nine and a half times that of Earth. It has only one-eighth the average density of Earth, but is over 95 times more massive.Saturn's interior is thought to be composed of a rocky core, surrounded by a deep layer of metallic hydrogen, an intermediate layer of liquid hydrogen and liquid helium, and finally, a gaseous outer layer. Saturn has a pale yellow hue due to ammonia crystals in its upper atmosphere. An electrical current within the metallic hydrogen layer is thought to give rise to Saturn's planetary magnetic field, which is weaker than Earth's, but which has a magnetic moment 580 times that of Earth due to Saturn's larger size. Saturn's magnetic field strength is around one-twentieth of Jupiter's. The outer atmosphere is generally bland and lacking in contrast, although long-lived features can appear. Wind speeds on Saturn can reach 1,800 kilometres per hour (1,100 miles per hour).\nThe planet has a prominent ring system, which is composed mainly of ice particles, with a smaller amount of rocky debris and dust. At least 146 moons are known to orbit the planet, of which 63 are officially named; this does not include the hundreds of moonlets in its rings. Titan, Saturn's largest moon and the second largest in the Solar System, is larger (while less massive) than the planet Mercury and is the only moon in the Solar System to have a substantial atmosphere.\n\n"}, {"name": "Uranus", "content": "Uranus is the seventh planet from the Sun and is a gaseous cyan ice giant. Most of Uranus is made out of water, ammonia, and methane in a supercritical phase of matter, which in astronomy is called 'ice' or volatiles. The planet's atmosphere has a complex layered cloud structure and has the lowest minimum temperature of 49 K (\u2212224 \u00b0C; \u2212371 \u00b0F) out of all Solar System's planets. Uranus has a marked axial tilt of 97.8\u00b0 with a retrograde rotation rate of 17 hours. This means that in an 84 Earth years orbital period around the Sun, its poles get around 42 years of continuous sunlight, followed by 42 years of continuous darkness. \nUranus has the third-largest diameter and fourth-largest mass among the Solar System's planets. Based on current models, inside Uranus's volatile mantle layer is a rocky core, and surrounding it is a thick hydrogen and helium atmosphere. Trace amount of hydrocarbons (thought to be produced via hydrolysis) and carbon monoxide along with carbon dioxide (thought to have been originated from comets) have been detected in the upper atmosphere. There are many unexplained climate phenomena in Uranus's atmosphere, such as its peak wind speed of 900 km/h (560 mph), variations in its polar cap and its erratic cloud formation. Uranus also has a very low internal heat compared to other giant planets, which is still unexplained. \nLike the other giant planets, Uranus has a ring system, orbiting natural satellites and a magnetosphere. Uranus's ring system is extremely dark, with only about 2% of the incoming light is reflected, and contains the known 13 inner moons. Further out are the larger 5 major moons of the planet: Miranda, Ariel, Umbriel, Titania, and Oberon; and orbit at much greater distance from Uranus are the known 9 irregular moons. Uranus magnetosphere is highly asymmetric and has many charged particles, which may cause the darkening of its rings and moons.\nUranus is visible to the naked eye, but it is very dim and was not classified as a planet until 1781, when it was first observed by William Herschel. About seven decades after its discovery, consensus was reached that the planet be named from the Greek god Uranus (Ouranos), one of the Greek primordial deities. As of 2023, Uranus was visited up close only one time when in 1986 the Voyager 2 probe flew by the planet. Though nowadays Uranus can be resolved and observed by telescopes, there is much desire to revisit the planet, as shown by Planetary Science Decadal Survey's decision to make the proposed Uranus Orbiter and Probe mission a top priority in the 2023\u20132032 survey."}, {"name": "Neptune", "content": "Neptune is the eighth planet from the Sun and the farthest known planet in the Solar System. It is the fourth-largest planet in the Solar System by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth, and slightly more massive than its near-twin Uranus. Neptune is denser and physically smaller than Uranus because its greater mass causes more gravitational compression of its atmosphere. Being composed primarily of gases and liquids, it has no well-defined solid surface. The planet orbits the Sun once every 164.8 years at an average distance of 30.1 astronomical units (4.5 billion kilometres; 2.8 billion miles). It is named after the Roman god of the sea and has the astronomical symbol , representing Neptune's trident.Neptune is not visible to the unaided eye and is the only planet in the Solar System found by mathematical prediction rather than by empirical observation. Unexpected changes in the orbit of Uranus led Alexis Bouvard to hypothesise that its orbit was subject to gravitational perturbation by an unknown planet. After Bouvard's death, the position of Neptune was predicted from his observations, independently, by John Couch Adams and Urbain Le Verrier. Neptune was subsequently observed with a telescope on 23 September 1846 by Johann Galle within a degree of the position predicted by Le Verrier. Its largest moon, Triton, was discovered shortly thereafter, though none of the planet's remaining 13 known moons were located telescopically until the 20th century. The planet's distance from Earth gives it a very small apparent size, making it challenging to study with Earth-based telescopes. Neptune was visited by Voyager 2, when it flew by the planet on 25 August 1989; Voyager 2 remains the only spacecraft to have visited Neptune. The advent of the Hubble Space Telescope and large ground-based telescopes with adaptive optics has recently allowed for additional detailed observations from afar.\nLike the gas giants (Jupiter and Saturn), Neptune's atmosphere is composed primarily of hydrogen and helium, along with traces of hydrocarbons and possibly nitrogen, but contains a higher proportion of ices such as water, ammonia and methane. Similar to Uranus, its interior is primarily composed of ices and rock; both planets are normally considered \"ice giants\" to distinguish them. Along with Rayleigh scattering, traces of methane in the outermost regions in part account for the planet's blue appearance. Newest data from the Gemini observatory shows the blue colour is more saturated than the one present on Uranus due to thinner haze of Neptune's more active atmosphere.In contrast to the hazy, relatively featureless atmosphere of Uranus, Neptune's atmosphere has active and visible weather patterns. For example, at the time of the Voyager 2 flyby in 1989, the planet's southern hemisphere had a Great Dark Spot comparable to the Great Red Spot on Jupiter. More recently, in 2018, a newer main dark spot and smaller dark spot were identified and studied. In addition, these weather patterns are driven by the strongest sustained winds of any planet in the Solar System, with recorded wind speeds as high as 2,100 km/h (580 m/s; 1,300 mph). Because of its great distance from the Sun, Neptune's outer atmosphere is one of the coldest places in the Solar System, with temperatures at its cloud tops approaching 55 K (\u2212218 \u00b0C; \u2212361 \u00b0F). Temperatures at the planet's centre are approximately 5,400 K (5,100 \u00b0C; 9,300 \u00b0F). Neptune has a faint and fragmented ring system (labelled \"arcs\"), which was discovered in 1984, then later confirmed by Voyager 2."}] -------------------------------------------------------------------------------- /languagemodels/config.py: -------------------------------------------------------------------------------- 1 | """Global model and inference configuration 2 | 3 | This module manages the global configuration object shared between other 4 | modules in the package. It implements a dictionary with data validation 5 | on the keys and values. 6 | 7 | Note that this module provides access to many implementation details 8 | that are not expected to be used by average users. Specific models that 9 | have never been the default for the package may be removed at any time. 10 | """ 11 | 12 | import re 13 | import os 14 | from collections import namedtuple 15 | from huggingface_hub import hf_hub_download 16 | import json 17 | 18 | ConfigItem = namedtuple("ConfigItem", "initfn default") 19 | 20 | 21 | class ModelFilterException(Exception): 22 | pass 23 | 24 | 25 | # Model list 26 | # This list is sorted in priority order, with the best models first 27 | # The best model that fits in the memory bounds and matches the model filter 28 | # will be selected 29 | models = [ 30 | { 31 | "name": "openchat-3.5-0106", 32 | "tuning": "instruct", 33 | "datasets": ["mistral", "openorca", "flan"], 34 | "params": 7e9, 35 | "quantization": "int8", 36 | "backend": "ct2", 37 | "architecture": "decoder-only-transformer", 38 | "license": "apache-2.0", 39 | "prompt_fmt": ( 40 | "GPT4 Correct User: {instruction}<|end_of_turn|>" "GPT4 Correct Assistant:" 41 | ), 42 | }, 43 | { 44 | "name": "Llama-3.1-8B-Instruct", 45 | "tuning": "instruct", 46 | "revision": "d02fc85", 47 | "datasets": ["llama3"], 48 | "params": 8e9, 49 | "quantization": "int8", 50 | "backend": "ct2", 51 | "architecture": "decoder-only-transformer", 52 | "license": "llama3", 53 | "prompt_fmt": ( 54 | "<|start_header_id|>user<|end_header_id|>\n\n" 55 | "{instruction}<|eot_id|>" 56 | "<|start_header_id|>assistant<|end_header_id|>\n\n" 57 | ), 58 | }, 59 | { 60 | "name": "Meta-Llama-3-8B-Instruct", 61 | "tuning": "instruct", 62 | "datasets": ["llama3"], 63 | "params": 8e9, 64 | "quantization": "int8", 65 | "backend": "ct2", 66 | "architecture": "decoder-only-transformer", 67 | "license": "llama3", 68 | "prompt_fmt": ( 69 | "<|start_header_id|>user<|end_header_id|>\n\n" 70 | "{instruction}<|eot_id|>" 71 | "<|start_header_id|>assistant<|end_header_id|>\n\n" 72 | ), 73 | }, 74 | { 75 | "name": "openchat-3.5-1210", 76 | "tuning": "instruct", 77 | "datasets": ["mistral", "openorca", "flan"], 78 | "params": 7e9, 79 | "quantization": "int8", 80 | "backend": "ct2", 81 | "architecture": "decoder-only-transformer", 82 | "license": "apache-2.0", 83 | "prompt_fmt": ( 84 | "GPT4 Correct User: {instruction}<|end_of_turn|>" "GPT4 Correct Assistant:" 85 | ), 86 | }, 87 | { 88 | "name": "WizardLM-2-7B", 89 | "tuning": "instruct", 90 | "datasets": ["mistral", "wizardlm"], 91 | "params": 7e9, 92 | "quantization": "int8", 93 | "backend": "ct2", 94 | "architecture": "decoder-only-transformer", 95 | "license": "apache-2.0", 96 | "prompt_fmt": "USER: {instruction} ASSISTANT:", 97 | }, 98 | { 99 | "name": "neural-chat-7b-v3-1", 100 | "tuning": "instruct", 101 | "datasets": ["mistral", "slimorca"], 102 | "params": 7e9, 103 | "quantization": "int8", 104 | "backend": "ct2", 105 | "architecture": "decoder-only-transformer", 106 | "license": "apache-2.0", 107 | "prompt_fmt": ( 108 | "### System:\n" 109 | "Be helpful\n" 110 | "### User:\n{instruction}\n" 111 | "### Assistant:\n" 112 | ), 113 | }, 114 | { 115 | "name": "Mistral-7B-Instruct-v0.2", 116 | "tuning": "instruct", 117 | "datasets": ["mistral"], 118 | "params": 7e9, 119 | "quantization": "int8", 120 | "backend": "ct2", 121 | "architecture": "decoder-only-transformer", 122 | "license": "apache-2.0", 123 | "prompt_fmt": "[INST] {instruction} [/INST]", 124 | }, 125 | { 126 | "name": "flan-alpaca-gpt4-xl", 127 | "tuning": "instruct", 128 | "datasets": ["c4", "flan", "gpt4-alpaca"], 129 | "params": 3e9, 130 | "quantization": "int8", 131 | "backend": "ct2", 132 | "architecture": "encoder-decoder-transformer", 133 | "license": "apache-2.0", 134 | }, 135 | { 136 | "name": "flan-alpaca-xl", 137 | "tuning": "instruct", 138 | "datasets": ["c4", "flan", "alpaca"], 139 | "params": 3e9, 140 | "quantization": "int8", 141 | "backend": "ct2", 142 | "architecture": "encoder-decoder-transformer", 143 | "license": "apache-2.0", 144 | }, 145 | { 146 | "name": "flan-t5-xl", 147 | "tuning": "instruct", 148 | "datasets": ["c4", "flan"], 149 | "params": 3e9, 150 | "quantization": "int8", 151 | "backend": "ct2", 152 | "architecture": "encoder-decoder-transformer", 153 | "license": "apache-2.0", 154 | }, 155 | { 156 | "name": "Llama-3.2-3B-Instruct", 157 | "tuning": "instruct", 158 | "revision": "5da4ba8", 159 | "datasets": ["llama3"], 160 | "params": 1e9, 161 | "quantization": "int8", 162 | "backend": "ct2", 163 | "architecture": "decoder-only-transformer", 164 | "license": "llama3.2", 165 | "repetition_penalty": 1.1, 166 | "prompt_fmt": ( 167 | "<|start_header_id|>user<|end_header_id|>\n\n" 168 | "{instruction}<|eot_id|>" 169 | "<|start_header_id|>assistant<|end_header_id|>\n\n" 170 | ), 171 | }, 172 | { 173 | "name": "fastchat-t5-3b-v1.0", 174 | "tuning": "instruct", 175 | "datasets": ["c4", "flan", "sharegpt"], 176 | "params": 3e9, 177 | "quantization": "int8", 178 | "backend": "ct2", 179 | "architecture": "encoder-decoder-transformer", 180 | "license": "apache-2.0", 181 | }, 182 | { 183 | "name": "LaMini-Flan-T5-783M", 184 | "tuning": "instruct", 185 | "revision": "e5e20a1", 186 | "datasets": ["c4", "flan", "lamini"], 187 | "params": 783e6, 188 | "quantization": "int8", 189 | "backend": "ct2", 190 | "architecture": "encoder-decoder-transformer", 191 | "license": "cc-by-nc-4.0", 192 | }, 193 | { 194 | "name": "flan-t5-large", 195 | "tuning": "instruct", 196 | "datasets": ["c4", "flan"], 197 | "params": 783e6, 198 | "quantization": "int8", 199 | "backend": "ct2", 200 | "architecture": "encoder-decoder-transformer", 201 | "license": "apache-2.0", 202 | }, 203 | { 204 | "name": "Llama-3.2-1B-Instruct", 205 | "tuning": "instruct", 206 | "revision": "6e3e3a1", 207 | "datasets": ["llama3"], 208 | "params": 1e9, 209 | "quantization": "int8", 210 | "backend": "ct2", 211 | "architecture": "decoder-only-transformer", 212 | "license": "llama3.2", 213 | "repetition_penalty": 1.1, 214 | "prompt_fmt": ( 215 | "<|start_header_id|>user<|end_header_id|>\n\n" 216 | "{instruction}<|eot_id|>" 217 | "<|start_header_id|>assistant<|end_header_id|>\n\n" 218 | ), 219 | }, 220 | { 221 | "name": "LaMini-Flan-T5-248M", 222 | "tuning": "instruct", 223 | "revision": "96cfe99", 224 | "datasets": ["c4", "flan", "lamini"], 225 | "params": 248e6, 226 | "quantization": "int8", 227 | "backend": "ct2", 228 | "architecture": "encoder-decoder-transformer", 229 | "license": "cc-by-nc-4.0", 230 | }, 231 | { 232 | "name": "flan-t5-base", 233 | "tuning": "instruct", 234 | "datasets": ["c4", "flan"], 235 | "params": 248e6, 236 | "quantization": "int8", 237 | "backend": "ct2", 238 | "architecture": "encoder-decoder-transformer", 239 | "license": "apache-2.0", 240 | }, 241 | { 242 | "name": "flan-alpaca-base", 243 | "tuning": "instruct", 244 | "datasets": ["c4", "flan", "alpaca"], 245 | "params": 248e6, 246 | "quantization": "int8", 247 | "backend": "ct2", 248 | "architecture": "encoder-decoder-transformer", 249 | "license": "apache-2.0", 250 | }, 251 | { 252 | "name": "dialogstudio-t5-base-v1.0", 253 | "tuning": "instruct", 254 | "datasets": ["c4", "flan", "dialogstudio"], 255 | "params": 248e6, 256 | "quantization": "int8", 257 | "backend": "ct2", 258 | "architecture": "encoder-decoder-transformer", 259 | "license": "apache-2.0", 260 | "prompt_fmt": ("Instruction: Be helpful. {instruction}"), 261 | }, 262 | { 263 | "name": "LaMini-Flan-T5-77M", 264 | "tuning": "instruct", 265 | "datasets": ["c4", "flan", "lamini"], 266 | "params": 77e6, 267 | "backend": "ct2", 268 | "quantization": "int8", 269 | "architecture": "encoder-decoder-transformer", 270 | "license": "cc-by-nc-4.0", 271 | }, 272 | { 273 | "name": "flan-t5-small", 274 | "tuning": "instruct", 275 | "datasets": ["c4", "flan"], 276 | "params": 77e6, 277 | "quantization": "int8", 278 | "backend": "ct2", 279 | "architecture": "encoder-decoder-transformer", 280 | "license": "apache-2.0", 281 | }, 282 | { 283 | "name": "Phi-3-mini-4k-instruct-20240701", 284 | "tuning": "instruct", 285 | "datasets": ["phi-3"], 286 | "params": 3.8e9, 287 | "quantization": "int8", 288 | "backend": "ct2", 289 | "architecture": "decoder-only-transformer", 290 | "license": "mit", 291 | "prompt_fmt": "<|user|>\n{instruction}<|end|>\n<|assistant|>", 292 | "repetition_penalty": 1.1, 293 | }, 294 | { 295 | "name": "Phi-3-mini-4k-instruct", 296 | "tuning": "instruct", 297 | "datasets": ["phi-3"], 298 | "params": 3.8e9, 299 | "quantization": "int8", 300 | "backend": "ct2", 301 | "architecture": "decoder-only-transformer", 302 | "license": "mit", 303 | "prompt_fmt": "<|user|>\n{instruction}<|end|>\n<|assistant|>", 304 | "repetition_penalty": 1.1, 305 | }, 306 | { 307 | "name": "phi-2", 308 | "tuning": "instruct", 309 | "datasets": ["phi-2"], 310 | "params": 2.7e9, 311 | "quantization": "int8", 312 | "backend": "ct2", 313 | "architecture": "decoder-only-transformer", 314 | "license": "microsoft-research-license", 315 | "prompt_fmt": "Instruct: {instruction}\nOutput:", 316 | }, 317 | { 318 | "name": "gemma-2b-it", 319 | "tuning": "instruct", 320 | "datasets": ["gemma"], 321 | "params": 2.5e9, 322 | "quantization": "int8", 323 | "backend": "ct2", 324 | "architecture": "decoder-only-transformer", 325 | "license": "gemma-terms-of-use", 326 | "prompt_fmt": "user\n" 327 | "{instruction}\n" 328 | "model", 329 | }, 330 | { 331 | "name": "h2o-danube3-4b-chat", 332 | "tuning": "instruct", 333 | "datasets": [], 334 | "params": 4.0e9, 335 | "quantization": "int8", 336 | "backend": "ct2", 337 | "architecture": "decoder-only-transformer", 338 | "license": "apache-2.0", 339 | "prompt_fmt": "<|prompt|>{instruction}<|answer|>", 340 | }, 341 | { 342 | "name": "h2o-danube2-1.8b-chat", 343 | "tuning": "instruct", 344 | "datasets": [], 345 | "params": 1.8e9, 346 | "quantization": "int8", 347 | "backend": "ct2", 348 | "architecture": "decoder-only-transformer", 349 | "license": "other", 350 | "prompt_fmt": "<|prompt|>{instruction}<|answer|>", 351 | }, 352 | { 353 | "name": "h2o-danube-1.8b-chat", 354 | "tuning": "instruct", 355 | "datasets": [], 356 | "params": 1.8e9, 357 | "quantization": "int8", 358 | "backend": "ct2", 359 | "architecture": "decoder-only-transformer", 360 | "license": "other", 361 | "prompt_fmt": "<|prompt|>{instruction}<|answer|>", 362 | }, 363 | { 364 | "name": "Falcon3-3B-Instruct", 365 | "tuning": "instruct", 366 | "languages": ["en", "fr", "es", "pt"], 367 | "revision": "b183d4d", 368 | "datasets": [], 369 | "params": 3.23e9, 370 | "quantization": "int8", 371 | "backend": "ct2", 372 | "context_length": 8192, 373 | "repetition_penalty": 1.1, 374 | "architecture": "decoder-only-transformer", 375 | "license": "falcon", 376 | "prompt_fmt": ( 377 | "<|system|>\nAnswer concisely.\n<|user|>\n{instruction}\n<|assistant|>\n" 378 | ), 379 | }, 380 | { 381 | "name": "phi-1_5", 382 | "tuning": "instruct", 383 | "datasets": ["phi-1_5"], 384 | "params": 1.4e9, 385 | "quantization": "int8", 386 | "backend": "ct2", 387 | "architecture": "decoder-only-transformer", 388 | "license": "other", 389 | "prompt_fmt": "{instruction}\n\nAnswer:", 390 | }, 391 | { 392 | "name": "h2o-danube3-500m-chat", 393 | "tuning": "instruct", 394 | "datasets": [], 395 | "params": 0.5e9, 396 | "quantization": "int8", 397 | "backend": "ct2", 398 | "architecture": "decoder-only-transformer", 399 | "license": "apache-2.0", 400 | "prompt_fmt": "<|prompt|>{instruction}<|answer|>", 401 | }, 402 | { 403 | "name": "SmolLM2-1.7B-Instruct", 404 | "tuning": "instruct", 405 | "revision": "83b1658", 406 | "datasets": [], 407 | "params": 1.7e9, 408 | "quantization": "int8", 409 | "backend": "ct2", 410 | "context_length": 2048, 411 | "repetition_penalty": 1.0, 412 | "architecture": "decoder-only-transformer", 413 | "license": "apache-2.0", 414 | "prompt_fmt": ( 415 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 416 | "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" 417 | ), 418 | }, 419 | { 420 | "name": "SmolLM-1.7B-Instruct", 421 | "tuning": "instruct", 422 | "revision": "dc3dfe2", 423 | "datasets": [], 424 | "params": 1.7e9, 425 | "quantization": "int8", 426 | "backend": "ct2", 427 | "context_length": 2048, 428 | "repetition_penalty": 1.1, 429 | "architecture": "decoder-only-transformer", 430 | "license": "apache-2.0", 431 | "prompt_fmt": ( 432 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 433 | "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" 434 | ), 435 | }, 436 | { 437 | "name": "Falcon3-1B-Instruct", 438 | "tuning": "instruct", 439 | "languages": ["en", "fr", "es", "pt"], 440 | "revision": "74391aa", 441 | "datasets": [], 442 | "params": 1.7e9, 443 | "quantization": "int8", 444 | "backend": "ct2", 445 | "context_length": 8192, 446 | "repetition_penalty": 1.1, 447 | "architecture": "decoder-only-transformer", 448 | "license": "falcon", 449 | "prompt_fmt": ( 450 | "<|system|>\nAnswer concisely.\n<|user|>\n{instruction}\n<|assistant|>\n" 451 | ), 452 | }, 453 | { 454 | "name": "Qwen2.5-1.5B-Instruct", 455 | "tuning": "instruct", 456 | "languages": [ 457 | "zh", 458 | "en", 459 | "fr", 460 | "es", 461 | "pt", 462 | "de", 463 | "it", 464 | "ru", 465 | "ja", 466 | "ko", 467 | "vi", 468 | "th", 469 | "ar", 470 | ], 471 | "revision": "5de22ab", 472 | "datasets": [], 473 | "params": 1.5e9, 474 | "quantization": "int8", 475 | "backend": "ct2", 476 | "context_length": 32 * 1024, 477 | "repetition_penalty": 1.1, 478 | "architecture": "decoder-only-transformer", 479 | "license": "apache-2.0", 480 | "prompt_fmt": ( 481 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 482 | "<|im_start|>user\n{instruction}<|im_end|>\n" 483 | "<|im_start|>assistant\n" 484 | ), 485 | }, 486 | { 487 | "name": "Qwen2.5-0.5B-Instruct", 488 | "tuning": "instruct", 489 | "languages": [ 490 | "zh", 491 | "en", 492 | "fr", 493 | "es", 494 | "pt", 495 | "de", 496 | "it", 497 | "ru", 498 | "ja", 499 | "ko", 500 | "vi", 501 | "th", 502 | "ar", 503 | ], 504 | "revision": "554ffe5", 505 | "datasets": [], 506 | "params": 0.5e9, 507 | "quantization": "int8", 508 | "backend": "ct2", 509 | "context_length": 32 * 1024, 510 | "repetition_penalty": 1.1, 511 | "architecture": "decoder-only-transformer", 512 | "license": "apache-2.0", 513 | "prompt_fmt": ( 514 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 515 | "<|im_start|>user\n{instruction}<|im_end|>\n" 516 | "<|im_start|>assistant\n" 517 | ), 518 | }, 519 | { 520 | "name": "SmolLM2-360M-Instruct", 521 | "tuning": "instruct", 522 | "revision": "ed9c4fe", 523 | "datasets": [], 524 | "params": 360e6, 525 | "quantization": "int8", 526 | "backend": "ct2", 527 | "context_length": 2048, 528 | "repetition_penalty": 1.0, 529 | "architecture": "decoder-only-transformer", 530 | "license": "apache-2.0", 531 | "prompt_fmt": ( 532 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 533 | "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" 534 | ), 535 | }, 536 | { 537 | "name": "SmolLM-360M-Instruct", 538 | "tuning": "instruct", 539 | "revision": "0b0e861", 540 | "datasets": [], 541 | "params": 360e6, 542 | "quantization": "int8", 543 | "backend": "ct2", 544 | "context_length": 2048, 545 | "repetition_penalty": 1.1, 546 | "architecture": "decoder-only-transformer", 547 | "license": "apache-2.0", 548 | "prompt_fmt": ( 549 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 550 | "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" 551 | ), 552 | }, 553 | { 554 | "name": "SmolLM2-135M-Instruct", 555 | "tuning": "instruct", 556 | "revision": "e52a3dc", 557 | "datasets": [], 558 | "params": 135e6, 559 | "quantization": "int8", 560 | "backend": "ct2", 561 | "context_length": 2048, 562 | "repetition_penalty": 1.0, 563 | "architecture": "decoder-only-transformer", 564 | "license": "apache-2.0", 565 | "prompt_fmt": ( 566 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 567 | "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" 568 | ), 569 | }, 570 | { 571 | "name": "SmolLM-135M-Instruct", 572 | "tuning": "instruct", 573 | "revision": "90046ba", 574 | "datasets": [], 575 | "params": 135e6, 576 | "quantization": "int8", 577 | "backend": "ct2", 578 | "context_length": 2048, 579 | "repetition_penalty": 1.3, 580 | "architecture": "decoder-only-transformer", 581 | "license": "apache-2.0", 582 | "prompt_fmt": ( 583 | "<|im_start|>system\nAnswer concisely.<|im_end|>\n" 584 | "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" 585 | ), 586 | }, 587 | { 588 | "name": "LaMini-GPT-774M", 589 | "tuning": "instruct", 590 | "datasets": ["webtext", "lamini"], 591 | "params": 774e6, 592 | "quantization": "int8", 593 | "backend": "ct2", 594 | "architecture": "decoder-only-transformer", 595 | "license": "mit", 596 | "prompt_fmt": ( 597 | "Below is an instruction that describes a task.\n" 598 | "Write a response that completes the request.\n\n" 599 | "### Instruction:\n{instruction}\n\n### Response:" 600 | ), 601 | }, 602 | { 603 | "name": "LaMini-GPT-124M", 604 | "tuning": "instruct", 605 | "datasets": ["webtext", "lamini"], 606 | "params": 124e6, 607 | "quantization": "int8", 608 | "backend": "ct2", 609 | "architecture": "decoder-only-transformer", 610 | "license": "mit", 611 | "prompt_fmt": ( 612 | "Below is an instruction that describes a task.\n" 613 | "Write a response that completes the request.\n\n" 614 | "### Instruction:\n{instruction}\n\n### Response:" 615 | ), 616 | }, 617 | { 618 | "name": "TinyLlama-1.1B-Chat-v1.0", 619 | "tuning": "instruct", 620 | "datasets": ["slimpajama", "starcoderdata"], 621 | "params": 1.1e9, 622 | "quantization": "int8", 623 | "backend": "ct2", 624 | "architecture": "decoder-only-transformer", 625 | "license": "mit", 626 | "prompt_fmt": ("<|user|>{instruction}<|assistant|>"), 627 | }, 628 | { 629 | "name": "codet5p-770m-py", 630 | "tuning": "code", 631 | "datasets": ["github-code"], 632 | "params": 770e6, 633 | "quantization": "int8", 634 | "backend": "ct2", 635 | "architecture": "encoder-decoder-transformer", 636 | "license": "bsd-3-clause", 637 | }, 638 | { 639 | "name": "codet5p-220m-py", 640 | "tuning": "code", 641 | "datasets": ["github-code"], 642 | "params": 220e6, 643 | "quantization": "int8", 644 | "backend": "ct2", 645 | "architecture": "encoder-decoder-transformer", 646 | "license": "bsd-3-clause", 647 | }, 648 | { 649 | "name": "all-MiniLM-L6-v2", 650 | "tuning": "embedding", 651 | "revision": "28efeb4", 652 | "params": 22e6, 653 | "quantization": "int8", 654 | "backend": "ct2", 655 | "architecture": "encoder-only-transformer", 656 | "license": "apache-2.0", 657 | }, 658 | { 659 | "name": "gte-tiny", 660 | "tuning": "embedding", 661 | "params": 22e6, 662 | "quantization": "int8", 663 | "backend": "ct2", 664 | "architecture": "encoder-only-transformer", 665 | "license": "mit", 666 | }, 667 | { 668 | "name": "gte-small", 669 | "tuning": "embedding", 670 | "params": 33e6, 671 | "quantization": "int8", 672 | "backend": "ct2", 673 | "architecture": "encoder-only-transformer", 674 | "license": "mit", 675 | }, 676 | { 677 | "name": "GIST-small-Embedding-v0", 678 | "tuning": "embedding", 679 | "params": 33e6, 680 | "quantization": "int8", 681 | "backend": "ct2", 682 | "architecture": "encoder-only-transformer", 683 | "license": "mit", 684 | }, 685 | { 686 | "name": "bge-small-en", 687 | "tuning": "embedding", 688 | "query_prefix": "Represent this sentence for searching relevant passages: ", 689 | "params": 33e6, 690 | "quantization": "int8", 691 | "backend": "ct2", 692 | "architecture": "encoder-only-transformer", 693 | "license": "mit", 694 | }, 695 | { 696 | "name": "e5-small-v2", 697 | "tuning": "embedding", 698 | "params": 33e6, 699 | "quantization": "int8", 700 | "backend": "ct2", 701 | "architecture": "encoder-only-transformer", 702 | "license": "mit", 703 | }, 704 | { 705 | "name": "granite-embedding-125m-english", 706 | "tuning": "embedding", 707 | "params": 30e6, 708 | "quantization": "int8", 709 | "backend": "ct2", 710 | "architecture": "encoder-only-transformer", 711 | "license": "apache-2.0", 712 | }, 713 | { 714 | "name": "granite-embedding-107m-multilingual", 715 | "tuning": "embedding", 716 | "params": 30e6, 717 | "quantization": "int8", 718 | "backend": "ct2", 719 | "architecture": "encoder-only-transformer", 720 | "license": "apache-2.0", 721 | }, 722 | { 723 | "name": "granite-embedding-30m-english", 724 | "tuning": "embedding", 725 | "params": 30e6, 726 | "quantization": "int8", 727 | "backend": "ct2", 728 | "architecture": "encoder-only-transformer", 729 | "license": "apache-2.0", 730 | }, 731 | { 732 | "name": "multilingual-e5-small", 733 | "tuning": "embedding", 734 | "params": 120e6, 735 | "quantization": "int8", 736 | "backend": "ct2", 737 | "architecture": "encoder-only-transformer", 738 | "license": "mit", 739 | }, 740 | ] 741 | 742 | 743 | class Config(dict): 744 | """ 745 | Store configuration information for the package. 746 | 747 | This is a dictionary that provides data basic data validation. 748 | 749 | Only appropriate keys and values are allowed to be set. 750 | 751 | >>> c = Config({'max_ram': '4gb'}) 752 | >>> c 753 | {...'max_ram': 4.0...} 754 | 755 | >>> c = Config({'instruct_model': 'flan-t5-small'}) 756 | >>> c 757 | {...'instruct_model': 'flan-t5-small'...} 758 | 759 | >>> c = Config({'model_license': 'apache|mit|bsd'}) 760 | >>> c 761 | {...'model_license': re.compile('apache|mit|bsd')...} 762 | 763 | >>> c = Config({'instruct_model': 'flan-t5-bad'}) 764 | Traceback (most recent call last): 765 | ... 766 | KeyError: 'flan-t5-bad' 767 | 768 | >>> c = Config({'bad_value': 1}) 769 | Traceback (most recent call last): 770 | ... 771 | KeyError: 'bad_value' 772 | 773 | >>> c = Config() 774 | >>> c.update({'bad_value': 1}) 775 | Traceback (most recent call last): 776 | ... 777 | KeyError: 'bad_value' 778 | 779 | """ 780 | 781 | model_names = {m["name"]: m for m in models} 782 | 783 | def __init__(self, config={}): 784 | # Defaults are loaded first 785 | for key in Config.schema: 786 | self[key] = self.schema[key].default 787 | 788 | # Environment variables override defaults 789 | for key in Config.schema: 790 | value = os.environ.get(f"LANGUAGEMODELS_{key.upper()}") 791 | if value: 792 | self[key] = value 793 | 794 | # Any values passed in the config dict override environment vars 795 | for key in config.keys(): 796 | self[key] = config[key] 797 | 798 | def __setitem__(self, key, value): 799 | super().__setitem__(key, Config.schema[key].initfn(value)) 800 | 801 | # Auto-adjust instruct_model when filters change 802 | if key == "max_ram" or key == "model_license": 803 | found = set() 804 | for model in models: 805 | if model["quantization"] == "int8": 806 | memsize = model["params"] / 1e9 807 | elif model["quantization"] == "q3_k_m": 808 | memsize = model["params"] * 0.48 / 1e9 809 | elif model["quantization"] == "q4_k_m": 810 | memsize = model["params"] * 0.59 / 1e9 811 | 812 | sizefit = memsize < self["max_ram"] 813 | 814 | if "model_license" in self: 815 | licensematch = self["model_license"].match(model["license"]) 816 | else: 817 | licensematch = True 818 | 819 | if model["tuning"] not in found and sizefit and licensematch: 820 | self[model["tuning"] + "_model"] = model["name"] 821 | found.add(model["tuning"]) 822 | 823 | if len(found) < 3: 824 | raise ModelFilterException("Unable to find models to match filters") 825 | 826 | def update(self, other): 827 | for key in other: 828 | self[key] = other[key] 829 | 830 | def use_hf_model(self, hf_path, revision, model_type="instruct"): 831 | """Load and use a model from Huggingface 832 | 833 | :param hf_path: Path for the model e.g. "org/model" 834 | :param revision: The model git revision to load 835 | :param model_type: Model type to load 836 | """ 837 | 838 | assert "ct2" in hf_path.lower() 839 | assert "int8" in hf_path.lower() 840 | 841 | # We defer importing jinja2 until this point as it is only needed 842 | # for interpolating hf model chat templates and does not need 843 | # to be installed unless this method is used 844 | from jinja2 import Environment, BaseLoader 845 | 846 | tok_config = hf_hub_download( 847 | hf_path, "tokenizer_config.json", revision=revision 848 | ) 849 | 850 | with open(tok_config) as f: 851 | chat_template = json.load(f)["chat_template"] 852 | 853 | env = Environment(loader=BaseLoader()) 854 | 855 | template = env.from_string(chat_template) 856 | 857 | prompt_fmt = template.render( 858 | messages=[{"role": "user", "content": "{instruction}"}], 859 | add_generation_prompt=True, 860 | ) 861 | 862 | model = { 863 | "name": hf_path, 864 | "backend": "ct2", 865 | "quantization": "int8", 866 | "architecture": "decoder-only-transformer", 867 | "max_tokens": 2048, 868 | "params": 0, 869 | "prompt_fmt": prompt_fmt, 870 | } 871 | 872 | models.insert(0, model) 873 | self.model_names[model["name"]] = model 874 | self[f"{model_type}_model"] = model["name"] 875 | 876 | @staticmethod 877 | def validate_model(model_name): 878 | return Config.model_names[model_name]["name"] 879 | 880 | @staticmethod 881 | def validate_device(device): 882 | assert device in ["auto", "cpu"] 883 | 884 | return device 885 | 886 | @staticmethod 887 | def convert_to_gb(space): 888 | """Convert max RAM string to int 889 | 890 | Output will be in gigabytes 891 | 892 | If not specified, input is assumed to be in gigabytes 893 | 894 | >>> Config.convert_to_gb("512") 895 | 512.0 896 | 897 | >>> Config.convert_to_gb(".5") 898 | 0.5 899 | 900 | >>> Config.convert_to_gb("4G") 901 | 4.0 902 | 903 | >>> Config.convert_to_gb("256mb") 904 | 0.25 905 | 906 | >>> Config.convert_to_gb("256M") 907 | 0.25 908 | 909 | >>> Config.convert_to_gb("small") 910 | 0.2 911 | 912 | >>> Config.convert_to_gb("base") 913 | 0.48 914 | 915 | >>> Config.convert_to_gb("large") 916 | 1.0 917 | 918 | >>> Config.convert_to_gb("xl") 919 | 4.0 920 | 921 | >>> Config.convert_to_gb("xxl") 922 | 16.0 923 | """ 924 | 925 | if isinstance(space, int) or isinstance(space, float): 926 | return float(space) 927 | 928 | size_names = { 929 | "small": 0.2, 930 | "base": 0.48, 931 | "large": 1.0, 932 | "xl": 4.0, 933 | "xxl": 16.0, 934 | } 935 | 936 | if space.lower().strip() in size_names: 937 | return size_names[space.lower().strip()] 938 | 939 | multipliers = { 940 | "g": 1.0, 941 | "m": 2**-10, 942 | } 943 | 944 | space = space.lower() 945 | space = space.rstrip("b") 946 | 947 | if space[-1] in multipliers: 948 | return float(space[:-1]) * multipliers[space[-1]] 949 | else: 950 | return float(space) 951 | 952 | 953 | Config.schema = { 954 | "max_ram": ConfigItem(Config.convert_to_gb, 0.48), 955 | "max_tokens": ConfigItem(int, 200), 956 | "echo": ConfigItem(int, False), 957 | "device": ConfigItem(Config.validate_device, "cpu"), 958 | "model_license": ConfigItem(re.compile, ".*"), 959 | "instruct_model": ConfigItem(Config.validate_model, "LaMini-Flan-T5-248M"), 960 | "embedding_model": ConfigItem(Config.validate_model, "all-MiniLM-L6-v2"), 961 | "code_model": ConfigItem(Config.validate_model, "codet5p-220m-py"), 962 | "max_prompt_length": ConfigItem(int, 50_000), 963 | } 964 | 965 | config = Config() 966 | 967 | if "COLAB_GPU" in os.environ: 968 | if len(os.environ["COLAB_GPU"]) > 0: 969 | # We have a Colab GPU, so default to using it 970 | config["device"] = "auto" 971 | --------------------------------------------------------------------------------