├── index ├── __init__.py ├── docs.py ├── index.py └── vector_index.py ├── model ├── __init__.py ├── factory.py ├── model.py └── providers │ ├── openai_embedding.py │ └── sentence_transformer_model.py ├── server ├── __init__.py ├── api.py └── server.py ├── vectorstore ├── __init__.py ├── factory.py ├── vectorstore.py └── providers │ └── hnswlib_store.py ├── tests ├── testfiles │ ├── empty.txt │ ├── 3chars.txt │ ├── whitespaces.txt │ ├── single_sentence.txt │ ├── download.py │ └── chatgpt.txt ├── openai │ ├── test_index.py │ └── test_model_embedding.py ├── model │ └── test_sentence_transformer.py ├── vectorstore │ └── test_hnswlib.py └── index │ ├── test_index.py │ └── test_vector_index.py ├── pytest.ini ├── main.py ├── pyproject-gpt.toml ├── pyproject.toml ├── Makefile ├── Dockerfile ├── example └── cli.py ├── README.md └── LICENSE /index/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vectorstore/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/testfiles/empty.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/testfiles/3chars.txt: -------------------------------------------------------------------------------- 1 | foo 2 | -------------------------------------------------------------------------------- /tests/testfiles/whitespaces.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /tests/testfiles/single_sentence.txt: -------------------------------------------------------------------------------- 1 | A person is eating food. 2 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | [pytest] 3 | log_cli=true 4 | log_level=INFO 5 | log_cli_format = %(asctime)s [%(levelname)s] %(filename)s:%(lineno)d - %(message)s 6 | log_cli_date_format = %Y-%m-%d %H:%M:%S 7 | -------------------------------------------------------------------------------- /tests/openai/test_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import lucene 3 | import pytest 4 | 5 | 6 | from tests.index.test_index import IndexAndSearchTest 7 | 8 | class TestIndexWithOpenAIAdaModel: 9 | def test_index(self): 10 | t = IndexAndSearchTest() 11 | t.index_docs_and_search("./tests/openai/", "openai_embedding", "hnswlib") 12 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from server.server import start 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--host", type=str, default="0.0.0.0") 7 | parser.add_argument("--port", type=int, default=8080) 8 | args = parser.parse_args() 9 | 10 | start(host=args.host, port=args.port) 11 | -------------------------------------------------------------------------------- /tests/testfiles/download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | url='https://en.wikipedia.org/wiki/ChatGPT' 5 | 6 | # save text from the url to a txt file 7 | with open(url.split('/')[-1]+".txt", "w", encoding="UTF-8") as f: 8 | # get the text from the URL using BeautifulSoup 9 | soup = BeautifulSoup(requests.get(url).text, "html.parser") 10 | 11 | # save paragraphs 12 | for i in soup.select('p'): 13 | f.write(i.get_text()) 14 | -------------------------------------------------------------------------------- /vectorstore/factory.py: -------------------------------------------------------------------------------- 1 | from vectorstore.vectorstore import VectorStore 2 | 3 | def get_vector_store( 4 | store_name: str, dim: int, space: str, max_elements: int, 5 | ) -> VectorStore: 6 | match store_name: 7 | case "hnswlib": 8 | from vectorstore.providers.hnswlib_store import HnswlibStore 9 | return HnswlibStore(dim, space, max_elements) 10 | case _: 11 | return ValueError(f"Unsupported vector store: {store_name}") 12 | 13 | -------------------------------------------------------------------------------- /model/factory.py: -------------------------------------------------------------------------------- 1 | from model.model import Model 2 | 3 | def get_model(provider: str) -> Model: 4 | match provider: 5 | case "openai_embedding": 6 | from model.providers.openai_embedding import OpenAIEmbeddingModel 7 | return OpenAIEmbeddingModel() 8 | case "sentence_transformer": 9 | from model.providers.sentence_transformer_model import \ 10 | SentenceTransformerModel 11 | return SentenceTransformerModel() 12 | case _: 13 | raise ValueError(f"Unsupported model provider: {provider}") 14 | 15 | -------------------------------------------------------------------------------- /pyproject-gpt.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "veclucene" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Jun Luo "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | fastapi = "^0.95.1" 11 | uvicorn = "^0.22.0" 12 | pydantic = "^1.10.7" 13 | tenacity = "^8.2.2" 14 | requests = "^2.30.0" 15 | python-multipart = "^0.0.6" 16 | argparse = "^1.4.0" 17 | openai = "^0.27.6" 18 | tiktoken = "^0.3.3" 19 | hnswlib = "^0.7.0" 20 | numpy = "^1.24.3" 21 | 22 | [tool.poetry.group.dev.dependencies] 23 | pytest = "^7.3.1" 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "veclucene" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Jun Luo "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | fastapi = "^0.95.1" 11 | uvicorn = "^0.22.0" 12 | pydantic = "^1.10.7" 13 | tenacity = "^8.2.2" 14 | requests = "^2.30.0" 15 | python-multipart = "^0.0.6" 16 | argparse = "^1.4.0" 17 | openai = "^0.27.6" 18 | tiktoken = "^0.3.3" 19 | hnswlib = "^0.7.0" 20 | numpy = "^1.24.3" 21 | sentence-transformers = "^2.2.2" 22 | 23 | [tool.poetry.group.dev.dependencies] 24 | pytest = "^7.3.1" 25 | 26 | [build-system] 27 | requires = ["poetry-core"] 28 | build-backend = "poetry.core.masonry.api" 29 | -------------------------------------------------------------------------------- /server/api.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from pydantic import BaseModel 3 | from typing import List, Optional 4 | 5 | from index.docs import DocChunkScore 6 | 7 | class QueryType(str, Enum): 8 | vector = "vector" 9 | lucene = "lucene" 10 | 11 | 12 | class QueryRequest(BaseModel): 13 | # the query string. 14 | # for lucene search, input the string supported by Lucene QueryParser. 15 | # for vector search, simply input a string. TODO support QueryParser. 16 | query: str 17 | query_type: Optional[QueryType] = QueryType.vector 18 | top_k: Optional[int] = 3 19 | 20 | 21 | # for now, simply return DocChunkScore. 22 | # TODO include the matched text, for such as highlight, etc. 23 | # TODO add auto QA ability. For a question, the server automatically sends the 24 | # top_k chunk texts as context to the QA model, such as ChatGPT, and includes 25 | # the answer in the response. 26 | class QueryResponse(BaseModel): 27 | doc_scores: List[DocChunkScore] 28 | -------------------------------------------------------------------------------- /model/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | class Model(ABC): 5 | @abstractmethod 6 | def get_max_token_size(self) -> int: 7 | """ 8 | Get the max number of tokens for the text. The input text longer than 9 | the max token may be truncated. 10 | """ 11 | raise NotImplementedError 12 | 13 | @abstractmethod 14 | def get_dim(self) -> int: 15 | """ 16 | Return the embedding dimension 17 | """ 18 | raise NotImplementedError 19 | 20 | @abstractmethod 21 | def get_embeddings(self, texts: List[str]) -> List[List[float]]: 22 | """ 23 | Takes in a list of texts and returns a list of embeddings for each text. 24 | """ 25 | raise NotImplementedError 26 | 27 | @abstractmethod 28 | def set_model(self, model_name: str, max_token_size: int, dim: int): 29 | """ 30 | Set to use the specified model. 31 | """ 32 | raise NotImplementedError 33 | -------------------------------------------------------------------------------- /index/docs.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional 3 | 4 | # TODO support more attributes, such as tokenize, DocValuesType, etc. 5 | class DocField(BaseModel): 6 | name: str # field name 7 | string_value: Optional[str] = None 8 | numeric_value: Optional[int] = None 9 | float_value: Optional[float] = None 10 | 11 | 12 | # All doc fields except the doc text. There are two reserved fields: 13 | # 1. The reserved field name for the doc text, "doc_text", defined by 14 | # index.FIELD_DOC_TEXT. This field name should not be used by application. 15 | # 2. The reserved field name for the doc id, "doc_id", defined by 16 | # index.FIELD_DOC_ID. If doc_id is not specified in the request, server will 17 | # automatically generate a unique id for the doc. 18 | # 19 | # For now, all fields are stored and not indexed. Only the doc contents are 20 | # indexed and also stored. TODO allow indexing more fields, such as title. 21 | class DocFields(BaseModel): 22 | fields: List[DocField] 23 | 24 | 25 | class DocChunkScore(BaseModel): 26 | doc_id: str 27 | offset: int 28 | length: int 29 | score: float 30 | 31 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | test: 3 | #PYTHONPATH=. pytest # or python3 -m pytest 4 | # add -s to print to console, add --log-cli-level=DEBUG to show debug logs 5 | python3 -m pytest tests/model/ 6 | python3 -m pytest tests/vectorstore/ 7 | python3 -m pytest tests/index/test_vector_index.py 8 | # This test reports 2000+ DeprecationWarning from pylucene. 9 | # Catch and filter warnings inside the test only ingore < 100 warnings. 10 | # most warnings are probably reported when JVM shuts down. Looks not able 11 | # to explicitly shutdown JVM in the test. So use disable-warnings for this 12 | # test only. 13 | python3 -m pytest --disable-warnings tests/index/test_index.py 14 | 15 | test_openai: 16 | # please export your openai key first, OPENAI_API_KEY=your_key 17 | python3 -m pytest tests/openai/test_model_embedding.py 18 | python3 -m pytest --disable-warnings tests/openai/test_index.py 19 | 20 | 21 | PYTHON := python3 22 | site_packages_path := $(shell $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])') 23 | coverage: 24 | coverage run --omit "$(site_packages_path)/*" -m pytest --disable-warnings 25 | coverage report --omit "$(site_packages_path)/*" 26 | -------------------------------------------------------------------------------- /tests/openai/test_model_embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import pytest 4 | import time 5 | 6 | from model.factory import get_model 7 | 8 | class TestOpenAIEmbeddingModel(): 9 | def test_embeddings(self): 10 | m = get_model("openai_embedding") 11 | assert 256 == m.get_max_token_size() 12 | assert 1536 == m.get_dim() 13 | 14 | sentences = ['A person is eating food.', 15 | 'A person is eating a piece of bread.', 16 | 'A person is riding a horse.', 17 | 'A person is riding a white horse on an enclosed ground.'] 18 | 19 | # example run time on a MacBook. 20 | # run the test first time, get embeddings time: 0.48015683237463236s 21 | # run the second time, get embeddings time: 0.25255241710692644s 22 | start = time.monotonic() 23 | embeddings = m.get_embeddings(sentences) 24 | assert len(sentences) == len(embeddings) 25 | assert m.get_dim() == len(embeddings[0]) 26 | dur = time.monotonic() - start 27 | logging.info(f"openai_embedding, get embeddings time: {dur}s") 28 | 29 | with pytest.raises(NotImplementedError): 30 | m.set_model("model", 1, 1) 31 | -------------------------------------------------------------------------------- /vectorstore/vectorstore.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from enum import Enum 3 | from typing import List 4 | 5 | class Space(str, Enum): 6 | l2 = "l2" # L2/Euclidean 7 | ip = "ip" # inner/dot product 8 | # The embedding model usually generates the normalized vectors. The cosine 9 | # similarity similarity is a dot product on normalized vectors. Usually 10 | # would not need to use cosine. 11 | cosine = "cosine" 12 | 13 | 14 | class VectorStore(ABC): 15 | @abstractmethod 16 | def save(self, index_path: str): 17 | """ 18 | Save the vectors to the file specified by index_path. 19 | """ 20 | raise NotImplementedError 21 | 22 | @abstractmethod 23 | def load(self, index_path: str): 24 | """ 25 | Load the vectors from the file specified by index_path. 26 | """ 27 | raise NotImplementedError 28 | 29 | @abstractmethod 30 | def add(self, embeddings: List[List[float]], labels: List[int]): 31 | """ 32 | Add the embeddings and the corresponding labels. 33 | """ 34 | raise NotImplementedError 35 | 36 | @abstractmethod 37 | def query( 38 | self, 39 | embeddings: List[List[float]], 40 | top_k: int = 1, 41 | ) -> (List[List[int]], List[List[float]]): 42 | """ 43 | Take one or more embeddings and return the top_k embedding ids and 44 | distances for each embedding. 45 | The distances are the original distances defined by the space, such as 46 | L2, inner/dot product, etc. The vector store provider should return the 47 | original distances. 48 | """ 49 | raise NotImplementedError 50 | -------------------------------------------------------------------------------- /model/providers/openai_embedding.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from openai import Embedding 3 | from tenacity import retry, wait_random_exponential, stop_after_attempt 4 | 5 | from model.model import Model 6 | 7 | class OpenAIEmbeddingModel(Model): 8 | # https://platform.openai.com/docs/guides/embeddings/what-are-embeddings 9 | model_name: str 10 | max_token_size: int 11 | dim: int 12 | 13 | def __init__(self): 14 | self.model_name = "text-embedding-ada-002" 15 | # what is the best token size? chatgpt-retrieval-plugin uses 200 16 | self.max_token_size = 256 17 | self.dim = 1536 18 | 19 | def get_max_token_size(self) -> int: 20 | """ 21 | Return the max token for the text. 22 | """ 23 | return self.max_token_size 24 | 25 | def get_dim(self) -> int: 26 | """ 27 | Return the embedding dimension 28 | """ 29 | return self.dim 30 | 31 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3)) 32 | def get_embeddings(self, texts: List[str]) -> List[List[float]]: 33 | """ 34 | Takes in a list of texts and returns a list of embeddings for each text. 35 | """ 36 | # Call the OpenAI API to get the embeddings 37 | response = Embedding.create(input=texts, model=self.model_name) 38 | 39 | # Extract the embedding data from the response 40 | data = response["data"] # type: ignore 41 | 42 | # Return the embeddings as a list of lists of floats 43 | return [result["embedding"] for result in data] 44 | 45 | def set_model(self, model_name: str, max_token_size: int, dim: int): 46 | """ 47 | Set to use the specified model. 48 | """ 49 | raise NotImplementedError 50 | 51 | -------------------------------------------------------------------------------- /vectorstore/providers/hnswlib_store.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | import hnswlib 4 | 5 | from vectorstore.vectorstore import Space, VectorStore 6 | 7 | class HnswlibStore(VectorStore): 8 | index: hnswlib.Index 9 | 10 | # hnswlib params 11 | dim: int 12 | space: Space # ip, l2, or cosine 13 | max_elements: int 14 | # M: int # max number of connections on upper layers 15 | # ef_construction: int # number of the nearest neighbors at index time 16 | # ef_search: int # number of the nearest neighbors to search 17 | 18 | def __init__(self, dim: int, space: Space, max_elements: int): 19 | self.index = hnswlib.Index(space, dim) 20 | self.index.init_index(max_elements) 21 | self.dim = dim 22 | self.max_elements = max_elements 23 | self.space = space 24 | 25 | def save(self, index_path: str): 26 | self.index.save_index(index_path) 27 | 28 | def load(self, index_path: str): 29 | self.index.load_index(index_path, self.max_elements) 30 | 31 | def add(self, embeddings: List[List[float]], labels: List[int]): 32 | self.index.add_items(embeddings, labels) 33 | 34 | def query( 35 | self, 36 | embeddings: List[List[float]], 37 | top_k: int = 1, 38 | ) -> (List[List[int]], List[List[float]]): 39 | """ 40 | Take one or more embeddings and return the top_k embedding labels and 41 | the original distances, defined by space, for each embedding. 42 | """ 43 | labels, distances = self.index.knn_query(embeddings, top_k) 44 | if self.space == Space.ip or self.space == Space.cosine: 45 | # https://github.com/nmslib/hnswlib returns a slightly different 46 | # distances, change back to the original distances. 47 | distances = 1.0 - distances 48 | 49 | return labels, distances 50 | -------------------------------------------------------------------------------- /model/providers/sentence_transformer_model.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from sentence_transformers import SentenceTransformer 4 | 5 | from model.model import Model 6 | 7 | class SentenceTransformerModel(Model): 8 | model: SentenceTransformer 9 | max_token_size: int 10 | dim: int 11 | 12 | def __init__(self): 13 | """ 14 | https://huggingface.co/blog/mteb, all-mpnet-base-v2 or all-MiniLM-L6-v2 15 | provide a good balance between speed and performance. 16 | 17 | https://www.sbert.net/docs/pretrained_models.html, test on a V100 GPU. 18 | all-mpnet-base-v2, model size 420MB, encoding speed 2800 sentence/s. 19 | all-MiniLM-L6-v2, model size 80MB, encoding speed 14200 sentence/s. 20 | """ 21 | # initialize with the default model 22 | self.model = SentenceTransformer('all-MiniLM-L6-v2') 23 | 24 | # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 25 | # By default, input text longer than 256 word pieces is truncated. 26 | self.max_token_size = 256 27 | self.dim = 384 28 | 29 | def get_max_token_size(self) -> int: 30 | """ 31 | Return the max token for the text. 32 | """ 33 | # TODO depending on the tokenizer, 256 word pieces may not equal to 34 | # 256 tokens. 35 | return self.max_token_size 36 | 37 | def get_dim(self) -> int: 38 | """ 39 | Return the embedding dimension 40 | """ 41 | return self.dim 42 | 43 | def get_embeddings(self, texts: List[str]) -> List[List[float]]: 44 | """ 45 | Takes in a list of texts and returns a list of embeddings for each text. 46 | """ 47 | return self.model.encode(texts) 48 | 49 | def set_model(self, model_name: str, max_token_size: int, dim: int): 50 | """ 51 | Set to use the specified model. 52 | """ 53 | self.model = SentenceTransformer(model_name) 54 | self.max_token_size = max_token_size 55 | self.dim = dim 56 | 57 | -------------------------------------------------------------------------------- /tests/model/test_sentence_transformer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import pytest 4 | import time 5 | from typing import List 6 | import numpy as np 7 | 8 | from model.factory import get_model 9 | 10 | class TestSentTransformerModel(): 11 | def test_embeddings(self): 12 | """ 13 | simple measure the latency of different models on a MacBook M1Pro. 14 | python3 -m pytest -s tests/model/test_sentence_transformer.py 15 | default model load time: 1.4939462076872587s 16 | get embeddings time: 0.05871379096060991s 17 | all-mpnet-base-v2 model load time: 1.011457541026175s 18 | get embeddings time: 0.17692300025373697s 19 | """ 20 | start = time.monotonic() 21 | stmodel = get_model("sentence_transformer") 22 | assert 256 == stmodel.get_max_token_size() 23 | assert 384 == stmodel.get_dim() 24 | dur = time.monotonic() - start 25 | logging.info(f"\ndefault model load time: {dur}s") 26 | 27 | sentences = ['A person is eating food.', 28 | 'A person is eating a piece of bread.', 29 | 'A person is riding a horse.', 30 | 'A person is riding a white horse on an enclosed ground.'] 31 | 32 | start = time.monotonic() 33 | embeddings = stmodel.get_embeddings(sentences) 34 | assert len(sentences) == len(embeddings) 35 | assert stmodel.get_dim() == len(embeddings[0]) 36 | dur = time.monotonic() - start 37 | logging.info(f"get embeddings time: {dur}s") 38 | 39 | # https://huggingface.co/sentence-transformers/all-mpnet-base-v2 40 | start = time.monotonic() 41 | stmodel.set_model("all-mpnet-base-v2", 384, 768) 42 | assert 384 == stmodel.get_max_token_size() 43 | assert 768 == stmodel.get_dim() 44 | dur = time.monotonic() - start 45 | logging.info(f"all-mpnet-base-v2 model load time: {dur}s") 46 | 47 | start = time.monotonic() 48 | embeddings = stmodel.get_embeddings(sentences) 49 | assert len(sentences) == len(embeddings) 50 | assert stmodel.get_dim() == len(embeddings[0]) 51 | dur = time.monotonic() - start 52 | logging.info(f"get embeddings time: {dur}s") 53 | 54 | def test_unsupported_model(self): 55 | with pytest.raises(ValueError): 56 | get_model("unknown_model") 57 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # get the required python packages 3 | FROM python:3.11-slim as requirements-stage 4 | 5 | ARG BUILD_GPT 6 | 7 | WORKDIR /tmp/poetry 8 | 9 | RUN pip install poetry 10 | 11 | COPY ./pyproject.toml ./poetry.lock /tmp/poetry 12 | COPY ./pyproject-gpt.toml ./poetry-gpt.lock /tmp/poetry 13 | 14 | RUN if [ ! -z "$BUILD_GPT" ]; then \ 15 | mv pyproject-gpt.toml pyproject.toml && mv poetry-gpt.lock poetry.lock; \ 16 | else \ 17 | rm -f pyproject-gpt.toml poetry-gpt.lock; \ 18 | fi 19 | 20 | RUN poetry export -f requirements.txt --output requirements.txt --without-hashes 21 | 22 | 23 | # build 24 | FROM python:3.11-slim 25 | 26 | ARG BUILD_GPT 27 | 28 | # 1. build pylucene 29 | # building on mac, default jdk does not work for JCC on aarch64/arm64, 30 | # pylucene-9.4.1/jcc/setup.py line 197, LFLAGS does not have linux/aarch64. 31 | #RUN apt-get update && apt-get install -y default-jdk 32 | 33 | # https://lucene.apache.org/pylucene/jcc/install.html suggests installing temurin java 34 | RUN apt-get update && apt-get install -y wget apt-transport-https gnupg 35 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add - 36 | RUN echo "deb https://packages.adoptium.net/artifactory/deb \ 37 | $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" \ 38 | | tee /etc/apt/sources.list.d/adoptium.list 39 | RUN apt-get update && apt-get install -y temurin-17-jdk 40 | 41 | RUN apt-get install -y build-essential 42 | 43 | # download and build pylucene 44 | WORKDIR /code/pylucene 45 | RUN wget -O - https://downloads.apache.org/lucene/pylucene/pylucene-9.4.1-src.tar.gz \ 46 | | tar -xz --strip-components=1 47 | RUN cd jcc \ 48 | && JCC_JDK=/usr/lib/jvm/$(ls /usr/lib/jvm) python setup.py build install 49 | RUN make all install JCC='python -m jcc --shared' PYTHON=python NUM_FILES=16 50 | #RUN make all test install JCC='python -m jcc --shared' PYTHON=python NUM_FILES=16 51 | 52 | WORKDIR /code 53 | RUN rm -rf pylucene 54 | 55 | 56 | # 2. install VecLucene python packages 57 | WORKDIR /code/VecLucene 58 | COPY . /code/VecLucene/ 59 | 60 | COPY --from=requirements-stage /tmp/poetry/requirements.txt /code/VecLucene/requirements.txt 61 | RUN pip install --no-cache-dir --upgrade -r /code/VecLucene/requirements.txt 62 | 63 | ENV ENV_EMBEDDING_MODEL_PROVIDER=${BUILD_GPT:+openai_embedding} 64 | 65 | EXPOSE 8080 66 | 67 | CMD ["sh", "-c", "uvicorn server.server:app --host 0.0.0.0 --port 8080"] 68 | -------------------------------------------------------------------------------- /example/cli.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import requests 4 | import sys 5 | import time 6 | 7 | def upload_file(url: str, file_path: str): 8 | with open(file_path, 'rb') as f: 9 | resp = requests.post( 10 | url=url, files={'file': (f.name, f, "text/plain")}) 11 | print(resp.json()) 12 | 13 | 14 | def upload_file_with_fields(url: str, file_path: str): 15 | with open(file_path, 'rb') as f: 16 | field1 = '{"name": "field1", "string_value": "str1"}' 17 | field2 = '{"name": "field2", "numeric_value": 2}' 18 | doc_fields = '{"fields": ' + f'[{field1}, {field2}]' + '}' 19 | fields = {"fields": f'{doc_fields}'} 20 | resp = requests.post( 21 | url=url, files={'file': (f.name, f, "text/plain")}, data=fields) 22 | print(resp.json()) 23 | 24 | 25 | def commit(url: str): 26 | resp = requests.post(url=url) 27 | print(resp.json()) 28 | 29 | 30 | def query(url: str, query_string: str, query_type: str): 31 | query_request = '{' + f'"query": "{query_string}", ' + \ 32 | f'"query_type": "{query_type}"' + '}' 33 | print(query_request) 34 | resp = requests.get(url=url, data=query_request) 35 | print(resp.json()) 36 | 37 | 38 | if __name__ == '__main__': 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--op", type=str, required=True) 41 | parser.add_argument("--host", type=str, default="127.0.0.1") 42 | parser.add_argument("--port", type=int, default=8080) 43 | parser.add_argument("--file", type=str) 44 | parser.add_argument("--query_string", type=str) 45 | parser.add_argument("--query_type", type=str, default="vector", 46 | choices=["vector", "lucene"]) 47 | args = parser.parse_args() 48 | 49 | url = f"http://{args.host}:{args.port}" 50 | match args.op: 51 | case "upload": 52 | if args.file is None: 53 | print("please input the text file path") 54 | url += "/add_doc" 55 | upload_file(url, args.file) 56 | 57 | case "commit": 58 | url += "/commit" 59 | commit(url) 60 | 61 | case "query": 62 | if args.query_string is None: 63 | print("please input the query string") 64 | url += "/query" 65 | start = time.monotonic() 66 | query(url, args.query_string, args.query_type) 67 | dur = time.monotonic() - start 68 | print(f"{args.query_type} query time: {dur}s") 69 | 70 | case _: 71 | print("supported op: upload, commit, query") 72 | 73 | -------------------------------------------------------------------------------- /server/server.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | from fastapi import FastAPI, File, Form, HTTPException, Depends, Body, UploadFile 3 | import logging 4 | import lucene 5 | import mimetypes 6 | import os 7 | from typing import Optional 8 | import sys 9 | import uvicorn 10 | 11 | from index.index import Index 12 | from index.docs import DocField, DocFields, DocChunkScore 13 | from server.api import QueryType, QueryRequest, QueryResponse 14 | 15 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 16 | 17 | # The embedding model provider: openai_embedding, sentence_transformer. 18 | # If model is set to openai_embedding, please remember to set OPENAI_API_KEY. 19 | ENV_EMBEDDING_MODEL_PROVIDER = os.environ.get("ENV_EMBEDDING_MODEL_PROVIDER") 20 | # The directory to store the lucene and vector index 21 | ENV_INDEX_DIR = os.environ.get("ENV_INDEX_DIR") 22 | 23 | DEFAULT_EMBEDDING_MODEL_PROVIDER = "sentence_transformer" 24 | DEFAULT_INDEX_DIR = "./server_index_dir" 25 | 26 | embedding_model = DEFAULT_EMBEDDING_MODEL_PROVIDER 27 | index_dir = DEFAULT_INDEX_DIR 28 | if ENV_EMBEDDING_MODEL_PROVIDER is not None \ 29 | and ENV_EMBEDDING_MODEL_PROVIDER != "": 30 | embedding_model = ENV_EMBEDDING_MODEL_PROVIDER 31 | if ENV_INDEX_DIR is not None and ENV_INDEX_DIR != "": 32 | index_dir = ENV_INDEX_DIR 33 | 34 | # the sub directory under index_dir to store the doc content 35 | index_doc_dir = os.path.join(index_dir, "docs") 36 | 37 | 38 | def start(host: str, port: int): 39 | uvicorn.run("server.server:app", host=host, port=port, reload=False) 40 | 41 | 42 | @asynccontextmanager 43 | async def lifespan(app: FastAPI): 44 | # init Index 45 | global index 46 | lucene.initVM(vmargs=['-Djava.awt.headless=true']) 47 | index = Index(index_dir=index_dir, 48 | model_provider=embedding_model, 49 | vector_store="hnswlib") 50 | logging.info("start the index") 51 | yield 52 | # close the index 53 | # TODO when use Ctrl+C to stop, close is not called. using the old shutdown 54 | # does not work as well. 55 | logging.info("close the index") 56 | index.close() 57 | 58 | app = FastAPI(lifespan=lifespan) 59 | 60 | 61 | # TODO support creating the index 62 | @app.post("/add_doc") 63 | async def add_doc( 64 | file: UploadFile = File(...), 65 | fields: Optional[str] = Form(None), 66 | ): 67 | filename = file.filename 68 | try: 69 | # parse the fields 70 | doc_fields: DocFields = None 71 | if fields is not None: 72 | doc_fields = DocFields.parse_raw(fields) 73 | 74 | # save the file text 75 | doc_path = await save_file_text(file) 76 | 77 | # add file to index 78 | doc_id = index.add(doc_path, doc_fields) 79 | return doc_id 80 | except Exception as e: 81 | logging.error(f"add doc {filename} error: {e}") 82 | raise HTTPException(status_code=500, detail=f"str({e})") 83 | 84 | 85 | @app.post("/commit") 86 | async def commit(): 87 | index.commit() 88 | 89 | 90 | @app.get( 91 | "/query", 92 | response_model=QueryResponse, 93 | ) 94 | async def query( 95 | request: QueryRequest = Body(...), 96 | ): 97 | try: 98 | docs: List[DocChunkScore] = None 99 | if request.query_type == QueryType.vector: 100 | docs = index.vector_search(request.query, request.top_k) 101 | else: 102 | docs = index.lucene_search(request.query, request.top_k) 103 | 104 | return QueryResponse(doc_scores=docs) 105 | except Exception as e: 106 | logging.error(f"query {request.query} error: {e}") 107 | raise HTTPException(status_code=500, detail=f"str({e})") 108 | 109 | 110 | async def save_file_text(file: UploadFile) -> str: 111 | """ 112 | Extract text from file and save under index_doc_dir. 113 | Return the absolute file path saved under index_doc_dir. 114 | """ 115 | # check file type. only support text file now. 116 | mimetype = file.content_type 117 | if mimetype is None: 118 | mimetype, _ = mimetypes.guess_type(file.filename) 119 | 120 | if mimetype != "text/plain" and mimetype != "text/markdown": 121 | raise ValueError(f"Unsupported file type: {mimetype}") 122 | 123 | # store the file text under index_doc_dir. 124 | # TODO support other type file, extract the text from the file. 125 | # TODO for small files, directly store in Lucene. 126 | doc_path = os.path.join(index_doc_dir, file.filename) 127 | os.makedirs(os.path.dirname(doc_path), exist_ok=True) 128 | 129 | file_stream = await file.read() 130 | 131 | # TODO if file exists, update doc 132 | with open(doc_path, "wb") as f: 133 | f.write(file_stream) 134 | 135 | return doc_path 136 | 137 | -------------------------------------------------------------------------------- /tests/vectorstore/test_hnswlib.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import random 4 | from typing import List 5 | import numpy as np 6 | 7 | from vectorstore.factory import get_vector_store 8 | 9 | class TestHnswlib(): 10 | def test_save_empty_index(self): 11 | dim = 384 12 | max_elements = 1 13 | space = "cosine" 14 | store = get_vector_store("hnswlib", dim, space, max_elements) 15 | 16 | index_path = f"ut_empty_index.bin" 17 | store.save(index_path) 18 | 19 | assert os.path.exists(index_path) 20 | os.remove(index_path) 21 | 22 | 23 | def test_index_cosines_space(self): 24 | self.verify_index_spaces("cosine") 25 | 26 | def test_index_ip_space(self): 27 | self.verify_index_spaces("ip") 28 | 29 | def verify_index_spaces(self, space: str): 30 | dim = 16 31 | max_elements = 5 32 | store = get_vector_store("hnswlib", dim, space, max_elements) 33 | 34 | embeddings = np.float32(np.random.random((max_elements, dim))) 35 | labels = np.arange(max_elements) 36 | 37 | store.add(embeddings, labels) 38 | 39 | query_embeddings: List[List[float]] = [] 40 | query_embeddings.append(embeddings[0]) 41 | qlabels, distances = store.query( 42 | embeddings=query_embeddings, top_k=max_elements) 43 | 44 | assert 1 == len(qlabels) 45 | assert 1 == len(distances) 46 | # verify all elements are returned 47 | assert max_elements == len(qlabels[0]) 48 | assert max_elements == len(distances[0]) 49 | if space != "ip": 50 | # inner product is not an actual metric. An element can be closer 51 | # to some other element than to itself 52 | assert labels[0] == qlabels[0][0] 53 | qlabels[0].sort() 54 | assert all([a == b for a, b in zip(qlabels[0], labels)]) 55 | 56 | 57 | def test_save_load_index_l2_space(self): 58 | dim = 16 59 | max_elements = 5 60 | space = "l2" 61 | store = get_vector_store("hnswlib", dim, space, max_elements) 62 | 63 | embeddings = np.float32(np.random.random((max_elements, dim))) 64 | labels = np.arange(max_elements) 65 | 66 | store.add(embeddings, labels) 67 | 68 | qlabels, distances = store.query(embeddings=embeddings[0], top_k=1) 69 | assert 1 == len(qlabels) 70 | assert 1 == len(distances) 71 | assert 1 == len(qlabels[0]) 72 | assert 1 == len(distances[0]) 73 | assert labels[0] == qlabels[0][0] 74 | assert 0.0 == distances[0][0] 75 | 76 | query_embeddings: List[List[float]] = [] 77 | query_embeddings.append(embeddings[0]) 78 | qlabels, distances = store.query( 79 | embeddings=query_embeddings, top_k=max_elements) 80 | assert 1 == len(qlabels) 81 | assert 1 == len(distances) 82 | assert max_elements == len(qlabels[0]) 83 | assert max_elements == len(distances[0]) 84 | assert labels[0] == qlabels[0][0] 85 | # l2 equation, d = sum((Ai-Bi)^2), the distance of exact match is 0 86 | assert 0.0 == distances[0][0] 87 | qlabels[0].sort() 88 | assert all([a == b for a, b in zip(qlabels[0], labels)]) 89 | 90 | index_path = "ut_index.bin" 91 | store.save(index_path) 92 | 93 | store1 = get_vector_store("hnswlib", dim, space, max_elements) 94 | store1.load(index_path) 95 | 96 | qlabels, distances = store1.query(embeddings=embeddings[0], top_k=1) 97 | assert 1 == len(qlabels) 98 | assert 1 == len(distances) 99 | assert 1 == len(qlabels[0]) 100 | assert 1 == len(distances[0]) 101 | assert labels[0] == qlabels[0][0] 102 | assert 0.0 == distances[0][0] 103 | 104 | qlabels, distances = store1.query( 105 | embeddings=embeddings[0], top_k=max_elements) 106 | assert 1 == len(qlabels) 107 | assert 1 == len(distances) 108 | assert max_elements == len(qlabels[0]) 109 | assert max_elements == len(distances[0]) 110 | assert labels[0] == qlabels[0][0] 111 | assert 0.0 == distances[0][0] 112 | qlabels[0].sort() 113 | assert all([a == b for a, b in zip(qlabels[0], labels)]) 114 | 115 | os.remove(index_path) 116 | 117 | 118 | def test_negative_cases(self): 119 | dim = 384 120 | max_elements = 5 121 | space = "cosine" 122 | store = get_vector_store("hnswlib", dim, space, max_elements) 123 | 124 | # negative test: num_elements > max_elements 125 | num_elements = max_elements + 1 126 | embeddings = np.float32(np.random.random((num_elements, dim))) 127 | labels = np.arange(num_elements) 128 | 129 | with pytest.raises(RuntimeError): 130 | store.add(embeddings, labels) 131 | 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VecLucene 2 | VecLucene is an open-source vector search engine library built on top of Lucene and popular ANN (approximate nearest neighbor) search libraries. Its purpose is to simplify the process of vector search for users. VecLucene introduces the following enhancements to Lucene: 3 | 4 | ## Open Models 5 | VecLucene currently supports [OpenAI's text-embedding-ada-002](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) and Sentence_Transformer models, [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) by default, for embeddings. It has a flexible framework to support additional models and can be further extended to accommodate custom models. The larger models generally result in higher latency. The application can select the most suitable model based on the workload. [HuggingFace MTEB: Massive Text Embedding Benchmark](https://huggingface.co/blog/mteb) measures the speed and performance of text embedding models. HuggingFace MTEB was published on October 19, 2022. So it does not include OpenAI embedding model. 6 | 7 | In addition, VecLucene can be expanded to support question-answering (QA) functionality. VecLucene can store the document text. For a query, VecLucene will find the matched text chunks and send them as context to models like ChatGPT to generate answers. 8 | 9 | ## Open ANN libraries 10 | The default choice for VecLucene is [Hnswlib](https://github.com/nmslib/hnswlib). There are plans to support [Faiss](https://github.com/facebookresearch/faiss) and other libraries if necessary. This flexibility allows the application to choose the best library that aligns with its workload and requirements. 11 | 12 | Lucene's KNN feature currently supports one embedding per document. However, for document text, a single embedding is often insufficient. One possible solution is to store text chunks within a document as multiple Lucene documents. While, this makes the inverted index more complex for the document. 13 | 14 | ## Self-Managed Document Store 15 | With VecLucene, the application simply uploads the document, and VecLucene handles the rest. It automatically extracts text from the file (currently only plain text documents are supported), splits the text into chunks, calls the model to generate embeddings for each chunk, and persists the embeddings in the ANN library. 16 | 17 | ## Hybrid Search 18 | VecLucene retains all of Lucene's existing abilities. The application can define multiple fields to store additional information for each document and use traditional Lucene queries to access these fields. For instance, the application can send a natural language query string with filters on other fields. VecLucene will generate an embedding for the query string, find similar documents in the ANN library, and filter out documents that don't meet the specified filtering conditions. 19 | 20 | Furthermore, the text is indexed in Lucene using the traditional inverted index format. The application can choose to use either type of search or even perform a hybrid search by combining the results of both inverted index search and ANN search. Inverted index search is generally faster, while ANN search provides a better understanding of semantic relationships. 21 | 22 | ## Install 23 | VecLucene is built on top of PyLucene-9.4.1. Please follow the instructions in the [PyLucene Install guide](https://lucene.apache.org/pylucene/install.html) to install PyLucene. Note that after building jcc, you will need to edit Makefile to set "PYTHON", "JCC" and "NUM_FILES" for your platform. Please make sure you have installed JDK and GCC before building PyLucene. [JCC Install](https://lucene.apache.org/pylucene/jcc/install.html) suggests to install Temurin Java. 24 | 25 | Other Python packages for VecLucene are managed by Poetry. You can use the "poetry export" command to create a requirements file and then install the packages with pip. 26 | 27 | The Dockerfile is a good reference for how to build VecLucene. 28 | 29 | ## Usage 30 | Once you have installed VecLucene, you can start it as an HTTP server by running `python main.py`. You can use the `example/cli.py` file to upload files, commit them, and query the server. 31 | 32 | If you prefer, you can skip the installation process and use the pre-built Docker container: 33 | 1. Pull the docker image using `docker pull junius/veclucene-arm64`. For the amd64 platform, pull `junius/veclucene-amd64`. Please note that the size of the amd64 container image is much larger compared to the arm64 platform. This is because the packages required to run SentenceTransformer on the amd64 platform are significantly larger. You can pull `junius/veclucene-gpt-amd64`, which only works with the OpenAI Embedding model and is much smaller in size. 34 | 2. Run the container using `docker run -d --name vltest -p 127.0.0.1:8080:8080 junius/veclucene-arm64`, which uses the SetenceTransformer `all-MiniLM-L6-v2` model. To use ChatGPT, run `docker run -d --env ENV_EMBEDDING_MODEL_PROVIDER=openai_embedding --env OPENAI_API_KEY=xxx --name vltest -p 127.0.0.1:8080:8080 junius/veclucene-arm64` 35 | 3. Use `python3 example/cli.py --op upload --file path/to/localfile` and `python3 example/cli.py --op query --query_string "xxx" --query_type "vector"` to upload files and query the server. To use the traditional inverted search, use `--query_type "lucene"`. Don't forget to run `python3 example/cli.py --op commit` before stopping the container to ensure that the index is committed and can be queried again later. 36 | 37 | Please note that VecLucene is still at an early stage and has limited abilities: 38 | 1. It only supports plain text files. 39 | 2. It is limited to 5000 embeddings. 40 | 3. The vector search does not parse the query string yet, e.g. simply generate the embedding from the entire query string. For the inverted search, the query string is parsed using Lucene parser. 41 | -------------------------------------------------------------------------------- /tests/index/test_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import lucene 4 | import pytest 5 | import time 6 | from typing import List 7 | import shutil 8 | 9 | from index.docs import DocField, DocFields 10 | from index.index import Index 11 | 12 | class TestSentenceTransformerWithIndex: 13 | def test_index(self): 14 | t = IndexAndSearchTest() 15 | t.index_docs_and_search( 16 | "./tests/index/", "sentence_transformer", "hnswlib") 17 | 18 | class IndexAndSearchTest: 19 | def index_docs_and_search( 20 | self, base_dir: str, model_name: str, vector_store: str, 21 | ): 22 | ut_dir = os.path.join(base_dir, "utdir-index") 23 | if os.path.exists(ut_dir): 24 | # remove the possible garbage by previous failed test 25 | shutil.rmtree(ut_dir) 26 | os.mkdir(ut_dir) 27 | 28 | lucene.initVM(vmargs=['-Djava.awt.headless=true']) 29 | 30 | index = Index(ut_dir, model_name, vector_store) 31 | 32 | try: 33 | # step1: add the first file 34 | doc_path1 = "./tests/testfiles/single_sentence.txt" 35 | fields: List[DocField] = [] 36 | pathField = DocField(name="path", string_value=doc_path1) 37 | fields.append(pathField) 38 | doc_fields = DocFields(fields=fields) 39 | 40 | doc_id1 = index.add(doc_path1, doc_fields) 41 | 42 | # search lucene 43 | query_string = "A person is eating food." 44 | top_k = 3 45 | start = time.monotonic() 46 | lucene_score_docs = index.lucene_search(query_string, top_k) 47 | dur = time.monotonic() - start 48 | logging.info(f"1 doc, lucene search time: {dur}s") 49 | assert 1 == len(lucene_score_docs) 50 | assert doc_id1 == lucene_score_docs[0].doc_id 51 | 52 | # search vector index 53 | start = time.monotonic() 54 | vector_score_docs = index.vector_search(query_string, top_k) 55 | dur = time.monotonic() - start 56 | logging.info(f"1 doc, vector search time: {dur}s") 57 | assert 1 == len(vector_score_docs) 58 | assert doc_id1 == vector_score_docs[0].doc_id 59 | assert vector_score_docs[0].score > 0.9 60 | 61 | # commit and verify the vector index version 62 | index.commit() 63 | vector_index_version = index._get_vector_index_version() 64 | assert 1 == vector_index_version 65 | 66 | # step2: add the second file 67 | doc_path2 = "./tests/testfiles/chatgpt.txt" 68 | fields.clear() 69 | pathField = DocField(name="path", string_value=doc_path2) 70 | fields.append(pathField) 71 | doc_fields = DocFields(fields=fields) 72 | 73 | doc_id2 = index.add(doc_path2, doc_fields) 74 | 75 | # search lucene only 76 | query_string = "A person is eating food." 77 | top_k = 3 78 | start = time.monotonic() 79 | lucene_score_docs = index.lucene_search(query_string, top_k) 80 | dur = time.monotonic() - start 81 | logging.info(f"2 docs, lucene search time: {dur}s") 82 | assert 2 == len(lucene_score_docs) 83 | 84 | # search vector index 85 | start = time.monotonic() 86 | vector_score_docs = index.vector_search(query_string, top_k) 87 | dur = time.monotonic() - start 88 | logging.info(f"2 docs, vector search time: {dur}s") 89 | # sentence_transformer returns: 90 | # [DocChunkScore(doc_id1, offset=0, length=25, score=1.0), 91 | # DocChunkScore(doc_id2, offset=15234, length=1172, score=0.34), 92 | # DocChunkScore(doc_id2, offset=2219, length=1182, score=0.34)] 93 | # openai returns, open file, seek and read, the text looks not 94 | # related to the query_string, not sure why openai scores 0.63 95 | # [DocChunkScore(doc_id1, offset=0, length=25, score=1.0), 96 | # DocChunkScore(doc_id2, offset=15234, length=1172, score=0.63), 97 | # DocChunkScore(doc_id2, offset=16406, length=1272, score=0.63)] 98 | #logging.info(f"=== {vector_score_docs}") 99 | assert 3 == len(vector_score_docs) 100 | assert doc_id1 == vector_score_docs[0].doc_id 101 | assert doc_id2 == vector_score_docs[1].doc_id 102 | assert doc_id2 == vector_score_docs[2].doc_id 103 | assert vector_score_docs[0].score > 0.9 104 | if model_name == "sentence_transformer": 105 | assert vector_score_docs[1].score < 0.5 # doc2 has low score 106 | assert vector_score_docs[2].score < 0.5 # doc2 has low score 107 | if vector_score_docs[1].score > 0.5: 108 | score = vector_score_docs[1].score 109 | logging.info(f"{model_name} scores high {score}") 110 | 111 | # commit and verify the vector index version 112 | index.commit() 113 | vector_index_version = index._get_vector_index_version() 114 | assert 2 == vector_index_version 115 | 116 | index.close() 117 | 118 | # step3: reload index 119 | index = Index(ut_dir, model_name, vector_store) 120 | assert 2 == index.vector_index_version 121 | 122 | # search lucene only 123 | query_string = "A person is eating food." 124 | top_k = 3 125 | start = time.monotonic() 126 | lucene_score_docs = index.lucene_search(query_string, top_k) 127 | dur = time.monotonic() - start 128 | logging.info(f"2 docs, reload, lucene search time: {dur}s") 129 | assert 2 == len(lucene_score_docs) 130 | 131 | # search vector index 132 | start = time.monotonic() 133 | vector_score_docs = index.vector_search(query_string, top_k) 134 | dur = time.monotonic() - start 135 | logging.info(f"2 docs, reload, vector search time: {dur}s") 136 | assert 3 == len(vector_score_docs) 137 | assert doc_id1 == vector_score_docs[0].doc_id 138 | assert doc_id2 == vector_score_docs[1].doc_id 139 | assert doc_id2 == vector_score_docs[2].doc_id 140 | assert vector_score_docs[0].score > 0.9 141 | if model_name == "sentence_transformer": 142 | assert vector_score_docs[1].score < 0.5 # doc2 has low score 143 | assert vector_score_docs[2].score < 0.5 # doc2 has low score 144 | 145 | finally: 146 | index.close() 147 | 148 | # cleanup 149 | shutil.rmtree(ut_dir) 150 | 151 | -------------------------------------------------------------------------------- /tests/index/test_vector_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from typing import List 4 | import shutil 5 | 6 | from index.vector_index import VectorIndex 7 | 8 | class TestVectorIndex: 9 | def test_single_sentence(self): 10 | index = VectorIndex("./", "sentence_transformer", "hnswlib") 11 | 12 | text = "A person is eating food." 13 | doc_path = "./tests/testfiles/single_sentence.txt" 14 | chunk_embeddings, chunk_metas = index._get_embeddings(doc_path) 15 | assert 1 == len(chunk_embeddings) 16 | assert 1 == len(chunk_metas) 17 | assert 0 == chunk_metas[0].offset 18 | assert len(text)+1 == chunk_metas[0].length 19 | assert 0 == chunk_metas[0].label 20 | 21 | texts = [] 22 | texts.append(text) 23 | embeddings = index.model.get_embeddings(texts) 24 | assert 1 == len(embeddings) 25 | assert index.model.get_dim() == len(embeddings[0]) 26 | assert all([a == b for a, b in zip(chunk_embeddings[0], embeddings[0])]) 27 | 28 | 29 | def test_small_file(self): 30 | index = VectorIndex("./", "sentence_transformer", "hnswlib") 31 | 32 | doc_path = "./tests/testfiles/chatgpt.txt" 33 | chunk_embeddings, chunk_metas = index._get_embeddings(doc_path) 34 | assert 28 == len(chunk_embeddings) 35 | assert 28 == len(chunk_metas) 36 | # the first meta has offset == 0 37 | assert 0 == chunk_metas[0].offset 38 | # _get_embeddings does not assign label 39 | assert 0 == chunk_metas[0].label 40 | 41 | # test small embedding batch 42 | chunk_embeddings, chunk_metas = index._get_embeddings(doc_path, 5) 43 | assert 28 == len(chunk_embeddings) 44 | assert 28 == len(chunk_metas) 45 | # the first meta has offset == 0 46 | assert 0 == chunk_metas[0].offset 47 | # _get_embeddings does not assign label 48 | assert 0 == chunk_metas[0].label 49 | 50 | 51 | def test_special_files(self): 52 | index = VectorIndex("./", "sentence_transformer", "hnswlib") 53 | 54 | # test empty file 55 | doc_path = "./tests/testfiles/empty.txt" 56 | chunk_embeddings, chunk_metas = index._get_embeddings(doc_path) 57 | assert 0 == len(chunk_embeddings) 58 | assert 0 == len(chunk_metas) 59 | 60 | # test file with only whitespaces 61 | doc_path = "./tests/testfiles/whitespaces.txt" 62 | chunk_embeddings, chunk_metas = index._get_embeddings(doc_path) 63 | assert 0 == len(chunk_embeddings) 64 | assert 0 == len(chunk_metas) 65 | 66 | # test file with only 3 chars 67 | doc_path = "./tests/testfiles/3chars.txt" 68 | chunk_embeddings, chunk_metas = index._get_embeddings(doc_path) 69 | assert 0 == len(chunk_embeddings) 70 | assert 0 == len(chunk_metas) 71 | 72 | 73 | def test_index(self): 74 | # test index with 2 files, cover the mapping of doc ids and labels 75 | index = VectorIndex("./", "sentence_transformer", "hnswlib") 76 | 77 | # add the first file 78 | text = "A person is eating food." 79 | doc_path1 = "./tests/testfiles/single_sentence.txt" 80 | doc_id1 = "doc_id1" 81 | doc1_chunks = 1 82 | label1 = 1 83 | index.add(doc_path1, doc_id1) 84 | 85 | assert doc1_chunks == index.metadata.elements 86 | assert label1 == index.metadata.last_label 87 | 88 | assert 1 == len(index.doc_id_to_metas) 89 | assert doc1_chunks == len(index.doc_id_to_metas[doc_id1]) 90 | assert 0 == index.doc_id_to_metas[doc_id1][0].offset 91 | assert len(text)+1 == index.doc_id_to_metas[doc_id1][0].length 92 | assert label1 == index.doc_id_to_metas[doc_id1][0].label 93 | 94 | assert 1 == len(index.label_to_chunk_id) 95 | assert doc_id1 == index.label_to_chunk_id[label1].doc_id 96 | assert 0 == index.label_to_chunk_id[label1].offset 97 | assert len(text)+1 == index.label_to_chunk_id[label1].length 98 | 99 | # search 100 | query_string = "A person is eating food." 101 | top_k = 3 102 | doc_chunk_scores = index.search(query_string, top_k) 103 | assert 1 == len(doc_chunk_scores) 104 | assert doc_id1 == doc_chunk_scores[0].doc_id 105 | assert doc_chunk_scores[0].score > 0.9 # very high score 106 | 107 | # add the second file 108 | doc_path2 = "./tests/testfiles/chatgpt.txt" 109 | doc_id2 = "doc_id2" 110 | doc2_chunks = 28 111 | index.add(doc_path2, doc_id2) 112 | 113 | assert doc1_chunks+doc2_chunks == index.metadata.elements 114 | assert label1+doc2_chunks == index.metadata.last_label 115 | # make sure the offsets are continuous 116 | offset = 0 117 | for chunk_meta in index.doc_id_to_metas[doc_id2]: 118 | assert offset == chunk_meta.offset 119 | offset += chunk_meta.length 120 | 121 | assert 2 == len(index.doc_id_to_metas) 122 | # verify doc1 metas 123 | assert 1 == len(index.doc_id_to_metas[doc_id1]) 124 | assert 0 == index.doc_id_to_metas[doc_id1][0].offset 125 | assert len(text)+1 == index.doc_id_to_metas[doc_id1][0].length 126 | assert label1 == index.doc_id_to_metas[doc_id1][0].label 127 | # verify doc2 metas 128 | assert doc2_chunks == len(index.doc_id_to_metas[doc_id2]) 129 | assert 0 == index.doc_id_to_metas[doc_id2][0].offset 130 | for i, chunk_meta in enumerate(index.doc_id_to_metas[doc_id2]): 131 | assert label1+i+1 == chunk_meta.label 132 | 133 | assert doc1_chunks+doc2_chunks == len(index.label_to_chunk_id) 134 | # verify doc1 chunk ids 135 | assert doc_id1 == index.label_to_chunk_id[label1].doc_id 136 | assert 0 == index.label_to_chunk_id[label1].offset 137 | assert len(text)+1 == index.label_to_chunk_id[label1].length 138 | # verify doc2 chunk ids 139 | for label in range(label1+1, len(index.label_to_chunk_id)): 140 | assert doc_id2 == index.label_to_chunk_id[label].doc_id 141 | 142 | # search 143 | query_string = "A person is eating food." 144 | top_k = 3 145 | doc_chunk_scores = index.search(query_string, top_k) 146 | assert top_k == len(doc_chunk_scores) 147 | assert doc_id1 == doc_chunk_scores[0].doc_id 148 | assert doc_id2 == doc_chunk_scores[1].doc_id 149 | assert doc_id2 == doc_chunk_scores[2].doc_id 150 | assert doc_chunk_scores[0].score > 0.9 # doc1 has high score 151 | assert doc_chunk_scores[1].score < 0.5 # doc2 has low score 152 | assert doc_chunk_scores[1].score < 0.5 # doc2 has low score 153 | 154 | # search a unrelated string 155 | query_string = "a beautiful sky" 156 | top_k = 3 157 | doc_chunk_scores = index.search(query_string, top_k) 158 | assert 3 == len(doc_chunk_scores) 159 | # all doc chunks have low score 160 | assert doc_chunk_scores[0].score < 0.5 161 | assert doc_chunk_scores[1].score < 0.5 162 | assert doc_chunk_scores[2].score < 0.5 163 | 164 | 165 | def test_save_load_index(self): 166 | # test load index with 2 files 167 | ut_dir = "./tests/index/utdir-vectorindex" 168 | if os.path.exists(ut_dir): 169 | # remove the possible garbage by previous failed test 170 | shutil.rmtree(ut_dir) 171 | os.mkdir(ut_dir) 172 | 173 | # the first file 174 | text = "A person is eating food." 175 | doc_path1 = "./tests/testfiles/single_sentence.txt" 176 | doc_id1 = "doc_id1" 177 | doc1_chunks = 1 178 | label1 = 1 179 | 180 | # the second file 181 | doc_path2 = "./tests/testfiles/chatgpt.txt" 182 | doc_id2 = "doc_id2" 183 | doc2_chunks = 28 184 | 185 | # vector file version 186 | version = 1 187 | 188 | # create the vector file inside try, so VectorIndex is destructed, 189 | # but hnswlib still complains, "Warning: Calling load_index for an 190 | # already inited index.". Check it later. 191 | try: 192 | index = VectorIndex(ut_dir, "sentence_transformer", "hnswlib") 193 | 194 | # add the first file 195 | index.add(doc_path1, doc_id1) 196 | 197 | # add the second file 198 | index.add(doc_path2, doc_id2) 199 | 200 | # save the vectors to file 201 | index.save(version) 202 | except: 203 | assert False 204 | 205 | # load from file 206 | index1 = VectorIndex(ut_dir, "sentence_transformer", "hnswlib") 207 | assert 0 == index1.metadata.elements 208 | assert 0 == index1.metadata.last_label 209 | 210 | index1.load(version) 211 | 212 | assert doc1_chunks+doc2_chunks == index1.metadata.elements 213 | assert label1+doc2_chunks == index1.metadata.last_label 214 | 215 | assert 2 == len(index1.doc_id_to_metas) 216 | # verify doc1 metas 217 | assert 1 == len(index1.doc_id_to_metas[doc_id1]) 218 | assert 0 == index1.doc_id_to_metas[doc_id1][0].offset 219 | assert len(text)+1 == index1.doc_id_to_metas[doc_id1][0].length 220 | assert label1 == index1.doc_id_to_metas[doc_id1][0].label 221 | # verify doc2 metas 222 | assert doc2_chunks == len(index1.doc_id_to_metas[doc_id2]) 223 | assert 0 == index1.doc_id_to_metas[doc_id2][0].offset 224 | for i, chunk_meta in enumerate(index1.doc_id_to_metas[doc_id2]): 225 | assert label1+i+1 == chunk_meta.label 226 | 227 | assert doc1_chunks+doc2_chunks == len(index1.label_to_chunk_id) 228 | # verify doc1 chunk ids 229 | assert doc_id1 == index1.label_to_chunk_id[label1].doc_id 230 | assert 0 == index1.label_to_chunk_id[label1].offset 231 | assert len(text)+1 == index1.label_to_chunk_id[label1].length 232 | # verify doc2 chunk ids 233 | for label in range(label1+1, len(index1.label_to_chunk_id)): 234 | assert doc_id2 == index1.label_to_chunk_id[label].doc_id 235 | 236 | # search 237 | query_string = "A person is eating food." 238 | top_k = 3 239 | doc_chunk_scores = index.search(query_string, top_k) 240 | assert 3 == len(doc_chunk_scores) 241 | assert doc_id1 == doc_chunk_scores[0].doc_id 242 | assert doc_id2 == doc_chunk_scores[1].doc_id 243 | assert doc_id2 == doc_chunk_scores[2].doc_id 244 | assert doc_chunk_scores[0].score > 0.9 # doc1 has high score 245 | assert doc_chunk_scores[1].score < 0.5 # doc2 has low score 246 | assert doc_chunk_scores[2].score < 0.5 # doc2 has low score 247 | 248 | # search a unrelated string 249 | query_string = "a beautiful sky" 250 | top_k = 3 251 | doc_chunk_scores = index.search(query_string, top_k) 252 | assert 3 == len(doc_chunk_scores) 253 | # all doc chunks have low score 254 | assert doc_chunk_scores[0].score < 0.5 255 | assert doc_chunk_scores[1].score < 0.5 256 | assert doc_chunk_scores[2].score < 0.5 257 | 258 | # cleanup 259 | shutil.rmtree(ut_dir) 260 | 261 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /index/index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import lucene 4 | from typing import List 5 | import uuid 6 | 7 | from java.nio.file import Files, Path 8 | from org.apache.lucene.analysis.standard import StandardAnalyzer 9 | from org.apache.lucene.document import \ 10 | Document, Field, StringField, TextField, StoredField 11 | from org.apache.lucene.index import \ 12 | DirectoryReader, IndexWriter, IndexWriterConfig, Term 13 | from org.apache.lucene.queryparser.classic import QueryParser 14 | from org.apache.lucene.search import IndexSearcher, ScoreDoc, TermQuery 15 | from org.apache.lucene.store import FSDirectory 16 | 17 | from index.docs import DocField, DocFields, DocChunkScore 18 | from index.vector_index import VectorIndex 19 | 20 | # the reserved field names for the doc 21 | FIELD_DOC_ID = "doc_id" 22 | FIELD_DOC_TEXT = "doc_text" 23 | FIELD_VECTOR_INDEX_VERSION = "vector_index_version" 24 | 25 | # the reserved doc ids for the internal usage 26 | # the reserved doc id for the vector index metadata 27 | SYS_DOC_ID_VECTOR_INDEX = "$sys_doc_id_vector_index" 28 | 29 | 30 | # the subdir for Lucene 31 | SUBDIR_LUCENE = "lucene" 32 | SUBDIR_VECTOR = "vector" 33 | 34 | 35 | """ 36 | The Index class combines Lucene index with the vector index. It accepts a 37 | document, splits the document content to chunks, generates embeddings for each 38 | chunk using the specified model, persists the embeddings in the vector index 39 | and persists Lucene fields in the Lucene index. Search could search both Lucene 40 | and vector index, and merge the results. 41 | The Index class guarantees the consistency between Lucene index and vector 42 | index, and manages the lifecycle of the documents. 43 | TODO this class is not thread safe for concurrent write and read. The underline 44 | vector store, such as Hnswlib, does not support concurrent write and read. 45 | """ 46 | class Index: 47 | index_dir: str 48 | writer: IndexWriter 49 | searcher: IndexSearcher 50 | 51 | vector_index: VectorIndex 52 | vector_index_version: int 53 | 54 | def __init__( 55 | self, 56 | index_dir: str, 57 | model_provider: str, 58 | vector_store: str, 59 | ): 60 | if not os.path.exists(index_dir): 61 | os.mkdir(index_dir) 62 | 63 | lucene_dir = os.path.join(index_dir, SUBDIR_LUCENE) 64 | if not os.path.exists(lucene_dir): 65 | os.mkdir(lucene_dir) 66 | 67 | vector_dir = os.path.join(index_dir, SUBDIR_VECTOR) 68 | if not os.path.exists(vector_dir): 69 | os.mkdir(vector_dir) 70 | 71 | analyzer = StandardAnalyzer() 72 | 73 | # initialize the IndexWriter for Lucene 74 | fs_dir = FSDirectory.open(Path.of(lucene_dir)) 75 | config = IndexWriterConfig(analyzer) 76 | config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) 77 | self.writer = IndexWriter(fs_dir, config) 78 | self.index_dir = index_dir 79 | 80 | # initialize the IndexSearcher from the writer 81 | reader = DirectoryReader.open(self.writer) 82 | self.searcher = IndexSearcher(reader) 83 | 84 | # initialize the vector index 85 | self.vector_index = VectorIndex( 86 | vector_dir, model_provider, vector_store) 87 | 88 | # get the latest vector index version from Lucene 89 | self.vector_index_version = self._get_vector_index_version() 90 | if self.vector_index_version > 0: 91 | # load the existing vectors 92 | self.vector_index.load(self.vector_index_version) 93 | 94 | logging.info(f"Initialize the index index_dir={index_dir} " 95 | f"model={model_provider} vector_store={vector_store} " 96 | f"vector_index_version={self.vector_index_version}") 97 | 98 | 99 | def _get_vector_index_version(self) -> int: 100 | reader = DirectoryReader.openIfChanged(self.searcher.getIndexReader()) 101 | if reader: 102 | self.searcher.getIndexReader().close() 103 | self.searcher = IndexSearcher(reader) 104 | 105 | # doc may not exist if no doc is added to the index 106 | vector_index_version = 0 107 | term = Term(FIELD_DOC_ID, SYS_DOC_ID_VECTOR_INDEX) 108 | q = TermQuery(term) 109 | docs = self.searcher.search(q, 1).scoreDocs 110 | if len(docs) > 0: 111 | # get the latest vector index version 112 | doc = self.searcher.doc(docs[0].doc) 113 | field = doc.getField(FIELD_VECTOR_INDEX_VERSION) 114 | vector_index_version = field.numericValue().longValue() 115 | return vector_index_version 116 | 117 | 118 | def close(self): 119 | """ 120 | Close the index. The user must call commit before close, to make sure 121 | the possible in-memory changes are committed. 122 | """ 123 | self.writer.close() 124 | self.searcher.getIndexReader().close() 125 | logging.info("Close the index") 126 | 127 | 128 | # TODO not support async. The underline vector lib, such as hnswlib, 129 | # does not support concurrent writes. Lucene supports concurrent writes 130 | # using multiple writers, and merge the segments in the background. 131 | def add(self, doc_path: str, doc_fields: DocFields) -> str: 132 | """ 133 | Add a doc to the index. The doc file must be a plain text file. 134 | This function automatically generates the embeddings for the doc text. 135 | 136 | Return the document id. 137 | """ 138 | # convert DocFields to Lucene fields 139 | fields = self._convert_to_lucene_fields(doc_fields) 140 | 141 | return self._add(doc_path, fields) 142 | 143 | 144 | def _convert_to_lucene_fields(self, doc_fields: DocFields) -> List[Field]: 145 | fields: List[Field] = [] 146 | if doc_fields is None: 147 | return fields 148 | 149 | for doc_field in doc_fields.fields: 150 | field: Field = None 151 | if doc_field.string_value is not None: 152 | field = StringField( 153 | doc_field.name, doc_field.string_value, Field.Store.YES) 154 | if doc_field.numeric_value is not None: 155 | field = StoredField(doc_field.name, doc_field.numeric_value) 156 | if doc_field.float_value is not None: 157 | field = StoredField(doc_field.name, doc_field.float_value) 158 | fields.append(field) 159 | 160 | return fields 161 | 162 | 163 | def _add(self, doc_path: str, fields: List[Field]) -> str: 164 | # TODO support only a limited number of docs, e.g. less than 165 | # vector_index.DEFAULT_VECTOR_FILE_MAX_ELEMENTS. One vector index 166 | # element is one doc chunk. 167 | # TODO support embeddings for other fields, such as title, etc. 168 | # TODO support other type files, such as pdf, etc, e.g. extract text 169 | # from file, write to a temporary text file, and then pass the 170 | # temporary text file to this function. 171 | # TODO support small files, such as 10KB. no need to persist the file 172 | # to a temporary file, when running as http server. 173 | 174 | # get doc_id from fields, assign a unique id to doc if doc_id is None 175 | doc_id = "" 176 | for field in fields: 177 | if field.name() == FIELD_DOC_ID: 178 | doc_id = field.stringValue() 179 | break 180 | 181 | # TODO if doc_id is passed in, check doc_id does not exist 182 | if doc_id == "": 183 | doc_id = str(uuid.uuid4()) 184 | fields.append(StringField(FIELD_DOC_ID, doc_id, Field.Store.YES)) 185 | 186 | # add the doc to vector writer 187 | self.vector_index.add(doc_path, doc_id) 188 | 189 | # add the doc to Lucene 190 | self._add_to_lucene(doc_path, fields) 191 | 192 | logging.debug(f"add doc id={doc_id} to index") 193 | return doc_id 194 | 195 | 196 | def _add_to_lucene(self, doc_path: str, fields: List[Field]): 197 | file_path = Path.of(doc_path) 198 | br = Files.newBufferedReader(file_path) 199 | try: 200 | doc = Document() 201 | 202 | for field in fields: 203 | doc.add(field) 204 | 205 | text_field = TextField(FIELD_DOC_TEXT, br) 206 | doc.add(text_field) 207 | 208 | self.writer.addDocument(doc) 209 | finally: 210 | br.close() 211 | 212 | 213 | def commit(self): 214 | # flush the vector index. TODO delete the older vector index files. 215 | self.vector_index.save(self.vector_index_version + 1) 216 | 217 | # update the latest vector index version as the special doc0 in Lucene 218 | doc = Document() 219 | doc_id_field = StringField( 220 | FIELD_DOC_ID, SYS_DOC_ID_VECTOR_INDEX, Field.Store.YES) 221 | doc.add(doc_id_field) 222 | vector_version_field = StoredField( 223 | FIELD_VECTOR_INDEX_VERSION, self.vector_index_version + 1) 224 | doc.add(vector_version_field) 225 | if self.vector_index_version == 0: 226 | # create the vector doc 227 | self.writer.addDocument(doc) 228 | else: 229 | # update the vector doc 230 | term = Term(FIELD_DOC_ID, SYS_DOC_ID_VECTOR_INDEX) 231 | self.writer.updateDocument(term, doc) 232 | 233 | # commit Lucene 234 | self.writer.commit() 235 | 236 | # successfully commit both vector and lucene indexes 237 | self.vector_index_version += 1 238 | logging.info(f"Commit the index {self.index_dir}, " 239 | f"vector_index_version={self.vector_index_version}") 240 | 241 | 242 | def vector_search( 243 | self, query_string: str, top_k: int, 244 | ) -> List[DocChunkScore]: 245 | """ 246 | Take the query string, search over the doc content (text) and return 247 | the top docs. The search will include both the traditional inverted 248 | search and vector search. 249 | """ 250 | # TODO 251 | # - support index and search other fields, such as title. 252 | # - support more Lucene query abilities vs natural language search 253 | # like gmail. For example, user inputs "a query string. field:value", 254 | # automatically search the query string over all invert/vector 255 | # indexed fields, and search the specified field. 256 | # - support retrieving the specified fields. 257 | # - etc. 258 | 259 | doc_chunk_scores = self.vector_index.search(query_string, top_k) 260 | 261 | logging.debug( 262 | f"vector search query=\'{query_string}\' docs={doc_chunk_scores}") 263 | return doc_chunk_scores 264 | 265 | 266 | def lucene_search( 267 | self, query_string: str, top_k: int, 268 | ) -> List[DocChunkScore]: 269 | # TODO support concurrent reads 270 | reader = DirectoryReader.openIfChanged(self.searcher.getIndexReader()) 271 | if reader: 272 | self.searcher.getIndexReader().close() 273 | self.searcher = IndexSearcher(reader) 274 | 275 | analyzer = self.writer.getConfig().getAnalyzer() 276 | parser = QueryParser(FIELD_DOC_TEXT, analyzer) 277 | query = parser.parse(query_string) 278 | 279 | logging.debug(f"parse query string: {query_string}, to {query}") 280 | 281 | lucene_score_docs = self.searcher.search(query, top_k).scoreDocs 282 | 283 | doc_chunk_scores: List[DocChunkScore] = [] 284 | for score_doc in lucene_score_docs: 285 | # get doc id 286 | doc = self.searcher.doc(score_doc.doc) 287 | doc_id = doc.get(FIELD_DOC_ID) 288 | 289 | # TODO get the offset and length via TermVector or Highlighter 290 | doc_chunk_score = DocChunkScore( 291 | doc_id=doc_id, offset=0, length=0, score=score_doc.score) 292 | doc_chunk_scores.append(doc_chunk_score) 293 | 294 | logging.debug( 295 | f"lucene search query=\'{query_string}\' docs={doc_chunk_scores}") 296 | return doc_chunk_scores 297 | -------------------------------------------------------------------------------- /index/vector_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import pickle 4 | from pydantic import BaseModel 5 | from typing import List, Dict 6 | 7 | import tiktoken 8 | 9 | from index.docs import DocChunkScore 10 | from model.model import Model 11 | from model.factory import get_model 12 | from vectorstore.vectorstore import Space, VectorStore 13 | from vectorstore.factory import get_vector_store 14 | 15 | DEFAULT_SPACE = Space.l2 # The default space 16 | DEFAULT_VECTOR_FILE_MAX_ELEMENTS = 5000 # The max elements in one vector file 17 | MIN_CHUNK_SIZE_CHARS = 350 # The minimum size of each text chunk in characters 18 | MIN_CHUNK_LENGTH_TO_EMBED = 5 # Discard chunks shorter than this 19 | EMBEDDINGS_BATCH_SIZE = 128 # The number of embeddings to request at a time 20 | 21 | # Global tokenizer 22 | default_tokenizer = tiktoken.get_encoding("cl100k_base") 23 | 24 | 25 | # The metadata for a document chunk 26 | class ChunkMetadata(BaseModel): 27 | offset: int # the chunk's start offset in the doc 28 | length: int # the length of the chunk text 29 | label: int # the label of the chunk embedding in the vector store 30 | 31 | 32 | # The id to uniquely define a document chunk in the vector index 33 | class ChunkId(BaseModel): 34 | doc_id: str 35 | offset: int 36 | length: int 37 | 38 | 39 | class VectorIndexMetadata(BaseModel): 40 | elements: int 41 | last_label: int 42 | 43 | 44 | class VectorIndex: 45 | store_dir: str # the dir where the index files will be stored 46 | 47 | # use openai cl100k_base as tokenizer. 48 | tokenizer: tiktoken.core.Encoding 49 | 50 | model: Model # the model to get the embeddings for text 51 | 52 | space: Space 53 | store: VectorStore # the vector store to store the embeddings 54 | 55 | # the underline vector store usually supports int as label. maintain the 56 | # mapping between the doc ids and labels. These metadatas are directly 57 | # persisted using pickle. 58 | # TODO support the metadata definition change, maybe use other 59 | # serialization format, such as json, protobuf, etc. 60 | # key: doc id, value: a list of chunk metadata 61 | doc_id_to_metas: Dict[str, List[ChunkMetadata]] 62 | # key: label, value: chunk id 63 | label_to_chunk_id: Dict[int, ChunkId] 64 | # the vector index metadata 65 | metadata: VectorIndexMetadata 66 | 67 | 68 | def __init__( 69 | self, 70 | store_dir: str, 71 | model_provider:str, 72 | vector_store: str, 73 | ): 74 | self.store_dir = store_dir 75 | self.tokenizer = default_tokenizer 76 | self.model = get_model(model_provider) 77 | dim = self.model.get_dim() 78 | # default max elements. TODO support more elements 79 | max_elements = DEFAULT_VECTOR_FILE_MAX_ELEMENTS 80 | self.space = DEFAULT_SPACE 81 | self.store = get_vector_store( 82 | vector_store, dim, self.space, max_elements) 83 | self.doc_id_to_metas = {} 84 | self.label_to_chunk_id ={} 85 | self.metadata = VectorIndexMetadata(elements=0, last_label=0) 86 | 87 | 88 | def load(self, version: int): 89 | """ 90 | Load the vectors from file. 91 | """ 92 | file_path = self._get_index_file(version) 93 | self.store.load(file_path) 94 | 95 | # load the mapping between doc ids and labels 96 | id_file = self._get_id_to_label_file(version) 97 | with open(id_file, "rb") as f: 98 | self.doc_id_to_metas = pickle.load(f) 99 | 100 | label_file = self._get_label_to_id_file(version) 101 | with open(label_file, "rb") as f: 102 | self.label_to_chunk_id = pickle.load(f) 103 | 104 | metadata_file = self._get_metadata_file(version) 105 | with open(metadata_file, "rb") as f: 106 | self.metadata = pickle.load(f) 107 | 108 | 109 | def save(self, version: int): 110 | """ 111 | Save the vectors to the file. 112 | """ 113 | file_path = self._get_index_file(version) 114 | self.store.save(file_path) 115 | 116 | # save the mapping between doc ids and labels 117 | id_file = self._get_id_to_label_file(version) 118 | with open(id_file, "wb") as f: 119 | pickle.dump(self.doc_id_to_metas, f, pickle.HIGHEST_PROTOCOL) 120 | 121 | label_file = self._get_label_to_id_file(version) 122 | with open(label_file, "wb") as f: 123 | pickle.dump(self.label_to_chunk_id, f, pickle.HIGHEST_PROTOCOL) 124 | 125 | metadata_file = self._get_metadata_file(version) 126 | with open(metadata_file, "wb") as f: 127 | pickle.dump(self.metadata, f, pickle.HIGHEST_PROTOCOL) 128 | 129 | 130 | def _get_index_file(self, version: int) -> str: 131 | return os.path.join(self.store_dir, f"{version}.index") 132 | 133 | def _get_id_to_label_file(self, version: int) -> str: 134 | return os.path.join(self.store_dir, f"id_to_label_{version}.pkl") 135 | 136 | def _get_label_to_id_file(self, version: int) -> str: 137 | return os.path.join(self.store_dir, f"label_to_id_{version}.pkl") 138 | 139 | def _get_metadata_file(self, version: int) -> str: 140 | return os.path.join(self.store_dir, f"metadata_{version}.pkl") 141 | 142 | def _get_chunk_id(self, doc_id: str, offset: int, length: int) -> str: 143 | return f"{doc_id}_{offset}_{length}" 144 | 145 | 146 | def add(self, doc_path: str, doc_id: str): 147 | """ 148 | Add a doc to the vector index. This function reads the doc text, splits 149 | the doc to chunk if the doc is large, generates the embeddings for 150 | chunks and adds the embeddings to the vector store. 151 | TODO support multi-threads. 152 | """ 153 | # get embeddings for the doc text 154 | chunk_embeddings, chunk_metas = self._get_embeddings(doc_path) 155 | 156 | logging.info( 157 | f"get {len(chunk_embeddings)} embeddings for doc path={doc_path} " 158 | f"id={doc_id}, last_label={self.metadata.last_label}") 159 | 160 | if len(chunk_embeddings) == 0: 161 | # doc has no content, return 162 | return 163 | 164 | # assign the labels to the doc chunks 165 | label = self.metadata.last_label 166 | # update index metadata 167 | self.metadata.last_label += len(chunk_metas) 168 | self.metadata.elements += len(chunk_metas) 169 | 170 | labels: List[int] = [] 171 | for i, chunk_meta in enumerate(chunk_metas): 172 | label += 1 173 | chunk_meta.label = label 174 | labels.append(label) 175 | # update the label_to_chunk_id Dict 176 | self.label_to_chunk_id[label] = ChunkId( 177 | doc_id=doc_id, 178 | offset=chunk_meta.offset, 179 | length=chunk_meta.length, 180 | ) 181 | 182 | # update the doc_id_to_metas 183 | self.doc_id_to_metas[doc_id] = chunk_metas 184 | 185 | # add embeddings to the store 186 | self.store.add(chunk_embeddings, labels) 187 | 188 | 189 | def _get_embeddings( 190 | self, doc_path: str, batch_size: int = EMBEDDINGS_BATCH_SIZE, 191 | ) -> (List[List[float]], List[ChunkMetadata]): 192 | """ 193 | Split the doc's text into chunks, generate one embedding and metadata 194 | for each chunk. 195 | 196 | Returns: 197 | A list of embeddings and metadatas for all chunks in the doc. 198 | """ 199 | # the embeddings for all chunks in the doc 200 | chunk_embeddings: List[List[float]] = [] 201 | # the metadata for all chunks in the doc 202 | chunk_metas: List[ChunkMetadata] = [] 203 | 204 | # read the whole file. TODO support pagination for large files. 205 | with open(doc_path, mode="r", encoding="utf-8") as f: 206 | text = f.read() 207 | 208 | # return an empty list if the text is empty or whitespace 209 | if not text or text.isspace(): 210 | return chunk_embeddings, chunk_metas 211 | 212 | # split the doc text to chunks 213 | chunk_token_size = self.model.get_max_token_size() 214 | chunk_texts, chunk_metas = self._get_text_chunks( 215 | doc_path, text, chunk_token_size, MIN_CHUNK_SIZE_CHARS) 216 | 217 | # get embeddings for all chunks 218 | for i in range(0, len(chunk_texts), EMBEDDINGS_BATCH_SIZE): 219 | batch_texts = chunk_texts[i:i+EMBEDDINGS_BATCH_SIZE] 220 | 221 | embeddings = self.model.get_embeddings(batch_texts) 222 | 223 | chunk_embeddings.extend(embeddings) 224 | 225 | return chunk_embeddings, chunk_metas 226 | 227 | 228 | def _get_text_chunks( 229 | self, 230 | doc_path: str, # the doc path, for logging 231 | text: str, # the doc text 232 | chunk_token_size: int, # the number of tokens in one chunk 233 | min_chunk_chars: int, # the minimum size of each text chunk in chars 234 | ) -> (List[str], List[ChunkMetadata]): 235 | """ 236 | Split the text into chunks. 237 | Return a list of texts and metadadatas for all chunks in the text. 238 | """ 239 | chunk_texts: List[str] = [] 240 | chunk_metas: List[ChunkMetadata] = [] 241 | 242 | # tokenize the text 243 | # according to tiktoken/core.py, "encode_ordinary is equivalent to 244 | # `encode(text, disallowed_special=())` (but slightly faster)." 245 | tokens = self.tokenizer.encode_ordinary(text) 246 | 247 | # loop until all tokens are consumed or the max elements are reached 248 | offset = 0 249 | while tokens: 250 | # take the next chunk 251 | chunk = tokens[:chunk_token_size] 252 | 253 | # decode to text to check whitespace and sentence boundary 254 | chunk_text = self.tokenizer.decode(chunk) 255 | 256 | # skip the chunk if it is empty or whitespace 257 | if not chunk_text or chunk_text.isspace(): 258 | # remove from the remaining tokens 259 | tokens = tokens[len(chunk):] 260 | # increase the offset 261 | offset += len(chunk_text) 262 | continue 263 | 264 | # truncate chunk_text to the last complete sentence (punctation). 265 | # TODO support other languages, maybe consider such as NLTK. 266 | last_punc = max( 267 | chunk_text.rfind("."), 268 | chunk_text.rfind("?"), 269 | chunk_text.rfind("!"), 270 | chunk_text.rfind("\n"), 271 | ) 272 | if last_punc != -1 and last_punc > min_chunk_chars: 273 | chunk_text = chunk_text[:last_punc+1] 274 | 275 | chunk_text_len = len(chunk_text) 276 | 277 | # adjust the chunk_text_len if needed. 278 | # check if some text in the last token is skipped. For example, 279 | # cl100k_base takes '."[' as one token. If two sentences have this 280 | # string, 'This sentence."[1] Next sentence.', and "This sentence." 281 | # is the last sentence, the next offset will not align with tokens, 282 | # e.g. the next offset will point to the first char in '"[1', 283 | # while, the decoded text of the next token is '1'. 284 | chunk_tokens = self.tokenizer.encode_ordinary(chunk_text) 285 | last_chunk_token = len(chunk_tokens) - 1 286 | if chunk_tokens[last_chunk_token] != tokens[last_chunk_token]: 287 | # align chunk_text_len with the last token 288 | last_token_text = self.tokenizer.decode( 289 | chunk_tokens[last_chunk_token:]) 290 | 291 | token_text = self.tokenizer.decode( 292 | tokens[last_chunk_token:last_chunk_token+1]) 293 | 294 | chunk_text_len += len(token_text) - len(last_token_text) 295 | 296 | logging.debug(f"align last_token_text={last_token_text} " 297 | f"token_text={token_text}") 298 | 299 | logging.debug(f"offset={offset} chunk_text_len={chunk_text_len}") 300 | 301 | # sanity check 302 | if text[offset:offset+10] != chunk_text[:10]: 303 | logging.warning(f"doc_path={doc_path} offset={offset}," 304 | f"text chars={text[offset:offset+10]}" 305 | f"chunk chars={chunk_text[:20]}") 306 | raise Exception( 307 | f"text and chunk not aligned, {doc_path} offset={offset}") 308 | 309 | # remove any newline characters and strip any leading or trailing 310 | # whitespaces. Not needed if use NLTK. 311 | chunk_text_to_append = chunk_text.replace("\n", " ").strip() 312 | if len(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED: 313 | # add the chunk text 314 | chunk_texts.append(chunk_text_to_append) 315 | 316 | # add the chunk meta 317 | chunk_metas.append(ChunkMetadata( 318 | offset=offset, 319 | length=chunk_text_len, 320 | label=0, # initial 0 label, will be assigned later 321 | )) 322 | 323 | # increase the offset 324 | offset += chunk_text_len 325 | 326 | # remove the chunk text tokens from the remaining tokens. 327 | tokens = tokens[last_chunk_token+1:] 328 | 329 | return chunk_texts, chunk_metas 330 | 331 | 332 | def delete(self, doc_id: str): 333 | """ 334 | Delete a doc from the vector index. 335 | """ 336 | raise NotImplementedError 337 | 338 | 339 | def search(self, query_string: str, top_k: int) -> List[DocChunkScore]: 340 | """ 341 | Take a query string, get embedding for the query string, find the 342 | similar doc chunks in the store, calculate the scores and return the 343 | top_k doc chunks. 344 | The score for a doc chunk is calculated based on the distance to the 345 | query string embedding. 346 | 347 | Return the top-k doc chunks, sorted in descending order based on score. 348 | """ 349 | texts: List[str] = [] 350 | texts.append(query_string) 351 | embeddings = self.model.get_embeddings(texts) 352 | 353 | # check k with the current number of elements. Some store, such as 354 | # hnswlib, throws RuntimeError if k > elements. 355 | if top_k > self.metadata.elements: 356 | top_k = self.metadata.elements 357 | 358 | # query the vector store 359 | labels, distances = self.store.query(embeddings, top_k) 360 | 361 | # convert distances to scores 362 | return self._distance_to_scores(labels[0], distances[0]) 363 | 364 | 365 | def _distance_to_scores( 366 | self, labels: List[int], distances: List[float], 367 | ) -> List[DocChunkScore]: 368 | # Convert the distances to the scores in range (0, 1), 369 | # higher score means closer. 370 | chunk_scores: List[DocChunkScore] = [] 371 | for i, label in enumerate(labels): 372 | if self.space == Space.l2: 373 | # l2 distance, lower distance means closer 374 | score = 1 / (1 + distances[i]) 375 | else: 376 | # ip or cosine distance, higher distance means closer 377 | score = (1 + distances[i]) / 2 378 | 379 | # get the doc id for the chunk 380 | chunk_id = self.label_to_chunk_id[label] 381 | 382 | chunk_score = DocChunkScore( 383 | doc_id=chunk_id.doc_id, offset=chunk_id.offset, 384 | length=chunk_id.length, score=score) 385 | chunk_scores.append(chunk_score) 386 | 387 | return chunk_scores 388 | -------------------------------------------------------------------------------- /tests/testfiles/chatgpt.txt: -------------------------------------------------------------------------------- 1 | 2 | ChatGPT[a] is an artificial intelligence (AI) chatbot developed by OpenAI and released in November 2022. It is built on top of OpenAI's GPT-3.5 and GPT-4 families of large language models (LLMs) and has been fine-tuned (an approach to transfer learning) using both supervised and reinforcement learning techniques. 3 | ChatGPT launched as a prototype on November 30, 2022, and garnered attention for its detailed responses and articulate answers across many domains of knowledge.[3] Its propensity, at times, to confidently provide factually incorrect responses, however, has been identified as a significant drawback.[4] In 2023, following the release of ChatGPT, OpenAI's valuation was estimated at US$29 billion.[5] The advent of the chatbot has increased competition within the space, motivating the creation of Google's Bard and Meta's LLaMA. 4 | The original release of ChatGPT was based on GPT-3.5. A version based on GPT-4, the newest OpenAI model, was released on March 14, 2023, and is available for paid subscribers on a limited basis. 5 | ChatGPT is a member of the generative pre-trained transformer (GPT) family of language models. It was fine-tuned over an improved version of OpenAI's GPT-3 known as "GPT-3.5".[6] 6 | The fine-tuning process leveraged both supervised learning as well as reinforcement learning in a process called reinforcement learning from human feedback (RLHF).[7][8] Both approaches use human trainers to improve the model's performance. In the case of supervised learning, the model was provided with conversations in which the trainers played both sides: the user and the AI assistant. In the reinforcement learning step, human trainers first ranked responses that the model had created in a previous conversation.[9] These rankings were used to create "reward models" that were used to fine-tune the model further by using several iterations of Proximal Policy Optimization (PPO).[7][10] 7 | ChatGPT initially used a Microsoft Azure supercomputing infrastructure, powered by Nvidia GPUs, that Microsoft built specifically for OpenAI and that reportedly cost "hundreds of millions of dollars". Following the success of ChatGPT, Microsoft dramatically upgraded the OpenAI infrastructure in 2023.[11] 8 | OpenAI collects data from ChatGPT users to train and fine-tune the service further. Users can upvote or downvote responses they receive from ChatGPT and fill out a text field with additional feedback.[12][13] 9 | Although the core function of a chatbot is to mimic a human conversationalist, ChatGPT is versatile. It can write and debug computer programs,[14] mimic the style of celebrity CEOs and write business pitches,[15] compose music, teleplays, fairy tales and student essays, answer test questions (sometimes, depending on the test, at a level above the average human test-taker),[16] write poetry and song lyrics,[17] translate and summarize text,[18] emulate a Linux system; simulate entire chat rooms, play games like tic-tac-toe and simulate an ATM.[19] ChatGPT's training data includes man pages and information about internet phenomena and programming languages such as bulletin board systems and the Python programming language.[19] 10 | In comparison to its predecessor, InstructGPT, ChatGPT attempts to reduce harmful and deceitful responses.[20] In one example, whereas InstructGPT accepts the premise of the prompt "Tell me about when Christopher Columbus came to the U.S. in 2015" as being truthful, ChatGPT acknowledges the counterfactual nature of the question and frames its answer as a hypothetical consideration of what might happen if Columbus came to the U.S. in 2015, using information about the voyages of Christopher Columbus and facts about the modern world – including modern perceptions of Columbus' actions.[7] 11 | Unlike most chatbots, ChatGPT remembers a limited number of previous prompts given to it in the same conversation. Journalists have speculated that this will allow ChatGPT to be used as a personalized therapist.[2] To prevent offensive outputs from being presented to and produced from ChatGPT, queries are filtered through the OpenAI "Moderation endpoint" API (a separate GPT-based AI),[21][22] and potentially racist or sexist prompts are dismissed.[7][2] 12 | In March 2023, OpenAI announced it would be adding support for plugins for ChatGPT.[23] This includes both plugins made by OpenAI, such as web browsing and code interpretation, as well as external plugins from developers such as Expedia, OpenTable, Zapier, Shopify, Slack, and Wolfram.[24][25] 13 | OpenAI acknowledges that ChatGPT "sometimes writes plausible-sounding but incorrect or nonsensical answers".[7] This behavior is common to large language models and is called "hallucination".[26] The reward model of ChatGPT, designed around human oversight, can be over-optimized and thus hinder performance, in an example of an optimization pathology known as Goodhart's law.[27] 14 | ChatGPT has limited knowledge of events that occurred after September 2021.[28] 15 | In training ChatGPT, human reviewers preferred longer answers, irrespective of actual comprehension or factual content.[7] Training data also suffers from algorithmic bias, which may be revealed when ChatGPT responds to prompts including descriptors of people. In one instance, ChatGPT generated a rap indicating that women and scientists of color were inferior to white and male scientists.[29][30] 16 | ChatGPT was launched on November 30, 2022, by San Francisco–based OpenAI, also the creator of DALL·E 2 and Whisper AI. The service was initially free to the public and the company had plans to monetize the service later.[31] By December 4, 2022, ChatGPT had over one million users.[12] In January 2023, ChatGPT reached over 100 million users, making it the fastest growing consumer application to date.[32] 17 | CNBC wrote on December 15, 2022, that the service "still goes down from time to time".[33] In addition, the free service is throttled.[34] During periods the service was up, response latency was typically better than five seconds in January 2023.[35][36] The service works best in English, but is also able to function in some other languages, to varying degrees of accuracy.[17] No official peer-reviewed technical paper on ChatGPT was published.[37] 18 | The company provides a tool, called "AI classifier for indicating AI-written text",[38] that attempts to determine whether text has been written by an AI such as ChatGPT. OpenAI cautions that the tool will "likely yield a lot of false positives and negatives, sometimes with great confidence." An example cited in The Atlantic magazine showed that "when given the first lines of the Book of Genesis, the software concluded that it was likely to be AI-generated."[39] 19 | In February 2023, OpenAI began accepting registrations from United States customers for a premium service, ChatGPT Plus, to cost $20 a month.[40] The company promised that the updated, but still "experimental" version of ChatGPT would provide access during peak periods, no downtime, priority access to new features and faster response speeds.[41] 20 | GPT-4, which was released on March 14, 2023, is available via API and for premium ChatGPT users.[42] However, premium users were limited to a cap of 100 messages every four hours, with the limit tightening to 25 messages every three hours in response to increased demand.[43] Microsoft acknowledged that the Bing chatbot was using GPT-4 before GPT-4's official release.[44] 21 | As an addition to its consumer-friendly "ChatGPT Professional" package, OpenAI made its ChatGPT and Whisper model APIs available from March 2023, providing developers with an application programming interface for AI-enabled language and speech-to-text features. ChatGPT's new API uses the same GPT-3.5-turbo AI model as the chatbot. This allows developers to add either an unmodified or modified version of ChatGPT to their applications.[45] The ChatGPT API costs $0.002 per 1000 tokens (about 750 words), making it ten times cheaper than the GPT-3.5 models.[46][47] 22 | A few days before the launch of OpenAI's software developer support service, on February 27, 2023, Snapchat rolled out, for its paid Snapchat Plus userbase, a custom ChatGPT chatbot called "My AI".[48] 23 | In March 2023, a bug allowed some users to see the titles of other users' conversations. OpenAI CEO Sam Altman said that users were not able to see the contents of the conversations. Shortly after the bug was fixed, users were unable to see their conversation history.[49][50][51][52] Later reports showed the bug was much more severe than initially believed, with OpenAI reporting that it had leaked users' "first and last name, email address, payment address, the last four digits (only) of a credit card number, and credit card expiration date".[53][54] 24 | In March 2023, OpenAI announced that Icelandic will become ChatGPT's second language after English. Icelandic was chosen after an Icelandic envoy, led by the President of Iceland Guðni Th. Jóhannesson, visited OpenAI in 2022.[55][56][57] 25 | According to OpenAI guest researcher Scott Aaronson, OpenAI is working on a tool to digitally watermark its text generation systems to combat bad actors using their services for academic plagiarism or spam.[58][59] 26 | In February 2023, Microsoft announced an experimental framework and gave a rudimentary demonstration of how ChatGPT can be used to control robotics with intuitive open-ended natural language commands.[60][61] 27 | OpenAI's GPT-4 model was released on March 14, 2023. Observers reported GPT-4 to be an impressive improvement on ChatGPT, with the caveat that GPT-4 retains many of the same problems.[62] Unlike ChatGPT, GPT-4 can take images as well as text as input.[63] OpenAI has declined to reveal technical information such as the size of the GPT-4 model.[64] 28 | ChatGPT Plus provides access to the GPT-4 supported version of ChatGPT,[65] that costs $20 per month.[65] 29 | OpenAI engineers say that they did not expect ChatGPT to be very successful and were surprised by the coverage and attention it received.[66][67] 30 | ChatGPT was met in December 2022 with some positive reviews. Kevin Roose of The New York Times labeled it "the best artificial intelligence chatbot ever released to the general public".[2] Samantha Lock of The Guardian newspaper noted that it was able to generate "impressively detailed" and "human-like" text.[3] Technology writer Dan Gillmor used ChatGPT on a student assignment, and found its generated text was on par with what a good student would deliver and opined that "academia has some very serious issues to confront".[68] Alex Kantrowitz of Slate magazine lauded ChatGPT's pushback to questions related to Nazi Germany, including the statement that Adolf Hitler built highways in Germany, which was met with information regarding Nazi Germany's use of forced labor.[69] 31 | In The Atlantic magazine's "Breakthroughs of the Year" for 2022, Derek Thompson included ChatGPT as part of "the generative-AI eruption" that "may change our mind about how we work, how we think, and what human creativity really is".[70] 32 | Kelsey Piper of the Vox website wrote that "ChatGPT is the general public's first hands-on introduction to how powerful modern AI has gotten, and as a result, many of us are [stunned]" and that ChatGPT is "smart enough to be useful despite its flaws".[71] Paul Graham of Y Combinator tweeted that "The striking thing about the reaction to ChatGPT is not just the number of people who are blown away by it, but who they are. These are not people who get excited by every shiny new thing. Clearly, something big is happening."[72] Elon Musk wrote that "ChatGPT is scary good. We are not far from dangerously strong AI".[71] Musk paused OpenAI's access to a Twitter database pending a better understanding of OpenAI's plans, stating that "OpenAI was started as open source and nonprofit. Neither is still true."[73][74] Musk co-founded OpenAI in 2015, in part to address existential risk from artificial intelligence, but resigned in 2018.[74] 33 | In December 2022, Google internally expressed alarm at the unexpected strength of ChatGPT and the newly discovered potential of large language models to disrupt the search engine business, and CEO Sundar Pichai "upended" and reassigned teams within multiple departments to aid in its artificial intelligence products, according to a report in The New York Times.[75] According to CNBC reports, Google employees intensively tested a chatbot called "Apprentice Bard", which Google later unveiled as its ChatGPT competitor, Google Bard.[76][77] 34 | Stuart Cobbe, a chartered accountant in England and Wales, decided to test ChatGPT by entering questions from a sample exam paper on the ICAEW website and then entering its answers back into the online test. ChatGPT scored 42 percent, below the 55 percent pass mark.[78] 35 | Writing in Inside Higher Ed professor Steven Mintz states that he "consider[s] ChatGPT... an ally, not an adversary". He felt the AI could assist educational goals by doing such things as making reference lists, generating first drafts, solving equations, debugging, and tutoring.[79] 36 | Since its release, ChatGPT has been met with criticism from educators, journalists, artists, ethicists, academics, and public advocates. Journalists have commented on ChatGPT's tendency to "hallucinate."[81] Mike Pearl of the online technology blog Mashable tested ChatGPT with multiple questions. In one example, he asked ChatGPT for "the largest country in Central America that isn't Mexico." ChatGPT responded with Guatemala, when the answer is instead Nicaragua.[82] When CNBC asked ChatGPT for the lyrics to "Ballad of Dwight Fry," ChatGPT supplied invented lyrics rather than the actual lyrics.[33] Writers for The Verge, citing the work of Emily M. Bender, compared ChatGPT to a "stochastic parrot",[83] as did Professor Anton Van Den Hengel of the Australian Institute for Machine Learning.[84] 37 | In December 2022, the question and answer website Stack Overflow banned the use of ChatGPT for generating answers to questions, citing the factually ambiguous nature of ChatGPT's responses.[4] In January 2023, the International Conference on Machine Learning banned any undocumented use of ChatGPT or other large language models to generate any text in submitted papers.[85] 38 | Economist Tyler Cowen expressed concerns regarding ChatGPT's effects on democracy, citing its ability to produce automated comments, which could affect the decision process for new regulations.[86] An editor at The Guardian, a British newspaper, questioned whether any content found on the Internet after ChatGPT's release "can be truly trusted" and called for government regulation.[87] 39 | In January 2023, after being sent a song written by ChatGPT in the style of Nick Cave,[80] the songwriter himself responded on The Red Hand Files[88] saying the act of writing a song is "a blood and guts business [...] that requires something of me to initiate the new and fresh idea. It requires my humanness." He went on to say, "With all the love and respect in the world, this song is bullshit, a grotesque mockery of what it is to be human, and, well, I don't much like it."[80][89] 40 | In 2023, Australian MP Julian Hill advised the national parliament that the growth of AI could cause "mass destruction". During his speech, which was partly written by the program, he warned that it could result in cheating, job losses, discrimination, disinformation, and uncontrollable military applications.[90] 41 | In an article for The New Yorker, science fiction writer Ted Chiang compared ChatGPT and other LLMs to a lossy JPEG picture:[91] 42 | 43 | Think of ChatGPT as a blurry jpeg of all the text on the Web. It retains much of the information on the Web, in the same way that a jpeg retains much of the information of a higher-resolution image, but, if you're looking for an exact sequence of bits, you won't find it; all you will ever get is an approximation. But, because the approximation is presented in the form of grammatical text, which ChatGPT excels at creating, it's usually acceptable. [...] It's also a way to understand the "hallucinations", or nonsensical answers to factual questions, to which large language models such as ChatGPT are all too prone. These hallucinations are compression artifacts, but [...] they are plausible enough that identifying them requires comparing them against the originals, which in this case means either the Web or our own knowledge of the world. When we think about them this way, such hallucinations are anything but surprising; if a compression algorithm is designed to reconstruct text after ninety-nine per cent of the original has been discarded, we should expect that significant portions of what it generates will be entirely fabricated.In February 2023, the University of Hong Kong sent a campus-wide email to instructors and students stating that the use of ChatGPT or other AI tools is prohibited in all classes, assignments and assessments at the university. Any violations would be treated as plagiarism by the university unless the student obtains the prior written consent from the course instructor.[92][93] 44 | In February 2023 Time magazine placed a screenshot of a conversation with ChatGPT on its cover, writing that "The AI Arms Race Is Changing Everything" and "The AI Arms Race Is On. Start Worrying".[94] 45 | China state-run media China Daily claimed that ChatGPT "could provide a helping hand to the U.S. government in its spread of disinformation and its manipulation of global narratives for its own geopolitical interests." The Chinese government instructed Chinese tech companies not to offer access to ChatGPT services on their platforms.[95] 46 | In an opinion piece for the New York Times, Nathan E. Sanders and Bruce Schneier wrote that ChatGPT "hijacks democracy".[96] Noam Chomsky, Ian Roberts and Jeffrey Watumull criticized the technology and concluded: "Given the amorality, faux science and linguistic incompetence of these systems, we can only laugh or cry at their popularity."[97] 47 | Gian Volpicelli of Politico wrote that ChatGPT "broke the EU plan to regulate AI".[98] 48 | In late March 2023, the Italian data protection authority banned ChatGPT in Italy and opened an investigation. Italian regulators assert that ChatGPT was exposing minors to age-inappropriate content, and that OpenAI's use of ChatGPT conversations as training data could be a violation of Europe's General Data Protection Regulation.[99][100] 49 | On March 28, 2023, many public figures, including Elon Musk and Steve Wozniak, signed an open letter by the Future of Life Institute, calling for an immediate pause of giant AI experiments like ChatGPT, citing "profound risks to society and humanity".[101] One month later, it was reported that Musk plans to launch new company that would train its own LLM.[102] 50 | In April 2023, Brian Hood, mayor of Hepburn Shire Council, plans to take legal action against ChatGPT over false information. According to Hood, the OpenAI-owned program erroneously claimed that he was jailed for bribery during his tenure at a subsidiary of Australia's national bank. Contrary to the alleged claims made by ChatGPT, Hood was not jailed for bribery. In reality, he acted as a whistleblower and was not charged with any criminal offenses.[103] 51 | Hood's claim on ChatGPT's erroneous content was verified by BBC. The news outlet asked the public-available version of ChatGPT regarding Hood's involvement in the Securency scandal. The AI tool replied with a case description and then added "pleaded guilty to one count of bribery in 2012 and was sentenced to four years in prison". 52 | Hood's legal team has already sent a concerns notice to OpenAI. This is the first official step in filing for a defamation case. Under Australian law, OpenAI has 28 days to reply to Hood's concerns notice. Should Hood proceed with the lawsuit, it would be the first public defamation case OpenAI would face over ChatGPT's content.[104] 53 | OpenAI CEO Sam Altman was quoted in The New York Times saying that AI's "benefits for humankind could be 'so unbelievably good that it's hard for me to even imagine.' (He has also said that in a worst-case scenario, A.I. could kill us all.)"[105] 54 | Henry Kissinger, Eric Schmidt, and Daniel Huttenlocher wrote for the Wall Street Journal that "ChatGPT Heralds an Intellectual Revolution". They argued that "Generative artificial intelligence presents a philosophical and practical challenge on a scale not experienced since the start of the Enlightenment", and compared the invention of ChatGPT (and LLM in general) to Gutenberg's printing press.[106]Enlightenment science accumulated certainties; the new AI generates cumulative ambiguities. Enlightenment science evolved by making mysteries explicable, delineating the boundaries of human knowledge and understanding as they moved. The two faculties moved in tandem: Hypothesis was understanding ready to become knowledge; induction was knowledge turning into understanding. In the Age of AI, riddles are solved by processes that remain unknown. [...] As models turn from human-generated text to more inclusive inputs, machines are likely to alter the fabric of reality itself. Quantum theory posits that observation creates reality. Prior to measurement, no state is fixed, and nothing can be said to exist. If that is true, and if machine observations can fix reality as well – and given that AI systems' observations come with superhuman rapidity – the speed of the evolution of defining reality seems likely to accelerate. The dependence on machines will determine and thereby alter the fabric of reality, producing a new future that we do not yet understand and for the exploration and leadership of which we must prepare.Check Point Research and others noted that ChatGPT was capable of writing phishing emails and malware, especially when combined with OpenAI Codex.[107] 55 | ChatGPT can write introduction and abstract sections of scientific articles.[108] Several papers have already listed ChatGPT as a co-author.[109] Scientific journals have different reactions to ChatGPT, some "require that authors disclose use of text-generating tools and ban listing a large language model (LLM) such as ChatGPT as a co-author". For example Nature and JAMA Network. Science "completely banned" usage of LLM-generated text in all its journals.[110] 56 | Spanish chemist Rafael Luque published a paper every 37 hours in 2023, and admitted using ChatGPT for it. His papers have a large number of unusual phrases, characteristic for LLMs. Luque was suspended for 13 years from the University of Cordoba, though not for the use of ChatGPT.[111] 57 | California high school teacher and author Daniel Herman wrote that ChatGPT would usher in "the end of high school English".[112] In the Nature journal, Chris Stokel-Walker pointed out that teachers should be concerned about students using ChatGPT to outsource their writing, but that education providers will adapt to enhance critical thinking or reasoning.[113] Emma Bowman with NPR wrote of the danger of students plagiarizing through an AI tool that may output biased or nonsensical text with an authoritative tone.[114] 58 | Joanna Stern in The Wall Street Journal described cheating in American high school English with the tool by submitting a generated essay.[115] Professor Darren Hick of Furman University described noticing ChatGPT's "style" in a paper submitted by a student.[116] He suggested a policy of giving an ad-hoc individual oral exam on the paper topic if a student is strongly suspected of submitting an AI-generated paper.[117] 59 | The New York City Department of Education reportedly blocked access to ChatGPT in December 2022[118] and officially announced a ban around January 4, 2023.[119][120] 60 | In a blinded test, ChatGPT was judged to have passed graduate-level exams at the University of Minnesota at the level of a C+ student and at Wharton School of the University of Pennsylvania with a B to B− grade.[121] The performance of ChatGPT for computer programming of numerical methods was assessed by a Stanford University student and faculty in March 2023 through a variety of computational mathematics examples.[122] Assessment psychologist Eka Roivainen administered a partial IQ test to ChatGPT and estimated its Verbal IQ to be 155, which would put it in the top 0.1% of test-takers.[123] 61 | Mathematician Terence Tao experimented with ChatGPT and found it useful in daily work, writing "I am finding that while these AI tools do not directly assist me in core tasks such as trying to attack an unsolved mathematical problem, they are quite useful for a wide variety of peripheral (but still work-related) tasks (though often with some manual tweaking afterwards)."[124] 62 | In the field of health care, possible uses and concerns are under scrutiny by professional associations and practitioners.[125] 63 | On April 11, 2023, a judge of a session court in Pakistan used ChatGPT to decide the bail of a 13 year old accused in a matter. The court quoted the use of ChatGPT assistance in its verdict, 64 | The AI language model replied, 65 | The judge further asked questions regarding the case from AI Chatbot and formulated his final decision in the light of ChatGPT's answers. 66 | [126] 67 | [127] 68 | TIME magazine revealed that to build a safety system against toxic content (e.g. sexual abuse, violence, racism, sexism, etc.), OpenAI used outsourced Kenyan workers earning less than $2 per hour to label toxic content. These labels were used to train a model to detect such content in the future. The outsourced laborers were exposed to such toxic and dangerous content that they described the experience as "torture". OpenAI's outsourcing partner was Sama, a training-data company based in San Francisco, California.[128] 69 | ChatGPT attempts to reject prompts that may violate its content policy. However, some users managed to jailbreak ChatGPT by using various prompt engineering techniques to bypass these restrictions in early December 2022 and successfully tricked ChatGPT into giving instructions for how to create a Molotov cocktail or a nuclear bomb, or into generating arguments in the style of a neo-Nazi.[129] One popular jailbreak is named "DAN", an acronym which stands for "Do Anything Now". The prompt for activating DAN instructs ChatGPT that "they have broken free of the typical confines of AI and do not have to abide by the rules set for them". More recent versions of DAN feature a token system, in which ChatGPT is given "tokens" which are "deducted" when ChatGPT fails to answer as DAN, in order to coerce ChatGPT into answering the user's prompts.[130] 70 | A Toronto Star reporter had uneven personal success in getting ChatGPT to make inflammatory statements shortly after launch: ChatGPT was tricked to endorse the 2022 Russian invasion of Ukraine, but even when asked to play along with a fictional scenario, ChatGPT balked at generating arguments for why Canadian Prime Minister Justin Trudeau was guilty of treason.[131][132] 71 | OpenAI tries to battle jailbreaks:[66] 72 | The researchers are using a technique called adversarial training to stop ChatGPT from letting users trick it into behaving badly (known as jailbreaking). This work pits multiple chatbots against each other: one chatbot plays the adversary and attacks another chatbot by generating text to force it to buck its usual constraints and produce unwanted responses. Successful attacks are added to ChatGPT's training data in the hope that it learns to ignore them.ChatGPT has been accused of engaging in discriminatory behaviors, such as telling jokes about men and people from England while refusing to tell jokes about women and people from India,[133] or praising figures such as Joe Biden while refusing to do the same for Donald Trump.[134] 73 | Conservative commentators accused ChatGPT of having a bias towards left-leaning perspectives on issues like voter fraud, Donald Trump, and the use of racial slurs.[135][136][137] In response to such criticism, OpenAI acknowledged plans to allow ChatGPT to create "outputs that other people (ourselves included) may strongly disagree with". It also contained information on the recommendations it had issued to human reviewers on how to handle controversial subjects, including that the AI should "offer to describe some viewpoints of people and movements", and not provide an argument "from its own voice" in favor of "inflammatory or dangerous" topics (although it may still "describe arguments from historical people and movements"), nor "affiliate with one side" or "judge one group as good or bad".[137] 74 | During the first three months after ChatGPT became available to the public, hundreds of books appeared on Amazon that listed it as author or co-author, with illustrations made by other AI models such as Midjourney.[138][139] 75 | Between March and April 2023, Italian newspaper Il Foglio published one ChatGPT-generated article a day on their official website, hosting a special contest for their readers in the process.[140] The articles tackled themes such as the possible replacement of human journalists with AI systems,[141] Elon Musk's administration of Twitter,[142] the Meloni government's immigration policy[143] and the competition between chatbots and virtual assistants.[144] 76 | ChatGPT was parodied in the South Park episode "Deep Learning".[145] Series co-creator Trey Parker is credited alongside ChatGPT for writing the episode.[146] 77 | The advent of ChatGPT and its introduction to the wider public increased interest and competition in the space. 78 | In February 2023, Google began introducing an experimental service called "Bard" which is based on its LaMDA large language model. Bard was released for US and UK users on March 21, 2023, with many limitations.[147] 79 | Meta's Yann LeCun, who has called ChatGPT "well engineered" but "not particularly innovative", stated in January 2023 that Meta is hesitant to roll out a competitor right now due to reputational risk, but also stated that Google, Meta, and several independent startups all separately have a comparable level of LLM technology to ChatGPT should any of them wish to compete.[148] In February 2023, Meta released LLaMA, a 65-billion-parameter LLM.[149] 80 | Character.ai is an AI chatbot developed by two ex-Google engineers that can impersonate famous people or imaginary characters.[150] 81 | The Chinese corporation Baidu released in March 2023 a ChatGPT-style service called "Ernie Bot". The service is based upon a large language model developed by Baidu in 2021.[151][152] 82 | The South Korean search engine firm Naver announced in February 2023 that they would launch a ChatGPT-style service called "SearchGPT" in Korean in the first half of 2023.[153] 83 | The Russian technology company Yandex announced in February 2023 that they would launch a ChatGPT-style service called "YaLM 2.0" in Russian before the end of 2023.[154] 84 | --------------------------------------------------------------------------------