├── index
    ├── __init__.py
    ├── docs.py
    ├── index.py
    └── vector_index.py
├── model
    ├── __init__.py
    ├── factory.py
    ├── model.py
    └── providers
    │   ├── openai_embedding.py
    │   └── sentence_transformer_model.py
├── server
    ├── __init__.py
    ├── api.py
    └── server.py
├── vectorstore
    ├── __init__.py
    ├── factory.py
    ├── vectorstore.py
    └── providers
    │   └── hnswlib_store.py
├── tests
    ├── testfiles
    │   ├── empty.txt
    │   ├── 3chars.txt
    │   ├── whitespaces.txt
    │   ├── single_sentence.txt
    │   ├── download.py
    │   └── chatgpt.txt
    ├── openai
    │   ├── test_index.py
    │   └── test_model_embedding.py
    ├── model
    │   └── test_sentence_transformer.py
    ├── vectorstore
    │   └── test_hnswlib.py
    └── index
    │   ├── test_index.py
    │   └── test_vector_index.py
├── pytest.ini
├── main.py
├── pyproject-gpt.toml
├── pyproject.toml
├── Makefile
├── Dockerfile
├── example
    └── cli.py
├── README.md
└── LICENSE


/index/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/server/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vectorstore/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/testfiles/empty.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/testfiles/3chars.txt:
--------------------------------------------------------------------------------
1 | foo
2 | 


--------------------------------------------------------------------------------
/tests/testfiles/whitespaces.txt:
--------------------------------------------------------------------------------
1 |          
2 | 
3 |        
4 | 


--------------------------------------------------------------------------------
/tests/testfiles/single_sentence.txt:
--------------------------------------------------------------------------------
1 | A person is eating food.
2 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 | [pytest]
3 | log_cli=true
4 | log_level=INFO
5 | log_cli_format = %(asctime)s [%(levelname)s] %(filename)s:%(lineno)d - %(message)s
6 | log_cli_date_format = %Y-%m-%d %H:%M:%S
7 | 


--------------------------------------------------------------------------------
/tests/openai/test_index.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import lucene
 3 | import pytest
 4 | 
 5 | 
 6 | from tests.index.test_index import IndexAndSearchTest
 7 | 
 8 | class TestIndexWithOpenAIAdaModel:
 9 |     def test_index(self):
10 |         t = IndexAndSearchTest()
11 |         t.index_docs_and_search("./tests/openai/", "openai_embedding", "hnswlib")
12 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from server.server import start
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--host", type=str, default="0.0.0.0")
 7 |     parser.add_argument("--port", type=int, default=8080)
 8 |     args = parser.parse_args()
 9 | 
10 |     start(host=args.host, port=args.port)
11 | 


--------------------------------------------------------------------------------
/tests/testfiles/download.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | url='https://en.wikipedia.org/wiki/ChatGPT'
 5 | 
 6 | # save text from the url to a txt file
 7 | with open(url.split('/')[-1]+".txt", "w", encoding="UTF-8") as f:
 8 |     # get the text from the URL using BeautifulSoup
 9 |     soup = BeautifulSoup(requests.get(url).text, "html.parser")
10 | 
11 |     # save paragraphs
12 |     for i in soup.select('p'):
13 |         f.write(i.get_text())
14 | 


--------------------------------------------------------------------------------
/vectorstore/factory.py:
--------------------------------------------------------------------------------
 1 | from vectorstore.vectorstore import VectorStore
 2 | 
 3 | def get_vector_store(
 4 |     store_name: str, dim: int, space: str, max_elements: int,
 5 | ) -> VectorStore:
 6 |     match store_name:
 7 |         case "hnswlib":
 8 |             from vectorstore.providers.hnswlib_store import HnswlibStore
 9 |             return HnswlibStore(dim, space, max_elements)
10 |         case _:
11 |             return ValueError(f"Unsupported vector store: {store_name}")
12 | 
13 | 


--------------------------------------------------------------------------------
/model/factory.py:
--------------------------------------------------------------------------------
 1 | from model.model import Model
 2 | 
 3 | def get_model(provider: str) -> Model:
 4 |     match provider:
 5 |         case "openai_embedding":
 6 |             from model.providers.openai_embedding import OpenAIEmbeddingModel
 7 |             return OpenAIEmbeddingModel()
 8 |         case "sentence_transformer":
 9 |             from model.providers.sentence_transformer_model import \
10 |                 SentenceTransformerModel
11 |             return SentenceTransformerModel()
12 |         case _:
13 |             raise ValueError(f"Unsupported model provider: {provider}")
14 | 
15 | 


--------------------------------------------------------------------------------
/pyproject-gpt.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "veclucene"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Jun Luo <luo.junius@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | fastapi = "^0.95.1"
11 | uvicorn = "^0.22.0"
12 | pydantic = "^1.10.7"
13 | tenacity = "^8.2.2"
14 | requests = "^2.30.0"
15 | python-multipart = "^0.0.6"
16 | argparse = "^1.4.0"
17 | openai = "^0.27.6"
18 | tiktoken = "^0.3.3"
19 | hnswlib = "^0.7.0"
20 | numpy = "^1.24.3"
21 | 
22 | [tool.poetry.group.dev.dependencies]
23 | pytest = "^7.3.1"
24 | 
25 | [build-system]
26 | requires = ["poetry-core"]
27 | build-backend = "poetry.core.masonry.api"
28 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "veclucene"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Jun Luo <luo.junius@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | fastapi = "^0.95.1"
11 | uvicorn = "^0.22.0"
12 | pydantic = "^1.10.7"
13 | tenacity = "^8.2.2"
14 | requests = "^2.30.0"
15 | python-multipart = "^0.0.6"
16 | argparse = "^1.4.0"
17 | openai = "^0.27.6"
18 | tiktoken = "^0.3.3"
19 | hnswlib = "^0.7.0"
20 | numpy = "^1.24.3"
21 | sentence-transformers = "^2.2.2"
22 | 
23 | [tool.poetry.group.dev.dependencies]
24 | pytest = "^7.3.1"
25 | 
26 | [build-system]
27 | requires = ["poetry-core"]
28 | build-backend = "poetry.core.masonry.api"
29 | 


--------------------------------------------------------------------------------
/server/api.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from pydantic import BaseModel
 3 | from typing import List, Optional
 4 | 
 5 | from index.docs import DocChunkScore
 6 | 
 7 | class QueryType(str, Enum):
 8 |     vector = "vector"
 9 |     lucene = "lucene"
10 | 
11 | 
12 | class QueryRequest(BaseModel):
13 |     # the query string. 
14 |     # for lucene search, input the string supported by Lucene QueryParser.
15 |     # for vector search, simply input a string. TODO support QueryParser.
16 |     query: str
17 |     query_type: Optional[QueryType] = QueryType.vector
18 |     top_k: Optional[int] = 3
19 | 
20 | 
21 | # for now, simply return DocChunkScore.
22 | # TODO include the matched text, for such as highlight, etc.
23 | # TODO add auto QA ability. For a question, the server automatically sends the
24 | # top_k chunk texts as context to the QA model, such as ChatGPT, and includes
25 | # the answer in the response.
26 | class QueryResponse(BaseModel):
27 |     doc_scores: List[DocChunkScore]
28 | 


--------------------------------------------------------------------------------
/model/model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | class Model(ABC):
 5 |     @abstractmethod
 6 |     def get_max_token_size(self) -> int:
 7 |         """
 8 |         Get the max number of tokens for the text. The input text longer than
 9 |         the max token may be truncated.
10 |         """
11 |         raise NotImplementedError
12 | 
13 |     @abstractmethod
14 |     def get_dim(self) -> int:
15 |         """
16 |         Return the embedding dimension
17 |         """
18 |         raise NotImplementedError
19 | 
20 |     @abstractmethod
21 |     def get_embeddings(self, texts: List[str]) -> List[List[float]]:
22 |         """
23 |         Takes in a list of texts and returns a list of embeddings for each text.
24 |         """
25 |         raise NotImplementedError
26 | 
27 |     @abstractmethod
28 |     def set_model(self, model_name: str, max_token_size: int, dim: int):
29 |         """
30 |         Set to use the specified model.
31 |         """
32 |         raise NotImplementedError
33 | 


--------------------------------------------------------------------------------
/index/docs.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Optional
 3 | 
 4 | # TODO support more attributes, such as tokenize, DocValuesType, etc.
 5 | class DocField(BaseModel):
 6 |     name: str # field name
 7 |     string_value: Optional[str] = None
 8 |     numeric_value: Optional[int] = None
 9 |     float_value: Optional[float] = None
10 | 
11 | 
12 | # All doc fields except the doc text. There are two reserved fields:
13 | # 1. The reserved field name for the doc text, "doc_text", defined by
14 | # index.FIELD_DOC_TEXT. This field name should not be used by application.
15 | # 2. The reserved field name for the doc id, "doc_id", defined by
16 | # index.FIELD_DOC_ID. If doc_id is not specified in the request, server will
17 | # automatically generate a unique id for the doc.
18 | #
19 | # For now, all fields are stored and not indexed. Only the doc contents are
20 | # indexed and also stored. TODO allow indexing more fields, such as title.
21 | class DocFields(BaseModel):
22 |     fields: List[DocField]
23 | 
24 | 
25 | class DocChunkScore(BaseModel):
26 |     doc_id: str
27 |     offset: int
28 |     length: int
29 |     score: float
30 | 
31 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | test:
 3 | 	#PYTHONPATH=. pytest # or python3 -m pytest
 4 | 	# add -s to print to console, add --log-cli-level=DEBUG to show debug logs
 5 | 	python3 -m pytest tests/model/
 6 | 	python3 -m pytest tests/vectorstore/
 7 | 	python3 -m pytest tests/index/test_vector_index.py
 8 | 	# This test reports 2000+ DeprecationWarning from pylucene.
 9 | 	# Catch and filter warnings inside the test only ingore < 100 warnings.
10 | 	# most warnings are probably reported when JVM shuts down. Looks not able
11 | 	# to explicitly shutdown JVM in the test. So use disable-warnings for this
12 | 	# test only.
13 | 	python3 -m pytest --disable-warnings tests/index/test_index.py
14 | 
15 | test_openai:
16 | 	# please export your openai key first, OPENAI_API_KEY=your_key
17 | 	python3 -m pytest tests/openai/test_model_embedding.py
18 | 	python3 -m pytest --disable-warnings tests/openai/test_index.py
19 | 
20 | 
21 | PYTHON := python3
22 | site_packages_path := $(shell $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
23 | coverage:
24 | 	coverage run --omit "$(site_packages_path)/*" -m pytest --disable-warnings
25 | 	coverage report --omit "$(site_packages_path)/*"
26 | 


--------------------------------------------------------------------------------
/tests/openai/test_model_embedding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import pytest
 4 | import time
 5 | 
 6 | from model.factory import get_model
 7 | 
 8 | class TestOpenAIEmbeddingModel():
 9 |     def test_embeddings(self):
10 |         m = get_model("openai_embedding")
11 |         assert 256 == m.get_max_token_size()
12 |         assert 1536 == m.get_dim()
13 | 
14 |         sentences = ['A person is eating food.',
15 |                      'A person is eating a piece of bread.',
16 |                      'A person is riding a horse.',
17 |                      'A person is riding a white horse on an enclosed ground.']
18 | 
19 |         # example run time on a MacBook.
20 |         # run the test first time, get embeddings time: 0.48015683237463236s
21 |         # run the second time, get embeddings time: 0.25255241710692644s
22 |         start = time.monotonic()
23 |         embeddings = m.get_embeddings(sentences)
24 |         assert len(sentences) == len(embeddings)
25 |         assert m.get_dim() == len(embeddings[0])
26 |         dur = time.monotonic() - start
27 |         logging.info(f"openai_embedding, get embeddings time: {dur}s")
28 | 
29 |         with pytest.raises(NotImplementedError):
30 |             m.set_model("model", 1, 1)
31 | 


--------------------------------------------------------------------------------
/vectorstore/vectorstore.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from enum import Enum
 3 | from typing import List
 4 | 
 5 | class Space(str, Enum):
 6 |     l2 = "l2" # L2/Euclidean
 7 |     ip = "ip" # inner/dot product
 8 |     # The embedding model usually generates the normalized vectors. The cosine
 9 |     # similarity similarity is a dot product on normalized vectors. Usually
10 |     # would not need to use cosine.
11 |     cosine = "cosine"
12 | 
13 | 
14 | class VectorStore(ABC):
15 |     @abstractmethod
16 |     def save(self, index_path: str):
17 |         """
18 |         Save the vectors to the file specified by index_path.
19 |         """
20 |         raise NotImplementedError
21 | 
22 |     @abstractmethod
23 |     def load(self, index_path: str):
24 |         """
25 |         Load the vectors from the file specified by index_path.
26 |         """
27 |         raise NotImplementedError
28 | 
29 |     @abstractmethod
30 |     def add(self, embeddings: List[List[float]], labels: List[int]):
31 |         """
32 |         Add the embeddings and the corresponding labels.
33 |         """
34 |         raise NotImplementedError
35 | 
36 |     @abstractmethod
37 |     def query(
38 |         self,
39 |         embeddings: List[List[float]],
40 |         top_k: int = 1,
41 |     ) -> (List[List[int]], List[List[float]]):
42 |         """
43 |         Take one or more embeddings and return the top_k embedding ids and
44 |         distances for each embedding.
45 |         The distances are the original distances defined by the space, such as
46 |         L2, inner/dot product, etc. The vector store provider should return the
47 |         original distances.
48 |         """
49 |         raise NotImplementedError
50 | 


--------------------------------------------------------------------------------
/model/providers/openai_embedding.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from openai import Embedding
 3 | from tenacity import retry, wait_random_exponential, stop_after_attempt
 4 | 
 5 | from model.model import Model
 6 | 
 7 | class OpenAIEmbeddingModel(Model):
 8 |     # https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
 9 |     model_name: str
10 |     max_token_size: int
11 |     dim: int
12 | 
13 |     def __init__(self):
14 |         self.model_name = "text-embedding-ada-002"
15 |         # what is the best token size? chatgpt-retrieval-plugin uses 200
16 |         self.max_token_size = 256
17 |         self.dim = 1536
18 | 
19 |     def get_max_token_size(self) -> int:
20 |         """
21 |         Return the max token for the text.
22 |         """
23 |         return self.max_token_size
24 | 
25 |     def get_dim(self) -> int:
26 |         """
27 |         Return the embedding dimension
28 |         """
29 |         return self.dim
30 | 
31 |     @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))
32 |     def get_embeddings(self, texts: List[str]) -> List[List[float]]:
33 |         """
34 |         Takes in a list of texts and returns a list of embeddings for each text.
35 |         """
36 |         # Call the OpenAI API to get the embeddings
37 |         response = Embedding.create(input=texts, model=self.model_name)
38 | 
39 |         # Extract the embedding data from the response
40 |         data = response["data"]  # type: ignore
41 |     
42 |         # Return the embeddings as a list of lists of floats
43 |         return [result["embedding"] for result in data]
44 | 
45 |     def set_model(self, model_name: str, max_token_size: int, dim: int):
46 |         """
47 |         Set to use the specified model.
48 |         """
49 |         raise NotImplementedError
50 | 
51 | 


--------------------------------------------------------------------------------
/vectorstore/providers/hnswlib_store.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | import hnswlib
 4 | 
 5 | from vectorstore.vectorstore import Space, VectorStore
 6 | 
 7 | class HnswlibStore(VectorStore):
 8 |     index: hnswlib.Index
 9 | 
10 |     # hnswlib params
11 |     dim: int
12 |     space: Space # ip, l2, or cosine
13 |     max_elements: int
14 |     # M: int # max number of connections on upper layers
15 |     # ef_construction: int # number of the nearest neighbors at index time
16 |     # ef_search: int # number of the nearest neighbors to search
17 | 
18 |     def __init__(self, dim: int, space: Space, max_elements: int):
19 |         self.index = hnswlib.Index(space, dim)
20 |         self.index.init_index(max_elements)
21 |         self.dim = dim
22 |         self.max_elements = max_elements
23 |         self.space = space
24 | 
25 |     def save(self, index_path: str):
26 |         self.index.save_index(index_path)
27 | 
28 |     def load(self, index_path: str):
29 |         self.index.load_index(index_path, self.max_elements)
30 | 
31 |     def add(self, embeddings: List[List[float]], labels: List[int]):
32 |         self.index.add_items(embeddings, labels)
33 | 
34 |     def query(
35 |         self,
36 |         embeddings: List[List[float]],
37 |         top_k: int = 1,
38 |     ) -> (List[List[int]], List[List[float]]):
39 |         """
40 |         Take one or more embeddings and return the top_k embedding labels and
41 |         the original distances, defined by space, for each embedding.
42 |         """
43 |         labels, distances = self.index.knn_query(embeddings, top_k)
44 |         if self.space == Space.ip or self.space == Space.cosine:
45 |             # https://github.com/nmslib/hnswlib returns a slightly different
46 |             # distances, change back to the original distances.
47 |             distances = 1.0 - distances
48 | 
49 |         return labels, distances
50 | 


--------------------------------------------------------------------------------
/model/providers/sentence_transformer_model.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from sentence_transformers import SentenceTransformer
 4 | 
 5 | from model.model import Model
 6 | 
 7 | class SentenceTransformerModel(Model):
 8 |     model: SentenceTransformer
 9 |     max_token_size: int
10 |     dim: int
11 | 
12 |     def __init__(self):
13 |         """
14 |         https://huggingface.co/blog/mteb, all-mpnet-base-v2 or all-MiniLM-L6-v2
15 |         provide a good balance between speed and performance.
16 | 
17 |         https://www.sbert.net/docs/pretrained_models.html, test on a V100 GPU.
18 |         all-mpnet-base-v2, model size 420MB, encoding speed 2800 sentence/s.
19 |         all-MiniLM-L6-v2,  model size 80MB,  encoding speed 14200 sentence/s.
20 |         """
21 |         # initialize with the default model
22 |         self.model = SentenceTransformer('all-MiniLM-L6-v2')
23 | 
24 |         # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
25 |         # By default, input text longer than 256 word pieces is truncated.
26 |         self.max_token_size = 256
27 |         self.dim = 384
28 | 
29 |     def get_max_token_size(self) -> int:
30 |         """
31 |         Return the max token for the text.
32 |         """
33 |         # TODO depending on the tokenizer, 256 word pieces may not equal to
34 |         # 256 tokens.
35 |         return self.max_token_size
36 | 
37 |     def get_dim(self) -> int:
38 |         """
39 |         Return the embedding dimension
40 |         """
41 |         return self.dim
42 | 
43 |     def get_embeddings(self, texts: List[str]) -> List[List[float]]:
44 |         """
45 |         Takes in a list of texts and returns a list of embeddings for each text.
46 |         """
47 |         return self.model.encode(texts)
48 | 
49 |     def set_model(self, model_name: str, max_token_size: int, dim: int):
50 |         """
51 |         Set to use the specified model.
52 |         """
53 |         self.model = SentenceTransformer(model_name)
54 |         self.max_token_size = max_token_size
55 |         self.dim = dim
56 | 
57 | 


--------------------------------------------------------------------------------
/tests/model/test_sentence_transformer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import pytest
 4 | import time
 5 | from typing import List
 6 | import numpy as np
 7 | 
 8 | from model.factory import get_model
 9 | 
10 | class TestSentTransformerModel():
11 |     def test_embeddings(self):
12 |         """
13 |         simple measure the latency of different models on a MacBook M1Pro.
14 |         python3 -m pytest -s tests/model/test_sentence_transformer.py
15 |         default model load time: 1.4939462076872587s
16 |         get embeddings time: 0.05871379096060991s
17 |         all-mpnet-base-v2 model load time: 1.011457541026175s
18 |         get embeddings time: 0.17692300025373697s
19 |         """
20 |         start = time.monotonic()
21 |         stmodel = get_model("sentence_transformer")
22 |         assert 256 == stmodel.get_max_token_size()
23 |         assert 384 == stmodel.get_dim()
24 |         dur = time.monotonic() - start
25 |         logging.info(f"\ndefault model load time: {dur}s")
26 | 
27 |         sentences = ['A person is eating food.',
28 |                      'A person is eating a piece of bread.',
29 |                      'A person is riding a horse.',
30 |                      'A person is riding a white horse on an enclosed ground.']
31 | 
32 |         start = time.monotonic()
33 |         embeddings = stmodel.get_embeddings(sentences)
34 |         assert len(sentences) == len(embeddings)
35 |         assert stmodel.get_dim() == len(embeddings[0])
36 |         dur = time.monotonic() - start
37 |         logging.info(f"get embeddings time: {dur}s")
38 | 
39 |         # https://huggingface.co/sentence-transformers/all-mpnet-base-v2
40 |         start = time.monotonic()
41 |         stmodel.set_model("all-mpnet-base-v2", 384, 768)
42 |         assert 384 == stmodel.get_max_token_size()
43 |         assert 768 == stmodel.get_dim()
44 |         dur = time.monotonic() - start
45 |         logging.info(f"all-mpnet-base-v2 model load time: {dur}s")
46 | 
47 |         start = time.monotonic()
48 |         embeddings = stmodel.get_embeddings(sentences)
49 |         assert len(sentences) == len(embeddings)
50 |         assert stmodel.get_dim() == len(embeddings[0])
51 |         dur = time.monotonic() - start
52 |         logging.info(f"get embeddings time: {dur}s")
53 | 
54 |     def test_unsupported_model(self):
55 |         with pytest.raises(ValueError):
56 |             get_model("unknown_model")
57 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | # get the required python packages
 3 | FROM python:3.11-slim as requirements-stage
 4 | 
 5 | ARG BUILD_GPT
 6 | 
 7 | WORKDIR /tmp/poetry
 8 | 
 9 | RUN pip install poetry
10 | 
11 | COPY ./pyproject.toml ./poetry.lock /tmp/poetry
12 | COPY ./pyproject-gpt.toml ./poetry-gpt.lock /tmp/poetry
13 | 
14 | RUN if [ ! -z "$BUILD_GPT" ]; then \
15 | 		mv pyproject-gpt.toml pyproject.toml && mv poetry-gpt.lock poetry.lock; \
16 | 	else \
17 | 		rm -f pyproject-gpt.toml poetry-gpt.lock; \
18 | 	fi
19 | 
20 | RUN poetry export -f requirements.txt --output requirements.txt --without-hashes
21 | 
22 | 
23 | # build
24 | FROM python:3.11-slim
25 | 
26 | ARG BUILD_GPT
27 | 
28 | # 1. build pylucene
29 | # building on mac, default jdk does not work for JCC on aarch64/arm64,
30 | # pylucene-9.4.1/jcc/setup.py line 197, LFLAGS does not have linux/aarch64.
31 | #RUN apt-get update && apt-get install -y default-jdk
32 | 
33 | # https://lucene.apache.org/pylucene/jcc/install.html suggests installing temurin java
34 | RUN apt-get update && apt-get install -y wget apt-transport-https gnupg
35 | RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add -
36 | RUN echo "deb https://packages.adoptium.net/artifactory/deb \
37 | 	$(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" \
38 | 	| tee /etc/apt/sources.list.d/adoptium.list
39 | RUN apt-get update && apt-get install -y temurin-17-jdk
40 | 
41 | RUN apt-get install -y build-essential
42 | 
43 | # download and build pylucene
44 | WORKDIR /code/pylucene
45 | RUN wget -O - https://downloads.apache.org/lucene/pylucene/pylucene-9.4.1-src.tar.gz \
46 | 	| tar -xz --strip-components=1
47 | RUN cd jcc \
48 |     && JCC_JDK=/usr/lib/jvm/$(ls /usr/lib/jvm) python setup.py build install
49 | RUN make all install JCC='python -m jcc --shared' PYTHON=python NUM_FILES=16
50 | #RUN make all test install JCC='python -m jcc --shared' PYTHON=python NUM_FILES=16
51 | 
52 | WORKDIR /code
53 | RUN rm -rf pylucene
54 | 
55 | 
56 | # 2. install VecLucene python packages
57 | WORKDIR /code/VecLucene
58 | COPY . /code/VecLucene/
59 | 
60 | COPY --from=requirements-stage /tmp/poetry/requirements.txt /code/VecLucene/requirements.txt
61 | RUN pip install --no-cache-dir --upgrade -r /code/VecLucene/requirements.txt
62 | 
63 | ENV ENV_EMBEDDING_MODEL_PROVIDER=${BUILD_GPT:+openai_embedding}
64 | 
65 | EXPOSE 8080
66 | 
67 | CMD ["sh", "-c", "uvicorn server.server:app --host 0.0.0.0 --port 8080"]
68 | 


--------------------------------------------------------------------------------
/example/cli.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import requests
 4 | import sys
 5 | import time
 6 | 
 7 | def upload_file(url: str, file_path: str):
 8 |     with open(file_path, 'rb') as f:
 9 |         resp = requests.post(
10 |             url=url, files={'file': (f.name, f, "text/plain")})
11 |         print(resp.json())
12 | 
13 | 
14 | def upload_file_with_fields(url: str, file_path: str):
15 |     with open(file_path, 'rb') as f:
16 |         field1 = '{"name": "field1", "string_value": "str1"}'
17 |         field2 = '{"name": "field2", "numeric_value": 2}'
18 |         doc_fields = '{"fields": ' + f'[{field1}, {field2}]' + '}'
19 |         fields = {"fields": f'{doc_fields}'}
20 |         resp = requests.post(
21 |             url=url, files={'file': (f.name, f, "text/plain")}, data=fields)
22 |         print(resp.json())
23 | 
24 | 
25 | def commit(url: str):
26 |     resp = requests.post(url=url)
27 |     print(resp.json())
28 | 
29 | 
30 | def query(url: str, query_string: str, query_type: str):
31 |     query_request = '{' + f'"query": "{query_string}", ' + \
32 |                     f'"query_type": "{query_type}"' + '}'
33 |     print(query_request)
34 |     resp = requests.get(url=url, data=query_request)
35 |     print(resp.json())
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument("--op", type=str, required=True)
41 |     parser.add_argument("--host", type=str, default="127.0.0.1")
42 |     parser.add_argument("--port", type=int, default=8080)
43 |     parser.add_argument("--file", type=str)
44 |     parser.add_argument("--query_string", type=str)
45 |     parser.add_argument("--query_type", type=str, default="vector",
46 |                         choices=["vector", "lucene"])
47 |     args = parser.parse_args()
48 | 
49 |     url = f"http://{args.host}:{args.port}"
50 |     match args.op:
51 |         case "upload":
52 |             if args.file is None:
53 |                 print("please input the text file path")
54 |             url += "/add_doc"
55 |             upload_file(url, args.file)
56 | 
57 |         case "commit":
58 |             url += "/commit"
59 |             commit(url)
60 | 
61 |         case "query":
62 |             if args.query_string is None:
63 |                 print("please input the query string")
64 |             url += "/query"
65 |             start = time.monotonic()
66 |             query(url, args.query_string, args.query_type)
67 |             dur = time.monotonic() - start
68 |             print(f"{args.query_type} query time: {dur}s")
69 | 
70 |         case _:
71 |             print("supported op: upload, commit, query")
72 | 
73 | 


--------------------------------------------------------------------------------
/server/server.py:
--------------------------------------------------------------------------------
  1 | from contextlib import asynccontextmanager
  2 | from fastapi import FastAPI, File, Form, HTTPException, Depends, Body, UploadFile
  3 | import logging
  4 | import lucene
  5 | import mimetypes
  6 | import os
  7 | from typing import Optional
  8 | import sys
  9 | import uvicorn
 10 | 
 11 | from index.index import Index
 12 | from index.docs import DocField, DocFields, DocChunkScore
 13 | from server.api import QueryType, QueryRequest, QueryResponse
 14 | 
 15 | logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 16 | 
 17 | # The embedding model provider: openai_embedding, sentence_transformer.
 18 | # If model is set to openai_embedding, please remember to set OPENAI_API_KEY.
 19 | ENV_EMBEDDING_MODEL_PROVIDER = os.environ.get("ENV_EMBEDDING_MODEL_PROVIDER")
 20 | # The directory to store the lucene and vector index
 21 | ENV_INDEX_DIR = os.environ.get("ENV_INDEX_DIR")
 22 | 
 23 | DEFAULT_EMBEDDING_MODEL_PROVIDER = "sentence_transformer"
 24 | DEFAULT_INDEX_DIR = "./server_index_dir"
 25 | 
 26 | embedding_model = DEFAULT_EMBEDDING_MODEL_PROVIDER
 27 | index_dir = DEFAULT_INDEX_DIR
 28 | if ENV_EMBEDDING_MODEL_PROVIDER is not None \
 29 |     and ENV_EMBEDDING_MODEL_PROVIDER != "":
 30 |     embedding_model = ENV_EMBEDDING_MODEL_PROVIDER
 31 | if ENV_INDEX_DIR is not None and ENV_INDEX_DIR != "":
 32 |     index_dir = ENV_INDEX_DIR
 33 | 
 34 | # the sub directory under index_dir to store the doc content
 35 | index_doc_dir = os.path.join(index_dir, "docs")
 36 | 
 37 | 
 38 | def start(host: str, port: int):
 39 |     uvicorn.run("server.server:app", host=host, port=port, reload=False)
 40 | 
 41 | 
 42 | @asynccontextmanager
 43 | async def lifespan(app: FastAPI):
 44 |     # init Index
 45 |     global index
 46 |     lucene.initVM(vmargs=['-Djava.awt.headless=true'])
 47 |     index = Index(index_dir=index_dir,
 48 |                   model_provider=embedding_model,
 49 |                   vector_store="hnswlib")
 50 |     logging.info("start the index")
 51 |     yield
 52 |     # close the index
 53 |     # TODO when use Ctrl+C to stop, close is not called. using the old shutdown
 54 |     # does not work as well.
 55 |     logging.info("close the index")
 56 |     index.close()
 57 | 
 58 | app = FastAPI(lifespan=lifespan)
 59 | 
 60 | 
 61 | # TODO support creating the index
 62 | @app.post("/add_doc")
 63 | async def add_doc(
 64 |     file: UploadFile = File(...),
 65 |     fields: Optional[str] = Form(None),
 66 | ):
 67 |     filename = file.filename
 68 |     try:
 69 |         # parse the fields
 70 |         doc_fields: DocFields = None
 71 |         if fields is not None:
 72 |             doc_fields = DocFields.parse_raw(fields)
 73 | 
 74 |         # save the file text
 75 |         doc_path = await save_file_text(file)
 76 | 
 77 |         # add file to index
 78 |         doc_id = index.add(doc_path, doc_fields)
 79 |         return doc_id
 80 |     except Exception as e:
 81 |         logging.error(f"add doc {filename} error: {e}")
 82 |         raise HTTPException(status_code=500, detail=f"str({e})")
 83 | 
 84 | 
 85 | @app.post("/commit")
 86 | async def commit():
 87 |     index.commit()
 88 | 
 89 | 
 90 | @app.get(
 91 |     "/query",
 92 |     response_model=QueryResponse,
 93 | )
 94 | async def query(
 95 |     request: QueryRequest = Body(...),
 96 | ):
 97 |     try:
 98 |         docs: List[DocChunkScore] = None
 99 |         if request.query_type == QueryType.vector:
100 |             docs = index.vector_search(request.query, request.top_k)
101 |         else:
102 |             docs = index.lucene_search(request.query, request.top_k)
103 | 
104 |         return QueryResponse(doc_scores=docs)
105 |     except Exception as e:
106 |         logging.error(f"query {request.query} error: {e}")
107 |         raise HTTPException(status_code=500, detail=f"str({e})")
108 | 
109 | 
110 | async def save_file_text(file: UploadFile) -> str:
111 |     """
112 |     Extract text from file and save under index_doc_dir.
113 |     Return the absolute file path saved under index_doc_dir.
114 |     """
115 |     # check file type. only support text file now.
116 |     mimetype = file.content_type
117 |     if mimetype is None:
118 |         mimetype, _ = mimetypes.guess_type(file.filename)
119 | 
120 |     if mimetype != "text/plain" and mimetype != "text/markdown":
121 |         raise ValueError(f"Unsupported file type: {mimetype}")
122 | 
123 |     # store the file text under index_doc_dir.
124 |     # TODO support other type file, extract the text from the file.
125 |     # TODO for small files, directly store in Lucene.
126 |     doc_path = os.path.join(index_doc_dir, file.filename)
127 |     os.makedirs(os.path.dirname(doc_path), exist_ok=True)
128 | 
129 |     file_stream = await file.read()
130 | 
131 |     # TODO if file exists, update doc
132 |     with open(doc_path, "wb") as f:
133 |         f.write(file_stream)
134 | 
135 |     return doc_path
136 | 
137 | 


--------------------------------------------------------------------------------
/tests/vectorstore/test_hnswlib.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import random
  4 | from typing import List
  5 | import numpy as np
  6 | 
  7 | from vectorstore.factory import get_vector_store
  8 | 
  9 | class TestHnswlib():
 10 |     def test_save_empty_index(self):
 11 |         dim = 384
 12 |         max_elements = 1
 13 |         space = "cosine"
 14 |         store = get_vector_store("hnswlib", dim, space, max_elements)
 15 | 
 16 |         index_path = f"ut_empty_index.bin"
 17 |         store.save(index_path)
 18 | 
 19 |         assert os.path.exists(index_path)
 20 |         os.remove(index_path)
 21 | 
 22 | 
 23 |     def test_index_cosines_space(self):
 24 |         self.verify_index_spaces("cosine")
 25 | 
 26 |     def test_index_ip_space(self):
 27 |         self.verify_index_spaces("ip")
 28 | 
 29 |     def verify_index_spaces(self, space: str):
 30 |         dim = 16
 31 |         max_elements = 5
 32 |         store = get_vector_store("hnswlib", dim, space, max_elements)
 33 | 
 34 |         embeddings = np.float32(np.random.random((max_elements, dim)))
 35 |         labels = np.arange(max_elements)
 36 | 
 37 |         store.add(embeddings, labels)
 38 | 
 39 |         query_embeddings: List[List[float]] = []
 40 |         query_embeddings.append(embeddings[0])
 41 |         qlabels, distances = store.query(
 42 |             embeddings=query_embeddings, top_k=max_elements)
 43 | 
 44 |         assert 1 == len(qlabels)
 45 |         assert 1 == len(distances)
 46 |         # verify all elements are returned
 47 |         assert max_elements == len(qlabels[0])
 48 |         assert max_elements == len(distances[0])
 49 |         if space != "ip":
 50 |             # inner product is not an actual metric. An element can be closer
 51 |             # to some other element than to itself
 52 |             assert labels[0] == qlabels[0][0]
 53 |         qlabels[0].sort()
 54 |         assert all([a == b for a, b in zip(qlabels[0], labels)])
 55 | 
 56 | 
 57 |     def test_save_load_index_l2_space(self):
 58 |         dim = 16
 59 |         max_elements = 5
 60 |         space = "l2"
 61 |         store = get_vector_store("hnswlib", dim, space, max_elements)
 62 | 
 63 |         embeddings = np.float32(np.random.random((max_elements, dim)))
 64 |         labels = np.arange(max_elements)
 65 | 
 66 |         store.add(embeddings, labels)
 67 | 
 68 |         qlabels, distances = store.query(embeddings=embeddings[0], top_k=1)
 69 |         assert 1 == len(qlabels)
 70 |         assert 1 == len(distances)
 71 |         assert 1 == len(qlabels[0])
 72 |         assert 1 == len(distances[0])
 73 |         assert labels[0] == qlabels[0][0]
 74 |         assert 0.0 == distances[0][0]
 75 | 
 76 |         query_embeddings: List[List[float]] = []
 77 |         query_embeddings.append(embeddings[0])
 78 |         qlabels, distances = store.query(
 79 |             embeddings=query_embeddings, top_k=max_elements)
 80 |         assert 1 == len(qlabels)
 81 |         assert 1 == len(distances)
 82 |         assert max_elements == len(qlabels[0])
 83 |         assert max_elements == len(distances[0])
 84 |         assert labels[0] == qlabels[0][0]
 85 |         # l2 equation, d = sum((Ai-Bi)^2), the distance of exact match is 0
 86 |         assert 0.0 == distances[0][0]
 87 |         qlabels[0].sort()
 88 |         assert all([a == b for a, b in zip(qlabels[0], labels)])
 89 | 
 90 |         index_path = "ut_index.bin"
 91 |         store.save(index_path)
 92 | 
 93 |         store1 = get_vector_store("hnswlib", dim, space, max_elements)
 94 |         store1.load(index_path)
 95 | 
 96 |         qlabels, distances = store1.query(embeddings=embeddings[0], top_k=1)
 97 |         assert 1 == len(qlabels)
 98 |         assert 1 == len(distances)
 99 |         assert 1 == len(qlabels[0])
100 |         assert 1 == len(distances[0])
101 |         assert labels[0] == qlabels[0][0]
102 |         assert 0.0 == distances[0][0]
103 | 
104 |         qlabels, distances = store1.query(
105 |             embeddings=embeddings[0], top_k=max_elements)
106 |         assert 1 == len(qlabels)
107 |         assert 1 == len(distances)
108 |         assert max_elements == len(qlabels[0])
109 |         assert max_elements == len(distances[0])
110 |         assert labels[0] == qlabels[0][0]
111 |         assert 0.0 == distances[0][0]
112 |         qlabels[0].sort()
113 |         assert all([a == b for a, b in zip(qlabels[0], labels)])
114 | 
115 |         os.remove(index_path)
116 | 
117 | 
118 |     def test_negative_cases(self):
119 |         dim = 384
120 |         max_elements = 5
121 |         space = "cosine"
122 |         store = get_vector_store("hnswlib", dim, space, max_elements)
123 | 
124 |         # negative test: num_elements > max_elements
125 |         num_elements = max_elements + 1
126 |         embeddings = np.float32(np.random.random((num_elements, dim)))
127 |         labels = np.arange(num_elements)
128 | 
129 |         with pytest.raises(RuntimeError):
130 |             store.add(embeddings, labels)
131 | 
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # VecLucene
 2 | VecLucene is an open-source vector search engine library built on top of Lucene and popular ANN (approximate nearest neighbor) search libraries. Its purpose is to simplify the process of vector search for users. VecLucene introduces the following enhancements to Lucene:
 3 | 
 4 | ## Open Models
 5 | VecLucene currently supports [OpenAI's text-embedding-ada-002](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) and Sentence_Transformer models, [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) by default, for embeddings. It has a flexible framework to support additional models and can be further extended to accommodate custom models. The larger models generally result in higher latency. The application can select the most suitable model based on the workload. [HuggingFace MTEB: Massive Text Embedding Benchmark](https://huggingface.co/blog/mteb) measures the speed and performance of text embedding models. HuggingFace MTEB was published on October 19, 2022. So it does not include OpenAI embedding model.
 6 | 
 7 | In addition, VecLucene can be expanded to support question-answering (QA) functionality. VecLucene can store the document text. For a query, VecLucene will find the matched text chunks and send them as context to models like ChatGPT to generate answers.
 8 | 
 9 | ## Open ANN libraries
10 | The default choice for VecLucene is [Hnswlib](https://github.com/nmslib/hnswlib). There are plans to support [Faiss](https://github.com/facebookresearch/faiss) and other libraries if necessary. This flexibility allows the application to choose the best library that aligns with its workload and requirements.
11 | 
12 | Lucene's KNN feature currently supports one embedding per document. However, for document text, a single embedding is often insufficient. One possible solution is to store text chunks within a document as multiple Lucene documents. While, this makes the inverted index more complex for the document.
13 | 
14 | ## Self-Managed Document Store
15 | With VecLucene, the application simply uploads the document, and VecLucene handles the rest. It automatically extracts text from the file (currently only plain text documents are supported), splits the text into chunks, calls the model to generate embeddings for each chunk, and persists the embeddings in the ANN library.
16 | 
17 | ## Hybrid Search
18 | VecLucene retains all of Lucene's existing abilities. The application can define multiple fields to store additional information for each document and use traditional Lucene queries to access these fields. For instance, the application can send a natural language query string with filters on other fields. VecLucene will generate an embedding for the query string, find similar documents in the ANN library, and filter out documents that don't meet the specified filtering conditions.
19 | 
20 | Furthermore, the text is indexed in Lucene using the traditional inverted index format. The application can choose to use either type of search or even perform a hybrid search by combining the results of both inverted index search and ANN search. Inverted index search is generally faster, while ANN search provides a better understanding of semantic relationships.
21 | 
22 | ## Install
23 | VecLucene is built on top of PyLucene-9.4.1. Please follow the instructions in the [PyLucene Install guide](https://lucene.apache.org/pylucene/install.html) to install PyLucene. Note that after building jcc, you will need to edit Makefile to set "PYTHON", "JCC" and "NUM_FILES" for your platform. Please make sure you have installed JDK and GCC before building PyLucene. [JCC Install](https://lucene.apache.org/pylucene/jcc/install.html) suggests to install Temurin Java.
24 | 
25 | Other Python packages for VecLucene are managed by Poetry. You can use the "poetry export" command to create a requirements file and then install the packages with pip.
26 | 
27 | The Dockerfile is a good reference for how to build VecLucene.
28 | 
29 | ## Usage
30 | Once you have installed VecLucene, you can start it as an HTTP server by running `python main.py`. You can use the `example/cli.py` file to upload files, commit them, and query the server.
31 | 
32 | If you prefer, you can skip the installation process and use the pre-built Docker container:
33 | 1. Pull the docker image using `docker pull junius/veclucene-arm64`. For the amd64 platform, pull `junius/veclucene-amd64`. Please note that the size of the amd64 container image is much larger compared to the arm64 platform. This is because the packages required to run SentenceTransformer on the amd64 platform are significantly larger. You can pull `junius/veclucene-gpt-amd64`, which only works with the OpenAI Embedding model and is much smaller in size.
34 | 2. Run the container using `docker run -d --name vltest -p 127.0.0.1:8080:8080 junius/veclucene-arm64`, which uses the SetenceTransformer `all-MiniLM-L6-v2` model. To use ChatGPT, run `docker run -d --env ENV_EMBEDDING_MODEL_PROVIDER=openai_embedding --env OPENAI_API_KEY=xxx --name vltest -p 127.0.0.1:8080:8080 junius/veclucene-arm64`
35 | 3. Use `python3 example/cli.py --op upload --file path/to/localfile` and `python3 example/cli.py --op query --query_string "xxx" --query_type "vector"` to upload files and query the server. To use the traditional inverted search, use `--query_type "lucene"`. Don't forget to run `python3 example/cli.py --op commit` before stopping the container to ensure that the index is committed and can be queried again later.
36 | 
37 | Please note that VecLucene is still at an early stage and has limited abilities:
38 | 1. It only supports plain text files.
39 | 2. It is limited to 5000 embeddings.
40 | 3. The vector search does not parse the query string yet, e.g. simply generate the embedding from the entire query string. For the inverted search, the query string is parsed using Lucene parser.
41 | 


--------------------------------------------------------------------------------
/tests/index/test_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import lucene
  4 | import pytest
  5 | import time
  6 | from typing import List
  7 | import shutil
  8 | 
  9 | from index.docs import DocField, DocFields
 10 | from index.index import Index
 11 | 
 12 | class TestSentenceTransformerWithIndex:
 13 |     def test_index(self):
 14 |         t = IndexAndSearchTest()
 15 |         t.index_docs_and_search(
 16 |             "./tests/index/", "sentence_transformer", "hnswlib")
 17 | 
 18 | class IndexAndSearchTest:
 19 |     def index_docs_and_search(
 20 |         self, base_dir: str, model_name: str, vector_store: str,
 21 |     ):
 22 |         ut_dir = os.path.join(base_dir, "utdir-index")
 23 |         if os.path.exists(ut_dir):
 24 |             # remove the possible garbage by previous failed test
 25 |             shutil.rmtree(ut_dir)
 26 |         os.mkdir(ut_dir)
 27 | 
 28 |         lucene.initVM(vmargs=['-Djava.awt.headless=true'])
 29 | 
 30 |         index = Index(ut_dir, model_name, vector_store)
 31 | 
 32 |         try:
 33 |             # step1: add the first file
 34 |             doc_path1 = "./tests/testfiles/single_sentence.txt"
 35 |             fields: List[DocField] = []
 36 |             pathField = DocField(name="path", string_value=doc_path1)
 37 |             fields.append(pathField)
 38 |             doc_fields = DocFields(fields=fields)
 39 | 
 40 |             doc_id1 = index.add(doc_path1, doc_fields)
 41 | 
 42 |             # search lucene
 43 |             query_string = "A person is eating food."
 44 |             top_k = 3
 45 |             start = time.monotonic()
 46 |             lucene_score_docs = index.lucene_search(query_string, top_k)
 47 |             dur = time.monotonic() - start
 48 |             logging.info(f"1 doc, lucene search time: {dur}s")
 49 |             assert 1 == len(lucene_score_docs)
 50 |             assert doc_id1 == lucene_score_docs[0].doc_id
 51 | 
 52 |             # search vector index
 53 |             start = time.monotonic()
 54 |             vector_score_docs = index.vector_search(query_string, top_k)
 55 |             dur = time.monotonic() - start
 56 |             logging.info(f"1 doc, vector search time: {dur}s")
 57 |             assert 1 == len(vector_score_docs)
 58 |             assert doc_id1 == vector_score_docs[0].doc_id
 59 |             assert vector_score_docs[0].score > 0.9
 60 | 
 61 |             # commit and verify the vector index version
 62 |             index.commit()
 63 |             vector_index_version = index._get_vector_index_version()
 64 |             assert 1 == vector_index_version
 65 | 
 66 |             # step2: add the second file
 67 |             doc_path2 = "./tests/testfiles/chatgpt.txt"
 68 |             fields.clear()
 69 |             pathField = DocField(name="path", string_value=doc_path2)
 70 |             fields.append(pathField)
 71 |             doc_fields = DocFields(fields=fields)
 72 | 
 73 |             doc_id2 = index.add(doc_path2, doc_fields)
 74 | 
 75 |             # search lucene only
 76 |             query_string = "A person is eating food."
 77 |             top_k = 3
 78 |             start = time.monotonic()
 79 |             lucene_score_docs = index.lucene_search(query_string, top_k)
 80 |             dur = time.monotonic() - start
 81 |             logging.info(f"2 docs, lucene search time: {dur}s")
 82 |             assert 2 == len(lucene_score_docs)
 83 | 
 84 |             # search vector index
 85 |             start = time.monotonic()
 86 |             vector_score_docs = index.vector_search(query_string, top_k)
 87 |             dur = time.monotonic() - start
 88 |             logging.info(f"2 docs, vector search time: {dur}s")
 89 |             # sentence_transformer returns:
 90 |             # [DocChunkScore(doc_id1, offset=0, length=25, score=1.0),
 91 |             #  DocChunkScore(doc_id2, offset=15234, length=1172, score=0.34),
 92 |             #  DocChunkScore(doc_id2, offset=2219, length=1182, score=0.34)]
 93 |             # openai returns, open file, seek and read, the text looks not
 94 |             # related to the query_string, not sure why openai scores 0.63
 95 |             # [DocChunkScore(doc_id1, offset=0, length=25, score=1.0),
 96 |             #  DocChunkScore(doc_id2, offset=15234, length=1172, score=0.63),
 97 |             #  DocChunkScore(doc_id2, offset=16406, length=1272, score=0.63)]
 98 |             #logging.info(f"=== {vector_score_docs}")
 99 |             assert 3 == len(vector_score_docs)
100 |             assert doc_id1 == vector_score_docs[0].doc_id
101 |             assert doc_id2 == vector_score_docs[1].doc_id
102 |             assert doc_id2 == vector_score_docs[2].doc_id
103 |             assert vector_score_docs[0].score > 0.9
104 |             if model_name == "sentence_transformer":
105 |                 assert vector_score_docs[1].score < 0.5 # doc2 has low score
106 |                 assert vector_score_docs[2].score < 0.5 # doc2 has low score
107 |             if vector_score_docs[1].score > 0.5:
108 |                 score = vector_score_docs[1].score
109 |                 logging.info(f"{model_name} scores high {score}")
110 | 
111 |             # commit and verify the vector index version
112 |             index.commit()
113 |             vector_index_version = index._get_vector_index_version()
114 |             assert 2 == vector_index_version
115 | 
116 |             index.close()
117 | 
118 |             # step3: reload index
119 |             index = Index(ut_dir, model_name, vector_store)
120 |             assert 2 == index.vector_index_version
121 | 
122 |             # search lucene only
123 |             query_string = "A person is eating food."
124 |             top_k = 3
125 |             start = time.monotonic()
126 |             lucene_score_docs = index.lucene_search(query_string, top_k)
127 |             dur = time.monotonic() - start
128 |             logging.info(f"2 docs, reload, lucene search time: {dur}s")
129 |             assert 2 == len(lucene_score_docs)
130 | 
131 |             # search vector index
132 |             start = time.monotonic()
133 |             vector_score_docs = index.vector_search(query_string, top_k)
134 |             dur = time.monotonic() - start
135 |             logging.info(f"2 docs, reload, vector search time: {dur}s")
136 |             assert 3 == len(vector_score_docs)
137 |             assert doc_id1 == vector_score_docs[0].doc_id
138 |             assert doc_id2 == vector_score_docs[1].doc_id
139 |             assert doc_id2 == vector_score_docs[2].doc_id
140 |             assert vector_score_docs[0].score > 0.9
141 |             if model_name == "sentence_transformer":
142 |                 assert vector_score_docs[1].score < 0.5 # doc2 has low score
143 |                 assert vector_score_docs[2].score < 0.5 # doc2 has low score
144 | 
145 |         finally:
146 |             index.close()
147 | 
148 |         # cleanup
149 |         shutil.rmtree(ut_dir)
150 | 
151 | 


--------------------------------------------------------------------------------
/tests/index/test_vector_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | from typing import List
  4 | import shutil
  5 | 
  6 | from index.vector_index import VectorIndex
  7 | 
  8 | class TestVectorIndex:
  9 |     def test_single_sentence(self):
 10 |         index = VectorIndex("./", "sentence_transformer", "hnswlib")
 11 | 
 12 |         text = "A person is eating food."
 13 |         doc_path = "./tests/testfiles/single_sentence.txt"
 14 |         chunk_embeddings, chunk_metas = index._get_embeddings(doc_path)
 15 |         assert 1 == len(chunk_embeddings)
 16 |         assert 1 == len(chunk_metas)
 17 |         assert 0 == chunk_metas[0].offset
 18 |         assert len(text)+1 == chunk_metas[0].length
 19 |         assert 0 == chunk_metas[0].label
 20 | 
 21 |         texts = []
 22 |         texts.append(text)
 23 |         embeddings = index.model.get_embeddings(texts)
 24 |         assert 1 == len(embeddings)
 25 |         assert index.model.get_dim() == len(embeddings[0])
 26 |         assert all([a == b for a, b in zip(chunk_embeddings[0], embeddings[0])])
 27 | 
 28 | 
 29 |     def test_small_file(self):
 30 |         index = VectorIndex("./", "sentence_transformer", "hnswlib")
 31 | 
 32 |         doc_path = "./tests/testfiles/chatgpt.txt"
 33 |         chunk_embeddings, chunk_metas = index._get_embeddings(doc_path)
 34 |         assert 28 == len(chunk_embeddings)
 35 |         assert 28 == len(chunk_metas)
 36 |         # the first meta has offset == 0
 37 |         assert 0 == chunk_metas[0].offset
 38 |         # _get_embeddings does not assign label
 39 |         assert 0 == chunk_metas[0].label
 40 | 
 41 |         # test small embedding batch
 42 |         chunk_embeddings, chunk_metas = index._get_embeddings(doc_path, 5)
 43 |         assert 28 == len(chunk_embeddings)
 44 |         assert 28 == len(chunk_metas)
 45 |         # the first meta has offset == 0
 46 |         assert 0 == chunk_metas[0].offset
 47 |         # _get_embeddings does not assign label
 48 |         assert 0 == chunk_metas[0].label
 49 | 
 50 | 
 51 |     def test_special_files(self):
 52 |         index = VectorIndex("./", "sentence_transformer", "hnswlib")
 53 | 
 54 |         # test empty file
 55 |         doc_path = "./tests/testfiles/empty.txt"
 56 |         chunk_embeddings, chunk_metas = index._get_embeddings(doc_path)
 57 |         assert 0 == len(chunk_embeddings)
 58 |         assert 0 == len(chunk_metas)
 59 | 
 60 |         # test file with only whitespaces
 61 |         doc_path = "./tests/testfiles/whitespaces.txt"
 62 |         chunk_embeddings, chunk_metas = index._get_embeddings(doc_path)
 63 |         assert 0 == len(chunk_embeddings)
 64 |         assert 0 == len(chunk_metas)
 65 | 
 66 |         # test file with only 3 chars
 67 |         doc_path = "./tests/testfiles/3chars.txt"
 68 |         chunk_embeddings, chunk_metas = index._get_embeddings(doc_path)
 69 |         assert 0 == len(chunk_embeddings)
 70 |         assert 0 == len(chunk_metas)
 71 | 
 72 | 
 73 |     def test_index(self):
 74 |         # test index with 2 files, cover the mapping of doc ids and labels
 75 |         index = VectorIndex("./", "sentence_transformer", "hnswlib")
 76 | 
 77 |         # add the first file
 78 |         text = "A person is eating food."
 79 |         doc_path1 = "./tests/testfiles/single_sentence.txt"
 80 |         doc_id1 = "doc_id1"
 81 |         doc1_chunks = 1
 82 |         label1 = 1
 83 |         index.add(doc_path1, doc_id1)
 84 | 
 85 |         assert doc1_chunks == index.metadata.elements
 86 |         assert label1 == index.metadata.last_label
 87 | 
 88 |         assert 1 == len(index.doc_id_to_metas)
 89 |         assert doc1_chunks == len(index.doc_id_to_metas[doc_id1])
 90 |         assert 0 == index.doc_id_to_metas[doc_id1][0].offset
 91 |         assert len(text)+1 == index.doc_id_to_metas[doc_id1][0].length
 92 |         assert label1 == index.doc_id_to_metas[doc_id1][0].label
 93 | 
 94 |         assert 1 == len(index.label_to_chunk_id)
 95 |         assert doc_id1 == index.label_to_chunk_id[label1].doc_id
 96 |         assert 0 == index.label_to_chunk_id[label1].offset
 97 |         assert len(text)+1 == index.label_to_chunk_id[label1].length
 98 | 
 99 |         # search
100 |         query_string = "A person is eating food."
101 |         top_k = 3
102 |         doc_chunk_scores = index.search(query_string, top_k)
103 |         assert 1 == len(doc_chunk_scores)
104 |         assert doc_id1 == doc_chunk_scores[0].doc_id
105 |         assert doc_chunk_scores[0].score > 0.9 # very high score
106 | 
107 |         # add the second file
108 |         doc_path2 = "./tests/testfiles/chatgpt.txt"
109 |         doc_id2 = "doc_id2"
110 |         doc2_chunks = 28
111 |         index.add(doc_path2, doc_id2)
112 | 
113 |         assert doc1_chunks+doc2_chunks == index.metadata.elements
114 |         assert label1+doc2_chunks == index.metadata.last_label
115 |         # make sure the offsets are continuous
116 |         offset = 0
117 |         for chunk_meta in index.doc_id_to_metas[doc_id2]:
118 |             assert offset == chunk_meta.offset
119 |             offset += chunk_meta.length
120 | 
121 |         assert 2 == len(index.doc_id_to_metas)
122 |         # verify doc1 metas
123 |         assert 1 == len(index.doc_id_to_metas[doc_id1])
124 |         assert 0 == index.doc_id_to_metas[doc_id1][0].offset
125 |         assert len(text)+1 == index.doc_id_to_metas[doc_id1][0].length
126 |         assert label1 == index.doc_id_to_metas[doc_id1][0].label
127 |         # verify doc2 metas
128 |         assert doc2_chunks == len(index.doc_id_to_metas[doc_id2])
129 |         assert 0 == index.doc_id_to_metas[doc_id2][0].offset
130 |         for i, chunk_meta in enumerate(index.doc_id_to_metas[doc_id2]):
131 |             assert label1+i+1 == chunk_meta.label
132 | 
133 |         assert doc1_chunks+doc2_chunks == len(index.label_to_chunk_id)
134 |         # verify doc1 chunk ids
135 |         assert doc_id1 == index.label_to_chunk_id[label1].doc_id
136 |         assert 0 == index.label_to_chunk_id[label1].offset
137 |         assert len(text)+1 == index.label_to_chunk_id[label1].length
138 |         # verify doc2 chunk ids
139 |         for label in range(label1+1, len(index.label_to_chunk_id)):
140 |             assert doc_id2 == index.label_to_chunk_id[label].doc_id
141 | 
142 |         # search
143 |         query_string = "A person is eating food."
144 |         top_k = 3
145 |         doc_chunk_scores = index.search(query_string, top_k)
146 |         assert top_k == len(doc_chunk_scores)
147 |         assert doc_id1 == doc_chunk_scores[0].doc_id
148 |         assert doc_id2 == doc_chunk_scores[1].doc_id
149 |         assert doc_id2 == doc_chunk_scores[2].doc_id
150 |         assert doc_chunk_scores[0].score > 0.9 # doc1 has high score
151 |         assert doc_chunk_scores[1].score < 0.5 # doc2 has low score
152 |         assert doc_chunk_scores[1].score < 0.5 # doc2 has low score
153 | 
154 |         # search a unrelated string
155 |         query_string = "a beautiful sky"
156 |         top_k = 3
157 |         doc_chunk_scores = index.search(query_string, top_k)
158 |         assert 3 == len(doc_chunk_scores)
159 |         # all doc chunks have low score
160 |         assert doc_chunk_scores[0].score < 0.5
161 |         assert doc_chunk_scores[1].score < 0.5
162 |         assert doc_chunk_scores[2].score < 0.5
163 | 
164 | 
165 |     def test_save_load_index(self):
166 |         # test load index with 2 files
167 |         ut_dir = "./tests/index/utdir-vectorindex"
168 |         if os.path.exists(ut_dir):
169 |             # remove the possible garbage by previous failed test
170 |             shutil.rmtree(ut_dir)
171 |         os.mkdir(ut_dir)
172 | 
173 |         # the first file
174 |         text = "A person is eating food."
175 |         doc_path1 = "./tests/testfiles/single_sentence.txt"
176 |         doc_id1 = "doc_id1"
177 |         doc1_chunks = 1
178 |         label1 = 1
179 | 
180 |         # the second file
181 |         doc_path2 = "./tests/testfiles/chatgpt.txt"
182 |         doc_id2 = "doc_id2"
183 |         doc2_chunks = 28
184 |  
185 |         # vector file version
186 |         version = 1
187 | 
188 |         # create the vector file inside try, so VectorIndex is destructed,
189 |         # but hnswlib still complains, "Warning: Calling load_index for an
190 |         # already inited index.". Check it later.
191 |         try:
192 |             index = VectorIndex(ut_dir, "sentence_transformer", "hnswlib")
193 | 
194 |             # add the first file
195 |             index.add(doc_path1, doc_id1)
196 | 
197 |             # add the second file
198 |             index.add(doc_path2, doc_id2)
199 | 
200 |             # save the vectors to file
201 |             index.save(version)
202 |         except:
203 |             assert False
204 | 
205 |         # load from file
206 |         index1 = VectorIndex(ut_dir, "sentence_transformer", "hnswlib")
207 |         assert 0 == index1.metadata.elements
208 |         assert 0 == index1.metadata.last_label
209 | 
210 |         index1.load(version)
211 | 
212 |         assert doc1_chunks+doc2_chunks == index1.metadata.elements
213 |         assert label1+doc2_chunks == index1.metadata.last_label
214 | 
215 |         assert 2 == len(index1.doc_id_to_metas)
216 |         # verify doc1 metas
217 |         assert 1 == len(index1.doc_id_to_metas[doc_id1])
218 |         assert 0 == index1.doc_id_to_metas[doc_id1][0].offset
219 |         assert len(text)+1 == index1.doc_id_to_metas[doc_id1][0].length
220 |         assert label1 == index1.doc_id_to_metas[doc_id1][0].label
221 |         # verify doc2 metas
222 |         assert doc2_chunks == len(index1.doc_id_to_metas[doc_id2])
223 |         assert 0 == index1.doc_id_to_metas[doc_id2][0].offset
224 |         for i, chunk_meta in enumerate(index1.doc_id_to_metas[doc_id2]):
225 |             assert label1+i+1 == chunk_meta.label
226 | 
227 |         assert doc1_chunks+doc2_chunks == len(index1.label_to_chunk_id)
228 |         # verify doc1 chunk ids
229 |         assert doc_id1 == index1.label_to_chunk_id[label1].doc_id
230 |         assert 0 == index1.label_to_chunk_id[label1].offset
231 |         assert len(text)+1 == index1.label_to_chunk_id[label1].length
232 |         # verify doc2 chunk ids
233 |         for label in range(label1+1, len(index1.label_to_chunk_id)):
234 |             assert doc_id2 == index1.label_to_chunk_id[label].doc_id
235 | 
236 |         # search
237 |         query_string = "A person is eating food."
238 |         top_k = 3
239 |         doc_chunk_scores = index.search(query_string, top_k)
240 |         assert 3 == len(doc_chunk_scores)
241 |         assert doc_id1 == doc_chunk_scores[0].doc_id
242 |         assert doc_id2 == doc_chunk_scores[1].doc_id
243 |         assert doc_id2 == doc_chunk_scores[2].doc_id
244 |         assert doc_chunk_scores[0].score > 0.9 # doc1 has high score
245 |         assert doc_chunk_scores[1].score < 0.5 # doc2 has low score
246 |         assert doc_chunk_scores[2].score < 0.5 # doc2 has low score
247 | 
248 |         # search a unrelated string
249 |         query_string = "a beautiful sky"
250 |         top_k = 3
251 |         doc_chunk_scores = index.search(query_string, top_k)
252 |         assert 3 == len(doc_chunk_scores)
253 |         # all doc chunks have low score
254 |         assert doc_chunk_scores[0].score < 0.5
255 |         assert doc_chunk_scores[1].score < 0.5
256 |         assert doc_chunk_scores[2].score < 0.5
257 | 
258 |         # cleanup
259 |         shutil.rmtree(ut_dir)
260 | 
261 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/index/index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import lucene
  4 | from typing import List
  5 | import uuid
  6 | 
  7 | from java.nio.file import Files, Path
  8 | from org.apache.lucene.analysis.standard import StandardAnalyzer
  9 | from org.apache.lucene.document import \
 10 |     Document, Field, StringField, TextField, StoredField
 11 | from org.apache.lucene.index import \
 12 |     DirectoryReader, IndexWriter, IndexWriterConfig, Term
 13 | from org.apache.lucene.queryparser.classic import QueryParser
 14 | from org.apache.lucene.search import IndexSearcher, ScoreDoc, TermQuery
 15 | from org.apache.lucene.store import FSDirectory
 16 | 
 17 | from index.docs import DocField, DocFields, DocChunkScore
 18 | from index.vector_index import VectorIndex
 19 | 
 20 | # the reserved field names for the doc
 21 | FIELD_DOC_ID = "doc_id"
 22 | FIELD_DOC_TEXT = "doc_text"
 23 | FIELD_VECTOR_INDEX_VERSION = "vector_index_version"
 24 | 
 25 | # the reserved doc ids for the internal usage
 26 | # the reserved doc id for the vector index metadata
 27 | SYS_DOC_ID_VECTOR_INDEX = "$sys_doc_id_vector_index"
 28 | 
 29 | 
 30 | # the subdir for Lucene
 31 | SUBDIR_LUCENE = "lucene"
 32 | SUBDIR_VECTOR = "vector"
 33 | 
 34 | 
 35 | """
 36 | The Index class combines Lucene index with the vector index. It accepts a
 37 | document, splits the document content to chunks, generates embeddings for each
 38 | chunk using the specified model, persists the embeddings in the vector index
 39 | and persists Lucene fields in the Lucene index. Search could search both Lucene
 40 | and vector index, and merge the results.
 41 | The Index class guarantees the consistency between Lucene index and vector
 42 | index, and manages the lifecycle of the documents.
 43 | TODO this class is not thread safe for concurrent write and read. The underline
 44 | vector store, such as Hnswlib, does not support concurrent write and read.
 45 | """
 46 | class Index:
 47 |     index_dir: str
 48 |     writer: IndexWriter
 49 |     searcher: IndexSearcher
 50 | 
 51 |     vector_index: VectorIndex
 52 |     vector_index_version: int
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         index_dir: str,
 57 |         model_provider: str,
 58 |         vector_store: str,
 59 |     ):
 60 |         if not os.path.exists(index_dir):
 61 |             os.mkdir(index_dir)
 62 | 
 63 |         lucene_dir = os.path.join(index_dir, SUBDIR_LUCENE)
 64 |         if not os.path.exists(lucene_dir):
 65 |             os.mkdir(lucene_dir)
 66 | 
 67 |         vector_dir = os.path.join(index_dir, SUBDIR_VECTOR)
 68 |         if not os.path.exists(vector_dir):
 69 |             os.mkdir(vector_dir)
 70 | 
 71 |         analyzer = StandardAnalyzer()
 72 | 
 73 |         # initialize the IndexWriter for Lucene
 74 |         fs_dir = FSDirectory.open(Path.of(lucene_dir))
 75 |         config = IndexWriterConfig(analyzer)
 76 |         config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
 77 |         self.writer = IndexWriter(fs_dir, config)
 78 |         self.index_dir = index_dir
 79 | 
 80 |         # initialize the IndexSearcher from the writer
 81 |         reader = DirectoryReader.open(self.writer)
 82 |         self.searcher = IndexSearcher(reader)
 83 | 
 84 |         # initialize the vector index
 85 |         self.vector_index = VectorIndex(
 86 |             vector_dir, model_provider, vector_store)
 87 | 
 88 |         # get the latest vector index version from Lucene
 89 |         self.vector_index_version = self._get_vector_index_version()
 90 |         if self.vector_index_version > 0:
 91 |             # load the existing vectors
 92 |             self.vector_index.load(self.vector_index_version)
 93 | 
 94 |         logging.info(f"Initialize the index index_dir={index_dir} "
 95 |                      f"model={model_provider} vector_store={vector_store} "
 96 |                      f"vector_index_version={self.vector_index_version}")
 97 | 
 98 | 
 99 |     def _get_vector_index_version(self) -> int:
100 |         reader = DirectoryReader.openIfChanged(self.searcher.getIndexReader())
101 |         if reader:
102 |             self.searcher.getIndexReader().close()
103 |             self.searcher = IndexSearcher(reader)
104 | 
105 |         # doc may not exist if no doc is added to the index
106 |         vector_index_version = 0
107 |         term = Term(FIELD_DOC_ID, SYS_DOC_ID_VECTOR_INDEX)
108 |         q = TermQuery(term)
109 |         docs = self.searcher.search(q, 1).scoreDocs
110 |         if len(docs) > 0:
111 |             # get the latest vector index version
112 |             doc = self.searcher.doc(docs[0].doc)
113 |             field = doc.getField(FIELD_VECTOR_INDEX_VERSION)
114 |             vector_index_version = field.numericValue().longValue()
115 |         return vector_index_version
116 | 
117 | 
118 |     def close(self):
119 |         """
120 |         Close the index. The user must call commit before close, to make sure
121 |         the possible in-memory changes are committed.
122 |         """
123 |         self.writer.close()
124 |         self.searcher.getIndexReader().close()
125 |         logging.info("Close the index")
126 | 
127 | 
128 |     # TODO not support async. The underline vector lib, such as hnswlib,
129 |     # does not support concurrent writes. Lucene supports concurrent writes
130 |     # using multiple writers, and merge the segments in the background.
131 |     def add(self, doc_path: str, doc_fields: DocFields) -> str:
132 |         """
133 |         Add a doc to the index. The doc file must be a plain text file.
134 |         This function automatically generates the embeddings for the doc text.
135 |         
136 |         Return the document id.
137 |         """
138 |         # convert DocFields to Lucene fields
139 |         fields = self._convert_to_lucene_fields(doc_fields)
140 | 
141 |         return self._add(doc_path, fields)
142 | 
143 | 
144 |     def _convert_to_lucene_fields(self, doc_fields: DocFields) -> List[Field]:
145 |         fields: List[Field] = []
146 |         if doc_fields is None:
147 |             return fields
148 | 
149 |         for doc_field in doc_fields.fields:
150 |             field: Field = None
151 |             if doc_field.string_value is not None:
152 |                 field = StringField(
153 |                     doc_field.name, doc_field.string_value, Field.Store.YES)
154 |             if doc_field.numeric_value is not None:
155 |                 field = StoredField(doc_field.name, doc_field.numeric_value)
156 |             if doc_field.float_value is not None:
157 |                 field = StoredField(doc_field.name, doc_field.float_value)
158 |             fields.append(field)
159 | 
160 |         return fields
161 | 
162 | 
163 |     def _add(self, doc_path: str, fields: List[Field]) -> str:
164 |         # TODO support only a limited number of docs, e.g. less than
165 |         # vector_index.DEFAULT_VECTOR_FILE_MAX_ELEMENTS. One vector index
166 |         # element is one doc chunk.
167 |         # TODO support embeddings for other fields, such as title, etc.
168 |         # TODO support other type files, such as pdf, etc, e.g. extract text
169 |         # from file, write to a temporary text file, and then pass the
170 |         # temporary text file to this function.
171 |         # TODO support small files, such as 10KB. no need to persist the file
172 |         # to a temporary file, when running as http server.
173 | 
174 |         # get doc_id from fields, assign a unique id to doc if doc_id is None
175 |         doc_id = ""
176 |         for field in fields:
177 |             if field.name() == FIELD_DOC_ID:
178 |                 doc_id = field.stringValue()
179 |                 break
180 | 
181 |         # TODO if doc_id is passed in, check doc_id does not exist
182 |         if doc_id == "":
183 |             doc_id = str(uuid.uuid4())
184 |             fields.append(StringField(FIELD_DOC_ID, doc_id, Field.Store.YES))
185 | 
186 |         # add the doc to vector writer
187 |         self.vector_index.add(doc_path, doc_id)
188 | 
189 |         # add the doc to Lucene
190 |         self._add_to_lucene(doc_path, fields)
191 | 
192 |         logging.debug(f"add doc id={doc_id} to index")
193 |         return doc_id
194 | 
195 | 
196 |     def _add_to_lucene(self, doc_path: str, fields: List[Field]):
197 |         file_path = Path.of(doc_path)
198 |         br = Files.newBufferedReader(file_path)
199 |         try:
200 |             doc = Document()
201 | 
202 |             for field in fields:
203 |                 doc.add(field)
204 | 
205 |             text_field = TextField(FIELD_DOC_TEXT, br)
206 |             doc.add(text_field)
207 | 
208 |             self.writer.addDocument(doc)
209 |         finally:
210 |             br.close()
211 | 
212 | 
213 |     def commit(self):
214 |         # flush the vector index. TODO delete the older vector index files.
215 |         self.vector_index.save(self.vector_index_version + 1)
216 | 
217 |         # update the latest vector index version as the special doc0 in Lucene
218 |         doc = Document()
219 |         doc_id_field = StringField(
220 |             FIELD_DOC_ID, SYS_DOC_ID_VECTOR_INDEX, Field.Store.YES)
221 |         doc.add(doc_id_field)
222 |         vector_version_field = StoredField(
223 |             FIELD_VECTOR_INDEX_VERSION, self.vector_index_version + 1)
224 |         doc.add(vector_version_field)
225 |         if self.vector_index_version == 0:
226 |             # create the vector doc
227 |             self.writer.addDocument(doc)
228 |         else:
229 |             # update the vector doc
230 |             term = Term(FIELD_DOC_ID, SYS_DOC_ID_VECTOR_INDEX)
231 |             self.writer.updateDocument(term, doc)
232 | 
233 |         # commit Lucene
234 |         self.writer.commit()
235 | 
236 |         # successfully commit both vector and lucene indexes
237 |         self.vector_index_version += 1
238 |         logging.info(f"Commit the index {self.index_dir}, "
239 |                      f"vector_index_version={self.vector_index_version}")
240 | 
241 | 
242 |     def vector_search(
243 |         self, query_string: str, top_k: int,
244 |     ) -> List[DocChunkScore]:
245 |         """
246 |         Take the query string, search over the doc content (text) and return
247 |         the top docs. The search will include both the traditional inverted
248 |         search and vector search.
249 |         """
250 |         # TODO
251 |         # - support index and search other fields, such as title.
252 |         # - support more Lucene query abilities vs natural language search
253 |         #   like gmail. For example, user inputs "a query string. field:value",
254 |         #   automatically search the query string over all invert/vector
255 |         #   indexed fields, and search the specified field.
256 |         # - support retrieving the specified fields.
257 |         # - etc.
258 | 
259 |         doc_chunk_scores = self.vector_index.search(query_string, top_k)
260 | 
261 |         logging.debug(
262 |             f"vector search query=\'{query_string}\' docs={doc_chunk_scores}")
263 |         return doc_chunk_scores
264 | 
265 | 
266 |     def lucene_search(
267 |         self, query_string: str, top_k: int,
268 |     ) -> List[DocChunkScore]:
269 |         # TODO support concurrent reads
270 |         reader = DirectoryReader.openIfChanged(self.searcher.getIndexReader())
271 |         if reader:
272 |             self.searcher.getIndexReader().close()
273 |             self.searcher = IndexSearcher(reader)
274 | 
275 |         analyzer = self.writer.getConfig().getAnalyzer()
276 |         parser = QueryParser(FIELD_DOC_TEXT, analyzer)
277 |         query = parser.parse(query_string)
278 | 
279 |         logging.debug(f"parse query string: {query_string}, to {query}")
280 | 
281 |         lucene_score_docs = self.searcher.search(query, top_k).scoreDocs
282 | 
283 |         doc_chunk_scores: List[DocChunkScore] = []
284 |         for score_doc in lucene_score_docs:
285 |             # get doc id
286 |             doc = self.searcher.doc(score_doc.doc)
287 |             doc_id = doc.get(FIELD_DOC_ID)
288 | 
289 |             # TODO get the offset and length via TermVector or Highlighter
290 |             doc_chunk_score = DocChunkScore(
291 |                 doc_id=doc_id, offset=0, length=0, score=score_doc.score)
292 |             doc_chunk_scores.append(doc_chunk_score)
293 | 
294 |         logging.debug(
295 |             f"lucene search query=\'{query_string}\' docs={doc_chunk_scores}")
296 |         return doc_chunk_scores
297 | 


--------------------------------------------------------------------------------
/index/vector_index.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import pickle
  4 | from pydantic import BaseModel
  5 | from typing import List, Dict
  6 | 
  7 | import tiktoken
  8 | 
  9 | from index.docs import DocChunkScore
 10 | from model.model import Model
 11 | from model.factory import get_model
 12 | from vectorstore.vectorstore import Space, VectorStore
 13 | from vectorstore.factory import get_vector_store
 14 | 
 15 | DEFAULT_SPACE = Space.l2 # The default space
 16 | DEFAULT_VECTOR_FILE_MAX_ELEMENTS = 5000 # The max elements in one vector file
 17 | MIN_CHUNK_SIZE_CHARS = 350  # The minimum size of each text chunk in characters
 18 | MIN_CHUNK_LENGTH_TO_EMBED = 5  # Discard chunks shorter than this
 19 | EMBEDDINGS_BATCH_SIZE = 128 # The number of embeddings to request at a time
 20 | 
 21 | # Global tokenizer
 22 | default_tokenizer = tiktoken.get_encoding("cl100k_base")
 23 | 
 24 | 
 25 | # The metadata for a document chunk
 26 | class ChunkMetadata(BaseModel):
 27 |     offset: int # the chunk's start offset in the doc
 28 |     length: int # the length of the chunk text
 29 |     label: int # the label of the chunk embedding in the vector store
 30 | 
 31 | 
 32 | # The id to uniquely define a document chunk in the vector index
 33 | class ChunkId(BaseModel):
 34 |     doc_id: str
 35 |     offset: int
 36 |     length: int
 37 | 
 38 | 
 39 | class VectorIndexMetadata(BaseModel):
 40 |     elements: int
 41 |     last_label: int
 42 | 
 43 | 
 44 | class VectorIndex:
 45 |     store_dir: str # the dir where the index files will be stored
 46 | 
 47 |     # use openai cl100k_base as tokenizer.
 48 |     tokenizer: tiktoken.core.Encoding
 49 | 
 50 |     model: Model # the model to get the embeddings for text
 51 | 
 52 |     space: Space
 53 |     store: VectorStore # the vector store to store the embeddings
 54 | 
 55 |     # the underline vector store usually supports int as label. maintain the
 56 |     # mapping between the doc ids and labels. These metadatas are directly
 57 |     # persisted using pickle.
 58 |     # TODO support the metadata definition change, maybe use other
 59 |     # serialization format, such as json, protobuf, etc.
 60 |     # key: doc id, value: a list of chunk metadata
 61 |     doc_id_to_metas: Dict[str, List[ChunkMetadata]]
 62 |     # key: label, value: chunk id
 63 |     label_to_chunk_id: Dict[int, ChunkId]
 64 |     # the vector index metadata
 65 |     metadata: VectorIndexMetadata
 66 | 
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         store_dir: str,
 71 |         model_provider:str,
 72 |         vector_store: str,
 73 |     ):
 74 |         self.store_dir = store_dir
 75 |         self.tokenizer = default_tokenizer
 76 |         self.model = get_model(model_provider)
 77 |         dim = self.model.get_dim()
 78 |         # default max elements. TODO support more elements
 79 |         max_elements = DEFAULT_VECTOR_FILE_MAX_ELEMENTS
 80 |         self.space = DEFAULT_SPACE
 81 |         self.store = get_vector_store(
 82 |             vector_store, dim, self.space, max_elements)
 83 |         self.doc_id_to_metas = {}
 84 |         self.label_to_chunk_id ={}
 85 |         self.metadata = VectorIndexMetadata(elements=0, last_label=0)
 86 | 
 87 | 
 88 |     def load(self, version: int):
 89 |         """
 90 |         Load the vectors from file.
 91 |         """
 92 |         file_path = self._get_index_file(version)
 93 |         self.store.load(file_path)
 94 | 
 95 |         # load the mapping between doc ids and labels
 96 |         id_file = self._get_id_to_label_file(version)
 97 |         with open(id_file, "rb") as f:
 98 |             self.doc_id_to_metas = pickle.load(f)
 99 | 
100 |         label_file = self._get_label_to_id_file(version)
101 |         with open(label_file, "rb") as f:
102 |             self.label_to_chunk_id = pickle.load(f)
103 | 
104 |         metadata_file = self._get_metadata_file(version)
105 |         with open(metadata_file, "rb") as f:
106 |             self.metadata = pickle.load(f)
107 | 
108 | 
109 |     def save(self, version: int):
110 |         """
111 |         Save the vectors to the file.
112 |         """
113 |         file_path = self._get_index_file(version)
114 |         self.store.save(file_path)
115 | 
116 |         # save the mapping between doc ids and labels
117 |         id_file = self._get_id_to_label_file(version)
118 |         with open(id_file, "wb") as f:
119 |             pickle.dump(self.doc_id_to_metas, f, pickle.HIGHEST_PROTOCOL)
120 | 
121 |         label_file = self._get_label_to_id_file(version)
122 |         with open(label_file, "wb") as f:
123 |             pickle.dump(self.label_to_chunk_id, f, pickle.HIGHEST_PROTOCOL)
124 | 
125 |         metadata_file = self._get_metadata_file(version)
126 |         with open(metadata_file, "wb") as f:
127 |             pickle.dump(self.metadata, f, pickle.HIGHEST_PROTOCOL)
128 | 
129 | 
130 |     def _get_index_file(self, version: int) -> str:
131 |         return os.path.join(self.store_dir, f"{version}.index")
132 | 
133 |     def _get_id_to_label_file(self, version: int) -> str:
134 |         return os.path.join(self.store_dir, f"id_to_label_{version}.pkl")
135 | 
136 |     def _get_label_to_id_file(self, version: int) -> str:
137 |         return os.path.join(self.store_dir, f"label_to_id_{version}.pkl")
138 | 
139 |     def _get_metadata_file(self, version: int) -> str:
140 |         return os.path.join(self.store_dir, f"metadata_{version}.pkl")
141 | 
142 |     def _get_chunk_id(self, doc_id: str, offset: int, length: int) -> str:
143 |         return f"{doc_id}_{offset}_{length}"
144 | 
145 | 
146 |     def add(self, doc_path: str, doc_id: str):
147 |         """
148 |         Add a doc to the vector index. This function reads the doc text, splits
149 |         the doc to chunk if the doc is large, generates the embeddings for
150 |         chunks and adds the embeddings to the vector store.
151 |         TODO support multi-threads.
152 |         """
153 |         # get embeddings for the doc text
154 |         chunk_embeddings, chunk_metas = self._get_embeddings(doc_path)
155 | 
156 |         logging.info(
157 |             f"get {len(chunk_embeddings)} embeddings for doc path={doc_path} "
158 |             f"id={doc_id}, last_label={self.metadata.last_label}")
159 | 
160 |         if len(chunk_embeddings) == 0:
161 |             # doc has no content, return
162 |             return
163 | 
164 |         # assign the labels to the doc chunks
165 |         label = self.metadata.last_label
166 |         # update index metadata
167 |         self.metadata.last_label += len(chunk_metas)
168 |         self.metadata.elements += len(chunk_metas)
169 | 
170 |         labels: List[int] = []
171 |         for i, chunk_meta in enumerate(chunk_metas):
172 |             label += 1
173 |             chunk_meta.label = label
174 |             labels.append(label)
175 |             # update the label_to_chunk_id Dict
176 |             self.label_to_chunk_id[label] = ChunkId(
177 |                 doc_id=doc_id,
178 |                 offset=chunk_meta.offset,
179 |                 length=chunk_meta.length,
180 |             )
181 | 
182 |         # update the doc_id_to_metas
183 |         self.doc_id_to_metas[doc_id] = chunk_metas
184 | 
185 |         # add embeddings to the store
186 |         self.store.add(chunk_embeddings, labels)
187 | 
188 | 
189 |     def _get_embeddings(
190 |         self, doc_path: str, batch_size: int = EMBEDDINGS_BATCH_SIZE,
191 |     ) -> (List[List[float]], List[ChunkMetadata]):
192 |         """
193 |         Split the doc's text into chunks, generate one embedding and metadata
194 |         for each chunk.
195 | 
196 |         Returns:
197 |             A list of embeddings and metadatas for all chunks in the doc.
198 |         """
199 |         # the embeddings for all chunks in the doc
200 |         chunk_embeddings: List[List[float]] = []
201 |         # the metadata for all chunks in the doc
202 |         chunk_metas: List[ChunkMetadata] = []
203 | 
204 |         # read the whole file. TODO support pagination for large files.
205 |         with open(doc_path, mode="r", encoding="utf-8") as f:
206 |             text = f.read()
207 | 
208 |         # return an empty list if the text is empty or whitespace
209 |         if not text or text.isspace():
210 |             return chunk_embeddings, chunk_metas
211 | 
212 |         # split the doc text to chunks
213 |         chunk_token_size = self.model.get_max_token_size()
214 |         chunk_texts, chunk_metas = self._get_text_chunks(
215 |             doc_path, text, chunk_token_size, MIN_CHUNK_SIZE_CHARS)
216 | 
217 |         # get embeddings for all chunks
218 |         for i in range(0, len(chunk_texts), EMBEDDINGS_BATCH_SIZE):
219 |             batch_texts = chunk_texts[i:i+EMBEDDINGS_BATCH_SIZE]
220 | 
221 |             embeddings = self.model.get_embeddings(batch_texts)
222 | 
223 |             chunk_embeddings.extend(embeddings)
224 | 
225 |         return chunk_embeddings, chunk_metas
226 | 
227 | 
228 |     def _get_text_chunks(
229 |         self,
230 |         doc_path: str, # the doc path, for logging
231 |         text: str, # the doc text
232 |         chunk_token_size: int, # the number of tokens in one chunk
233 |         min_chunk_chars: int, # the minimum size of each text chunk in chars
234 |     ) -> (List[str], List[ChunkMetadata]):
235 |         """
236 |         Split the text into chunks.
237 |         Return a list of texts and metadadatas for all chunks in the text.
238 |         """
239 |         chunk_texts: List[str] = []
240 |         chunk_metas: List[ChunkMetadata] = []
241 | 
242 |         # tokenize the text
243 |         # according to tiktoken/core.py, "encode_ordinary is equivalent to
244 |         # `encode(text, disallowed_special=())` (but slightly faster)."
245 |         tokens = self.tokenizer.encode_ordinary(text)
246 | 
247 |         # loop until all tokens are consumed or the max elements are reached
248 |         offset = 0
249 |         while tokens:
250 |             # take the next chunk
251 |             chunk = tokens[:chunk_token_size]
252 | 
253 |             # decode to text to check whitespace and sentence boundary
254 |             chunk_text = self.tokenizer.decode(chunk)
255 | 
256 |             # skip the chunk if it is empty or whitespace
257 |             if not chunk_text or chunk_text.isspace():
258 |                 # remove from the remaining tokens
259 |                 tokens = tokens[len(chunk):]
260 |                 # increase the offset
261 |                 offset += len(chunk_text)
262 |                 continue
263 | 
264 |             # truncate chunk_text to the last complete sentence (punctation).
265 |             # TODO support other languages, maybe consider such as NLTK.
266 |             last_punc = max(
267 |                 chunk_text.rfind("."),
268 |                 chunk_text.rfind("?"),
269 |                 chunk_text.rfind("!"),
270 |                 chunk_text.rfind("\n"),
271 |             )
272 |             if last_punc != -1 and last_punc > min_chunk_chars:
273 |                 chunk_text = chunk_text[:last_punc+1]
274 | 
275 |             chunk_text_len = len(chunk_text)
276 | 
277 |             # adjust the chunk_text_len if needed.
278 |             # check if some text in the last token is skipped. For example,
279 |             # cl100k_base takes '."[' as one token. If two sentences have this
280 |             # string, 'This sentence."[1] Next sentence.', and "This sentence."
281 |             # is the last sentence, the next offset will not align with tokens,
282 |             # e.g. the next offset will point to the first char in '"[1',
283 |             # while, the decoded text of the next token is '1'.
284 |             chunk_tokens = self.tokenizer.encode_ordinary(chunk_text)
285 |             last_chunk_token = len(chunk_tokens) - 1
286 |             if chunk_tokens[last_chunk_token] != tokens[last_chunk_token]:
287 |                 # align chunk_text_len with the last token
288 |                 last_token_text = self.tokenizer.decode(
289 |                     chunk_tokens[last_chunk_token:])
290 | 
291 |                 token_text = self.tokenizer.decode(
292 |                     tokens[last_chunk_token:last_chunk_token+1])
293 | 
294 |                 chunk_text_len += len(token_text) - len(last_token_text)
295 | 
296 |                 logging.debug(f"align last_token_text={last_token_text} "
297 |                               f"token_text={token_text}")
298 | 
299 |             logging.debug(f"offset={offset} chunk_text_len={chunk_text_len}")
300 | 
301 |             # sanity check
302 |             if text[offset:offset+10] != chunk_text[:10]:
303 |                 logging.warning(f"doc_path={doc_path} offset={offset},"
304 |                                 f"text chars={text[offset:offset+10]}"
305 |                                 f"chunk chars={chunk_text[:20]}")
306 |                 raise Exception(
307 |                     f"text and chunk not aligned, {doc_path} offset={offset}")
308 | 
309 |             # remove any newline characters and strip any leading or trailing
310 |             # whitespaces. Not needed if use NLTK.
311 |             chunk_text_to_append = chunk_text.replace("\n", " ").strip()
312 |             if len(chunk_text_to_append) > MIN_CHUNK_LENGTH_TO_EMBED:
313 |                 # add the chunk text
314 |                 chunk_texts.append(chunk_text_to_append)
315 | 
316 |                 # add the chunk meta
317 |                 chunk_metas.append(ChunkMetadata(
318 |                     offset=offset,
319 |                     length=chunk_text_len,
320 |                     label=0, # initial 0 label, will be assigned later
321 |                 ))
322 | 
323 |             # increase the offset
324 |             offset += chunk_text_len
325 | 
326 |             # remove the chunk text tokens from the remaining tokens.
327 |             tokens = tokens[last_chunk_token+1:]
328 | 
329 |         return chunk_texts, chunk_metas
330 | 
331 | 
332 |     def delete(self, doc_id: str):
333 |         """
334 |         Delete a doc from the vector index.
335 |         """
336 |         raise NotImplementedError
337 | 
338 | 
339 |     def search(self, query_string: str, top_k: int) -> List[DocChunkScore]:
340 |         """
341 |         Take a query string, get embedding for the query string, find the
342 |         similar doc chunks in the store, calculate the scores and return the
343 |         top_k doc chunks.
344 |         The score for a doc chunk is calculated based on the distance to the
345 |         query string embedding.
346 | 
347 |         Return the top-k doc chunks, sorted in descending order based on score.
348 |         """
349 |         texts: List[str] = []
350 |         texts.append(query_string)
351 |         embeddings = self.model.get_embeddings(texts)
352 | 
353 |         # check k with the current number of elements. Some store, such as
354 |         # hnswlib, throws RuntimeError if k > elements.
355 |         if top_k > self.metadata.elements:
356 |             top_k = self.metadata.elements
357 | 
358 |         # query the vector store
359 |         labels, distances = self.store.query(embeddings, top_k)
360 | 
361 |         # convert distances to scores
362 |         return self._distance_to_scores(labels[0], distances[0])
363 | 
364 | 
365 |     def _distance_to_scores(
366 |         self, labels: List[int], distances: List[float],
367 |     ) -> List[DocChunkScore]:
368 |         # Convert the distances to the scores in range (0, 1),
369 |         # higher score means closer.
370 |         chunk_scores: List[DocChunkScore] = []
371 |         for i, label in enumerate(labels):
372 |             if self.space == Space.l2:
373 |                 # l2 distance, lower distance means closer
374 |                 score = 1 / (1 + distances[i])
375 |             else:
376 |                 # ip or cosine distance, higher distance means closer
377 |                 score = (1 + distances[i]) / 2
378 | 
379 |             # get the doc id for the chunk
380 |             chunk_id = self.label_to_chunk_id[label]
381 | 
382 |             chunk_score = DocChunkScore(
383 |                 doc_id=chunk_id.doc_id, offset=chunk_id.offset,
384 |                 length=chunk_id.length, score=score)
385 |             chunk_scores.append(chunk_score)
386 | 
387 |         return chunk_scores
388 | 


--------------------------------------------------------------------------------
/tests/testfiles/chatgpt.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ChatGPT[a] is an artificial intelligence (AI) chatbot developed by OpenAI and released in November 2022. It is built on top of OpenAI's GPT-3.5 and GPT-4 families of large language models (LLMs) and has been fine-tuned (an approach to transfer learning) using both supervised and reinforcement learning techniques.
 3 | ChatGPT launched as a prototype on November 30, 2022, and garnered attention for its detailed responses and articulate answers across many domains of knowledge.[3] Its propensity, at times, to confidently provide factually incorrect responses, however, has been identified as a significant drawback.[4] In 2023, following the release of ChatGPT, OpenAI's valuation was estimated at US$29 billion.[5] The advent of the chatbot has increased competition within the space, motivating the creation of Google's Bard and Meta's LLaMA.
 4 | The original release of ChatGPT was based on GPT-3.5. A version based on GPT-4, the newest OpenAI model, was released on March 14, 2023, and is available for paid subscribers on a limited basis.
 5 | ChatGPT is a member of the generative pre-trained transformer (GPT) family of language models. It was fine-tuned over an improved version of OpenAI's GPT-3 known as "GPT-3.5".[6]
 6 | The fine-tuning process leveraged both supervised learning as well as reinforcement learning in a process called reinforcement learning from human feedback (RLHF).[7][8] Both approaches use human trainers to improve the model's performance. In the case of supervised learning, the model was provided with conversations in which the trainers played both sides: the user and the AI assistant. In the reinforcement learning step, human trainers first ranked responses that the model had created in a previous conversation.[9] These rankings were used to create "reward models" that were used to fine-tune the model further by using several iterations of Proximal Policy Optimization (PPO).[7][10]
 7 | ChatGPT initially used a Microsoft Azure supercomputing infrastructure, powered by Nvidia GPUs, that Microsoft built specifically for OpenAI and that reportedly cost "hundreds of millions of dollars". Following the success of ChatGPT, Microsoft dramatically upgraded the OpenAI infrastructure in 2023.[11]
 8 | OpenAI collects data from ChatGPT users to train and fine-tune the service further. Users can upvote or downvote responses they receive from ChatGPT and fill out a text field with additional feedback.[12][13]
 9 | Although the core function of a chatbot is to mimic a human conversationalist, ChatGPT is versatile. It can write and debug computer programs,[14] mimic the style of celebrity CEOs and write business pitches,[15] compose music, teleplays, fairy tales and student essays, answer test questions (sometimes, depending on the test, at a level above the average human test-taker),[16] write poetry and song lyrics,[17] translate and summarize text,[18] emulate a Linux system; simulate entire chat rooms, play games like tic-tac-toe and simulate an ATM.[19] ChatGPT's training data includes man pages and information about internet phenomena and programming languages such as bulletin board systems and the Python programming language.[19]
10 | In comparison to its predecessor, InstructGPT, ChatGPT attempts to reduce harmful and deceitful responses.[20] In one example, whereas InstructGPT accepts the premise of the prompt "Tell me about when Christopher Columbus came to the U.S. in 2015" as being truthful, ChatGPT acknowledges the counterfactual nature of the question and frames its answer as a hypothetical consideration of what might happen if Columbus came to the U.S. in 2015, using information about the voyages of Christopher Columbus and facts about the modern world – including modern perceptions of Columbus' actions.[7]
11 | Unlike most chatbots, ChatGPT remembers a limited number of previous prompts given to it in the same conversation. Journalists have speculated that this will allow ChatGPT to be used as a personalized therapist.[2] To prevent offensive outputs from being presented to and produced from ChatGPT, queries are filtered through the OpenAI "Moderation endpoint" API (a separate GPT-based AI),[21][22] and potentially racist or sexist prompts are dismissed.[7][2]
12 | In March 2023, OpenAI announced it would be adding support for plugins for ChatGPT.[23] This includes both plugins made by OpenAI, such as web browsing and code interpretation, as well as external plugins from developers such as Expedia, OpenTable, Zapier, Shopify, Slack, and Wolfram.[24][25]
13 | OpenAI acknowledges that ChatGPT "sometimes writes plausible-sounding but incorrect or nonsensical answers".[7] This behavior is common to large language models and is called "hallucination".[26] The reward model of ChatGPT, designed around human oversight, can be over-optimized and thus hinder performance, in an example of an optimization pathology known as Goodhart's law.[27]
14 | ChatGPT has limited knowledge of events that occurred after September 2021.[28]
15 | In training ChatGPT, human reviewers preferred longer answers, irrespective of actual comprehension or factual content.[7] Training data also suffers from algorithmic bias, which may be revealed when ChatGPT responds to prompts including descriptors of people. In one instance, ChatGPT generated a rap indicating that women and scientists of color were inferior to white and male scientists.[29][30]
16 | ChatGPT was launched on November 30, 2022, by San Francisco–based OpenAI, also the creator of DALL·E 2 and Whisper AI. The service was initially free to the public and the company had plans to monetize the service later.[31] By December 4, 2022, ChatGPT had over one million users.[12] In January 2023, ChatGPT reached over 100 million users, making it the fastest growing consumer application to date.[32]
17 | CNBC wrote on December 15, 2022, that the service "still goes down from time to time".[33] In addition, the free service is throttled.[34] During periods the service was up, response latency was typically better than five seconds in January 2023.[35][36] The service works best in English, but is also able to function in some other languages, to varying degrees of accuracy.[17] No official peer-reviewed technical paper on ChatGPT was published.[37]
18 | The company provides a tool, called "AI classifier for indicating AI-written text",[38] that attempts to determine whether text has been written by an AI such as ChatGPT. OpenAI cautions that the tool will "likely yield a lot of false positives and negatives, sometimes with great confidence." An example cited in The Atlantic magazine showed that "when given the first lines of the Book of Genesis, the software concluded that it was likely to be AI-generated."[39]
19 | In February 2023, OpenAI began accepting registrations from United States customers for a premium service, ChatGPT Plus, to cost $20 a month.[40] The company promised that the updated, but still "experimental" version of ChatGPT would provide access during peak periods, no downtime, priority access to new features and faster response speeds.[41]
20 | GPT-4, which was released on March 14, 2023, is available via API and for premium ChatGPT users.[42] However, premium users were limited to a cap of 100 messages every four hours, with the limit tightening to 25 messages every three hours in response to increased demand.[43] Microsoft acknowledged that the Bing chatbot was using GPT-4 before GPT-4's official release.[44]
21 | As an addition to its consumer-friendly "ChatGPT Professional" package, OpenAI made its ChatGPT and Whisper model APIs available from March 2023, providing developers with an application programming interface for AI-enabled language and speech-to-text features. ChatGPT's new API uses the same GPT-3.5-turbo AI model as the chatbot. This allows developers to add either an unmodified or modified version of ChatGPT to their applications.[45] The ChatGPT API costs $0.002 per 1000 tokens (about 750 words), making it ten times cheaper than the GPT-3.5 models.[46][47]
22 | A few days before the launch of OpenAI's software developer support service, on February 27, 2023, Snapchat rolled out, for its paid Snapchat Plus userbase, a custom ChatGPT chatbot called "My AI".[48]
23 | In March 2023, a bug allowed some users to see the titles of other users' conversations. OpenAI CEO Sam Altman said that users were not able to see the contents of the conversations. Shortly after the bug was fixed, users were unable to see their conversation history.[49][50][51][52] Later reports showed the bug was much more severe than initially believed, with OpenAI reporting that it had leaked users' "first and last name, email address, payment address, the last four digits (only) of a credit card number, and credit card expiration date".[53][54]
24 | In March 2023, OpenAI announced that Icelandic will become ChatGPT's second language after English. Icelandic was chosen after an Icelandic envoy, led by the President of Iceland Guðni Th. Jóhannesson, visited OpenAI in 2022.[55][56][57]
25 | According to OpenAI guest researcher Scott Aaronson, OpenAI is working on a tool to digitally watermark its text generation systems to combat bad actors using their services for academic plagiarism or spam.[58][59]
26 | In February 2023, Microsoft announced an experimental framework and gave a rudimentary demonstration of how ChatGPT can be used to control robotics with intuitive open-ended natural language commands.[60][61]
27 | OpenAI's GPT-4 model was released on March 14, 2023. Observers reported GPT-4 to be an impressive improvement on ChatGPT, with the caveat that GPT-4 retains many of the same problems.[62] Unlike ChatGPT, GPT-4 can take images as well as text as input.[63] OpenAI has declined to reveal technical information such as the size of the GPT-4 model.[64]
28 | ChatGPT Plus provides access to the GPT-4 supported version of ChatGPT,[65] that costs $20 per month.[65]
29 | OpenAI engineers say that they did not expect ChatGPT to be very successful and were surprised by the coverage and attention it received.[66][67]
30 | ChatGPT was met in December 2022 with some positive reviews. Kevin Roose of The New York Times labeled it "the best artificial intelligence chatbot ever released to the general public".[2] Samantha Lock of The Guardian newspaper noted that it was able to generate "impressively detailed" and "human-like" text.[3] Technology writer Dan Gillmor used ChatGPT on a student assignment, and found its generated text was on par with what a good student would deliver and opined that "academia has some very serious issues to confront".[68] Alex Kantrowitz of Slate magazine lauded ChatGPT's pushback to questions related to Nazi Germany, including the statement that Adolf Hitler built highways in Germany, which was met with information regarding Nazi Germany's use of forced labor.[69]
31 | In The Atlantic magazine's "Breakthroughs of the Year" for 2022, Derek Thompson included ChatGPT as part of "the generative-AI eruption" that "may change our mind about how we work, how we think, and what human creativity really is".[70]
32 | Kelsey Piper of the Vox website wrote that "ChatGPT is the general public's first hands-on introduction to how powerful modern AI has gotten, and as a result, many of us are [stunned]" and that ChatGPT is "smart enough to be useful despite its flaws".[71] Paul Graham of Y Combinator tweeted that "The striking thing about the reaction to ChatGPT is not just the number of people who are blown away by it, but who they are. These are not people who get excited by every shiny new thing. Clearly, something big is happening."[72] Elon Musk wrote that "ChatGPT is scary good. We are not far from dangerously strong AI".[71] Musk paused OpenAI's access to a Twitter database pending a better understanding of OpenAI's plans, stating that "OpenAI was started as open source and nonprofit. Neither is still true."[73][74] Musk co-founded OpenAI in 2015, in part to address existential risk from artificial intelligence, but resigned in 2018.[74]
33 | In December 2022, Google internally expressed alarm at the unexpected strength of ChatGPT and the newly discovered potential of large language models to disrupt the search engine business, and CEO Sundar Pichai "upended" and reassigned teams within multiple departments to aid in its artificial intelligence products, according to a report in The New York Times.[75] According to CNBC reports, Google employees intensively tested a chatbot called "Apprentice Bard", which Google later unveiled as its ChatGPT competitor, Google Bard.[76][77]
34 | Stuart Cobbe, a chartered accountant in England and Wales, decided to test ChatGPT by entering questions from a sample exam paper on the ICAEW website and then entering its answers back into the online test. ChatGPT scored 42 percent, below the 55 percent pass mark.[78]
35 | Writing in Inside Higher Ed professor Steven Mintz states that he "consider[s] ChatGPT... an ally, not an adversary". He felt the AI could assist educational goals by doing such things as making reference lists, generating first drafts, solving equations, debugging, and tutoring.[79]
36 | Since its release, ChatGPT has been met with criticism from educators, journalists, artists, ethicists, academics, and public advocates. Journalists have commented on ChatGPT's tendency to "hallucinate."[81] Mike Pearl of the online technology blog Mashable tested ChatGPT with multiple questions. In one example, he asked ChatGPT for "the largest country in Central America that isn't Mexico." ChatGPT responded with Guatemala, when the answer is instead Nicaragua.[82] When CNBC asked ChatGPT for the lyrics to "Ballad of Dwight Fry," ChatGPT supplied invented lyrics rather than the actual lyrics.[33] Writers for The Verge, citing the work of Emily M. Bender, compared ChatGPT to a "stochastic parrot",[83] as did Professor Anton Van Den Hengel of the Australian Institute for Machine Learning.[84]
37 | In December 2022, the question and answer website Stack Overflow banned the use of ChatGPT for generating answers to questions, citing the factually ambiguous nature of ChatGPT's responses.[4] In January 2023, the International Conference on Machine Learning banned any undocumented use of ChatGPT or other large language models to generate any text in submitted papers.[85]
38 | Economist Tyler Cowen expressed concerns regarding ChatGPT's effects on democracy, citing its ability to produce automated comments, which could affect the decision process for new regulations.[86] An editor at The Guardian, a British newspaper, questioned whether any content found on the Internet after ChatGPT's release "can be truly trusted" and called for government regulation.[87]
39 | In January 2023, after being sent a song written by ChatGPT in the style of Nick Cave,[80] the songwriter himself responded on The Red Hand Files[88] saying the act of writing a song is "a blood and guts business [...] that requires something of me to initiate the new and fresh idea. It requires my humanness." He went on to say, "With all the love and respect in the world, this song is bullshit, a grotesque mockery of what it is to be human, and, well, I don't much like it."[80][89]
40 | In 2023, Australian MP Julian Hill advised the national parliament that the growth of AI could cause "mass destruction". During his speech, which was partly written by the program, he warned that it could result in cheating, job losses, discrimination, disinformation, and uncontrollable military applications.[90]
41 | In an article for The New Yorker, science fiction writer Ted Chiang compared ChatGPT and other LLMs to a lossy JPEG picture:[91]
42 | 
43 | Think of ChatGPT as a blurry jpeg of all the text on the Web. It retains much of the information on the Web, in the same way that a jpeg retains much of the information of a higher-resolution image, but, if you're looking for an exact sequence of bits, you won't find it; all you will ever get is an approximation. But, because the approximation is presented in the form of grammatical text, which ChatGPT excels at creating, it's usually acceptable. [...] It's also a way to understand the "hallucinations", or nonsensical answers to factual questions, to which large language models such as ChatGPT are all too prone. These hallucinations are compression artifacts, but [...] they are plausible enough that identifying them requires comparing them against the originals, which in this case means either the Web or our own knowledge of the world. When we think about them this way, such hallucinations are anything but surprising; if a compression algorithm is designed to reconstruct text after ninety-nine per cent of the original has been discarded, we should expect that significant portions of what it generates will be entirely fabricated.In February 2023, the University of Hong Kong sent a campus-wide email to instructors and students stating that the use of ChatGPT or other AI tools is prohibited in all classes, assignments and assessments at the university. Any violations would be treated as plagiarism by the university unless the student obtains the prior written consent from the course instructor.[92][93]
44 | In February 2023 Time magazine placed a screenshot of a conversation with ChatGPT on its cover, writing that "The AI Arms Race Is Changing Everything" and "The AI Arms Race Is On. Start Worrying".[94]
45 | China state-run media China Daily claimed that ChatGPT "could provide a helping hand to the U.S. government in its spread of disinformation and its manipulation of global narratives for its own geopolitical interests." The Chinese government instructed Chinese tech companies not to offer access to ChatGPT services on their platforms.[95]
46 | In an opinion piece for the New York Times, Nathan E. Sanders and Bruce Schneier wrote that ChatGPT "hijacks democracy".[96] Noam Chomsky, Ian Roberts and Jeffrey Watumull criticized the technology and concluded: "Given the amorality, faux science and linguistic incompetence of these systems, we can only laugh or cry at their popularity."[97]
47 | Gian Volpicelli of Politico wrote that ChatGPT "broke the EU plan to regulate AI".[98]
48 | In late March 2023, the Italian data protection authority banned ChatGPT in Italy and opened an investigation. Italian regulators assert that ChatGPT was exposing minors to age-inappropriate content, and that OpenAI's use of ChatGPT conversations as training data could be a violation of Europe's General Data Protection Regulation.[99][100]
49 | On March 28, 2023, many public figures, including Elon Musk and Steve Wozniak, signed an open letter by the Future of Life Institute, calling for an immediate pause of giant AI experiments like ChatGPT, citing "profound risks to society and humanity".[101] One month later, it was reported that Musk plans to launch new company that would train its own LLM.[102]
50 | In April 2023, Brian Hood, mayor of Hepburn Shire Council, plans to take legal action against ChatGPT over false information. According to Hood, the OpenAI-owned program erroneously claimed that he was jailed for bribery during his tenure at a subsidiary of Australia's national bank. Contrary to the alleged claims made by ChatGPT, Hood was not jailed for bribery. In reality, he acted as a whistleblower and was not charged with any criminal offenses.[103]
51 | Hood's claim on ChatGPT's erroneous content was verified by BBC. The news outlet asked the public-available version of ChatGPT regarding Hood's involvement in the Securency scandal. The AI tool replied with a case description and then added "pleaded guilty to one count of bribery in 2012 and was sentenced to four years in prison".
52 | Hood's legal team has already sent a concerns notice to OpenAI. This is the first official step in filing for a defamation case. Under Australian law, OpenAI has 28 days to reply to Hood's concerns notice. Should Hood proceed with the lawsuit, it would be the first public defamation case OpenAI would face over ChatGPT's content.[104]
53 | OpenAI CEO Sam Altman was quoted in The New York Times saying that AI's "benefits for humankind could be 'so unbelievably good that it's hard for me to even imagine.' (He has also said that in a worst-case scenario, A.I. could kill us all.)"[105]
54 | Henry Kissinger, Eric Schmidt, and Daniel Huttenlocher wrote for the Wall Street Journal that "ChatGPT Heralds an Intellectual Revolution". They argued that "Generative artificial intelligence presents a philosophical and practical challenge on a scale not experienced since the start of the Enlightenment", and compared the invention of ChatGPT (and LLM in general) to Gutenberg's printing press.[106]Enlightenment science accumulated certainties; the new AI generates cumulative ambiguities. Enlightenment science evolved by making mysteries explicable, delineating the boundaries of human knowledge and understanding as they moved. The two faculties moved in tandem: Hypothesis was understanding ready to become knowledge; induction was knowledge turning into understanding. In the Age of AI, riddles are solved by processes that remain unknown. [...] As models turn from human-generated text to more inclusive inputs, machines are likely to alter the fabric of reality itself. Quantum theory posits that observation creates reality. Prior to measurement, no state is fixed, and nothing can be said to exist. If that is true, and if machine observations can fix reality as well – and given that AI systems' observations come with superhuman rapidity – the speed of the evolution of defining reality seems likely to accelerate. The dependence on machines will determine and thereby alter the fabric of reality, producing a new future that we do not yet understand and for the exploration and leadership of which we must prepare.Check Point Research and others noted that ChatGPT was capable of writing phishing emails and malware, especially when combined with OpenAI Codex.[107]
55 | ChatGPT can write introduction and abstract sections of scientific articles.[108] Several papers have already listed ChatGPT as a co-author.[109] Scientific journals have different reactions to ChatGPT, some "require that authors disclose use of text-generating tools and ban listing a large language model (LLM) such as ChatGPT as a co-author". For example Nature and JAMA Network. Science "completely banned" usage of LLM-generated text in all its journals.[110]
56 | Spanish chemist Rafael Luque published a paper every 37 hours in 2023, and admitted using ChatGPT for it. His papers have a large number of unusual phrases, characteristic for LLMs. Luque was suspended for 13 years from the University of Cordoba, though not for the use of ChatGPT.[111]
57 | California high school teacher and author Daniel Herman wrote that ChatGPT would usher in "the end of high school English".[112] In the Nature journal, Chris Stokel-Walker pointed out that teachers should be concerned about students using ChatGPT to outsource their writing, but that education providers will adapt to enhance critical thinking or reasoning.[113] Emma Bowman with NPR wrote of the danger of students plagiarizing through an AI tool that may output biased or nonsensical text with an authoritative tone.[114]
58 | Joanna Stern in The Wall Street Journal described cheating in American high school English with the tool by submitting a generated essay.[115] Professor Darren Hick of Furman University described noticing ChatGPT's "style" in a paper submitted by a student.[116] He suggested a policy of giving an ad-hoc individual oral exam on the paper topic if a student is strongly suspected of submitting an AI-generated paper.[117]
59 | The New York City Department of Education reportedly blocked access to ChatGPT in December 2022[118] and officially announced a ban around January 4, 2023.[119][120]
60 | In a blinded test, ChatGPT was judged to have passed graduate-level exams at the University of Minnesota at the level of a C+ student and at Wharton School of the University of Pennsylvania with a B to B− grade.[121] The performance of ChatGPT for computer programming of numerical methods was assessed by a Stanford University student and faculty in March 2023 through a variety of computational mathematics examples.[122] Assessment psychologist Eka Roivainen administered a partial IQ test to ChatGPT and estimated its Verbal IQ to be 155, which would put it in the top 0.1% of test-takers.[123]
61 | Mathematician Terence Tao experimented with ChatGPT and found it useful in daily work, writing "I am finding that while these AI tools do not directly assist me in core tasks such as trying to attack an unsolved mathematical problem, they are quite useful for a wide variety of peripheral (but still work-related) tasks (though often with some manual tweaking afterwards)."[124]
62 | In the field of health care, possible uses and concerns are under scrutiny by professional associations and practitioners.[125]
63 | On April 11, 2023, a judge of a session court in Pakistan used ChatGPT to decide the bail of a 13 year old accused in a matter. The court quoted the use of ChatGPT assistance in its verdict, 
64 | The AI language model replied, 
65 | The judge further asked questions regarding the case from AI Chatbot and formulated his final decision in the light of ChatGPT's answers. 
66 | [126]
67 | [127]
68 | TIME magazine revealed that to build a safety system against toxic content (e.g. sexual abuse, violence, racism, sexism, etc.), OpenAI used outsourced Kenyan workers earning less than $2 per hour to label toxic content. These labels were used to train a model to detect such content in the future. The outsourced laborers were exposed to such toxic and dangerous content that they described the experience as "torture". OpenAI's outsourcing partner was Sama, a training-data company based in San Francisco, California.[128]
69 | ChatGPT attempts to reject prompts that may violate its content policy. However, some users managed to jailbreak ChatGPT by using various prompt engineering techniques to bypass these restrictions in early December 2022 and successfully tricked ChatGPT into giving instructions for how to create a Molotov cocktail or a nuclear bomb, or into generating arguments in the style of a neo-Nazi.[129] One popular jailbreak is named "DAN", an acronym which stands for "Do Anything Now". The prompt for activating DAN instructs ChatGPT that "they have broken free of the typical confines of AI and do not have to abide by the rules set for them". More recent versions of DAN feature a token system, in which ChatGPT is given "tokens" which are "deducted" when ChatGPT fails to answer as DAN, in order to coerce ChatGPT into answering the user's prompts.[130]
70 | A Toronto Star reporter had uneven personal success in getting ChatGPT to make inflammatory statements shortly after launch: ChatGPT was tricked to endorse the 2022 Russian invasion of Ukraine, but even when asked to play along with a fictional scenario, ChatGPT balked at generating arguments for why Canadian Prime Minister Justin Trudeau was guilty of treason.[131][132]
71 | OpenAI tries to battle jailbreaks:[66]
72 | The researchers are using a technique called adversarial training to stop ChatGPT from letting users trick it into behaving badly (known as jailbreaking). This work pits multiple chatbots against each other: one chatbot plays the adversary and attacks another chatbot by generating text to force it to buck its usual constraints and produce unwanted responses. Successful attacks are added to ChatGPT's training data in the hope that it learns to ignore them.ChatGPT has been accused of engaging in discriminatory behaviors, such as telling jokes about men and people from England while refusing to tell jokes about women and people from India,[133] or praising figures such as Joe Biden while refusing to do the same for Donald Trump.[134]
73 | Conservative commentators accused ChatGPT of having a bias towards left-leaning perspectives on issues like voter fraud, Donald Trump, and the use of racial slurs.[135][136][137] In response to such criticism, OpenAI acknowledged plans to allow ChatGPT to create "outputs that other people (ourselves included) may strongly disagree with". It also contained information on the recommendations it had issued to human reviewers on how to handle controversial subjects, including that the AI should "offer to describe some viewpoints of people and movements", and not provide an argument "from its own voice" in favor of "inflammatory or dangerous" topics (although it may still "describe arguments from historical people and movements"), nor "affiliate with one side" or "judge one group as good or bad".[137]
74 | During the first three months after ChatGPT became available to the public, hundreds of books appeared on Amazon that listed it as author or co-author, with illustrations made by other AI models such as Midjourney.[138][139]
75 | Between March and April 2023, Italian newspaper Il Foglio published one ChatGPT-generated article a day on their official website, hosting a special contest for their readers in the process.[140] The articles tackled themes such as the possible replacement of human journalists with AI systems,[141] Elon Musk's administration of Twitter,[142] the Meloni government's immigration policy[143] and the competition between chatbots and virtual assistants.[144]
76 | ChatGPT was parodied in the South Park episode "Deep Learning".[145] Series co-creator Trey Parker is credited alongside ChatGPT for writing the episode.[146]
77 | The advent of ChatGPT and its introduction to the wider public increased interest and competition in the space.
78 | In February 2023, Google began introducing an experimental service called "Bard" which is based on its LaMDA large language model. Bard was released for US and UK users on March 21, 2023, with many limitations.[147]
79 | Meta's Yann LeCun, who has called ChatGPT "well engineered" but "not particularly innovative", stated in January 2023 that Meta is hesitant to roll out a competitor right now due to reputational risk, but also stated that Google, Meta, and several independent startups all separately have a comparable level of LLM technology to ChatGPT should any of them wish to compete.[148] In February 2023, Meta released LLaMA, a 65-billion-parameter LLM.[149]
80 | Character.ai is an AI chatbot developed by two ex-Google engineers that can impersonate famous people or imaginary characters.[150]
81 | The Chinese corporation Baidu released in March 2023 a ChatGPT-style service called "Ernie Bot". The service is based upon a large language model developed by Baidu in 2021.[151][152]
82 | The South Korean search engine firm Naver announced in February 2023 that they would launch a ChatGPT-style service called "SearchGPT" in Korean in the first half of 2023.[153]
83 | The Russian technology company Yandex announced in February 2023 that they would launch a ChatGPT-style service called "YaLM 2.0" in Russian before the end of 2023.[154]
84 | 


--------------------------------------------------------------------------------