├── .python-version
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── 03_others.yml
    │   ├── 01_bug_report.yml
    │   └── 02_feature_request.yml
    ├── pull_request_template.md
    ├── FUNDING.yml
    ├── branch-convention.md
    ├── semantic.yml
    └── commit-convention.md
├── img
    └── angryface.png
├── .gitignore
├── constants.py
├── main.py
├── LICENSE
├── pyproject.toml
├── llama2gptq
    ├── qa.py
    ├── ingest.py
    ├── quantize.py
    └── generate.py
├── chat.py
├── README.md
├── requirements.lock
└── requirements-dev.lock


/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.0


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## Why did you do this
2 | 
3 | ## How did you do that
4 | 


--------------------------------------------------------------------------------
/img/angryface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/seonglae/llama2gptq/HEAD/img/angryface.png


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github:
2 |   - seonglae
3 | custom:
4 |   - 'https://paypal.me/seonglae'
5 |   - 'https://www.buymeacoffee.com/seongland'
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | knowledge
 2 | .env
 3 | __pycache__
 4 | # mypy
 5 | .mypy_cache/
 6 | .dmypy.json
 7 | dmypy.json
 8 | .venv
 9 | .chroma
10 | db*
11 | 


--------------------------------------------------------------------------------
/.github/branch-convention.md:
--------------------------------------------------------------------------------
 1 | ## Git Branch name Convention
 2 | 
 3 | #### TL;DR:
 4 | 
 5 | Branch name must be matched by the following regex:
 6 | 
 7 | ```re
 8 | ^(feature|bug|document|style|refactor|test|deps)\/\#[0-9]{1,5}-[a-z|A-Z|\-|0-9]{1,20}
 9 | ```
10 | 


--------------------------------------------------------------------------------
/.github/semantic.yml:
--------------------------------------------------------------------------------
 1 | titleOnly: true
 2 | types:
 3 |   - feat
 4 |   - fix
 5 |   - docs
 6 |   - style
 7 |   - refactor
 8 |   - test
 9 |   - ci
10 |   - cd
11 |   - build
12 |   - lint
13 |   - merge
14 |   - typing
15 |   - perf
16 |   - meta
17 |   - deps
18 |   - pr
19 |   - chore
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/03_others.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Other Issue
 3 | description: Other kind of the issue
 4 | body:
 5 |   - type: textarea
 6 |     id: summary
 7 |     attributes:
 8 |       label: Summary
 9 |       description: Any precise description of the issue
10 |     validations:
11 |       required: true
12 | 


--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
 1 | from os.path import realpath, join, dirname
 2 | 
 3 | from chromadb.config import Settings
 4 | from langchain.document_loaders.base import BaseLoader
 5 | 
 6 | 
 7 | ROOT_DIRECTORY = dirname(realpath(__file__))
 8 | 
 9 | SOURCE_DIRECTORY = join(ROOT_DIRECTORY, 'knowledge')
10 | 
11 | PERSIST_DIRECTORY = join(ROOT_DIRECTORY, 'db')
12 | 
13 | CHROMA_SETTINGS = Settings(
14 |     chroma_db_impl='duckdb+parquet',
15 |     persist_directory=PERSIST_DIRECTORY,
16 |     anonymized_telemetry=False
17 | )
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/01_bug_report.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 🐛 Bug Report
 3 | description: Something isn't working as expected
 4 | labels:
 5 |   - bug
 6 | body:
 7 |   - type: input
 8 |     id: testcase
 9 |     attributes:
10 |       label: Reproducible test case
11 |       description:
12 |         If possible, please create a minimal test case that reproduces your
13 |         problem.
14 |     validations:
15 |       required: true
16 |   - type: textarea
17 |     id: summary
18 |     attributes:
19 |       label: Additional information
20 |       description:
21 |         Please share any other relevant information not mentioned above. What
22 |         did you expect to happen? What do you think the problem might be?
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/02_feature_request.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 🟢 Feature Request
 3 | description: Wouldn’t it be nice if
 4 | labels:
 5 |   - feature
 6 | body:
 7 |   - type: textarea
 8 |     id: summary
 9 |     attributes:
10 |       label: What?
11 |       description: Describe your feature idea
12 |     validations:
13 |       required: true
14 |   - type: textarea
15 |     id: why
16 |     attributes:
17 |       label: Why?
18 |       description: Describe the problem you are facing
19 |     validations:
20 |       required: true
21 |   - type: textarea
22 |     id: alternatives
23 |     attributes:
24 |       label: How?
25 |       description:
26 |         Describe you tried or ideas to implement the feature
27 |     validations:
28 |       required: false
29 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | 
 3 | from llama2gptq.ingest import ingest
 4 | from llama2gptq.qa import chat_cli
 5 | from llama2gptq.quantize import quantization
 6 | from constants import (SOURCE_DIRECTORY, PERSIST_DIRECTORY)
 7 | 
 8 | 
 9 | def chat(device: str = "cuda") -> str:
10 |   stats = chat_cli(device)
11 |   return stats
12 | 
13 | 
14 | def process(src_dir: str = SOURCE_DIRECTORY, dst_dir: str = PERSIST_DIRECTORY, device: str = "cuda") -> str:
15 |   return ingest(src_dir, dst_dir, device)
16 | 
17 | 
18 | def quantize(model: str = "meta-llama/Llama-2-13b-chat-hf",
19 |              output: str = "llama-2-13b-chat-hf-gptq",
20 |              push: bool = False, owner: str = 'seonglae',
21 |              safetensor = False, inference_only: bool = False) -> str:
22 |   quantization(model, output, push, owner, safetensor, inference_only)
23 |   return 'complete'
24 | 
25 | 
26 | if __name__ == '__main__':
27 |   fire.Fire()
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Alan Jo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | authors = [
 3 |   {name = "seonglae", email = "sungle3737@gmail.com"},
 4 | ]
 5 | dependencies = [
 6 |     "langchain~=0.0.225",
 7 |     "chromadb~=0.3.26",
 8 |     "transformers~=4.30.2",
 9 |     "InstructorEmbedding~=1.0.1",
10 |     "sentence_transformers~=2.2.2",
11 |     "unstructured~=0.7.12",
12 |     "torch>=2.0.1",
13 |     "auto_gptq~=0.2.2",
14 |     "einops~=0.6.1",
15 |     "fire~=0.5.0",
16 |     "streamlit-chat~=0.1.1",
17 |     "protobuf<=3.20.0"
18 | ]
19 | description = "Chat AI which can provide responses with reference documents by Prompt engineering over vector database."
20 | license = {text = "MIT"}
21 | name = "llama2gptq"
22 | readme = "README.md"
23 | requires-python = ">= 3.8"
24 | version = "0.1.0"
25 | 
26 | [build-system]
27 | build-backend = "hatchling.build"
28 | requires = ["hatchling"]
29 | 
30 | [tool.rye]
31 | dev-dependencies = [
32 |     "autopep8~=2.0.2",
33 |     "pip~=23.1.2",
34 |     "mypy~=1.3.0",
35 |     "setuptools~=68.0.0",
36 | ]
37 | managed = true
38 | 
39 | [[tool.rye.sources]]
40 | name = "cuda"
41 | url = "https://download.pytorch.org/whl/cu118"
42 | type = "index"
43 | 


--------------------------------------------------------------------------------
/llama2gptq/qa.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | from typing import Tuple, List
 3 | 
 4 | import torch
 5 | from transformers import Pipeline
 6 | 
 7 | from llama2gptq.ingest import extract_ref
 8 | from llama2gptq.generate import load_embeddings, load_db, load_model, TokenStoppingCriteria
 9 | 
10 | 
11 | @torch.no_grad()
12 | def qa(query, device, db, transformer: Pipeline, history: List[List[str]],
13 |        user_token="USER: ",
14 |        bot_token="ASSISTANT: ",
15 |        sys_token="",
16 |        system="",
17 |        extract_ref=extract_ref) -> Tuple:
18 |   start = time()
19 | 
20 |   if db is None:
21 |     embeddings = load_embeddings(device)
22 |     db = load_db(device, embeddings)
23 |   if transformer is None:
24 |     transformer = load_model(device)
25 | 
26 |   # input similarity
27 |   conversation = [f"{user_token}{q}\n{bot_token}{a}\n" for [q, a] in history]
28 |   prompt = f"{sys_token}{system}" + \
29 |       "".join(conversation) + f'{user_token}{query}\n{bot_token}'
30 |   print(prompt)
31 | 
32 |   # Inference
33 |   criteria = TokenStoppingCriteria(
34 |       user_token.strip(), prompt, transformer.tokenizer)
35 |   response = transformer(prompt, stopping_criteria=criteria)[
36 |       0]["generated_text"]
37 |   answer = response.replace(prompt, "").strip()
38 | 
39 |   # output similarity
40 |   refs = db.search(
41 |       f'{user_token}{query}\n{bot_token}', search_type="similarity")
42 | 
43 |   # Print the result
44 |   print('\nHelpful links\n')
45 |   for ref in refs:
46 |     ref_info = extract_ref(ref)
47 |     print(f"{ref_info['title']}: {ref_info['link']}")
48 | 
49 |   print(f"\nTime taken: {time() - start} seconds\n")
50 |   print(prompt + answer + '\n')
51 | 
52 |   return (answer, refs)
53 | 
54 | 
55 | def qa_cli(device, db, llm, history) -> Tuple:
56 |   query = input("\nQuestion: ")
57 |   if query == "exit":
58 |     return ()
59 |   return (query, *qa(query, device, db, llm, history))
60 | 
61 | 
62 | def chat_cli(device='cuda'):
63 |   embeddings = load_embeddings(device)
64 |   db = load_db(device, embeddings)
65 |   transformer = load_model(device)
66 | 
67 |   pingongs = []
68 |   while True:
69 |     history = [[pingpong[0], pingpong[1]] for pingpong in pingongs]
70 |     pingpong = qa_cli(device, db, transformer, history)
71 |     if len(pingpong) == 0:
72 |       break
73 |     pingongs.append(pingpong)
74 |   return pingongs
75 | 


--------------------------------------------------------------------------------
/llama2gptq/ingest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from re import split
 3 | from typing import List, Type, Dict
 4 | from pathlib import Path
 5 | 
 6 | from langchain.docstore.document import Document
 7 | from langchain.embeddings import HuggingFaceInstructEmbeddings
 8 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 9 | from langchain.document_loaders.base import BaseLoader
10 | from langchain.vectorstores import Chroma
11 | from langchain.document_loaders import (
12 |     CSVLoader,
13 |     PDFMinerLoader,
14 |     TextLoader,
15 |     UnstructuredMarkdownLoader,
16 |     UnstructuredExcelLoader,
17 | )
18 | 
19 | from constants import (CHROMA_SETTINGS)
20 | 
21 | 
22 | DOCUMENT_MAP = {
23 |     ".txt": TextLoader,
24 |     ".pdf": PDFMinerLoader,
25 |     ".csv": CSVLoader,
26 |     ".xls": UnstructuredExcelLoader,
27 |     ".xlxs": UnstructuredExcelLoader,
28 |     ".md": TextLoader,
29 | }
30 | 
31 | 
32 | def load_documents(folder_path: str) -> List[Document]:
33 |   glob = Path(folder_path).glob
34 |   ps = list(glob("**/*.md"))
35 |   documents = []
36 |   for p in ps:
37 |     file_extension = os.path.splitext(p)[1]
38 |     loader_class = DOCUMENT_MAP.get(file_extension)
39 |     if loader_class:
40 |       loader = loader_class(p, encoding="utf-8")
41 |       document = loader.load()[0]
42 |       document.metadata["source"] = str(p)
43 |       documents.append(document)
44 |     else:
45 |       continue
46 |   return documents
47 | 
48 | 
49 | def extract_ref(ref: Document) -> Dict[str, str]:
50 |   source = split(r"\\|/", ref.metadata["source"])[-1]
51 |   slug = split(r" |.md", source)[-2]
52 |   title = ' '.join(slug.split('-')[:-1])
53 |   link = f"https://texonom.com/{slug}"
54 |   return {"title": title, "link": link}
55 | 
56 | 
57 | def ingest(source: str, output: str, device='cuda'):
58 |   print(f"Loading documents from {source}")
59 |   documents = load_documents(source)
60 |   for doc in documents:
61 |     doc.metadata["source"] = extract_ref(doc)['link']
62 |   text_splitter = RecursiveCharacterTextSplitter(
63 |       chunk_size=1000, chunk_overlap=200)
64 |   texts = text_splitter.split_documents(documents)
65 |   print(f"Loaded {len(documents)} documents from {source}")
66 |   print(f"Split into {len(texts)} chunks of text")
67 | 
68 |   embeddings = HuggingFaceInstructEmbeddings(
69 |       model_name="intfloat/multilingual-e5-large",
70 |       model_kwargs={"device": device},
71 |   )
72 |   db = Chroma.from_documents(
73 |       texts,
74 |       embeddings,
75 |       persist_directory=output,
76 |       client_settings=CHROMA_SETTINGS,
77 |   )
78 |   db.persist()
79 | 


--------------------------------------------------------------------------------
/llama2gptq/quantize.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from transformers import AutoTokenizer, TextGenerationPipeline, GenerationConfig, LlamaTokenizer, LlamaTokenizerFast
 3 | from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 4 | 
 5 | 
 6 | def quantization(source_model: str, output: str, push: bool, owner: str,
 7 |                  safetensor=False, inference_only=False):
 8 |   logging.basicConfig(
 9 |       format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
10 |   )
11 | 
12 |   tokenizer = AutoTokenizer.from_pretrained(
13 |       source_model, use_fast=True, use_auth_token=True)
14 |   examples = [
15 |       tokenizer(
16 |           "Texonom is an knowledge system that can help you with your daily tasks using AI chatbot."
17 |       )
18 |   ]
19 | 
20 |   quantize_config = BaseQuantizeConfig(
21 |       bits=4,  # quantize model to 4-bit
22 |       group_size=128,  # it is recommended to set the value to 128
23 |       desc_act=False,  # None act-order can significantly speed up inference but the perplexity may slightly bad
24 |   )
25 | 
26 |   # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
27 |   if not inference_only:
28 |     # load un-quantized model, by default, the model will always be loaded into CPU memory
29 |     model = AutoGPTQForCausalLM.from_pretrained(
30 |         source_model, quantize_config, use_safetensors=safetensor)
31 |     model.quantize(examples)
32 |     model.save_quantized(output, use_safetensors=safetensor)
33 | 
34 |   # load quantized model to the first GPU
35 |   quantized = AutoGPTQForCausalLM.from_quantized(
36 |       output,
37 |       device="cuda:0",
38 |       use_safetensors=safetensor
39 |   )
40 | 
41 |   # inference with model.generate
42 |   query = "USER: Are you AI? Say yes or no.\n ASSISTANT:"
43 | 
44 |   # or you can also use pipeline
45 |   pipeline = TextGenerationPipeline(model=quantized, tokenizer=tokenizer)
46 |   print(pipeline(query)[0]["generated_text"])
47 | 
48 |   # push quantized model to Hugging Face Hub.
49 |   # to use use_auth_token=True, Login first via huggingface-cli login.
50 |   if push and not inference_only:
51 |     commit_message = f"build: AutoGPTQ for {source_model}" + \
52 |         f": {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
53 |     generation_config = GenerationConfig.from_pretrained(source_model)
54 |     generation_config.push_to_hub(
55 |         output, use_auth_token=True, commit_message=commit_message)
56 |     tokenizer.push_to_hub(output, use_auth_token=True,
57 |                           commit_message=commit_message)
58 |     repo_id = f"{owner}/{output}"
59 |     quantized.push_to_hub(repo_id, use_safetensors=safetensor,
60 |                           commit_message=commit_message, use_auth_token=True)
61 | 


--------------------------------------------------------------------------------
/chat.py:
--------------------------------------------------------------------------------
  1 | from re import split
  2 | 
  3 | import torch
  4 | import streamlit as st
  5 | from streamlit_chat import message
  6 | 
  7 | from llama2gptq.qa import qa, load_model, load_db
  8 | from llama2gptq.ingest import extract_ref
  9 | 
 10 | DEVICE = 'cuda'
 11 | TITLE = 'LLaMa2 GPTQ'
 12 | HUG = 'https://em-content.zobj.net/source/microsoft-teams/363/hugging-face_1f917.png'
 13 | ANGRY = 'https://em-content.zobj.net/source/microsoft-teams/363/pouting-face_1f621.png'
 14 | 
 15 | 
 16 | st.set_page_config(page_title=TITLE)
 17 | st.header(TITLE)
 18 | st.markdown('''
 19 | ### Ask anythig to [Texonom](https://texonom.com).
 20 | Question for recently learned
 21 | ''', unsafe_allow_html=True)
 22 | 
 23 | 
 24 | @st.cache_resource
 25 | def load_transformer():
 26 |   return (load_model(DEVICE), load_db(DEVICE))
 27 | 
 28 | 
 29 | transformer, db = load_transformer()
 30 | 
 31 | styl = """
 32 | <style>
 33 |     .stTextInput {
 34 |       position: fixed;
 35 |       bottom: 3rem;
 36 |       z-index: 1;
 37 |     }
 38 |     .StatusWidget-enter-done{
 39 |       position: fixed;
 40 |       left: 50%;
 41 |       top: 50%;
 42 |       transform: translate(-50%, -50%);
 43 |     }
 44 |     .StatusWidget-enter-done button{
 45 |       display: none;
 46 |     }
 47 | </style>
 48 | """
 49 | 
 50 | BTN_STYLE = """
 51 | color: #aaa;
 52 | padding-right: 0.5rem;
 53 | """
 54 | 
 55 | 
 56 | st.markdown(styl, unsafe_allow_html=True)
 57 | 
 58 | if 'generated' not in st.session_state:
 59 |   st.session_state['generated'] = []
 60 | 
 61 | if 'past' not in st.session_state:
 62 |   st.session_state['past'] = []
 63 | 
 64 | if 'answers' not in st.session_state:
 65 |   st.session_state['answers'] = []
 66 | 
 67 | 
 68 | def query(query):
 69 |   st.session_state.past.append(query)
 70 |   history = []
 71 |   for i, _ in enumerate(st.session_state['generated']):
 72 |     history.append([st.session_state['past'][i],
 73 |                    st.session_state["generated"][i]])
 74 | 
 75 |   answer, refs = qa(query, DEVICE, db, transformer, history)
 76 | 
 77 |   # Append references
 78 |   st.session_state.generated.append(answer)
 79 | 
 80 |   # Generate HTML
 81 |   answer += '<hr style="border: 1px solid #424242;"> References: '
 82 |   for ref in refs:
 83 |     ref_info = extract_ref(ref)
 84 |     btn = f"<a href='{ref_info['link']}' style='{BTN_STYLE}'>{ref_info['title']}</a>"
 85 |     answer += btn
 86 | 
 87 |   st.session_state.answers.append(answer)
 88 |   return answer
 89 | 
 90 | 
 91 | def get_text():
 92 |   input_text = st.text_input("You: ", key="input")
 93 |   return input_text
 94 | 
 95 | 
 96 | user_input = get_text()
 97 | 
 98 | 
 99 | if user_input:
100 |   query(user_input)
101 | 
102 | 
103 | if st.session_state['generated']:
104 |   for i, _ in enumerate(st.session_state['generated']):
105 |     message(st.session_state['past'][i], is_user=True,
106 |             key=str(i) + '_user', logo=HUG)
107 |     message(st.session_state["answers"][i],
108 |             key=str(i), logo=ANGRY, allow_html=True)
109 | 


--------------------------------------------------------------------------------
/.github/commit-convention.md:
--------------------------------------------------------------------------------
 1 | ## Git Commit Message Convention
 2 | 
 3 | > This is adapted from [Vite's commit convention](https://github.com/vitejs/vite/blob/main/.github/commit-convention.md).
 4 | 
 5 | #### TL;DR:
 6 | 
 7 | Messages must be matched by the following regex:
 8 | 
 9 | <!-- prettier-ignore -->
10 | ```re
11 | ^(revert: )?(feat|fix|docs|style|refactor|test|ci|cd|build|meta|pr|lint|typing|perf|deps|merge)(\(.+\))?: .{1,50}
12 | ```
13 | 
14 | #### Examples
15 | 
16 | Appears under "Features" header, `dev` subheader:
17 | 
18 | ```
19 | feat(dev): add 'comments' option
20 | ```
21 | 
22 | Appears under "Bug Fixes" header, `dev` subheader, with a link to issue #28:
23 | 
24 | ```
25 | fix(dev): fix dev error
26 | 
27 | close #28
28 | ```
29 | 
30 | Appears under "Performance Improvements" header, and under "Breaking Changes" with the breaking change explanation:
31 | 
32 | ```
33 | perf(build): remove 'foo' option
34 | 
35 | BREAKING CHANGE: The 'foo' option has been removed.
36 | ```
37 | 
38 | The following commit and commit `667ecc1` do not appear in the changelog if they are under the same release. If not, the revert commit appears under the "Reverts" header.
39 | 
40 | ```
41 | revert: feat(compiler): add 'comments' option
42 | 
43 | This reverts commit 667ecc1654a317a13331b17617d973392f415f02.
44 | ```
45 | 
46 | ### Full Message Format
47 | 
48 | A commit message consists of a **header**, **body** and **footer**. The header has a **type**, **scope** and **subject**:
49 | 
50 | ```
51 | <type>(<scope>): <subject>
52 | <BLANK LINE>
53 | <body>
54 | <BLANK LINE>
55 | <footer>
56 | ```
57 | 
58 | The **header** is mandatory and the **scope** of the header is optional.
59 | 
60 | ### Revert
61 | 
62 | If the commit reverts a previous commit, it should begin with `revert: `, followed by the header of the reverted commit. In the body, it should say: `This reverts commit <hash>.`, where the hash is the SHA of the commit being reverted.
63 | 
64 | ### Type
65 | 
66 | If the prefix is `feat`, `fix` or `perf`, it will appear in the changelog. However, if there is any [BREAKING CHANGE](#footer), the commit will always appear in the changelog.
67 | 
68 | Other prefixes are up to your discretion. Suggested prefixes are `docs`, `chore`, `style`, `refactor`, and `test` for non-changelog related tasks.
69 | 
70 | ### Scope
71 | 
72 | The scope could be anything specifying the place of the commit change. For example `dev`, `build`, `workflow`, `cli` etc...
73 | 
74 | ### Subject
75 | 
76 | The subject contains a succinct description of the change:
77 | 
78 | - use the imperative, present tense: "change" not "changed" nor "changes"
79 | - don't capitalize the first letter
80 | - no dot (.) at the end
81 | 
82 | ### Body
83 | 
84 | Just as in the **subject**, use the imperative, present tense: "change" not "changed" nor "changes".
85 | The body should include the motivation for the change and contrast this with previous behavior.
86 | 
87 | ### Footer
88 | 
89 | The footer should contain any information about **Breaking Changes** and is also the place to
90 | reference GitHub issues that this commit **Closes**.
91 | 
92 | **Breaking Changes** should start with the word `BREAKING CHANGE:` with a space or two newlines. The rest of the commit message is then used for this.
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLaMa2 GPTQ
 2 | 
 3 | Chat AI which can provide responses with reference documents by Prompt engineering over vector database. It suggests related web pages provided through the integration with my previous product, Texonom.
 4 | 
 5 | <p align="center">
 6 | <img src="img/angryface.png" style="width: 150px"/>
 7 | </p>
 8 | 
 9 | Pursuing local, private and personal AI without requesting external API attained by optimizing inference performance with GPTQ model quantization. This project was inspired by the [langchain](https://github.com/hwchase17/langchain) projects like [notion-qa](https://github.com/hwchase17/notion-qa), [localGPT](https://github.com/PromtEngineer/localGPT).
10 | 
11 | # Demos
12 | 
13 | ### CLI Demo
14 | 
15 | https://github.com/seonglae/llama2gptq/assets/27716524/dba5cd39-ea5c-44d9-bf29-2e8f04039413
16 | 
17 | ### Chat Demo
18 | 
19 | https://github.com/seonglae/llama2gptq/assets/27716524/258de629-0b61-4670-b76b-9f2357adf4c7
20 | 
21 | <br/>
22 | 
23 | ## Install
24 | 
25 | This project is using [rye](https://mitsuhiko.github.io/rye/) as package manager
26 | Currently only available with [CUDA](https://texonom.com/a9e934a523d346c5a984d95e3d0676e3)
27 | 
28 | ```
29 | rye sync
30 | ```
31 | 
32 | or using pip
33 | 
34 | ```
35 | CUDA_VERSION=cu118
36 | TORCH_VERSION=2.0.1
37 | pip install torch==$TORCH_VERSION --index-url https://download.pytorch.org/whl/$CUDA_VERSION --force
38 | pip install torch==$TORCH_VERSION --index-url https://download.pytorch.org/whl/$CUDA_VERSION
39 | pip install .
40 | ```
41 | 
42 | ## QA
43 | 
44 | ### 1. Chat with Web UI
45 | 
46 | ```zsh
47 | streamlit run chat.py
48 | ```
49 | 
50 | ### 2. Chat with CLI
51 | 
52 | ```zsh
53 | python main.py chat
54 | ```
55 | 
56 | ## Ingest Documents
57 | 
58 | Currently code structure is mainly focussed on Notion's csv exported data
59 | 
60 | ### Custom source documents
61 | 
62 | ```zsh
63 | # Put document files to ./knowledge folder
64 | python main.py process
65 | # Or use provided Texonom DB
66 | git clone https://huggingface.co/datasets/texonom/md-chroma-instructor-xl db
67 | ```
68 | 
69 | ## Quantize Model
70 | 
71 | Default model is orca 3b for now
72 | 
73 | ```zsh
74 | python main quantize --source_model facebook/opt-125m --output opt-125m-4bit-gptq --push
75 | ```
76 | 
77 | ## Future Plan
78 | 
79 | - [ ] [MPS](https://texonom.com/8d71e4de36e4416c83f65ee7bdaa412b) support using dynamic model selecting
80 | - [ ] Stateful Web App support like [chat-langchain](https://chat.langchain.dev/)
81 | 
82 | ## App Stack
83 | 
84 | ### LLM Stack
85 | 
86 | - [Langchain](https://texonom.com/945567c597364cbb98336ca08c059856) for Prompt Engineering
87 | - [ChromaDB](https://texonom.com/8af886db7d684e03911a86b652620816) for storing embeddings
88 | - [Transformers](https://texonom.com/f5101287cc9249ab812e281e374e5629) for LLM engine
89 | - [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) for Quantization & Inference
90 | 
91 | ### Python Stack
92 | 
93 | - [Rye](https://texonom.com/rye-429b5d5f3d7f4026ab5d1abd61facc73) for package management
94 | - [Mypy](https://texonom.com/8a894731430f4138ac0fdd522cd74772) for type checking
95 | - [Fire](https://github.com/google/python-fire) for CLI implementation
96 | - [Streamlit](https://texonom.com/9e295c64d27e4999878a022b1c538964) for Web UI implementation
97 | 


--------------------------------------------------------------------------------
/llama2gptq/generate.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from langchain.embeddings import HuggingFaceInstructEmbeddings
  3 | from langchain.vectorstores import Chroma, VectorStore
  4 | from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, StoppingCriteria, PreTrainedTokenizerBase, LlamaTokenizerFast
  5 | from auto_gptq import AutoGPTQForCausalLM
  6 | 
  7 | from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY
  8 | 
  9 | 
 10 | def load_embeddings(device):
 11 |   embeddings = HuggingFaceInstructEmbeddings(
 12 |       model_name="hkunlp/instructor-xl", model_kwargs={"device": device}
 13 |   )
 14 |   return embeddings
 15 | 
 16 | 
 17 | def load_db(device, embeddings=None) -> VectorStore:
 18 |   if embeddings is None:
 19 |     embeddings = load_embeddings(device)
 20 |   db = Chroma(
 21 |       persist_directory=PERSIST_DIRECTORY,
 22 |       embedding_function=embeddings,
 23 |       client_settings=CHROMA_SETTINGS,
 24 |   )
 25 |   return db
 26 | 
 27 | 
 28 | class TokenStoppingCriteria(StoppingCriteria):
 29 |   def __init__(self, target_sequence, prompt, tokenizer: PreTrainedTokenizerBase, token_length=3):
 30 |     super().__init__()
 31 |     self.target_sequence = target_sequence
 32 |     self.prompt = prompt
 33 |     self.tokenizer: PreTrainedTokenizerBase = tokenizer
 34 |     self.token_length = token_length
 35 | 
 36 |   def __call__(self, input_ids, scores, **kwargs):
 37 |     generated_text = self.tokenizer.decode(input_ids[0])
 38 |     generated_text = generated_text.replace(self.prompt, "")
 39 |     if self.target_sequence in generated_text:
 40 |       for i in range(self.token_length):
 41 |         input_ids[0][-(i+1)] = self.tokenizer.eos_token_id
 42 |       return True
 43 |     print(generated_text)
 44 |     return False  # Continue generation
 45 | 
 46 |   def __len__(self):
 47 |     return 1
 48 | 
 49 |   def __iter__(self):
 50 |     yield self
 51 | 
 52 | 
 53 | def load_model(
 54 |     device: str, model_id="seonglae/llama-2-7b-chat-hf-gptq",
 55 |     model_basename="gptq_model-4bit-128g",
 56 |     model_type="gptq",
 57 |     safetensor=True,
 58 | ):
 59 |   assert device == "cuda"
 60 | 
 61 |   if model_type == "llama":
 62 |     tokenizer = LlamaTokenizer.from_pretrained(
 63 |         model_id, use_fast=True)
 64 |   else:
 65 |     tokenizer = AutoTokenizer.from_pretrained(
 66 |         model_id, use_fast=True)
 67 | 
 68 |   if model_type == "gptq":
 69 |     model = AutoGPTQForCausalLM.from_quantized(
 70 |         model_id,
 71 |         model_basename=model_basename,
 72 |         trust_remote_code=True,
 73 |         device='cuda:0',
 74 |         use_triton=False,
 75 |         use_safetensors=safetensor,
 76 |     )
 77 |   elif model_type == "llama":
 78 |     model = LlamaForCausalLM.from_pretrained(
 79 |         model_id,
 80 |         device_map='cuda:0',
 81 |         torch_dtype=torch.float16,
 82 |         trust_remote_code=True,
 83 |         use_safetensors=safetensor,
 84 |     )
 85 |   elif model_type == "auto":
 86 |     model = AutoModelForCausalLM.from_pretrained(
 87 |         model_id,
 88 |         device_map='cuda:0',
 89 |         torch_dtype=torch.float16,
 90 |         low_cpu_mem_usage=True,
 91 |         trust_remote_code=True,
 92 |         use_safetensors=safetensor,
 93 |     )
 94 |     model.tie_weights()
 95 |   model.eval()
 96 | 
 97 |   transformer = pipeline(
 98 |       "text-generation",
 99 |       model=model,
100 |       tokenizer=tokenizer,
101 |       temperature=0.5,
102 |       top_p=0.95,
103 |       max_new_tokens=100,
104 |       repetition_penalty=1.15,
105 |   )
106 |   return transformer
107 | 


--------------------------------------------------------------------------------
/requirements.lock:
--------------------------------------------------------------------------------
  1 | # generated by rye
  2 | # use `rye lock` or `rye sync` to update this lockfile
  3 | #
  4 | # last locked with the following flags:
  5 | #   pre: false
  6 | #   features: []
  7 | #   all-features: false
  8 | 
  9 | -e file:.
 10 | accelerate==0.20.3
 11 | aiohttp==3.8.4
 12 | aiosignal==1.3.1
 13 | altair==5.0.1
 14 | anyio==3.7.1
 15 | argilla==1.12.0
 16 | async-timeout==4.0.2
 17 | attrs==23.1.0
 18 | auto-gptq==0.2.2
 19 | backoff==2.2.1
 20 | blinker==1.6.2
 21 | cachetools==5.3.1
 22 | certifi==2023.5.7
 23 | cffi==1.15.1
 24 | chardet==5.1.0
 25 | charset-normalizer==3.1.0
 26 | chromadb==0.3.26
 27 | click==8.1.3
 28 | clickhouse-connect==0.6.5
 29 | colorama==0.4.6
 30 | coloredlogs==15.0.1
 31 | commonmark==0.9.1
 32 | cryptography==41.0.1
 33 | dataclasses-json==0.5.9
 34 | datasets==2.13.1
 35 | decorator==5.1.1
 36 | deprecated==1.2.14
 37 | dill==0.3.6
 38 | duckdb==0.8.1
 39 | einops==0.6.1
 40 | et-xmlfile==1.1.0
 41 | exceptiongroup==1.1.2
 42 | fastapi==0.99.1
 43 | filelock==3.12.2
 44 | filetype==1.2.0
 45 | fire==0.5.0
 46 | flatbuffers==23.5.26
 47 | frozenlist==1.3.3
 48 | fsspec==2023.6.0
 49 | gitdb==4.0.10
 50 | gitpython==3.1.31
 51 | greenlet==2.0.2
 52 | h11==0.14.0
 53 | hnswlib==0.7.0
 54 | httpcore==0.16.3
 55 | httptools==0.5.0
 56 | httpx==0.23.3
 57 | huggingface-hub==0.16.2
 58 | humanfriendly==10.0
 59 | idna==3.4
 60 | importlib-metadata==6.7.0
 61 | instructorembedding==1.0.1
 62 | jinja2==3.1.2
 63 | joblib==1.3.1
 64 | jsonschema==4.18.0
 65 | jsonschema-specifications==2023.6.1
 66 | langchain==0.0.225
 67 | langchainplus-sdk==0.0.20
 68 | lxml==4.9.3
 69 | lz4==4.3.2
 70 | markdown==3.4.3
 71 | markupsafe==2.1.3
 72 | marshmallow==3.19.0
 73 | marshmallow-enum==1.5.1
 74 | monotonic==1.6
 75 | mpmath==1.3.0
 76 | msg-parser==1.2.0
 77 | multidict==6.0.4
 78 | multiprocess==0.70.14
 79 | mypy-extensions==1.0.0
 80 | networkx==3.1
 81 | nltk==3.8.1
 82 | numexpr==2.8.4
 83 | numpy==1.23.5
 84 | olefile==0.46
 85 | onnxruntime==1.15.1
 86 | openapi-schema-pydantic==1.2.4
 87 | openpyxl==3.1.2
 88 | overrides==7.3.1
 89 | packaging==23.1
 90 | pandas==1.5.3
 91 | pdf2image==1.16.3
 92 | pdfminer-six==20221105
 93 | pillow==9.5.0
 94 | posthog==3.0.1
 95 | protobuf==3.20.0
 96 | psutil==5.9.5
 97 | pulsar-client==3.2.0
 98 | pyarrow==12.0.1
 99 | pycparser==2.21
100 | pydantic==1.10.11
101 | pydeck==0.8.1b0
102 | pygments==2.15.1
103 | pympler==1.0.1
104 | pypandoc==1.11
105 | pyreadline3==3.4.1
106 | python-dateutil==2.8.2
107 | python-docx==0.8.11
108 | python-dotenv==1.0.0
109 | python-magic==0.4.27
110 | python-pptx==0.6.21
111 | pytz==2023.3
112 | pytz-deprecation-shim==0.1.0.post0
113 | pyyaml==6.0
114 | referencing==0.29.1
115 | regex==2023.6.3
116 | requests==2.31.0
117 | rfc3986==1.5.0
118 | rich==13.0.1
119 | rouge==1.0.1
120 | rpds-py==0.8.7
121 | safetensors==0.3.1
122 | scikit-learn==1.3.0
123 | scipy==1.11.1
124 | sentence-transformers==2.2.2
125 | sentencepiece==0.1.99
126 | six==1.16.0
127 | smmap==5.0.0
128 | sniffio==1.3.0
129 | sqlalchemy==2.0.18
130 | starlette==0.27.0
131 | streamlit==1.24.0
132 | streamlit-chat==0.1.1
133 | sympy==1.12
134 | tabulate==0.9.0
135 | tenacity==8.2.2
136 | termcolor==2.3.0
137 | threadpoolctl==3.1.0
138 | tokenizers==0.13.3
139 | toml==0.10.2
140 | toolz==0.12.0
141 | torch==2.0.1+cu118
142 | torchvision==0.15.2+cu118
143 | tornado==6.3.2
144 | tqdm==4.65.0
145 | transformers==4.30.2
146 | typer==0.7.0
147 | typing-extensions==4.7.1
148 | typing-inspect==0.9.0
149 | tzdata==2023.3
150 | tzlocal==4.3.1
151 | unstructured==0.7.12
152 | urllib3==2.0.3
153 | uvicorn==0.22.0
154 | validators==0.20.0
155 | watchdog==3.0.0
156 | watchfiles==0.19.0
157 | websockets==11.0.3
158 | wrapt==1.14.1
159 | xlrd==2.0.1
160 | xlsxwriter==3.1.2
161 | xxhash==3.2.0
162 | yarl==1.9.2
163 | zipp==3.15.0
164 | zstandard==0.21.0
165 | 


--------------------------------------------------------------------------------
/requirements-dev.lock:
--------------------------------------------------------------------------------
  1 | # generated by rye
  2 | # use `rye lock` or `rye sync` to update this lockfile
  3 | #
  4 | # last locked with the following flags:
  5 | #   pre: false
  6 | #   features: []
  7 | #   all-features: false
  8 | 
  9 | -e file:.
 10 | accelerate==0.20.3
 11 | aiohttp==3.8.4
 12 | aiosignal==1.3.1
 13 | altair==5.0.1
 14 | anyio==3.7.1
 15 | argilla==1.12.0
 16 | async-timeout==4.0.2
 17 | attrs==23.1.0
 18 | auto-gptq==0.2.2
 19 | autopep8==2.0.2
 20 | backoff==2.2.1
 21 | blinker==1.6.2
 22 | cachetools==5.3.1
 23 | certifi==2023.5.7
 24 | cffi==1.15.1
 25 | chardet==5.1.0
 26 | charset-normalizer==3.1.0
 27 | chromadb==0.3.26
 28 | click==8.1.3
 29 | clickhouse-connect==0.6.5
 30 | colorama==0.4.6
 31 | coloredlogs==15.0.1
 32 | commonmark==0.9.1
 33 | cryptography==41.0.1
 34 | dataclasses-json==0.5.9
 35 | datasets==2.13.1
 36 | decorator==5.1.1
 37 | deprecated==1.2.14
 38 | dill==0.3.6
 39 | duckdb==0.8.1
 40 | einops==0.6.1
 41 | et-xmlfile==1.1.0
 42 | exceptiongroup==1.1.2
 43 | fastapi==0.99.1
 44 | filelock==3.12.2
 45 | filetype==1.2.0
 46 | fire==0.5.0
 47 | flatbuffers==23.5.26
 48 | frozenlist==1.3.3
 49 | fsspec==2023.6.0
 50 | gitdb==4.0.10
 51 | gitpython==3.1.31
 52 | greenlet==2.0.2
 53 | h11==0.14.0
 54 | hnswlib==0.7.0
 55 | httpcore==0.16.3
 56 | httptools==0.5.0
 57 | httpx==0.23.3
 58 | huggingface-hub==0.16.2
 59 | humanfriendly==10.0
 60 | idna==3.4
 61 | importlib-metadata==6.7.0
 62 | instructorembedding==1.0.1
 63 | jinja2==3.1.2
 64 | joblib==1.3.1
 65 | jsonschema==4.18.0
 66 | jsonschema-specifications==2023.6.1
 67 | langchain==0.0.225
 68 | langchainplus-sdk==0.0.20
 69 | lxml==4.9.3
 70 | lz4==4.3.2
 71 | markdown==3.4.3
 72 | markupsafe==2.1.3
 73 | marshmallow==3.19.0
 74 | marshmallow-enum==1.5.1
 75 | monotonic==1.6
 76 | mpmath==1.3.0
 77 | msg-parser==1.2.0
 78 | multidict==6.0.4
 79 | multiprocess==0.70.14
 80 | mypy==1.3.0
 81 | mypy-extensions==1.0.0
 82 | networkx==3.1
 83 | nltk==3.8.1
 84 | numexpr==2.8.4
 85 | numpy==1.23.5
 86 | olefile==0.46
 87 | onnxruntime==1.15.1
 88 | openapi-schema-pydantic==1.2.4
 89 | openpyxl==3.1.2
 90 | overrides==7.3.1
 91 | packaging==23.1
 92 | pandas==1.5.3
 93 | pdf2image==1.16.3
 94 | pdfminer-six==20221105
 95 | pillow==9.5.0
 96 | posthog==3.0.1
 97 | protobuf==3.20.0
 98 | psutil==5.9.5
 99 | pulsar-client==3.2.0
100 | pyarrow==12.0.1
101 | pycodestyle==2.10.0
102 | pycparser==2.21
103 | pydantic==1.10.11
104 | pydeck==0.8.1b0
105 | pygments==2.15.1
106 | pympler==1.0.1
107 | pypandoc==1.11
108 | pyreadline3==3.4.1
109 | python-dateutil==2.8.2
110 | python-docx==0.8.11
111 | python-dotenv==1.0.0
112 | python-magic==0.4.27
113 | python-pptx==0.6.21
114 | pytz==2023.3
115 | pytz-deprecation-shim==0.1.0.post0
116 | pyyaml==6.0
117 | referencing==0.29.1
118 | regex==2023.6.3
119 | requests==2.31.0
120 | rfc3986==1.5.0
121 | rich==13.0.1
122 | rouge==1.0.1
123 | rpds-py==0.8.7
124 | safetensors==0.3.1
125 | scikit-learn==1.3.0
126 | scipy==1.11.1
127 | sentence-transformers==2.2.2
128 | sentencepiece==0.1.99
129 | six==1.16.0
130 | smmap==5.0.0
131 | sniffio==1.3.0
132 | sqlalchemy==2.0.18
133 | starlette==0.27.0
134 | streamlit==1.24.0
135 | streamlit-chat==0.1.1
136 | sympy==1.12
137 | tabulate==0.9.0
138 | tenacity==8.2.2
139 | termcolor==2.3.0
140 | threadpoolctl==3.1.0
141 | tokenizers==0.13.3
142 | toml==0.10.2
143 | tomli==2.0.1
144 | toolz==0.12.0
145 | torch==2.0.1+cu118
146 | torchvision==0.15.2+cu118
147 | tornado==6.3.2
148 | tqdm==4.65.0
149 | transformers==4.30.2
150 | typer==0.7.0
151 | typing-extensions==4.7.1
152 | typing-inspect==0.9.0
153 | tzdata==2023.3
154 | tzlocal==4.3.1
155 | unstructured==0.7.12
156 | urllib3==2.0.3
157 | uvicorn==0.22.0
158 | validators==0.20.0
159 | watchdog==3.0.0
160 | watchfiles==0.19.0
161 | websockets==11.0.3
162 | wrapt==1.14.1
163 | xlrd==2.0.1
164 | xlsxwriter==3.1.2
165 | xxhash==3.2.0
166 | yarl==1.9.2
167 | zipp==3.15.0
168 | zstandard==0.21.0
169 | # The following packages are considered to be unsafe in a requirements file:
170 | pip==23.1.2
171 | setuptools==68.0.0
172 | 


--------------------------------------------------------------------------------