├── .cache └── web │ ├── Alpaca lora.pickle │ ├── End of FTX.pickle │ ├── Hoe maak ik pasta.pickle │ ├── Why Llama LLM model is so popular.pickle │ ├── Why did SVB collapsed.pickle │ ├── digital twin有哪些用处.pickle │ ├── what is new for gpt4.pickle │ ├── when is End of FTX.pickle │ ├── 日本国憲法は誰が作ったのか.pickle │ └── 아가동산사건의 문제가 뭐야.pickle ├── .gitignore ├── .idea └── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Procfile ├── README.md ├── app.py ├── img ├── architecture_roadmap.png ├── explainability.png └── webui.png ├── playground ├── test_OpenAIAPI.py ├── test_OpenAI_Embedding.py ├── test_langchain_faiss.py ├── test_nltk.py └── test_pyterrier.py ├── requirements.txt ├── runtime.txt └── src ├── BingService.py ├── FrontendService.py ├── LLMService.py ├── NLPUtil.py ├── SearchGPTService.py ├── SemanticSearchService.py ├── SourceService.py ├── Util.py ├── config └── config.yaml ├── flask_app.py ├── gradio_app.py ├── main.py ├── text_extract ├── __init__.py ├── doc │ ├── __init__.py │ ├── abc_doc_extract.py │ ├── docx_svc.py │ └── ppt_svc.py └── html │ ├── __init__.py │ ├── abc_html_extract.py │ ├── beautiful_soup.py │ └── trafilatura.py └── website ├── __init__.py ├── sender.py ├── static └── index.js ├── templates ├── alert_box.html ├── base.html ├── explain_result.html ├── index.html ├── index_static.html ├── prompt_examples.html ├── request_id_status_html.html ├── search_result.html └── search_result_step.html └── views.py /.cache/web/Alpaca lora.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Alpaca lora.pickle -------------------------------------------------------------------------------- /.cache/web/End of FTX.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/End of FTX.pickle -------------------------------------------------------------------------------- /.cache/web/Hoe maak ik pasta.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Hoe maak ik pasta.pickle -------------------------------------------------------------------------------- /.cache/web/Why Llama LLM model is so popular.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Why Llama LLM model is so popular.pickle -------------------------------------------------------------------------------- /.cache/web/Why did SVB collapsed.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Why did SVB collapsed.pickle -------------------------------------------------------------------------------- /.cache/web/digital twin有哪些用处.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/digital twin有哪些用处.pickle -------------------------------------------------------------------------------- /.cache/web/what is new for gpt4.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/what is new for gpt4.pickle -------------------------------------------------------------------------------- /.cache/web/when is End of FTX.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/when is End of FTX.pickle -------------------------------------------------------------------------------- /.cache/web/日本国憲法は誰が作ったのか.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/日本国憲法は誰が作ったのか.pickle -------------------------------------------------------------------------------- /.cache/web/아가동산사건의 문제가 뭐야.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/아가동산사건의 문제가 뭐야.pickle -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.docx 2 | *.doc 3 | *.pptx 4 | *.ppt 5 | *.pdf 6 | *.bf 7 | *.fsarrayfile 8 | *.fsomapfile 9 | *.fsomaphash 10 | *.fsomapid 11 | *.fsomapfile 12 | *.fsomapfile.0 13 | *.idx 14 | *.zdata 15 | *.pyc 16 | data*.properties 17 | /.idea/* 18 | /venv/* 19 | cache 20 | *.pickle 21 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # I would like to make it better.. 2 | 3 | Thanks for your contribution! 4 | 5 | ### Determine what to do 6 | 7 | There are so many possibilities to start with. For example 8 | 9 | - Studies on API calls like Toolformer 10 | - Studies on uncertainty in the search results. If the response is unsure or source didn't contain useful info, it shouldn't answer with confident. 11 | - Studies in open-source model integration and not relies on API. To be better, even using RLHF model 12 | - Studies on better generation of footnote. In word-level, or instrinsic model generated footnote. 13 | - Prompt engineering 14 | - Supports of markdown, code result, point form result, etc. 15 | - Use it to generate data for training of another LLM model 16 | - Make a better/robust UI 17 | - ... 18 | 19 | ### Taking on Tasks 20 | 21 | Please create a github issue on a problem that appeals to you. 22 | If there are any issues/features you want to address, mention them in your comment along with a brief explanation of 23 | how you'll resolve the issue. As soon as a project coordinator assigns you the problem, you can start working on it. (But you can still start first!) 24 | 25 | ### Submitting a Pull Request 26 | (We are not familiar with Github process. This reference LAION-AI/Open-Assistant) 27 | 28 | 1. [Fork this project repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo) 29 | and clone it to your local machine. (Read more 30 | [About Forks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/about-forks)) 31 | 1. Before working on any changes, try to 32 | [sync the forked repository](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork) 33 | to keep it up-to-date with the upstream repository. 34 | 1. On a 35 | [new branch](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-and-deleting-branches-within-your-repository) 36 | in your fork (aka a "feature branch" and not `master`) work on a small focused change that only touches on a few files. 37 | 1. Package up a small bit of work that solves part of the problem 38 | [into a Pull Request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) 39 | and 40 | [send it out for review](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/requesting-a-pull-request-review). 41 | [Here](https://github.com/michaelthwan/searchGPT/pull/42) is an example PR 42 | for this project to illustrate this flow. 43 | 1. If you're lucky, we can merge your change into `master` without any problems. 44 | If there are changes to files you're working on, resolve them by: 45 | 1. First try to rebase as suggested 46 | [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-rebase). 47 | 1. If rebasing feels too painful, merge as suggested 48 | [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-merge). 49 | 1. Once you've resolved conflicts (if any), finish the review and 50 | [squash and merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits) 51 | your PR (when squashing try to clean up or update the individual commit 52 | messages to be one sensible single one). 53 | 1. Merge in your change and move on to a new issue or the second step of your current issue. 54 | 55 | Additionally, if someone is working on an issue that interests you, ask if they 56 | need help on it or would like suggestions on how to approach the issue. If so, 57 | share wildly. If they seem to have a good handle on it, let them work on their 58 | solution until a challenge comes up. 59 | 60 | # Thank you for your contribution! -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Michael Wan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn --workers 1 --threads 2 app:app -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | searchGPT - An Open-Source RAG-based LLM Search Engine 2 | ================================================== 3 | 4 | **searchGPT** is an open-source project to build a search engine based on Large Language Model (LLM) technology to give natural language answers. 5 | 6 | You may take this is a **minimal implementation of new Bing mainly for search engine and question answering**. 7 | 8 | It supports answers based on web search content or file content. 9 | 10 | Please give me a star if you like it! 🌟 11 | 12 | ### **(Demo page link is available below!)** 13 | 14 | ![webui](/img/webui.png) 15 | ![explainability](/img/explainability.png) 16 | 17 | Features 18 | -------- 19 | 20 | * Source: 21 | * Web search with real-time results 22 | * File content search (PPT/DOC/PDF, etc.) 23 | * Sematic search from source ([FAISS](https://github.com/facebookresearch/faiss) / [pyterrier](https://github.com/terrier-org/pyterrier)) 24 | * LLM integration: ([OpenAI](https://platform.openai.com/docs/api-reference?lang=python) / [GooseAI](https://goose.ai/), etc.) 25 | * Frontend: Easy-to-use and intuitive user interface 26 | 27 | Demo page 28 | --------------- 29 | https://searchgpt-demo.herokuapp.com/index 30 | - Please do not abuse it with any programs. 31 | - Please be patient with the loading time, which usually takes ~10 seconds. 32 | 33 | Architecture and roadmap 34 | ------------------------ 35 | ![architecture_roadmap](/img/architecture_roadmap.png) 36 | 37 | Why RAG? 38 | --------------- 39 | Because it is impossible for the LLM to learn everything during the training, thus real-time factual information is needed for reference. 40 | 41 | Example of ungrounded false answers (both using `text-curie-001` model): 42 | 43 | **Prompt: what is ghost kitchen** 44 | 45 | | Original answer | RAG-based answer (SearchGPT) | 46 | |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 47 | | The Ghost Kitchen is a place where the departed spirits of cooks and dishwashers can cook and wash dishes with ease. The kitchen is said to be especially busy on Full Moon nights | A ghost kitchen is a physical space for operators to create food for off-premises consumption. [1] This type of kitchen doesn't have waiters, no dining room, and no parking lot – really, no public presence whatsoever. [1] This kitchen is designed to cut unnecessary costs, allowing a restaurant to expand easily with low capital. [2] With apps like GrubHub and DoorDash, restaurant owners can easily find orders and exceed their expected revenue. [3] | 48 | | Ghost Kitchen is a pop-up restaurant in London that is completely without food. Instead, the restaurant is filled with spirits, who banish negative energy and help to make the guests' dining experience more enjoyable. | A ghost kitchen is a professional kitchen that is set up to cook food for delivery or take-out but does not have a dine-in option. [1] Ghost kitchens are often located in areas with high foot traffic and are used by multiple restaurants. [1] This allows restaurant chains to test new menu items without the investment of a brick-and-mortar location. [2] By having all of your orders in one place, a ghost kitchen can minimize the chances of errors and increase efficiency. [3] Additionally, by being located close to customers, a ghost kitchen can increase revenue. [4] While ghost kitchens are here to stay, they will likely face competition from other restaurants in the near future. [1] | 49 | 50 | **Prompt: what is the meaning of wwww in Japanese** 51 | 52 | | Original answer | RAG-based answer (SearchGPT) | 53 | |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 54 | | Japanese "www" is typically used as a placeholder for "www." For example, "This is www." is typically translated to "Kore wa www.". Japanese "www" is also used to indicate that a webpage is not currently being viewed. | The meaning of "www" in Japanese is typically used to show amusement or to convey sarcasm. [1] It can also be used as a casual way to say "yes" or "okay." Additionally, speakers of Japanese may use "w" to represent the kana "笑" in online chat because it looks similar to the character for "laugh." [2] | 55 | 56 | 57 | Getting Started 58 | --------------- 59 | 60 | ### Prerequisites 61 | 62 | To run `searchGPT`, you'll need: 63 | 64 | * [Python 3.10.8](https://www.python.org/downloads/) 65 | * [OpenAI API Key](https://beta.openai.com/signup) or [GooseAI API Key](https://goose.ai/) 66 | * OpenAI: First $18 is free (enough for 3000+ searches) 67 | * GooseAI: First $10 is free 68 | * [Azure Bing Search Subscription Key](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api/) 69 | * Free version is available (3 searches per second, 1000 searches per month 70 | 71 | ### Installation 72 | 73 | 1. Create your python or anaconda env and install python packages 74 | 75 | Native 76 | ``` 77 | # using python=3.10.8 78 | pip install -r requirements.txt 79 | ``` 80 | 81 | Anaconda 82 | ``` 83 | conda create --name searchgpt python=3.10.8 84 | conda activate searchgpt 85 | pip install -r requirements.txt 86 | ``` 87 | 88 | 2. Input API keys (OpenAI/Azure Bing Search) in `backend/src/config/config.yaml` (or via UI if web app is used) 89 | 3. Run `app.py`, (or `flask_app.py`) for frontend web app launching. 90 | 4. For quick testing, run `main.py`. Stdout output only. 91 | 92 | Contributing 93 | ------------ 94 | 95 | We welcome contributions to **searchGPT**! (Especially frontend developers) 96 | 97 | If you're interested in contributing, please take a look at our [contributing guidelines](./CONTRIBUTING.md) for more information. 98 | 99 | License 100 | ------- 101 | 102 | `searchGPT` is licensed under the [MIT License](./LICENSE). 103 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from src.website import create_app 2 | import os, sys 3 | 4 | sys.path.append(os.path.join(os.path.dirname(__file__), "src")) 5 | 6 | app = create_app() 7 | 8 | if __name__ == '__main__': 9 | app.run(debug=True) 10 | -------------------------------------------------------------------------------- /img/architecture_roadmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/img/architecture_roadmap.png -------------------------------------------------------------------------------- /img/explainability.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/img/explainability.png -------------------------------------------------------------------------------- /img/webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/img/webui.png -------------------------------------------------------------------------------- /playground/test_OpenAIAPI.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | # openai.organization = "" 4 | openai.api_key = "" 5 | # print(openai.Model.list()) 6 | 7 | # GPT-3 8 | # text-davinci-003 $0.0200 /1K tokens 9 | # text-curie-001 $0.0020 /1K tokens 10 | # text-babbage-001 $0.0005 /1K tokens 11 | # text-ada-001 $0.0004 /1K tokens 12 | 13 | # Codex 14 | # code-davinci-002 15 | # code-cushman-001 16 | 17 | response = openai.Completion.create( 18 | # model="text-curie-001", 19 | model="text-babbage-001", 20 | prompt="Say this is a test", 21 | max_tokens=16, 22 | temperature=0 # default=1 23 | ) 24 | 25 | for r in response.choices: 26 | print(r.text) 27 | -------------------------------------------------------------------------------- /playground/test_OpenAI_Embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import openai 4 | import pandas as pd 5 | import yaml 6 | from openai.embeddings_utils import get_embedding, cosine_similarity 7 | 8 | from Util import get_project_root 9 | 10 | BASE_MODEL = "text-embedding-ada-002" # default embedding of faiss-openai 11 | 12 | 13 | def search_using_cosine_similarity(df, query): 14 | query_embedding = get_embedding(query, engine=BASE_MODEL) 15 | df["similarity"] = df['embeddings'].apply(lambda x: cosine_similarity(x, query_embedding)) 16 | 17 | results = df.sort_values("similarity", ascending=False, ignore_index=True) 18 | 19 | k = 5 20 | results = results.head(k) 21 | global sources 22 | sources = [] 23 | for i in range(k): 24 | sources.append({'Page ' + str(results.iloc[i]['page']): results.iloc[i]['text'][:150] + '...'}) 25 | print(sources) 26 | return results.head(k) 27 | 28 | 29 | def compute_embeddings(text, model="text-embedding-ada-002"): 30 | print(f'compute_embeddings() text: {text}') 31 | text = text.replace("\n", " ") 32 | return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding'] 33 | 34 | 35 | def search_similar(df: pd.DataFrame, target_text, n=3, pprint=True): 36 | print(f'search_similar() text: {target_text}') 37 | embedding = compute_embeddings(target_text, model=BASE_MODEL) 38 | df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, embedding)) 39 | res = df.sort_values('similarities', ascending=False).head(n) 40 | return res, df 41 | 42 | 43 | def compute_embeddings_2(text_df, model=BASE_MODEL, chunk_size=1000): 44 | print(f'compute_embeddings_2() len(texts): {len(df)}') 45 | text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) 46 | embeddings = [] 47 | for i in range(0, len(texts), chunk_size): 48 | response = openai.Embedding.create( 49 | input=texts[i: i + chunk_size], engine=model 50 | ) 51 | embeddings += [r["embedding"] for r in response["data"]] 52 | text_df['embedding'] = embeddings 53 | return text_df 54 | 55 | 56 | if __name__ == '__main__': 57 | # text_df = pd.read_csv(os.path.join(get_project_root(), 'src/text_df.csv')) 58 | texts = [ 59 | "Discover the world of delicious beans with our premium selection.", 60 | "Try our savory bean soup recipe for a delicious and nutritious meal.", 61 | "Our roasted coffee beans are carefully selected for their rich and delicious flavor.", 62 | "Beans are not only delicious, but also a great source of protein and dietary fiber.", 63 | "Looking for a delicious vegan meal? Try our spicy black bean burger recipe.", 64 | 65 | "The sky is blue and the sun is shining today.", 66 | "I need to go grocery shopping after work to pick up some milk and bread.", 67 | "Did you hear about the new movie that just came out? It's supposed to be really good.", 68 | "I'm planning a trip to Europe next summer and I'm so excited.", 69 | "My cat keeps meowing at me for no reason and it's driving me crazy.", 70 | ] 71 | text_df = pd.DataFrame({'text': texts, 'docno': range(len(texts))}) 72 | print(text_df.shape) 73 | 74 | with open(os.path.join(get_project_root(), 'src/config/config.yaml')) as f: 75 | config = yaml.load(f, Loader=yaml.FullLoader) 76 | openai.api_key = config.get('openai_api').get('api_key') 77 | 78 | # text_df = compute_embeddings(text_df) 79 | # result_df = search_using_cosine_similarity(text_df, 'what is chatgpt?') 80 | # print(result_df) 81 | 82 | search_text = 'delicious beans' 83 | search_text = 'Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection ' 84 | 85 | from pyinstrument import Profiler 86 | 87 | profiler = Profiler() 88 | profiler.start() 89 | print("Sequential call mode:") 90 | text_df['embedding'] = text_df['text'].apply(lambda x: compute_embeddings(x, model=BASE_MODEL)) 91 | res, text_df = search_similar(text_df, search_text, n=3) 92 | print(res) 93 | profiler.stop() 94 | profiler.print() 95 | 96 | profiler = Profiler() 97 | profiler.start() 98 | print("Batch call mode:") 99 | text_df = compute_embeddings_2(text_df) 100 | res, text_df = search_similar(text_df, search_text, n=3) 101 | print(res) 102 | profiler.stop() 103 | profiler.print() 104 | -------------------------------------------------------------------------------- /playground/test_langchain_faiss.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | # from langchain.embeddings.openai import OpenAIEmbeddings 3 | from langchain.embeddings import HuggingFaceEmbeddings 4 | from langchain.vectorstores import FAISS 5 | 6 | if __name__ == '__main__': 7 | search_text = 'the source of dark energy' 8 | index_path = r"C:\xxx\searchGPT\backend\src\.index" 9 | text_df = pd.read_csv(rf'C:\xxx\text_df.csv') 10 | text_df['docno'] = text_df.index.tolist() 11 | print(text_df.shape) 12 | print(text_df) 13 | texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist() 14 | docno_dict = [{'docno': docno} for docno in docno_list] 15 | embeddings = HuggingFaceEmbeddings() # OpenAIEmbeddings() cost money (OPENAI_API_KEY) 16 | faiss_index = FAISS.from_texts(texts, embeddings, metadatas=docno_dict) 17 | 18 | # k: Number of Documents to return. Defaults to 4. 19 | # fetch_k: Number of Documents to fetch to pass to MMR algorithm. 20 | k, fetch_k = 10, 999 21 | # docs = faiss_index.max_marginal_relevance_search(search_text, k=k, fetch_k=fetch_k) 22 | docs = faiss_index.similarity_search_with_score(search_text, k=k) 23 | text_list, docno_list, score_list = [], [], [] 24 | for t in docs: 25 | doc, score = t 26 | print(doc) 27 | text_list.append(doc.page_content) 28 | docno_list.append(doc.metadata['docno']) 29 | score_list.append(score) 30 | gpt_df = pd.DataFrame({'text': text_list, 'docno': docno_list, 'score': score_list}) 31 | print("=====gpt_df====") 32 | print(gpt_df.shape) 33 | print(gpt_df) 34 | -------------------------------------------------------------------------------- /playground/test_nltk.py: -------------------------------------------------------------------------------- 1 | # import nltk 2 | # 3 | # if __name__ == '__main__': 4 | # text = "There are many things you can do to learn how to run faster, such as incorporating speed workouts into your running schedule, running hills, counting your strides, and adjusting your running form. Lean forward when you run and push off firmly with each foot. Pump your arms actively and keep your elbows bent at a 90-degree angle. Try to run every day, and gradually increase the distance you run for long-distance runs. Make sure you rest at least one day per week to allow your body to recover. Avoid running with excess gear that could slow you down." 5 | # nltk.download('punkt') 6 | # sentences = nltk.sent_tokenize(text) 7 | # for sentence in sentences: 8 | # print(sentence) 9 | -------------------------------------------------------------------------------- /playground/test_pyterrier.py: -------------------------------------------------------------------------------- 1 | import pyterrier as pt 2 | import pandas as pd 3 | import os 4 | 5 | 6 | def pd_indexer(): 7 | df = pd.DataFrame({ 8 | 'docno': 9 | ['1', '2', '3'], 10 | 'url': 11 | ['url1', 'url2', 'url3'], 12 | 'text': 13 | ['He ran out of money, so he had to stop playing', 14 | 'The waves were crashing on the shore; it was a', 15 | 'The body may perhaps compensates for the loss'] 16 | }) 17 | files = pt.io.find_files("./var/files") 18 | 19 | indexref_file = pt.FilesIndexer("./file_index", overwrite=True).index(files) 20 | 21 | # pd_indexer = pt.DFIndexer("./var") 22 | # indexref2 = pd_indexer.index(df["text"], df["docno"]) 23 | index = pt.IndexFactory.of(indexref_file) 24 | print(index.getCollectionStatistics().toString()) 25 | # pt.BatchRetrieve(indexref2).search("waves") 26 | return 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | if not pt.started(): 32 | pt.init() 33 | # dataset = pt.datasets.get_dataset("vaswani") 34 | # index_path = "./index" 35 | print(os.getcwd()) 36 | 37 | # files = pt.io.find_files("./var/files") 38 | files = pt.io.find_files(os.path.join(os.getcwd(), "var/files")) 39 | indexref_file = pt.FilesIndexer(os.path.join(os.getcwd(), "var/file_index"), overwrite=True).index(files) 40 | # indexref_file = pt.IndexFactory.of(os.path.join(os.getcwd(), "var/file_index/data.properties")) 41 | print(type(indexref_file)) 42 | result_df = pt.BatchRetrieve(indexref_file).search("NSSO") # Can feed both jnius.reflect.org.terrier.querying.IndexRef / jnius.reflect.org.terrier.querying.Index 43 | print(result_df) 44 | 45 | # index = pt.IndexFactory.of(indexref_file) 46 | # print(index.getCollectionStatistics().toString()) 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ## python 3.10.8 2 | # basic 3 | openai==0.27.0 4 | pandas==1.5.3 5 | PyYAML==6.0 6 | 7 | # frontend 8 | Flask==2.2.3 9 | Werkzeug==2.2.2 10 | requests==2.28.2 11 | gunicorn==20.1.0 12 | 13 | # embedding 14 | tiktoken==0.3.2 15 | matplotlib==3.7.1 16 | plotly==5.13.1 17 | scipy==1.10.1 18 | scikit-learn==1.2.1 19 | 20 | # doc extraction 21 | python-docx==0.8.11 22 | python-pptx==0.6.21 23 | 24 | # html extraction 25 | beautifulsoup4==4.11.2 26 | trafilatura==1.4.1 27 | 28 | # misc 29 | psutil==5.9.4 30 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.10.8 -------------------------------------------------------------------------------- /src/BingService.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import concurrent.futures 4 | import pandas as pd 5 | import requests 6 | import yaml 7 | 8 | from Util import setup_logger, get_project_root, storage_cached 9 | from text_extract.html.beautiful_soup import BeautifulSoupSvc 10 | from text_extract.html.trafilatura import TrafilaturaSvc 11 | 12 | logger = setup_logger('BingService') 13 | 14 | 15 | class BingService: 16 | def __init__(self, config): 17 | self.config = config 18 | extract_svc = self.config.get('source_service').get('bing_search').get('text_extract') 19 | if extract_svc == 'trafilatura': 20 | self.txt_extract_svc = TrafilaturaSvc() 21 | elif extract_svc == 'beautifulsoup': 22 | self.txt_extract_svc = BeautifulSoupSvc() 23 | 24 | @storage_cached('bing_search_website', 'search_text') 25 | def call_bing_search_api(self, search_text: str) -> pd.DataFrame: 26 | logger.info("BingService.call_bing_search_api. query: " + search_text) 27 | subscription_key = self.config.get('source_service').get('bing_search').get('subscription_key') 28 | endpoint = self.config.get('source_service').get('bing_search').get('end_point') + "/v7.0/search" 29 | mkt = self.config.get('general').get('language') 30 | params = {'q': search_text, 'mkt': mkt} 31 | headers = {'Ocp-Apim-Subscription-Key': subscription_key} 32 | 33 | try: 34 | response = requests.get(endpoint, headers=headers, params=params) 35 | response.raise_for_status() 36 | 37 | columns = ['name', 'url', 'snippet'] 38 | if response.json().get('webPages'): 39 | website_df = pd.DataFrame(response.json()['webPages']['value'])[columns] 40 | website_df['url_id'] = website_df.index + 1 41 | website_df = website_df[:self.config.get('source_service').get('bing_search').get('result_count')] 42 | else: 43 | website_df = pd.DataFrame(columns=columns + ['url_id']) 44 | except Exception as ex: 45 | raise ex 46 | return website_df 47 | 48 | def call_urls_and_extract_sentences(self, website_df) -> pd.DataFrame: 49 | """ 50 | :param: 51 | website_df: one row = one website with url 52 | name: website title name 53 | url: url 54 | snippet: snippet of the website given by BingAPI 55 | :return: 56 | text_df: one row = one website sentence 57 | columns: 58 | name: website title name 59 | url: url 60 | snippet: snippet of the website given by BingAPI 61 | text: setences extracted from the website 62 | """ 63 | logger.info(f"BingService.call_urls_and_extract_sentences. website_df.shape: {website_df.shape}") 64 | name_list, url_list, url_id_list, snippet_list, text_list = [], [], [], [], [] 65 | for index, row in website_df.iterrows(): 66 | logger.info(f"Processing url: {row['url']}") 67 | sentences = self.extract_sentences_from_url(row['url']) 68 | for text in sentences: 69 | word_count = len(re.findall(r'\w+', text)) # approximate number of words 70 | if word_count < 8: 71 | continue 72 | name_list.append(row['name']) 73 | url_list.append(row['url']) 74 | url_id_list.append(row['url_id']) 75 | snippet_list.append(row['snippet']) 76 | text_list.append(text) 77 | text_df = pd.DataFrame(data=zip(name_list, url_list, url_id_list, snippet_list, text_list), 78 | columns=['name', 'url', 'url_id', 'snippet', 'text']) 79 | return text_df 80 | 81 | def call_one_url(self, website_tuple): 82 | name, url, snippet, url_id = website_tuple 83 | logger.info(f"Processing url: {url}") 84 | sentences = self.extract_sentences_from_url(url) 85 | logger.info(f" receive sentences: {len(sentences)}") 86 | return sentences, name, url, url_id, snippet 87 | 88 | @storage_cached('bing_search_website_content', 'website_df') 89 | def call_urls_and_extract_sentences_concurrent(self, website_df): 90 | logger.info(f"BingService.call_urls_and_extract_sentences_async. website_df.shape: {website_df.shape}") 91 | with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: 92 | results = list(executor.map(self.call_one_url, website_df.itertuples(index=False))) 93 | name_list, url_list, url_id_list, snippet_list, text_list = [], [], [], [], [] 94 | for result in results: 95 | sentences, name, url, url_id, snippet = result 96 | sentences = sentences[:self.config['source_service']['bing_search']['sentence_count_per_site']] # filter top N only for stability 97 | for text in sentences: 98 | word_count = len(re.findall(r'\w+', text)) # approximate number of words 99 | if word_count < 8: 100 | continue 101 | name_list.append(name) 102 | url_list.append(url) 103 | url_id_list.append(url_id) 104 | snippet_list.append(snippet) 105 | text_list.append(text) 106 | text_df = pd.DataFrame(data=zip(name_list, url_list, url_id_list, snippet_list, text_list), 107 | columns=['name', 'url', 'url_id', 'snippet', 'text']) 108 | return text_df 109 | 110 | def extract_sentences_from_url(self, url): 111 | # Fetch the HTML content of the page 112 | try: 113 | response = requests.get(url, timeout=3) 114 | except: 115 | logger.error(f"Failed to fetch url: {url}") 116 | return [] 117 | html_content = response.text 118 | 119 | # Use BeautifulSoup to parse the HTML and extract the text 120 | extract_text = self.txt_extract_svc.extract_from_html(html_content) 121 | return extract_text 122 | 123 | 124 | if __name__ == '__main__': 125 | # Load config 126 | with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f: 127 | config = yaml.load(f, Loader=yaml.FullLoader) 128 | service = BingService(config) 129 | website_df = service.call_bing_search_api('What is ChatGPT') 130 | print("===========Website df:============") 131 | print(website_df) 132 | # text_df = service.call_urls_and_extract_sentences(website_df) 133 | text_df = service.call_urls_and_extract_sentences_concurrent(website_df) 134 | print("===========text df:============") 135 | print(text_df) 136 | -------------------------------------------------------------------------------- /src/FrontendService.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from urllib.parse import urlparse 4 | 5 | import yaml 6 | 7 | from NLPUtil import split_with_delimiters, get_longest_common_word_sequences 8 | from Util import setup_logger, get_project_root 9 | 10 | logger = setup_logger('FootnoteService') 11 | 12 | 13 | class FrontendService: 14 | def __init__(self, config, response_text, gpt_input_text_df): 15 | self.config = config 16 | self.response_text = response_text 17 | used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id 18 | self.gpt_input_text_df = gpt_input_text_df[used_columns] 19 | 20 | @staticmethod 21 | def get_prompt_examples_json(): 22 | with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f: 23 | config = yaml.load(f, Loader=yaml.FullLoader) 24 | col1_list = config['frontend_service']['prompt_examples']['col1_list'] 25 | col2_list = config['frontend_service']['prompt_examples']['col2_list'] 26 | prompt_examples_json = { 27 | 'col1_list': col1_list, 28 | 'col2_list': col2_list, 29 | } 30 | return prompt_examples_json 31 | 32 | def get_data_json(self, response_text, gpt_input_text_df): 33 | def create_response_json_object(text, type): 34 | return {"text": text, "type": type} 35 | 36 | def create_source_json_object(footnote, domain, url, title, text): 37 | return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} 38 | 39 | def reorder_url_id(response_text, gpt_input_text_df): 40 | # response_text: find reference in text & re-order 41 | url_id_list = [int(x) for x in dict.fromkeys(re.findall(r'\[([0-9]+)\]', response_text))] 42 | url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) 43 | 44 | response_text = re.sub(r'\[([0-9]+)\]', lambda x: f"[{url_id_map[int(x.group(1))]}]", response_text) 45 | # for multiple references in same sentence, sort as per url_id 46 | refs = set(re.findall(r'(\[[0-9\]\[]+\])', response_text)) 47 | for ref in refs: 48 | response_text = response_text.replace(ref, '[' + ']['.join(sorted(re.findall(r'\[([0-9]+)\]', ref))) + ']') 49 | 50 | # gpt_input_text_df: find reference in text & re-order 51 | in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy() 52 | in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map) 53 | return response_text, in_scope_source_df 54 | 55 | def get_response_json(response_text): 56 | def create_response_json_object(text, type): 57 | return {"text": text, "type": type} 58 | 59 | response_json = [] 60 | split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text) 61 | 62 | components = [] 63 | for component in split_sentence: 64 | components.extend(split_with_delimiters(component, ['\n'])) 65 | for sentence in components: 66 | if sentence.startswith('[') and sentence.endswith(']'): 67 | response_json.append(create_response_json_object(sentence, "footnote")) 68 | elif sentence == '\n': 69 | response_json.append(create_response_json_object(sentence, "newline")) 70 | else: 71 | response_json.append(create_response_json_object(sentence, "response")) 72 | return response_json 73 | 74 | def get_source_json(in_scope_source_df): 75 | in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) 76 | in_scope_source_df.sort_values('docno', inplace=True) 77 | source_text_list = [] 78 | source_json = [] 79 | source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True) 80 | for index, row in source_url_df.iterrows(): 81 | url_text = '' 82 | url_text += f"[{row['url_id']}] {row['url']}\n" 83 | 84 | for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows(): 85 | url_text += f" {row['text']}\n" 86 | 87 | source_text_list.append(url_text) 88 | 89 | domain_name = urlparse(row['url']).netloc.replace('www.', '') 90 | source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet'])) 91 | source_text = ''.join(sorted(source_text_list)) 92 | 93 | source_json = sorted(source_json, key=lambda x: x['footnote']) 94 | return source_json, source_text 95 | 96 | def get_explainability_json(response_text, source_text): 97 | def get_colors(): 98 | return ['#ffe3e8', '#f1e1ff', '#c5d5ff', '#c5efff', '#d6fffa', '#e7ffe7', '#f7ffa7', '#fff3b3', '#ffdfdf', '#ffcaca'] 99 | 100 | def create_response_json_object(text, type, color): 101 | return {"text": text, "type": type, "color": color} 102 | 103 | def get_explain_json(text, word_color_dict): 104 | common_word_sequences = list(word_color_dict.keys()) 105 | word_list = split_with_delimiters(text.lower(), common_word_sequences + ['\n']) 106 | explain_json = [] 107 | for word in word_list: 108 | if word == '\n': 109 | explain_json.append(create_response_json_object(word, "newline", "")) 110 | elif word.lower() in common_word_sequences: 111 | explain_json.append(create_response_json_object(word, "keyword", word_color_dict[word.lower()])) 112 | else: 113 | explain_json.append(create_response_json_object(word, "word", "")) 114 | return explain_json 115 | 116 | longest_common_word_sequences = get_longest_common_word_sequences(response_text, source_text, k=10) 117 | word_color_dict = {longest_common_word_sequences[i]: get_colors()[i] for i in range(min(len(longest_common_word_sequences), len(get_colors())))} 118 | 119 | response_explain_json = get_explain_json(response_text, word_color_dict) 120 | source_explain_json = get_explain_json(source_text, word_color_dict) 121 | return response_explain_json, source_explain_json 122 | 123 | response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df) 124 | response_json = get_response_json(response_text) 125 | source_json, source_text = get_source_json(in_scope_source_df) 126 | response_explain_json, source_explain_json = get_explainability_json(response_text, source_text) 127 | prompt_examples_json = FrontendService.get_prompt_examples_json() 128 | 129 | return source_text, {'response_json': response_json, 130 | 'source_json': source_json, 131 | 'response_explain_json': response_explain_json, 132 | 'source_explain_json': source_explain_json, 133 | 'prompt_examples_json': prompt_examples_json, 134 | } 135 | 136 | 137 | if __name__ == '__main__': 138 | # str_list = ['Alpaca lora', 139 | # 'what is new for gpt4?', 140 | # 'Why Llama LLM model is so popular?', 141 | # 'Why did SVB collapsed?', 142 | # 'End of FTX', 143 | # 'digital twin有哪些用处', 144 | # '아가동산사건의 문제가 뭐야', 145 | # 'Hoe maak ik pasta', 146 | # '日本国憲法は誰が作ったのか?', 147 | # "Comment gagner de l'argent"] 148 | # for s in str_list: 149 | # print(path_safe_string_conversion(s)) 150 | 151 | import os 152 | import pickle 153 | 154 | folder_path = r"" 155 | pickle_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)] 156 | for file_path in pickle_files: 157 | if not '8d' in file_path: 158 | continue 159 | with open(file_path, "rb") as f: 160 | obj = pickle.load(f) 161 | print(file_path) 162 | print(obj['result'][0]) 163 | -------------------------------------------------------------------------------- /src/LLMService.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | from urllib.parse import urlparse 4 | 5 | import openai 6 | import pandas as pd 7 | import yaml 8 | 9 | from Util import setup_logger, get_project_root, storage_cached 10 | from website.sender import Sender, MSG_TYPE_SEARCH_STEP, MSG_TYPE_OPEN_AI_STREAM 11 | 12 | logger = setup_logger('LLMService') 13 | 14 | 15 | class LLMService(ABC): 16 | def __init__(self, config): 17 | self.config = config 18 | 19 | def clean_response_text(self, response_text: str): 20 | return response_text.replace("\n", "") 21 | 22 | def get_prompt(self, search_text: str, gpt_input_text_df: pd.DataFrame): 23 | logger.info(f"OpenAIService.get_prompt. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") 24 | prompt_length_limit = 3000 # obsolete 25 | is_use_source = self.config.get('source_service').get('is_use_source') 26 | if is_use_source: 27 | prompt_engineering = f"\n\nAnswer the question '{search_text}' using above information with about 100 words:" 28 | prompt = "" 29 | for index, row in gpt_input_text_df.iterrows(): 30 | prompt += f"""{row['text']}\n""" 31 | # limit the prompt length 32 | prompt = prompt[:prompt_length_limit] 33 | return prompt + prompt_engineering 34 | else: 35 | return f"\n\nAnswer the question '{search_text}' with about 100 words:" 36 | 37 | def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame): 38 | logger.info(f"OpenAIService.get_prompt_v2. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") 39 | context_str = "" 40 | gpt_input_text_df = gpt_input_text_df.sort_values('url_id') 41 | url_id_list = gpt_input_text_df['url_id'].unique() 42 | for url_id in url_id_list: 43 | context_str += f"Source ({url_id})\n" 44 | for index, row in gpt_input_text_df[gpt_input_text_df['url_id'] == url_id].iterrows(): 45 | context_str += f"{row['text']}\n" 46 | context_str += "\n" 47 | prompt_length_limit = 3000 # obsolete 48 | context_str = context_str[:prompt_length_limit] 49 | prompt = \ 50 | f""" 51 | Answer with 100 words for the question below based on the provided sources using a scientific tone. 52 | If the context is insufficient, reply "I cannot answer". 53 | Use Markdown for formatting code or text. 54 | Source: 55 | {context_str} 56 | Question: {search_text} 57 | Answer: 58 | """ 59 | return prompt 60 | 61 | def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): 62 | language = self.config.get('general').get('language') 63 | if not self.config.get('source_service').get('is_use_source'): 64 | prompt = \ 65 | f""" 66 | Instructions: Write a comprehensive reply to the given query. 67 | If the context is insufficient, reply "I cannot answer". 68 | Query: {search_text} 69 | """ 70 | return prompt 71 | 72 | logger.info(f"OpenAIService.get_prompt_v3. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") 73 | context_str = "" 74 | for _, row_url in gpt_input_text_df[['url_id', 'url']].drop_duplicates().iterrows(): 75 | domain = urlparse(row_url['url']).netloc.replace('www.', '') 76 | context_str += f"Source [{row_url['url_id']}] {domain}\n" 77 | for index, row in gpt_input_text_df[(gpt_input_text_df['url_id'] == row_url['url_id']) & gpt_input_text_df['in_scope']].iterrows(): 78 | context_str += f"{row['text']}\n" 79 | context_str += "\n\n" 80 | prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit') 81 | context_str = context_str[:prompt_length_limit] 82 | prompt = \ 83 | f""" 84 | Web search result: 85 | {context_str} 86 | 87 | Instructions: Using the provided web search results, write a comprehensive reply to the given query. 88 | Make sure to cite results using [number] notation after the reference. 89 | If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. 90 | Answer in language: {language} 91 | If the context is insufficient, reply "I cannot answer because my reference sources don't have related info" in language {language}. 92 | Query: {search_text} 93 | """ 94 | return prompt 95 | 96 | @abstractmethod 97 | def call_api(self, prompt): 98 | pass 99 | 100 | 101 | class OpenAIService(LLMService): 102 | def __init__(self, config, sender: Sender = None): 103 | super().__init__(config) 104 | self.sender = sender 105 | open_api_key = config.get('llm_service').get('openai_api').get('api_key') 106 | if open_api_key is None: 107 | raise Exception("OpenAI API key is not set.") 108 | openai.api_key = open_api_key 109 | 110 | @storage_cached('openai', 'prompt') 111 | def call_api(self, prompt: str): 112 | if self.sender is not None: 113 | self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg='Calling OpenAI API ...') 114 | 115 | openai_api_config = self.config.get('llm_service').get('openai_api') 116 | model = openai_api_config.get('model') 117 | is_stream = openai_api_config.get('stream') 118 | logger.info(f"OpenAIService.call_api. model: {model}, len(prompt): {len(prompt)}") 119 | 120 | if model in ['gpt-3.5-turbo', 'gpt-4']: 121 | try: 122 | response = openai.ChatCompletion.create( 123 | model=model, 124 | messages=[ 125 | {"role": "system", "content": "You are a helpful search engine."}, 126 | {"role": "user", "content": prompt} 127 | ], 128 | stream=is_stream 129 | ) 130 | except Exception as ex: 131 | raise ex 132 | 133 | if is_stream: 134 | collected_messages = [] 135 | # iterate through the stream of events 136 | for chunk in response: 137 | chunk_message = chunk['choices'][0]['delta'].get("content", None) # extract the message 138 | if chunk_message is not None: 139 | if self.sender is not None: 140 | self.sender.send_message(msg_type=MSG_TYPE_OPEN_AI_STREAM, msg=chunk_message) 141 | collected_messages.append(chunk_message) # save the message 142 | 143 | full_reply_content = ''.join(collected_messages) 144 | return full_reply_content 145 | else: 146 | return response.choices[0].message.content 147 | else: 148 | try: 149 | response = openai.Completion.create( 150 | model=model, 151 | prompt=prompt, 152 | max_tokens=openai_api_config.get('max_tokens'), 153 | temperature=openai_api_config.get('temperature'), 154 | top_p=openai_api_config.get('top_p'), 155 | ) 156 | except Exception as ex: 157 | raise ex 158 | return self.clean_response_text(response.choices[0].text) 159 | 160 | 161 | class GooseAIService(LLMService): 162 | def __init__(self, config, sender: Sender = None): 163 | super().__init__(config) 164 | self.sender = sender 165 | goose_api_key = config.get('goose_ai_api').get('api_key') 166 | if goose_api_key is None: 167 | raise Exception("Goose API key is not set.") 168 | openai.api_key = goose_api_key 169 | openai.api_base = config.get('goose_ai_api').get('api_base') 170 | 171 | @storage_cached('gooseai', 'prompt') 172 | def call_api(self, prompt: str, sender: Sender = None): 173 | if self.sender is not None: 174 | self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg='Calling gooseAI API ...') 175 | logger.info(f"GooseAIService.call_openai_api. len(prompt): {len(prompt)}") 176 | goose_api_config = self.config.get('goose_ai_api') 177 | try: 178 | response = openai.Completion.create( 179 | engine=goose_api_config.get('model'), 180 | prompt=prompt, 181 | max_tokens=goose_api_config.get('max_tokens'), 182 | # stream=True 183 | ) 184 | except Exception as ex: 185 | raise ex 186 | return self.clean_response_text(response.choices[0].text) 187 | 188 | 189 | class LLMServiceFactory: 190 | @staticmethod 191 | def create_llm_service(config, sender: Sender = None) -> LLMService: 192 | provider = config.get('llm_service').get('provider') 193 | if provider == 'openai': 194 | return OpenAIService(config, sender) 195 | elif provider == 'goose_ai': 196 | return GooseAIService(config, sender) 197 | else: 198 | logger.error(f'LLM Service for {provider} is not yet implemented.') 199 | raise NotImplementedError(f'LLM Service - {provider} - not is supported') 200 | 201 | 202 | if __name__ == '__main__': 203 | # Load config 204 | with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f: 205 | config = yaml.load(f, Loader=yaml.FullLoader) 206 | service_factory = LLMServiceFactory() 207 | service = service_factory.create_llm_service(config) 208 | prompt = """ 209 | """ 210 | # response_text = service.call_openai_api('What is ChatGPT') 211 | response_text = service.call_api(prompt) 212 | print(response_text) 213 | -------------------------------------------------------------------------------- /src/NLPUtil.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | 4 | def remove_substrings(strings): 5 | """ 6 | Remove any string that is a substring of another string 7 | Input ["abc", "ab", "c"] 8 | Output ["abc", "c"] 9 | """ 10 | # Sort the strings by length in descending order 11 | strings_sorted = sorted(strings, key=len, reverse=False) 12 | 13 | # Remove any string that is a substring of another string 14 | result = [] 15 | for i in range(len(strings_sorted)): 16 | is_substring = False 17 | for j in range(i + 1, len(strings_sorted)): 18 | if strings_sorted[i] in strings_sorted[j]: 19 | is_substring = True 20 | break 21 | if not is_substring: 22 | result.append(strings_sorted[i]) 23 | 24 | return result 25 | 26 | 27 | def get_longest_common_word_sequences(paragraph1, paragraph2, k=10): 28 | """ 29 | Find the longest common subsequences of words between two paragraphs 30 | Input: p1: "The quick brown fox jumps over the lazy dog", p2: "The quick brown dog jumps over the lazy fox" 31 | Output: ["jumps over the lazy", "the quick brown"] 32 | """ 33 | # Tokenize the paragraphs into lists of words 34 | word_lists1 = [word.lower() for word in paragraph1.split()] 35 | word_lists2 = [word.lower() for word in paragraph2.split()] 36 | 37 | # Initialize a table to store the lengths of common subsequences 38 | table = [[0] * (len(word_lists2) + 1) for _ in range(len(word_lists1) + 1)] 39 | 40 | # Fill in the table by comparing each pair of words 41 | common_sequences = [] 42 | for i in range(1, len(word_lists1) + 1): 43 | for j in range(1, len(word_lists2) + 1): 44 | if word_lists1[i - 1] == word_lists2[j - 1]: 45 | table[i][j] = table[i - 1][j - 1] + 1 46 | sequence_len = table[i][j] 47 | # if sequence_len >= k: 48 | sequence = ' '.join(word_lists1[i - sequence_len:i]) 49 | if sequence not in common_sequences: 50 | common_sequences.append(sequence) 51 | else: 52 | table[i][j] = 0 53 | 54 | # Sort the common sequences by length in descending order and return the top k longest sequences 55 | common_sequences = remove_substrings(common_sequences) 56 | longest_sequences = sorted(common_sequences, key=len, reverse=True)[:k] 57 | min_sequence_len = 10 58 | longest_sequences = [sequence for sequence in longest_sequences if len(sequence) >= min_sequence_len] 59 | return longest_sequences 60 | 61 | 62 | def split_with_delimiters(string, delimiter_list): 63 | """ 64 | Key point if this function is it will preserve the delimiters to serve the purpose 65 | Input: ("is fine-tuned from a gpt-3.5 series", ["fine-tuned", "gpt-3.5"]) 66 | Output: ['is ', 'fine-tuned', ' from a ', 'gpt-3.5', ' series'] 67 | """ 68 | result = [] 69 | start = 0 70 | for i in range(len(string)): 71 | for delimiter in delimiter_list: 72 | delimiter_len = len(delimiter) 73 | if string[i:i + delimiter_len] == delimiter: 74 | if i > start: 75 | result.append(string[start:i]) 76 | result.append(delimiter) 77 | start = i + delimiter_len 78 | break 79 | else: 80 | continue 81 | if start < len(string): 82 | result.append(string[start:]) 83 | return result 84 | 85 | 86 | def num_tokens_from_string(string: str) -> int: 87 | """ 88 | Returns the number of tokens in a text string. 89 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb 90 | """ 91 | encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") 92 | num_tokens = len(encoding.encode(string)) 93 | return num_tokens 94 | 95 | 96 | if __name__ == '__main__': 97 | paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]." 98 | paragraph2 = """ 99 | Source (1) 100 | ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. 101 | - ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly. 102 | ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure. 103 | Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF). 104 | 105 | Source (3) 106 | ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data. 107 | But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware. 108 | ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn. 109 | But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader. 110 | ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on. 111 | ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way". 112 | ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times. 113 | ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss 114 | """ 115 | 116 | # common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2) 117 | # # print(common_stems) 118 | # for common_stem in common_stems: 119 | # print(common_stem) 120 | 121 | # text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early", 122 | # "sensitive to tweaks to the input phrasing or attempting the same prompt multiple", 123 | # "is fine-tuned from a model in the gpt-3.5 series, which finished training in", 124 | # "is fine-tuned from a model in the gpt-3.5 series, which finished training", 125 | # "sensitive to tweaks to the input phrasing or attempting the same prompt", 126 | # "is fine-tuned from a model in the gpt-3.5 series, which finished", 127 | # "sensitive to tweaks to the input phrasing or attempting the same", 128 | # "sensitive to tweaks to the input phrasing or attempting the", 129 | # "is fine-tuned from a model in the gpt-3.5 series, which"] 130 | # text_list = FrontendService.remove_substrings(text_list) 131 | # for text in text_list: 132 | # print(text) 133 | 134 | # response_text = "is fine-tuned from a gpt-3.5 series" 135 | # split_list = split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"]) 136 | # print(split_list) 137 | 138 | s = "OpenAI 推出了一個新型聊天機器人模型ChatGPT" 139 | print(num_tokens_from_string(s)) 140 | -------------------------------------------------------------------------------- /src/SearchGPTService.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import yaml 5 | 6 | from FrontendService import FrontendService 7 | from LLMService import LLMServiceFactory 8 | from SemanticSearchService import BatchOpenAISemanticSearchService 9 | from SourceService import SourceService 10 | from Util import setup_logger, get_project_root, storage_cached 11 | from website.sender import Sender 12 | 13 | logger = setup_logger('SearchGPTService') 14 | 15 | 16 | class SearchGPTService: 17 | """ 18 | SearchGPT app->service->child-service structure 19 | - (Try to) app import service, child-service inherit service 20 | 21 | SearchGPT class 22 | - SourceService 23 | -- BingService 24 | -- Doc/PPT/PDF Service 25 | - SemanticSearchModule 26 | - LLMService 27 | -- OpenAIService 28 | -- GooseAPIService 29 | - FrontendService 30 | 31 | """ 32 | 33 | def __init__(self, ui_overriden_config=None, sender: Sender = None): 34 | with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f: 35 | self.config = yaml.load(f, Loader=yaml.FullLoader) 36 | self.overide_config_by_query_string(ui_overriden_config) 37 | self.validate_config() 38 | self.sender = sender 39 | 40 | def overide_config_by_query_string(self, ui_overriden_config): 41 | if ui_overriden_config is None: 42 | return 43 | for key, value in ui_overriden_config.items(): 44 | if value is not None and value != '': 45 | # query_string is flattened (one level) while config.yaml is nested (two+ levels) 46 | # Any better way to handle this? 47 | if key == 'bing_search_subscription_key': 48 | self.config['source_service']['bing_search']['subscription_key'] = value 49 | elif key == 'openai_api_key': 50 | self.config['llm_service']['openai_api']['api_key'] = value 51 | elif key == 'is_use_source': 52 | self.config['source_service']['is_use_source'] = False if value.lower() in ['false', '0'] else True 53 | elif key == 'llm_service_provider': 54 | self.config['llm_service']['provider'] = value 55 | elif key == 'llm_model': 56 | if self.config['llm_service']['provider'] == 'openai': 57 | self.config['llm_service']['openai_api']['model'] = value 58 | elif self.config['llm_service']['provider'] == 'goose_ai': 59 | self.config['llm_service']['goose_ai_api']['model'] = value 60 | else: 61 | raise Exception(f"llm_model is not supported for llm_service_provider: {self.config['llm_service']['provider']}") 62 | elif key == 'language': 63 | self.config['general']['language'] = value 64 | else: 65 | # invalid query_string but not throwing exception first 66 | pass 67 | 68 | def validate_config(self): 69 | if self.config['source_service']['is_enable_bing_search']: 70 | assert self.config['source_service']['bing_search']['subscription_key'], 'bing_search_subscription_key is required' 71 | if self.config['llm_service']['provider'] == 'openai': 72 | assert self.config['llm_service']['openai_api']['api_key'], 'openai_api_key is required' 73 | 74 | @storage_cached('web', 'search_text') 75 | def query_and_get_answer(self, search_text): 76 | source_module = SourceService(self.config, self.sender) 77 | bing_text_df = source_module.extract_bing_text_df(search_text) 78 | doc_text_df = source_module.extract_doc_text_df(bing_text_df) 79 | text_df = pd.concat([bing_text_df, doc_text_df], ignore_index=True) 80 | 81 | semantic_search_service = BatchOpenAISemanticSearchService(self.config, self.sender) 82 | gpt_input_text_df = semantic_search_service.search_related_source(text_df, search_text) 83 | gpt_input_text_df = BatchOpenAISemanticSearchService.post_process_gpt_input_text_df(gpt_input_text_df, 84 | self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_token_limit')) 85 | 86 | llm_service = LLMServiceFactory.create_llm_service(self.config, self.sender) 87 | prompt = llm_service.get_prompt_v3(search_text, gpt_input_text_df) 88 | response_text = llm_service.call_api(prompt=prompt) 89 | 90 | frontend_service = FrontendService(self.config, response_text, gpt_input_text_df) 91 | source_text, data_json = frontend_service.get_data_json(response_text, gpt_input_text_df) 92 | 93 | print('===========Prompt:============') 94 | print(prompt) 95 | print('===========Search:============') 96 | print(search_text) 97 | print('===========Response text:============') 98 | print(response_text) 99 | print('===========Source text:============') 100 | print(source_text) 101 | 102 | return response_text, source_text, data_json 103 | -------------------------------------------------------------------------------- /src/SemanticSearchService.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import pandas as pd 3 | import re 4 | from openai.embeddings_utils import cosine_similarity 5 | from website.sender import Sender, MSG_TYPE_SEARCH_STEP 6 | 7 | from Util import setup_logger 8 | from NLPUtil import num_tokens_from_string 9 | 10 | # from abc import ABC, abstractmethod 11 | # from langchain.embeddings import HuggingFaceEmbeddings 12 | # from langchain.vectorstores import FAISS 13 | BASE_MODEL = "text-embedding-ada-002" # default embedding of faiss-openai 14 | logger = setup_logger('SemanticSearchService') 15 | 16 | 17 | # class SemanticSearchService(ABC): 18 | # def __init__(self, config): 19 | # self.cwd = os.getcwd() 20 | # self.config = config 21 | # self.index = None 22 | # self.provider = '' 23 | # 24 | # @abstractmethod 25 | # def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): 26 | # pass 27 | # 28 | # @abstractmethod 29 | # def retrieve_result_by_search_text_from_text_df(self, search_text, text_df) -> pd.DataFrame: 30 | # pass 31 | # 32 | # @staticmethod 33 | # def use_index_to_search(index, search_text): 34 | # pass 35 | # 36 | # def clean_sentence_to_avoid_lexical_error(self, text): 37 | # """ 38 | # Clean sentence. Pyterrier will throw error if the search query contains some special characters shown below 39 | # jnius.JavaException: JVM exception occurred: Failed to process qid 1 ' 40 | # ' -- Lexical error at line 3, column 90. Encountered: "\'" (39), after : "" org.terrier.querying.parser.QueryParserException 41 | # python-BaseException 42 | # :return: 43 | # """ 44 | # # TODO: good way to clean 45 | # return text.replace("'", "").replace("?", "").replace("!", "").replace(":", "").replace(";", "") 46 | # 47 | # 48 | # class PyTerrierService(SemanticSearchService): 49 | # def __init__(self, config): 50 | # super().__init__(config) 51 | # self.provider = 'pyterrier' 52 | # 53 | # def create_index_column_in_df(self, text_df: pd.DataFrame) -> pd.DataFrame: 54 | # """ 55 | # add a docno column (primary key / index column) to the dataframe 56 | # :param text_df: 57 | # :return: text_df with docno column 58 | # """ 59 | # text_df["docno"] = text_df.index + 1 60 | # text_df["docno"] = text_df["docno"].astype(str) 61 | # return text_df 62 | # 63 | # def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): 64 | # """ 65 | # index the text_df to get a indexref 66 | # :param text_df: 67 | # required columns: 68 | # docno: as primary key for later process to retrieve back the row 69 | # text: the text to be indexed 70 | # :return: 71 | # indexref: 72 | # """ 73 | # import pyterrier as pt 74 | # if not pt.started(): 75 | # pt.init() 76 | # datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") 77 | # df_indexer_path = os.path.join(self.cwd, f".index/{indexref_folder_name}_" + datetime_str) 78 | # if not os.path.exists(df_indexer_path): 79 | # os.makedirs(df_indexer_path) 80 | # 81 | # # TODO: using overwrite? 82 | # # Currently I cannot use overwrite=True to directly overwrite the existing index folder 83 | # # when I index for the second time, it will throw error. Therefore need to create a new folder 84 | # # I also cannot delete it in the last step, because the process is still running and consuming the index files inside. 85 | # 86 | # # TODO: using a better wmodel than Tf? 87 | # pd_indexer = pt.DFIndexer(df_indexer_path, wmodel="Tf") 88 | # indexref = pd_indexer.index(text_df["text"], text_df["docno"]) 89 | # return indexref 90 | # 91 | # @staticmethod 92 | # def use_index_to_search(index, search_text): 93 | # result_df: pd.DataFrame = pt.BatchRetrieve(index).search(search_text) 94 | # return result_df 95 | # 96 | # def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): 97 | # logger.info(f"PyTerrierService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") 98 | # text_df = self.create_index_column_in_df(text_df) 99 | # index = self.index_text_df(text_df, 'df_index') 100 | # result_df: pd.DataFrame = self.use_index_to_search(index, search_text) 101 | # return result_df.merge(text_df, on="docno", how="left") 102 | # 103 | # 104 | # class LangChainFAISSService(SemanticSearchService): 105 | # def __init__(self, config): 106 | # super().__init__(config) 107 | # self.provider = self.config.get('semantic_search').get('provider') 108 | # self.embeddings = None 109 | # if self.provider == 'faiss-openai': 110 | # self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('llm_service').get('openai_api').get('api_key')) 111 | # elif self.provider == 'faiss-huggingface': 112 | # self.embeddings = HuggingFaceEmbeddings() 113 | # else: 114 | # raise Exception(f"provider {self.provider} is not supported") 115 | # 116 | # def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): 117 | # logger.info(f"LangChainFAISSService.index_text_df. text_df.shape: {text_df.shape}") 118 | # text_df['docno'] = text_df.index.tolist() 119 | # texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist() 120 | # docno_dict = [{'docno': docno} for docno in docno_list] 121 | # faiss_index = FAISS.from_texts(texts, self.embeddings, metadatas=docno_dict) 122 | # return faiss_index 123 | # 124 | # @staticmethod 125 | # def use_index_to_search(index, search_text): 126 | # index: FAISS 127 | # # k: Number of Documents to return. Defaults to 4. 128 | # # fetch_k: Number of Documents to fetch to pass to MMR algorithm. 129 | # 130 | # # k = 15 131 | # # # Cons: you can only pick k, but you cannot filter by score 132 | # # tuples = index.similarity_search_with_score(search_text, k=k) 133 | # # docno_list = [t[0].metadata['docno'] for t in tuples] 134 | # # score_list = [t[1] for t in tuples] 135 | # # result_df = pd.DataFrame({'docno': docno_list, 'score': score_list}) 136 | # # result_df['rank'] = result_df.index 137 | # 138 | # k = 30 139 | # docs = index.max_marginal_relevance_search(search_text, k=k, fetch_k=999) 140 | # docno_list = [doc.metadata['docno'] for doc in docs] 141 | # result_df = pd.DataFrame({'docno': docno_list}) 142 | # result_df['rank'] = result_df.index 143 | # result_df['score'] = 999 144 | # 145 | # return result_df 146 | # 147 | # def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): 148 | # logger.info(f"LangChainFAISSService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") 149 | # faiss_index = self.index_text_df(text_df, '') 150 | # result_df = self.use_index_to_search(faiss_index, search_text) 151 | # return result_df.merge(text_df, on="docno", how="left") 152 | # 153 | # 154 | # class SemanticSearchServiceFactory: 155 | # @staticmethod 156 | # def create_semantic_search_service(config) -> SemanticSearchService: 157 | # provider = config.get('semantic_search').get('provider') 158 | # if provider == 'pyterrier': 159 | # return PyTerrierService(config) 160 | # elif provider in ['faiss-openai', 'faiss-huggingface']: 161 | # return LangChainFAISSService(config) 162 | # else: 163 | # logger.error(f'SemanticSearchService for {provider} is not yet implemented.') 164 | # raise NotImplementedError(f'SemanticSearchService - {provider} - is not supported') 165 | 166 | 167 | class BatchOpenAISemanticSearchService: 168 | def __init__(self, config, sender: Sender = None): 169 | self.config = config 170 | openai.api_key = config.get('llm_service').get('openai_api').get('api_key') 171 | self.sender = sender 172 | 173 | @staticmethod 174 | def batch_call_embeddings(texts, chunk_size=1000): 175 | texts = [text.replace("\n", " ") for text in texts] 176 | embeddings = [] 177 | for i in range(0, len(texts), chunk_size): 178 | response = openai.Embedding.create( 179 | input=texts[i: i + chunk_size], engine=BASE_MODEL 180 | ) 181 | embeddings += [r["embedding"] for r in response["data"]] 182 | return embeddings 183 | 184 | @staticmethod 185 | def compute_embeddings_for_text_df(text_df: pd.DataFrame): 186 | """Compute embeddings for a text_df and return the text_df with the embeddings column added.""" 187 | print(f'compute_embeddings_for_text_df() len(texts): {len(text_df)}') 188 | text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) 189 | text_df['embedding'] = BatchOpenAISemanticSearchService.batch_call_embeddings(text_df['text'].tolist()) 190 | return text_df 191 | 192 | def search_related_source(self, text_df: pd.DataFrame, target_text, n=30): 193 | if not self.config.get('source_service').get('is_use_source'): 194 | col = ['name', 'url', 'url_id', 'snippet', 'text', 'similarities', 'rank', 'docno'] 195 | return pd.DataFrame(columns=col) 196 | 197 | if self.sender is not None: 198 | self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Searching from extracted text") 199 | print(f'search_similar() text: {target_text}') 200 | embedding = BatchOpenAISemanticSearchService.batch_call_embeddings([target_text])[0] 201 | text_df = BatchOpenAISemanticSearchService.compute_embeddings_for_text_df(text_df) 202 | text_df['similarities'] = text_df['embedding'].apply(lambda x: cosine_similarity(x, embedding)) 203 | result_df = text_df.sort_values('similarities', ascending=False).head(n) 204 | result_df['rank'] = range(1, len(result_df) + 1) 205 | result_df['docno'] = range(1, len(result_df) + 1) 206 | return result_df 207 | 208 | @staticmethod 209 | def post_process_gpt_input_text_df(gpt_input_text_df, prompt_token_limit): 210 | # clean out of prompt texts for existing [1], [2], [3]... in the source_text for response output stability 211 | gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x)) 212 | # length of char and token 213 | gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x)) 214 | gpt_input_text_df['len_token'] = gpt_input_text_df['text'].apply(lambda x: num_tokens_from_string(x)) 215 | 216 | gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() 217 | gpt_input_text_df['cumsum_len_token'] = gpt_input_text_df['len_token'].cumsum() 218 | 219 | max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_token'] <= prompt_token_limit]['rank'].max() + 1 220 | gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit 221 | # reorder url_id with url that in scope. 222 | url_id_list = gpt_input_text_df['url_id'].unique() 223 | url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) 224 | gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map) 225 | return gpt_input_text_df 226 | 227 | 228 | -------------------------------------------------------------------------------- /src/SourceService.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import pandas as pd 5 | 6 | from BingService import BingService 7 | from Util import setup_logger 8 | from text_extract.doc import support_doc_type, doc_extract_svc_map 9 | from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc 10 | from website.sender import Sender, MSG_TYPE_SEARCH_STEP 11 | 12 | logger = setup_logger('SourceModule') 13 | 14 | 15 | class SourceService: 16 | def __init__(self, config, sender: Sender = None): 17 | self.config = config 18 | self.sender = sender 19 | 20 | def extract_bing_text_df(self, search_text): 21 | # BingSearch using search_text 22 | # check if bing search result is cached and load if exists 23 | bing_text_df = None 24 | if not self.config['source_service']['is_use_source'] or not self.config['source_service']['is_enable_bing_search']: 25 | return bing_text_df 26 | 27 | bing_service = BingService(self.config) 28 | if self.sender is not None: 29 | self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Calling bing search API") 30 | website_df = bing_service.call_bing_search_api(search_text=search_text) 31 | if self.sender is not None: 32 | self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Extracting sentences from bing search result ...") 33 | bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df=website_df) 34 | 35 | return bing_text_df 36 | 37 | def extract_doc_text_df(self, bing_text_df): 38 | # DocSearch using doc_search_path 39 | # bing_text_df is used for doc_id arrangement 40 | if not self.config['source_service']['is_use_source'] or not self.config['source_service']['is_enable_doc_search']: 41 | return pd.DataFrame([]) 42 | if self.sender is not None: 43 | self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Extracting sentences from document") 44 | files_grabbed = list() 45 | for doc_type in support_doc_type: 46 | tmp_file_list = glob.glob(self.config['source_service']['doc_search_path'] + os.sep + "*." + doc_type) 47 | files_grabbed.extend({"file_path": file_path, "doc_type": doc_type} for file_path in tmp_file_list) 48 | 49 | logger.info(f"File list: {files_grabbed}") 50 | doc_sentence_list = list() 51 | 52 | start_doc_id = 1 if bing_text_df is None else bing_text_df['url_id'].max() + 1 53 | for doc_id, file in enumerate(files_grabbed, start=start_doc_id): 54 | extract_svc: AbstractDocExtractSvc = doc_extract_svc_map[file['doc_type']] 55 | sentence_list = extract_svc.extract_from_doc(file['file_path']) 56 | 57 | file_name = file['file_path'].split(os.sep)[-1] 58 | for sentence in sentence_list: 59 | doc_sentence_list.append({ 60 | 'name': file_name, 61 | 'url': file['file_path'], 62 | 'url_id': doc_id, 63 | 'snippet': '', 64 | 'text': sentence 65 | }) 66 | doc_text_df = pd.DataFrame(doc_sentence_list) 67 | return doc_text_df 68 | -------------------------------------------------------------------------------- /src/Util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pickle 4 | import re 5 | from copy import deepcopy 6 | from functools import wraps 7 | from hashlib import md5 8 | from pathlib import Path 9 | 10 | 11 | def get_project_root() -> Path: 12 | return Path(__file__).parent.parent 13 | 14 | 15 | def setup_logger(tag): 16 | logger = logging.getLogger(tag) 17 | logger.setLevel(logging.DEBUG) 18 | 19 | handler: logging.StreamHandler = logging.StreamHandler() 20 | formatter: logging.Formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 21 | handler.setFormatter(formatter) 22 | logger.addHandler(handler) 23 | return logger 24 | 25 | 26 | def save_result_cache(path: Path, hash: str, type: str, **kwargs): 27 | cache_dir = path / type 28 | os.makedirs(cache_dir, exist_ok=True) 29 | path = Path(cache_dir, f'{hash}.pickle') 30 | with open(path, 'wb') as f: 31 | pickle.dump(kwargs, f) 32 | 33 | 34 | def load_result_from_cache(path: Path, hash: str, type: str): 35 | path = path / type / f'{hash}.pickle' 36 | with open(path, 'rb') as f: 37 | return pickle.load(f) 38 | 39 | 40 | def check_result_cache_exists(path: Path, hash: str, type: str) -> bool: 41 | path = path / type / f'{hash}.pickle' 42 | return True if os.path.exists(path) else False 43 | 44 | 45 | def check_max_number_of_cache(path: Path, type, max_number_of_cache: int = 10): 46 | path = path / type 47 | if len(os.listdir(path)) > max_number_of_cache: 48 | ctime_list = [(os.path.getctime(path / file), file) for file in os.listdir(path)] 49 | oldest_file = sorted(ctime_list)[0][1] 50 | os.remove(path / oldest_file) 51 | 52 | 53 | def split_sentences_from_paragraph(text): 54 | sentences = re.split(r"(? 0] 14 | sentence_list = list() 15 | for raw_text in raw_text_list: 16 | sentence_list.extend(split_sentences_from_paragraph(raw_text)) 17 | 18 | # Remove duplicates 19 | sentence_list = list(dict.fromkeys(sentence_list)) 20 | 21 | return sentence_list 22 | 23 | 24 | docx_extract_svc = DocxSvc() 25 | -------------------------------------------------------------------------------- /src/text_extract/doc/ppt_svc.py: -------------------------------------------------------------------------------- 1 | import pptx 2 | from Util import split_sentences_from_paragraph 3 | 4 | from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc 5 | 6 | 7 | class PptSvc(AbstractDocExtractSvc): 8 | def __init__(self): 9 | super().__init__() 10 | 11 | def extract_from_doc(self, path: str): 12 | prs = pptx.Presentation(path) 13 | sentence_list = list() 14 | for i, slide in enumerate(prs.slides, start=1): 15 | for j, shape in enumerate(slide.shapes, start=1): 16 | if hasattr(shape, "text"): 17 | sentence_list.extend(split_sentences_from_paragraph(shape.text)) 18 | 19 | # Remove duplicates 20 | sentence_list = list(dict.fromkeys(sentence_list)) 21 | 22 | return sentence_list 23 | 24 | 25 | ppt_extract_svc = PptSvc() 26 | -------------------------------------------------------------------------------- /src/text_extract/html/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/src/text_extract/html/__init__.py -------------------------------------------------------------------------------- /src/text_extract/html/abc_html_extract.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class AbstractHtmlExtractSvc(abc.ABC): 5 | def __init__(self): 6 | pass 7 | 8 | @abc.abstractmethod 9 | def extract_from_html(self, text: str): 10 | pass 11 | -------------------------------------------------------------------------------- /src/text_extract/html/beautiful_soup.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from text_extract.html.abc_html_extract import AbstractHtmlExtractSvc 4 | 5 | 6 | class BeautifulSoupSvc(AbstractHtmlExtractSvc): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def extract_from_html(self, html_str: str): 11 | soup = BeautifulSoup(html_str, "html.parser") 12 | return [el.get_text() for el in soup.select('p')] 13 | -------------------------------------------------------------------------------- /src/text_extract/html/trafilatura.py: -------------------------------------------------------------------------------- 1 | from trafilatura import bare_extraction 2 | from trafilatura.meta import reset_caches 3 | 4 | from text_extract.html.abc_html_extract import AbstractHtmlExtractSvc 5 | 6 | 7 | class TrafilaturaSvc(AbstractHtmlExtractSvc): 8 | def __init__(self): 9 | super().__init__() 10 | 11 | def extract_from_html(self, html_str: str): 12 | extract = bare_extraction(html_str, favor_precision=True) 13 | # reset_caches() 14 | try: 15 | return extract['text'].split("\n") 16 | except: 17 | return [] 18 | -------------------------------------------------------------------------------- /src/website/__init__.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | 4 | def create_app(): 5 | app = Flask(__name__) 6 | app.config['SECRET_KEY'] = 'secret_key_xyz' 7 | 8 | from .views import views 9 | app.register_blueprint(views, url_prefix='/') 10 | 11 | return app 12 | -------------------------------------------------------------------------------- /src/website/sender.py: -------------------------------------------------------------------------------- 1 | from flask import render_template 2 | 3 | MSG_TYPE_SEARCH_STEP = 'search-step' 4 | MSG_TYPE_OPEN_AI_STREAM = 'openai-stream' 5 | 6 | # global var to store progress. Native polling 'socket' 7 | exporting_progress = {} 8 | 9 | 10 | class Sender: 11 | def __init__(self, request_id: str): 12 | self.request_id = request_id 13 | self.received_step_events = [] 14 | self.openai_stream = '' 15 | self.search_result_step_html = '' 16 | 17 | def send_message(self, msg_type, msg: str): 18 | if msg_type == MSG_TYPE_SEARCH_STEP: 19 | self.received_step_events.append(msg) 20 | self.search_result_step_html = render_template('search_result_step.html', 21 | search_result_step_json=[{'msg': received_msg} for received_msg in self.received_step_events]) 22 | elif msg_type == MSG_TYPE_OPEN_AI_STREAM: 23 | self.openai_stream += msg 24 | else: 25 | pass 26 | global exporting_progress 27 | exporting_progress[self.request_id] = {'html': self.search_result_step_html, 28 | 'openai_stream': self.openai_stream} 29 | -------------------------------------------------------------------------------- /src/website/static/index.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function () { 2 | let refresh_progress = function () { 3 | let status = $('#status').val() 4 | if (status === 'done' || status === 'error') { 5 | return; 6 | } 7 | $.get("/progress", 8 | {request_id: $('#request_id').val()}, 9 | function (data, status) { 10 | if (status === 'success') { 11 | $('#search-result-step').html(data.html); 12 | $('#result-text')[0].innerText = data.openai_stream; 13 | } 14 | } 15 | ); 16 | } 17 | 18 | let submit_search = function (is_poll, event) { 19 | if (event) { 20 | event.preventDefault(); 21 | } 22 | let search_text = $('#form1').val(); 23 | $('#search-btn')[0].disabled = true; 24 | $('#status').val('processing'); 25 | $('#search-result-spinner').addClass('d-flex'); 26 | // $('#search-results').hide(); 27 | $('#search_text')[0].innerText = search_text; 28 | $('#search_result_sources')[0].innerText = ''; 29 | $('#explain_results').hide(); 30 | $.ajax({ 31 | url: '/search', 32 | type: 'POST', 33 | data: { 34 | q: search_text, 35 | request_id: $('#request_id').val(), 36 | bing_search_subscription_key: $('#bing_search_subscription_key').val(), 37 | openai_api_key: $('#openai_api_key').val(), 38 | is_use_source: $('input[name="is_use_source"]')[0].checked, 39 | llm_service_provider: $('#llm_service_provider').val(), 40 | llm_model: $('#llm_model').val(), 41 | language: $('#language').val() 42 | }, 43 | success: function (response) { 44 | $('#' + response.id).html(response.html) 45 | $('#explain_results').html(response.explain_html) 46 | $('#request_id_status_html').html(response.request_id_status_html) 47 | $('#search-btn')[0].disabled = false; 48 | $('#search-result-spinner').removeClass('d-flex'); 49 | $('#search-results').show(); 50 | $('#explain_results').show(); 51 | }, 52 | error: function (error) { 53 | console.log(error) 54 | $('#explain_results').html(response.explain_html) 55 | $('#request_id_status_html').html(response.request_id_status_html) 56 | $('#search-btn')[0].disabled = false; 57 | $('#search-result-spinner').removeClass('d-flex'); 58 | $('#search-results').show(); 59 | $('#explain_results').show(); 60 | } 61 | }) 62 | 63 | // call 10 times progress each sec 64 | if (is_poll) { 65 | CALL_TIMES = 15; // 2 sec for 30 sec 66 | for (let i = 1; i < CALL_TIMES + 1; i++) { 67 | setTimeout(refresh_progress, 2000 * i); 68 | } 69 | } 70 | 71 | } 72 | 73 | $('.prompt-ex-btn').click(function () { 74 | $('#form1').val($(this).text()) 75 | submit_search(false, null); 76 | }); 77 | 78 | $('form').submit(function (event) { 79 | submit_search(true, event); 80 | }) 81 | }) -------------------------------------------------------------------------------- /src/website/templates/alert_box.html: -------------------------------------------------------------------------------- 1 | {% if error %} 2 | 6 | {% endif %} -------------------------------------------------------------------------------- /src/website/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 8 | 9 | 10 | 19 | 20 | {% block title %}SearchGPT{% endblock %} 21 | 22 | 23 | 28 |
{% block content %} {% endblock %}
29 | 30 | 41 | 44 | 47 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/website/templates/explain_result.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

4 | 8 |

9 |
11 |
12 |
13 |
14 |

{{search_text}}

15 |

16 | {% for item in response_explain_json %} 17 | {% if item['type'] == 'newline' %} 18 |
19 | {% elif item['type'] == 'keyword' %} 20 | {{ item['text'] }} 21 | {% else %} 22 | {{ item['text'] }} 23 | {% endif %} 24 | {% endfor %} 25 |

26 |
27 |
28 |

29 | {% for item in source_explain_json %} 30 | {% if item['type'] == 'newline' %} 31 |
32 | {% elif item['type'] == 'keyword' %} 33 | {{ item['text'] }} 34 | {% else %} 35 | {{ item['text'] }} 36 | {% endif %} 37 | {% endfor %} 38 |

39 |
40 |
41 |
42 |
43 |
44 |
-------------------------------------------------------------------------------- /src/website/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} {% block title %}SearchGPT{% endblock %} 2 | {% block content %} 3 |
4 |
5 | {% include 'alert_box.html' %} 6 |
7 |
8 |
9 | {% include 'prompt_examples.html' %} 10 |
11 |
12 |
13 |
14 | 18 | 19 |
20 |
21 |
22 |
23 | 24 | 28 |
29 |
30 | 31 | 33 |
34 |
35 | {% include 'request_id_status_html.html' %} 36 |
37 |
38 | Note:
39 | - Calls took about 15 seconds to fully complete normally.
40 | (~5 sec for Bing Search, ~10 sec for OpenAI)
41 | - Sometimes the API is not stable. If it exceeded 30 seconds, please try again.
42 | - Current source filtering: first 20 lines for each websites, then get the best results and trimmed to 1500 tokens
43 | - Non-English language is experimental. It will somehow work. 44 |
45 |
46 |
47 |
48 | 49 |
50 | {% if request.args.get('is_use_source', 'true') != 'False' %} 51 | 52 | {% else %} 53 | 54 | {% endif %} 55 | 58 |
59 | 60 |
61 |
62 | 63 | 66 |
67 |
68 | 69 | 79 |
80 |
81 | 82 | 93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 | 106 |
107 |
108 | {% include 'search_result_step.html' %} 109 |
110 |
111 | {% include 'search_result.html' %} 112 |
113 |
114 |
115 | {% include 'explain_result.html' %} 116 |
117 | 118 |
119 |
120 |
121 | 122 | 123 | {% endblock %} 124 | -------------------------------------------------------------------------------- /src/website/templates/index_static.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} {% block title %}Home{% endblock %} 2 | {% block content %} 3 |
4 | 8 |
9 |
10 |
11 | 12 | 15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |

what is chatgpt

23 |

ChatGPT is a computer program that generates text responses in the form of a conversation. It was 24 | created by OpenAI in 2018, and it is free to use for anyone with an account on their website. 25 | [5] ChatGPT is designed to generate human-like responses, but it is not always 26 | accurate. Users can upvote or downvote the responses they receive, and OpenAI gathers data from 27 | users to further train and fine-tune ChatGPT. [4].

28 |
29 |
30 |
    31 |
  • 32 |
    33 | Favicon 34 | [1] 35 | zdnet.com 36 |
    37 |

    What is ChatGPT and why 38 | does it matter? Here's everything you need to know

    39 |

    The launch of a paid version had been rumored for some time before the 40 | official release. In January, OpenAI announced on its Discord server that it was 41 | considering to start charging for ChatGPT with a version called ChatGPT 42 | Professional.

    43 |
  • 44 |
  • 45 |
    46 | Favicon 47 | [2] openai.com 48 |
    49 |

    ChatGPT: Optimizing 50 | Language Models for Dialogue

    51 |

    We are excited to introduce ChatGPT to get users' feedback and learn about 52 | its strengths and weaknesses. During the research preview, usage of ChatGPT is free. Try 53 | it now at chat.openai.com.

    54 |
  • 55 |
  • 56 |
    [3] xxx.com
    57 |

    xxx title xxx

    58 |

    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec varius elit 59 | lectus, vitae faucibus mi egestas in. Nulla

    60 |
  • 61 |
  • 62 |
    [4] xxx.com
    63 |

    xxx title xxx

    64 |

    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec varius elit 65 | lectus, vitae faucibus mi egestas in. Nulla

    66 |
  • 67 |
68 |
69 |
70 |
71 |
72 |

73 | 77 |

78 |
80 |
81 |
82 |
83 |

What is ChatGPT.

84 |
85 |
86 |

ChatGPT is a computer program that generates text responses in the form of a 87 | conversation. It was created by OpenAI in 2018, and it is free to use for anyone 88 | with an account on their website. [5] ChatGPT is designed to generate human-like 89 | responses, but it is not always 90 | accurate. Users can upvote or downvote the responses they receive, and OpenAI 91 | gathers data from users to further train and fine-tune ChatGPT.

92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 | 102 | {% endblock %} 103 | -------------------------------------------------------------------------------- /src/website/templates/prompt_examples.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

4 | 8 |

9 |
11 |
12 |
13 |
14 |
15 |
Prompts that SearchGPT > ChatGPT
16 | {% for prompt in prompt_examples_json.col1_list %} 17 | 18 | {% endfor %} 19 |
20 |
21 |
22 |
23 |
Multi-language experiments
24 | {% for prompt in prompt_examples_json.col2_list %} 25 | 26 | {% endfor %} 27 |
28 |
29 |
30 |
31 |
Users' last 5 queries (coming soon!)
32 |
33 |
34 |
35 |
36 |
37 |
38 |
-------------------------------------------------------------------------------- /src/website/templates/request_id_status_html.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 |
6 | 7 | 8 |
9 | -------------------------------------------------------------------------------- /src/website/templates/search_result.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |

{{search_text}}

4 |

5 | {% for item in response_json %} 6 | {% if item['type'] == 'newline' %} 7 |
8 | {% elif item['type'] == 'footnote' %} 9 | {{ item['text'] }} 10 | {% else %} 11 | {{ item['text'] }} 12 | {% endif %} 13 | {% endfor %} 14 |

15 |
16 |
17 | 29 |
30 |
-------------------------------------------------------------------------------- /src/website/templates/search_result_step.html: -------------------------------------------------------------------------------- 1 | {% for item in search_result_step_json %} 2 |
3 | 4 | 5 | 6 | 7 | 8 | 9 | {{ item['msg'] }} 10 | 11 |
12 | {% endfor %} -------------------------------------------------------------------------------- /src/website/views.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import string 4 | import tracemalloc 5 | 6 | import psutil 7 | from flask import Blueprint, render_template, request 8 | 9 | from SearchGPTService import SearchGPTService 10 | from FrontendService import FrontendService 11 | from Util import setup_logger 12 | from website.sender import exporting_progress, Sender 13 | 14 | logger = setup_logger('Views') 15 | views = Blueprint('views', __name__) 16 | 17 | process = psutil.Process(os.getpid()) 18 | tracemalloc.start() 19 | memory_snapshot = None 20 | 21 | 22 | @views.route('/', methods=['GET']) 23 | @views.route('/index', methods=['GET']) 24 | def start_page(): 25 | request_id = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(16)) 26 | 27 | data_json = {'response_json': [], 'source_json': [], 'response_explain_json': [], 'source_explain_json': [], 28 | 'prompt_examples_json': FrontendService.get_prompt_examples_json()} 29 | return render_template("index.html", 30 | search_text='' or "Please search for something.", 31 | response_json=data_json.get('response_json'), 32 | source_json=data_json.get('source_json'), 33 | response_explain_json=data_json.get('response_explain_json'), 34 | source_explain_json=data_json.get('source_explain_json'), 35 | prompt_examples_json=data_json.get('prompt_examples_json'), 36 | request_id=request_id, status="init", 37 | error=None 38 | ) 39 | 40 | 41 | @views.route('/search', methods=['POST']) 42 | def index_page(): 43 | error = None 44 | data_json = {'response_json': [], 'source_json': []} 45 | request_id = request.values.get('request_id') 46 | search_text = request.values.get('q') 47 | 48 | try: 49 | ui_overriden_config = { 50 | 'bing_search_subscription_key': request.values.get('bing_search_subscription_key'), 51 | 'openai_api_key': request.values.get('openai_api_key'), 52 | 'is_use_source': request.values.get('is_use_source'), 53 | 'llm_service_provider': request.values.get('llm_service_provider'), 54 | 'llm_model': request.values.get('llm_model'), 55 | 'language': request.values.get('language'), 56 | } 57 | logger.info(f"GET ui_overriden_config: {ui_overriden_config}") 58 | 59 | if search_text is not None: 60 | sender = Sender(request_id) if request_id is not None and request_id != "" else None 61 | search_gpt_service = SearchGPTService(ui_overriden_config, sender) 62 | _, _, data_json = search_gpt_service.query_and_get_answer(search_text=search_text) 63 | except Exception as e: 64 | error = str(e) 65 | 66 | if error is None: 67 | id = 'search-results' 68 | result_html = render_template('search_result.html', 69 | search_text=search_text, 70 | response_json=data_json.get('response_json'), 71 | source_json=data_json.get('source_json'), 72 | ) 73 | explain_html = render_template('explain_result.html', 74 | search_text=search_text, 75 | response_explain_json=data_json.get('response_explain_json'), 76 | source_explain_json=data_json.get('source_explain_json'), 77 | ) 78 | request_id_status_html = render_template('request_id_status_html.html', request_id=request_id, status="done") 79 | else: 80 | id = 'alert-box' 81 | result_html = render_template('alert_box.html', error=error) 82 | explain_html = render_template('explain_result.html', 83 | search_text=search_text, 84 | response_explain_json=[], 85 | source_explain_json=[], 86 | ) 87 | request_id_status_html = render_template('request_id_status_html.html', request_id=request_id, status="error") 88 | return { 89 | 'id': id, 90 | 'html': result_html, 91 | 'explain_html': explain_html, 92 | 'request_id_status_html': request_id_status_html, 93 | } 94 | 95 | 96 | @views.route('/progress') 97 | def progress(): 98 | request_id = request.values.get('request_id') 99 | request_dict = exporting_progress.get(request_id, '') 100 | return request_dict 101 | 102 | 103 | @views.route('/index_static', methods=['GET', 'POST']) 104 | def index_static_page(): 105 | return render_template("index_static.html") 106 | 107 | 108 | @views.route("/data", methods=["GET"]) 109 | def get_data(): 110 | return {'id': 1, 'test': 'test'} 111 | 112 | 113 | @views.route('/memory') 114 | def print_memory(): 115 | return {'memory': process.memory_info().rss} 116 | 117 | 118 | @views.route("/snapshot") 119 | def snap(): 120 | global memory_snapshot 121 | if not memory_snapshot: 122 | memory_snapshot = tracemalloc.take_snapshot() 123 | return "taken snapshot\n" 124 | else: 125 | lines = [] 126 | memory_snapshot_temp = tracemalloc.take_snapshot() 127 | top_stats = memory_snapshot_temp.compare_to(memory_snapshot, 'lineno') 128 | memory_snapshot = memory_snapshot_temp 129 | for stat in top_stats[:5]: 130 | lines.append(str(stat)) 131 | return "\n".join(lines) 132 | --------------------------------------------------------------------------------