├── .cache
    └── web
    │   ├── Alpaca lora.pickle
    │   ├── End of FTX.pickle
    │   ├── Hoe maak ik pasta.pickle
    │   ├── Why Llama LLM model is so popular.pickle
    │   ├── Why did SVB collapsed.pickle
    │   ├── digital twin有哪些用处.pickle
    │   ├── what is new for gpt4.pickle
    │   ├── when is End of FTX.pickle
    │   ├── 日本国憲法は誰が作ったのか.pickle
    │   └── 아가동산사건의 문제가 뭐야.pickle
├── .gitignore
├── .idea
    └── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── Procfile
├── README.md
├── app.py
├── img
    ├── architecture_roadmap.png
    ├── explainability.png
    └── webui.png
├── playground
    ├── test_OpenAIAPI.py
    ├── test_OpenAI_Embedding.py
    ├── test_langchain_faiss.py
    ├── test_nltk.py
    └── test_pyterrier.py
├── requirements.txt
├── runtime.txt
└── src
    ├── BingService.py
    ├── FrontendService.py
    ├── LLMService.py
    ├── NLPUtil.py
    ├── SearchGPTService.py
    ├── SemanticSearchService.py
    ├── SourceService.py
    ├── Util.py
    ├── config
        └── config.yaml
    ├── flask_app.py
    ├── gradio_app.py
    ├── main.py
    ├── text_extract
        ├── __init__.py
        ├── doc
        │   ├── __init__.py
        │   ├── abc_doc_extract.py
        │   ├── docx_svc.py
        │   └── ppt_svc.py
        └── html
        │   ├── __init__.py
        │   ├── abc_html_extract.py
        │   ├── beautiful_soup.py
        │   └── trafilatura.py
    └── website
        ├── __init__.py
        ├── sender.py
        ├── static
            └── index.js
        ├── templates
            ├── alert_box.html
            ├── base.html
            ├── explain_result.html
            ├── index.html
            ├── index_static.html
            ├── prompt_examples.html
            ├── request_id_status_html.html
            ├── search_result.html
            └── search_result_step.html
        └── views.py


/.cache/web/Alpaca lora.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Alpaca lora.pickle


--------------------------------------------------------------------------------
/.cache/web/End of FTX.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/End of FTX.pickle


--------------------------------------------------------------------------------
/.cache/web/Hoe maak ik pasta.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Hoe maak ik pasta.pickle


--------------------------------------------------------------------------------
/.cache/web/Why Llama LLM model is so popular.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Why Llama LLM model is so popular.pickle


--------------------------------------------------------------------------------
/.cache/web/Why did SVB collapsed.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/Why did SVB collapsed.pickle


--------------------------------------------------------------------------------
/.cache/web/digital twin有哪些用处.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/digital twin有哪些用处.pickle


--------------------------------------------------------------------------------
/.cache/web/what is new for gpt4.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/what is new for gpt4.pickle


--------------------------------------------------------------------------------
/.cache/web/when is End of FTX.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/when is End of FTX.pickle


--------------------------------------------------------------------------------
/.cache/web/日本国憲法は誰が作ったのか.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/日本国憲法は誰が作ったのか.pickle


--------------------------------------------------------------------------------
/.cache/web/아가동산사건의 문제가 뭐야.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/.cache/web/아가동산사건의 문제가 뭐야.pickle


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.docx
 2 | *.doc
 3 | *.pptx
 4 | *.ppt
 5 | *.pdf
 6 | *.bf
 7 | *.fsarrayfile
 8 | *.fsomapfile
 9 | *.fsomaphash
10 | *.fsomapid
11 | *.fsomapfile
12 | *.fsomapfile.0
13 | *.idx
14 | *.zdata
15 | *.pyc
16 | data*.properties
17 | /.idea/*
18 | /venv/*
19 | cache
20 | *.pickle
21 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # I would like to make it better..
 2 | 
 3 | Thanks for your contribution!
 4 | 
 5 | ### Determine what to do
 6 | 
 7 | There are so many possibilities to start with. For example
 8 | 
 9 | - Studies on API calls like Toolformer
10 | - Studies on uncertainty in the search results. If the response is unsure or source didn't contain useful info, it shouldn't answer with confident.
11 | - Studies in open-source model integration and not relies on API. To be better, even using RLHF model
12 | - Studies on better generation of footnote. In word-level, or instrinsic model generated footnote.
13 | - Prompt engineering
14 | - Supports of markdown, code result, point form result, etc.
15 | - Use it to generate data for training of another LLM model
16 | - Make a better/robust UI
17 | - ...
18 | 
19 | ### Taking on Tasks
20 | 
21 | Please create a github issue on a problem that appeals to you.
22 | If there are any issues/features you want to address, mention them in your comment along with a brief explanation of
23 | how you'll resolve the issue. As soon as a project coordinator assigns you the problem, you can start working on it. (But you can still start first!)
24 | 
25 | ### Submitting a Pull Request
26 | (We are not familiar with Github process. This reference LAION-AI/Open-Assistant)
27 | 
28 | 1. [Fork this project repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo)
29 |    and clone it to your local machine. (Read more
30 |    [About Forks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/about-forks))
31 | 1. Before working on any changes, try to
32 |    [sync the forked repository](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork)
33 |    to keep it up-to-date with the upstream repository.
34 | 1. On a
35 |    [new branch](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-and-deleting-branches-within-your-repository)
36 |    in your fork (aka a "feature branch" and not `master`) work on a small focused change that only touches on a few files.
37 | 1. Package up a small bit of work that solves part of the problem
38 |    [into a Pull Request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork)
39 |    and
40 |    [send it out for review](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/requesting-a-pull-request-review).
41 |    [Here](https://github.com/michaelthwan/searchGPT/pull/42) is an example PR
42 |    for this project to illustrate this flow.
43 | 1. If you're lucky, we can merge your change into `master` without any problems.
44 |    If there are changes to files you're working on, resolve them by:
45 |     1. First try to rebase as suggested
46 |        [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-rebase).
47 |     1. If rebasing feels too painful, merge as suggested
48 |        [in these instructions](https://timwise.co.uk/2019/10/14/merge-vs-rebase/#should-you-merge).
49 | 1. Once you've resolved conflicts (if any), finish the review and
50 |    [squash and merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits)
51 |    your PR (when squashing try to clean up or update the individual commit
52 |    messages to be one sensible single one).
53 | 1. Merge in your change and move on to a new issue or the second step of your current issue.
54 | 
55 | Additionally, if someone is working on an issue that interests you, ask if they
56 | need help on it or would like suggestions on how to approach the issue. If so,
57 | share wildly. If they seem to have a good handle on it, let them work on their
58 | solution until a challenge comes up.
59 | 
60 | # Thank you for your contribution!


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Michael Wan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn --workers 1 --threads 2 app:app


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | searchGPT - An Open-Source RAG-based LLM Search Engine
  2 | ==================================================
  3 | 
  4 | **searchGPT** is an open-source project to build a search engine based on Large Language Model (LLM) technology to give natural language answers.
  5 | 
  6 | You may take this is a **minimal implementation of new Bing mainly for search engine and question answering**. 
  7 | 
  8 | It supports answers based on web search content or file content.
  9 | 
 10 | Please give me a star if you like it! 🌟
 11 | 
 12 | ### **(Demo page link is available below!)**
 13 | 
 14 | ![webui](/img/webui.png)
 15 | ![explainability](/img/explainability.png)
 16 | 
 17 | Features
 18 | --------
 19 | 
 20 | * Source:
 21 |   * Web search with real-time results
 22 |   * File content search (PPT/DOC/PDF, etc.)
 23 | * Sematic search from source ([FAISS](https://github.com/facebookresearch/faiss) / [pyterrier](https://github.com/terrier-org/pyterrier))
 24 | * LLM integration: ([OpenAI](https://platform.openai.com/docs/api-reference?lang=python) / [GooseAI](https://goose.ai/), etc.)
 25 | * Frontend: Easy-to-use and intuitive user interface
 26 | 
 27 | Demo page
 28 | ---------------
 29 | https://searchgpt-demo.herokuapp.com/index
 30 | - Please do not abuse it with any programs.
 31 | - Please be patient with the loading time, which usually takes ~10 seconds.
 32 | 
 33 | Architecture and roadmap
 34 | ------------------------
 35 | ![architecture_roadmap](/img/architecture_roadmap.png)
 36 | 
 37 | Why RAG?
 38 | ---------------
 39 | Because it is impossible for the LLM to learn everything during the training, thus real-time factual information is needed for reference.
 40 | 
 41 | Example of ungrounded false answers (both using `text-curie-001` model):
 42 | 
 43 | **Prompt: what is ghost kitchen**
 44 | 
 45 | | Original answer                                                                                                                                                                                                           | RAG-based answer (SearchGPT)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 46 | |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 47 | | The Ghost Kitchen is a place where the departed spirits of cooks and dishwashers can cook and wash dishes with ease. The kitchen is said to be especially busy on Full Moon nights                                        | A ghost kitchen is a physical space for operators to create food for off-premises consumption. [1] This type of kitchen doesn't have waiters, no dining room, and no parking lot – really, no public presence whatsoever. [1] This kitchen is designed to cut unnecessary costs, allowing a restaurant to expand easily with low capital. [2] With apps like GrubHub and DoorDash, restaurant owners can easily find orders and exceed their expected revenue. [3]                                                                                                                                                                                                                                                |
 48 | | Ghost Kitchen is a pop-up restaurant in London that is completely without food. Instead, the restaurant is filled with spirits, who banish negative energy and help to make the guests' dining experience more enjoyable. | A ghost kitchen is a professional kitchen that is set up to cook food for delivery or take-out but does not have a dine-in option. [1] Ghost kitchens are often located in areas with high foot traffic and are used by multiple restaurants. [1] This allows restaurant chains to test new menu items without the investment of a brick-and-mortar location. [2] By having all of your orders in one place, a ghost kitchen can minimize the chances of errors and increase efficiency. [3] Additionally, by being located close to customers, a ghost kitchen can increase revenue. [4] While ghost kitchens are here to stay, they will likely face competition from other restaurants in the near future. [1] |
 49 | 
 50 | **Prompt: what is the meaning of wwww in Japanese**
 51 | 
 52 | | Original answer                                                                                                                                                                                                           | RAG-based answer (SearchGPT)                                                                                                                                                                                                                                                                                  |
 53 | |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 54 | | Japanese "www" is typically used as a placeholder for "www." For example, "This is www." is typically translated to "Kore wa www.". Japanese "www" is also used to indicate that a webpage is not currently being viewed. | The meaning of "www" in Japanese is typically used to show amusement or to convey sarcasm. [1] It can also be used as a casual way to say "yes" or "okay." Additionally, speakers of Japanese may use "w" to represent the kana "笑" in online chat because it looks similar to the character for "laugh." [2] | 
 55 | 
 56 | 
 57 | Getting Started
 58 | ---------------
 59 | 
 60 | ### Prerequisites
 61 | 
 62 | To run `searchGPT`, you'll need:
 63 | 
 64 | * [Python 3.10.8](https://www.python.org/downloads/)
 65 | * [OpenAI API Key](https://beta.openai.com/signup) or [GooseAI API Key](https://goose.ai/)
 66 |     * OpenAI: First $18 is free (enough for 3000+ searches)
 67 |     * GooseAI: First $10 is free
 68 | * [Azure Bing Search Subscription Key](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api/)
 69 |     * Free version is available (3 searches per second, 1000 searches per month
 70 | 
 71 | ### Installation
 72 | 
 73 | 1. Create your python or anaconda env and install python packages
 74 | 
 75 | Native
 76 | ```
 77 | # using python=3.10.8
 78 | pip install -r requirements.txt
 79 | ```
 80 | 
 81 | Anaconda
 82 | ```
 83 | conda create --name searchgpt python=3.10.8
 84 | conda activate searchgpt
 85 | pip install -r requirements.txt
 86 | ```
 87 | 
 88 | 2. Input API keys (OpenAI/Azure Bing Search) in `backend/src/config/config.yaml` (or via UI if web app is used)
 89 | 3. Run `app.py`, (or `flask_app.py`) for frontend web app launching.
 90 | 4. For quick testing, run `main.py`. Stdout output only.
 91 | 
 92 | Contributing
 93 | ------------
 94 | 
 95 | We welcome contributions to **searchGPT**! (Especially frontend developers)
 96 | 
 97 | If you're interested in contributing, please take a look at our [contributing guidelines](./CONTRIBUTING.md) for more information.
 98 | 
 99 | License
100 | -------
101 | 
102 | `searchGPT` is licensed under the [MIT License](./LICENSE).
103 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from src.website import create_app
 2 | import os, sys
 3 | 
 4 | sys.path.append(os.path.join(os.path.dirname(__file__), "src"))
 5 | 
 6 | app = create_app()
 7 | 
 8 | if __name__ == '__main__':
 9 |     app.run(debug=True)
10 | 


--------------------------------------------------------------------------------
/img/architecture_roadmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/img/architecture_roadmap.png


--------------------------------------------------------------------------------
/img/explainability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/img/explainability.png


--------------------------------------------------------------------------------
/img/webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/img/webui.png


--------------------------------------------------------------------------------
/playground/test_OpenAIAPI.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | # openai.organization = ""
 4 | openai.api_key = ""
 5 | # print(openai.Model.list())
 6 | 
 7 | # GPT-3
 8 | # text-davinci-003  $0.0200  /1K tokens
 9 | # text-curie-001    $0.0020  /1K tokens
10 | # text-babbage-001  $0.0005  /1K tokens
11 | # text-ada-001      $0.0004  /1K tokens
12 | 
13 | # Codex
14 | # code-davinci-002
15 | # code-cushman-001
16 | 
17 | response = openai.Completion.create(
18 |     # model="text-curie-001",
19 |     model="text-babbage-001",
20 |     prompt="Say this is a test",
21 |     max_tokens=16,
22 |     temperature=0 # default=1
23 | )
24 | 
25 | for r in response.choices:
26 |     print(r.text)
27 | 


--------------------------------------------------------------------------------
/playground/test_OpenAI_Embedding.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import openai
  4 | import pandas as pd
  5 | import yaml
  6 | from openai.embeddings_utils import get_embedding, cosine_similarity
  7 | 
  8 | from Util import get_project_root
  9 | 
 10 | BASE_MODEL = "text-embedding-ada-002"  # default embedding of faiss-openai
 11 | 
 12 | 
 13 | def search_using_cosine_similarity(df, query):
 14 |     query_embedding = get_embedding(query, engine=BASE_MODEL)
 15 |     df["similarity"] = df['embeddings'].apply(lambda x: cosine_similarity(x, query_embedding))
 16 | 
 17 |     results = df.sort_values("similarity", ascending=False, ignore_index=True)
 18 | 
 19 |     k = 5
 20 |     results = results.head(k)
 21 |     global sources
 22 |     sources = []
 23 |     for i in range(k):
 24 |         sources.append({'Page ' + str(results.iloc[i]['page']): results.iloc[i]['text'][:150] + '...'})
 25 |     print(sources)
 26 |     return results.head(k)
 27 | 
 28 | 
 29 | def compute_embeddings(text, model="text-embedding-ada-002"):
 30 |     print(f'compute_embeddings() text: {text}')
 31 |     text = text.replace("\n", " ")
 32 |     return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']
 33 | 
 34 | 
 35 | def search_similar(df: pd.DataFrame, target_text, n=3, pprint=True):
 36 |     print(f'search_similar() text: {target_text}')
 37 |     embedding = compute_embeddings(target_text, model=BASE_MODEL)
 38 |     df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, embedding))
 39 |     res = df.sort_values('similarities', ascending=False).head(n)
 40 |     return res, df
 41 | 
 42 | 
 43 | def compute_embeddings_2(text_df, model=BASE_MODEL, chunk_size=1000):
 44 |     print(f'compute_embeddings_2() len(texts): {len(df)}')
 45 |     text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " "))
 46 |     embeddings = []
 47 |     for i in range(0, len(texts), chunk_size):
 48 |         response = openai.Embedding.create(
 49 |             input=texts[i: i + chunk_size], engine=model
 50 |         )
 51 |         embeddings += [r["embedding"] for r in response["data"]]
 52 |     text_df['embedding'] = embeddings
 53 |     return text_df
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 |     # text_df = pd.read_csv(os.path.join(get_project_root(), 'src/text_df.csv'))
 58 |     texts = [
 59 |         "Discover the world of delicious beans with our premium selection.",
 60 |         "Try our savory bean soup recipe for a delicious and nutritious meal.",
 61 |         "Our roasted coffee beans are carefully selected for their rich and delicious flavor.",
 62 |         "Beans are not only delicious, but also a great source of protein and dietary fiber.",
 63 |         "Looking for a delicious vegan meal? Try our spicy black bean burger recipe.",
 64 | 
 65 |         "The sky is blue and the sun is shining today.",
 66 |         "I need to go grocery shopping after work to pick up some milk and bread.",
 67 |         "Did you hear about the new movie that just came out? It's supposed to be really good.",
 68 |         "I'm planning a trip to Europe next summer and I'm so excited.",
 69 |         "My cat keeps meowing at me for no reason and it's driving me crazy.",
 70 |     ]
 71 |     text_df = pd.DataFrame({'text': texts, 'docno': range(len(texts))})
 72 |     print(text_df.shape)
 73 | 
 74 |     with open(os.path.join(get_project_root(), 'src/config/config.yaml')) as f:
 75 |         config = yaml.load(f, Loader=yaml.FullLoader)
 76 |         openai.api_key = config.get('openai_api').get('api_key')
 77 | 
 78 |         # text_df = compute_embeddings(text_df)
 79 |         # result_df = search_using_cosine_similarity(text_df, 'what is chatgpt?')
 80 |         # print(result_df)
 81 | 
 82 |         search_text = 'delicious beans'
 83 |         search_text = 'Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection '
 84 | 
 85 |         from pyinstrument import Profiler
 86 | 
 87 |         profiler = Profiler()
 88 |         profiler.start()
 89 |         print("Sequential call mode:")
 90 |         text_df['embedding'] = text_df['text'].apply(lambda x: compute_embeddings(x, model=BASE_MODEL))
 91 |         res, text_df = search_similar(text_df, search_text, n=3)
 92 |         print(res)
 93 |         profiler.stop()
 94 |         profiler.print()
 95 | 
 96 |         profiler = Profiler()
 97 |         profiler.start()
 98 |         print("Batch call mode:")
 99 |         text_df = compute_embeddings_2(text_df)
100 |         res, text_df = search_similar(text_df, search_text, n=3)
101 |         print(res)
102 |         profiler.stop()
103 |         profiler.print()
104 | 


--------------------------------------------------------------------------------
/playground/test_langchain_faiss.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | # from langchain.embeddings.openai import OpenAIEmbeddings
 3 | from langchain.embeddings import HuggingFaceEmbeddings
 4 | from langchain.vectorstores import FAISS
 5 | 
 6 | if __name__ == '__main__':
 7 |     search_text = 'the source of dark energy'
 8 |     index_path = r"C:\xxx\searchGPT\backend\src\.index"
 9 |     text_df = pd.read_csv(rf'C:\xxx\text_df.csv')
10 |     text_df['docno'] = text_df.index.tolist()
11 |     print(text_df.shape)
12 |     print(text_df)
13 |     texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist()
14 |     docno_dict = [{'docno': docno} for docno in docno_list]
15 |     embeddings = HuggingFaceEmbeddings()  # OpenAIEmbeddings() cost money (OPENAI_API_KEY)
16 |     faiss_index = FAISS.from_texts(texts, embeddings, metadatas=docno_dict)
17 | 
18 |     # k: Number of Documents to return. Defaults to 4.
19 |     # fetch_k: Number of Documents to fetch to pass to MMR algorithm.
20 |     k, fetch_k = 10, 999
21 |     # docs = faiss_index.max_marginal_relevance_search(search_text, k=k, fetch_k=fetch_k)
22 |     docs = faiss_index.similarity_search_with_score(search_text, k=k)
23 |     text_list, docno_list, score_list = [], [], []
24 |     for t in docs:
25 |         doc, score = t
26 |         print(doc)
27 |         text_list.append(doc.page_content)
28 |         docno_list.append(doc.metadata['docno'])
29 |         score_list.append(score)
30 |     gpt_df = pd.DataFrame({'text': text_list, 'docno': docno_list, 'score': score_list})
31 |     print("=====gpt_df====")
32 |     print(gpt_df.shape)
33 |     print(gpt_df)
34 | 


--------------------------------------------------------------------------------
/playground/test_nltk.py:
--------------------------------------------------------------------------------
1 | # import nltk
2 | #
3 | # if __name__ == '__main__':
4 | #     text = "There are many things you can do to learn how to run faster, such as incorporating speed workouts into your running schedule, running hills, counting your strides, and adjusting your running form. Lean forward when you run and push off firmly with each foot. Pump your arms actively and keep your elbows bent at a 90-degree angle. Try to run every day, and gradually increase the distance you run for long-distance runs. Make sure you rest at least one day per week to allow your body to recover. Avoid running with excess gear that could slow you down."
5 | #     nltk.download('punkt')
6 | #     sentences = nltk.sent_tokenize(text)
7 | #     for sentence in sentences:
8 | #         print(sentence)
9 | 


--------------------------------------------------------------------------------
/playground/test_pyterrier.py:
--------------------------------------------------------------------------------
 1 | import pyterrier as pt
 2 | import pandas as pd
 3 | import os
 4 | 
 5 | 
 6 | def pd_indexer():
 7 |     df = pd.DataFrame({
 8 |         'docno':
 9 |             ['1', '2', '3'],
10 |         'url':
11 |             ['url1', 'url2', 'url3'],
12 |         'text':
13 |             ['He ran out of money, so he had to stop playing',
14 |              'The waves were crashing on the shore; it was a',
15 |              'The body may perhaps compensates for the loss']
16 |     })
17 |     files = pt.io.find_files("./var/files")
18 | 
19 |     indexref_file = pt.FilesIndexer("./file_index", overwrite=True).index(files)
20 | 
21 |     # pd_indexer = pt.DFIndexer("./var")
22 |     # indexref2 = pd_indexer.index(df["text"], df["docno"])
23 |     index = pt.IndexFactory.of(indexref_file)
24 |     print(index.getCollectionStatistics().toString())
25 |     # pt.BatchRetrieve(indexref2).search("waves")
26 |     return
27 | 
28 | 
29 | if __name__ == '__main__':
30 | 
31 |     if not pt.started():
32 |         pt.init()
33 |     # dataset = pt.datasets.get_dataset("vaswani")
34 |     # index_path = "./index"
35 |     print(os.getcwd())
36 | 
37 |     # files = pt.io.find_files("./var/files")
38 |     files = pt.io.find_files(os.path.join(os.getcwd(), "var/files"))
39 |     indexref_file = pt.FilesIndexer(os.path.join(os.getcwd(), "var/file_index"), overwrite=True).index(files)
40 |     # indexref_file = pt.IndexFactory.of(os.path.join(os.getcwd(), "var/file_index/data.properties"))
41 |     print(type(indexref_file))
42 |     result_df = pt.BatchRetrieve(indexref_file).search("NSSO")  # Can feed both jnius.reflect.org.terrier.querying.IndexRef / jnius.reflect.org.terrier.querying.Index
43 |     print(result_df)
44 | 
45 |     # index = pt.IndexFactory.of(indexref_file)
46 |     # print(index.getCollectionStatistics().toString())
47 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ## python 3.10.8
 2 | # basic
 3 | openai==0.27.0
 4 | pandas==1.5.3
 5 | PyYAML==6.0
 6 | 
 7 | # frontend
 8 | Flask==2.2.3
 9 | Werkzeug==2.2.2
10 | requests==2.28.2
11 | gunicorn==20.1.0
12 | 
13 | # embedding
14 | tiktoken==0.3.2
15 | matplotlib==3.7.1
16 | plotly==5.13.1
17 | scipy==1.10.1
18 | scikit-learn==1.2.1
19 | 
20 | # doc extraction
21 | python-docx==0.8.11
22 | python-pptx==0.6.21
23 | 
24 | # html extraction
25 | beautifulsoup4==4.11.2
26 | trafilatura==1.4.1
27 | 
28 | # misc
29 | psutil==5.9.4
30 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.10.8


--------------------------------------------------------------------------------
/src/BingService.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import concurrent.futures
  4 | import pandas as pd
  5 | import requests
  6 | import yaml
  7 | 
  8 | from Util import setup_logger, get_project_root, storage_cached
  9 | from text_extract.html.beautiful_soup import BeautifulSoupSvc
 10 | from text_extract.html.trafilatura import TrafilaturaSvc
 11 | 
 12 | logger = setup_logger('BingService')
 13 | 
 14 | 
 15 | class BingService:
 16 |     def __init__(self, config):
 17 |         self.config = config
 18 |         extract_svc = self.config.get('source_service').get('bing_search').get('text_extract')
 19 |         if extract_svc == 'trafilatura':
 20 |             self.txt_extract_svc = TrafilaturaSvc()
 21 |         elif extract_svc == 'beautifulsoup':
 22 |             self.txt_extract_svc = BeautifulSoupSvc()
 23 | 
 24 |     @storage_cached('bing_search_website', 'search_text')
 25 |     def call_bing_search_api(self, search_text: str) -> pd.DataFrame:
 26 |         logger.info("BingService.call_bing_search_api. query: " + search_text)
 27 |         subscription_key = self.config.get('source_service').get('bing_search').get('subscription_key')
 28 |         endpoint = self.config.get('source_service').get('bing_search').get('end_point') + "/v7.0/search"
 29 |         mkt = self.config.get('general').get('language')
 30 |         params = {'q': search_text, 'mkt': mkt}
 31 |         headers = {'Ocp-Apim-Subscription-Key': subscription_key}
 32 | 
 33 |         try:
 34 |             response = requests.get(endpoint, headers=headers, params=params)
 35 |             response.raise_for_status()
 36 | 
 37 |             columns = ['name', 'url', 'snippet']
 38 |             if response.json().get('webPages'):
 39 |                 website_df = pd.DataFrame(response.json()['webPages']['value'])[columns]
 40 |                 website_df['url_id'] = website_df.index + 1
 41 |                 website_df = website_df[:self.config.get('source_service').get('bing_search').get('result_count')]
 42 |             else:
 43 |                 website_df = pd.DataFrame(columns=columns + ['url_id'])
 44 |         except Exception as ex:
 45 |             raise ex
 46 |         return website_df
 47 | 
 48 |     def call_urls_and_extract_sentences(self, website_df) -> pd.DataFrame:
 49 |         """
 50 |         :param:
 51 |             website_df: one row = one website with url
 52 |                 name: website title name
 53 |                 url: url
 54 |                 snippet: snippet of the website given by BingAPI
 55 |         :return:
 56 |             text_df: one row = one website sentence
 57 |             columns:
 58 |                 name: website title name
 59 |                 url: url
 60 |                 snippet: snippet of the website given by BingAPI
 61 |                 text: setences extracted from the website
 62 |         """
 63 |         logger.info(f"BingService.call_urls_and_extract_sentences. website_df.shape: {website_df.shape}")
 64 |         name_list, url_list, url_id_list, snippet_list, text_list = [], [], [], [], []
 65 |         for index, row in website_df.iterrows():
 66 |             logger.info(f"Processing url: {row['url']}")
 67 |             sentences = self.extract_sentences_from_url(row['url'])
 68 |             for text in sentences:
 69 |                 word_count = len(re.findall(r'\w+', text))  # approximate number of words
 70 |                 if word_count < 8:
 71 |                     continue
 72 |                 name_list.append(row['name'])
 73 |                 url_list.append(row['url'])
 74 |                 url_id_list.append(row['url_id'])
 75 |                 snippet_list.append(row['snippet'])
 76 |                 text_list.append(text)
 77 |         text_df = pd.DataFrame(data=zip(name_list, url_list, url_id_list, snippet_list, text_list),
 78 |                                columns=['name', 'url', 'url_id', 'snippet', 'text'])
 79 |         return text_df
 80 | 
 81 |     def call_one_url(self, website_tuple):
 82 |         name, url, snippet, url_id = website_tuple
 83 |         logger.info(f"Processing url: {url}")
 84 |         sentences = self.extract_sentences_from_url(url)
 85 |         logger.info(f"  receive sentences: {len(sentences)}")
 86 |         return sentences, name, url, url_id, snippet
 87 | 
 88 |     @storage_cached('bing_search_website_content', 'website_df')
 89 |     def call_urls_and_extract_sentences_concurrent(self, website_df):
 90 |         logger.info(f"BingService.call_urls_and_extract_sentences_async. website_df.shape: {website_df.shape}")
 91 |         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
 92 |             results = list(executor.map(self.call_one_url, website_df.itertuples(index=False)))
 93 |         name_list, url_list, url_id_list, snippet_list, text_list = [], [], [], [], []
 94 |         for result in results:
 95 |             sentences, name, url, url_id, snippet = result
 96 |             sentences = sentences[:self.config['source_service']['bing_search']['sentence_count_per_site']]  # filter top N only for stability
 97 |             for text in sentences:
 98 |                 word_count = len(re.findall(r'\w+', text))  # approximate number of words
 99 |                 if word_count < 8:
100 |                     continue
101 |                 name_list.append(name)
102 |                 url_list.append(url)
103 |                 url_id_list.append(url_id)
104 |                 snippet_list.append(snippet)
105 |                 text_list.append(text)
106 |         text_df = pd.DataFrame(data=zip(name_list, url_list, url_id_list, snippet_list, text_list),
107 |                                columns=['name', 'url', 'url_id', 'snippet', 'text'])
108 |         return text_df
109 | 
110 |     def extract_sentences_from_url(self, url):
111 |         # Fetch the HTML content of the page
112 |         try:
113 |             response = requests.get(url, timeout=3)
114 |         except:
115 |             logger.error(f"Failed to fetch url: {url}")
116 |             return []
117 |         html_content = response.text
118 | 
119 |         # Use BeautifulSoup to parse the HTML and extract the text
120 |         extract_text = self.txt_extract_svc.extract_from_html(html_content)
121 |         return extract_text
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     # Load config
126 |     with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f:
127 |         config = yaml.load(f, Loader=yaml.FullLoader)
128 |         service = BingService(config)
129 |         website_df = service.call_bing_search_api('What is ChatGPT')
130 |         print("===========Website df:============")
131 |         print(website_df)
132 |         # text_df = service.call_urls_and_extract_sentences(website_df)
133 |         text_df = service.call_urls_and_extract_sentences_concurrent(website_df)
134 |         print("===========text df:============")
135 |         print(text_df)
136 | 


--------------------------------------------------------------------------------
/src/FrontendService.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from urllib.parse import urlparse
  4 | 
  5 | import yaml
  6 | 
  7 | from NLPUtil import split_with_delimiters, get_longest_common_word_sequences
  8 | from Util import setup_logger, get_project_root
  9 | 
 10 | logger = setup_logger('FootnoteService')
 11 | 
 12 | 
 13 | class FrontendService:
 14 |     def __init__(self, config, response_text, gpt_input_text_df):
 15 |         self.config = config
 16 |         self.response_text = response_text
 17 |         used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope']  # TODO: add url_id
 18 |         self.gpt_input_text_df = gpt_input_text_df[used_columns]
 19 | 
 20 |     @staticmethod
 21 |     def get_prompt_examples_json():
 22 |         with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f:
 23 |             config = yaml.load(f, Loader=yaml.FullLoader)
 24 |             col1_list = config['frontend_service']['prompt_examples']['col1_list']
 25 |             col2_list = config['frontend_service']['prompt_examples']['col2_list']
 26 |             prompt_examples_json = {
 27 |                 'col1_list': col1_list,
 28 |                 'col2_list': col2_list,
 29 |             }
 30 |             return prompt_examples_json
 31 | 
 32 |     def get_data_json(self, response_text, gpt_input_text_df):
 33 |         def create_response_json_object(text, type):
 34 |             return {"text": text, "type": type}
 35 | 
 36 |         def create_source_json_object(footnote, domain, url, title, text):
 37 |             return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text}
 38 | 
 39 |         def reorder_url_id(response_text, gpt_input_text_df):
 40 |             # response_text: find reference in text & re-order
 41 |             url_id_list = [int(x) for x in dict.fromkeys(re.findall(r'\[([0-9]+)\]', response_text))]
 42 |             url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1)))
 43 | 
 44 |             response_text = re.sub(r'\[([0-9]+)\]', lambda x: f"[{url_id_map[int(x.group(1))]}]", response_text)
 45 |             # for multiple references in same sentence, sort as per url_id
 46 |             refs = set(re.findall(r'(\[[0-9\]\[]+\])', response_text))
 47 |             for ref in refs:
 48 |                 response_text = response_text.replace(ref, '[' + ']['.join(sorted(re.findall(r'\[([0-9]+)\]', ref))) + ']')
 49 | 
 50 |             # gpt_input_text_df: find reference in text & re-order
 51 |             in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy()
 52 |             in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map)
 53 |             return response_text, in_scope_source_df
 54 | 
 55 |         def get_response_json(response_text):
 56 |             def create_response_json_object(text, type):
 57 |                 return {"text": text, "type": type}
 58 | 
 59 |             response_json = []
 60 |             split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text)
 61 | 
 62 |             components = []
 63 |             for component in split_sentence:
 64 |                 components.extend(split_with_delimiters(component, ['\n']))
 65 |             for sentence in components:
 66 |                 if sentence.startswith('[') and sentence.endswith(']'):
 67 |                     response_json.append(create_response_json_object(sentence, "footnote"))
 68 |                 elif sentence == '\n':
 69 |                     response_json.append(create_response_json_object(sentence, "newline"))
 70 |                 else:
 71 |                     response_json.append(create_response_json_object(sentence, "response"))
 72 |             return response_json
 73 | 
 74 |         def get_source_json(in_scope_source_df):
 75 |             in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int)
 76 |             in_scope_source_df.sort_values('docno', inplace=True)
 77 |             source_text_list = []
 78 |             source_json = []
 79 |             source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True)
 80 |             for index, row in source_url_df.iterrows():
 81 |                 url_text = ''
 82 |                 url_text += f"[{row['url_id']}] {row['url']}\n"
 83 | 
 84 |                 for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows():
 85 |                     url_text += f"  {row['text']}\n"
 86 | 
 87 |                 source_text_list.append(url_text)
 88 | 
 89 |                 domain_name = urlparse(row['url']).netloc.replace('www.', '')
 90 |                 source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet']))
 91 |             source_text = ''.join(sorted(source_text_list))
 92 | 
 93 |             source_json = sorted(source_json, key=lambda x: x['footnote'])
 94 |             return source_json, source_text
 95 | 
 96 |         def get_explainability_json(response_text, source_text):
 97 |             def get_colors():
 98 |                 return ['#ffe3e8', '#f1e1ff', '#c5d5ff', '#c5efff', '#d6fffa', '#e7ffe7', '#f7ffa7', '#fff3b3', '#ffdfdf', '#ffcaca']
 99 | 
100 |             def create_response_json_object(text, type, color):
101 |                 return {"text": text, "type": type, "color": color}
102 | 
103 |             def get_explain_json(text, word_color_dict):
104 |                 common_word_sequences = list(word_color_dict.keys())
105 |                 word_list = split_with_delimiters(text.lower(), common_word_sequences + ['\n'])
106 |                 explain_json = []
107 |                 for word in word_list:
108 |                     if word == '\n':
109 |                         explain_json.append(create_response_json_object(word, "newline", ""))
110 |                     elif word.lower() in common_word_sequences:
111 |                         explain_json.append(create_response_json_object(word, "keyword", word_color_dict[word.lower()]))
112 |                     else:
113 |                         explain_json.append(create_response_json_object(word, "word", ""))
114 |                 return explain_json
115 | 
116 |             longest_common_word_sequences = get_longest_common_word_sequences(response_text, source_text, k=10)
117 |             word_color_dict = {longest_common_word_sequences[i]: get_colors()[i] for i in range(min(len(longest_common_word_sequences), len(get_colors())))}
118 | 
119 |             response_explain_json = get_explain_json(response_text, word_color_dict)
120 |             source_explain_json = get_explain_json(source_text, word_color_dict)
121 |             return response_explain_json, source_explain_json
122 | 
123 |         response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df)
124 |         response_json = get_response_json(response_text)
125 |         source_json, source_text = get_source_json(in_scope_source_df)
126 |         response_explain_json, source_explain_json = get_explainability_json(response_text, source_text)
127 |         prompt_examples_json = FrontendService.get_prompt_examples_json()
128 | 
129 |         return source_text, {'response_json': response_json,
130 |                              'source_json': source_json,
131 |                              'response_explain_json': response_explain_json,
132 |                              'source_explain_json': source_explain_json,
133 |                              'prompt_examples_json': prompt_examples_json,
134 |                              }
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     # str_list = ['Alpaca lora',
139 |     #             'what is new for gpt4?',
140 |     #             'Why Llama LLM model is so popular?',
141 |     #             'Why did SVB collapsed?',
142 |     #             'End of FTX',
143 |     #             'digital twin有哪些用处',
144 |     #             '아가동산사건의 문제가 뭐야',
145 |     #             'Hoe maak ik pasta',
146 |     #             '日本国憲法は誰が作ったのか？',
147 |     #             "Comment gagner de l'argent"]
148 |     # for s in str_list:
149 |     #     print(path_safe_string_conversion(s))
150 | 
151 |     import os
152 |     import pickle
153 | 
154 |     folder_path = r""
155 |     pickle_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
156 |     for file_path in pickle_files:
157 |         if not '8d' in file_path:
158 |             continue
159 |         with open(file_path, "rb") as f:
160 |             obj = pickle.load(f)
161 |             print(file_path)
162 |             print(obj['result'][0])
163 | 


--------------------------------------------------------------------------------
/src/LLMService.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from abc import ABC, abstractmethod
  3 | from urllib.parse import urlparse
  4 | 
  5 | import openai
  6 | import pandas as pd
  7 | import yaml
  8 | 
  9 | from Util import setup_logger, get_project_root, storage_cached
 10 | from website.sender import Sender, MSG_TYPE_SEARCH_STEP, MSG_TYPE_OPEN_AI_STREAM
 11 | 
 12 | logger = setup_logger('LLMService')
 13 | 
 14 | 
 15 | class LLMService(ABC):
 16 |     def __init__(self, config):
 17 |         self.config = config
 18 | 
 19 |     def clean_response_text(self, response_text: str):
 20 |         return response_text.replace("\n", "")
 21 | 
 22 |     def get_prompt(self, search_text: str, gpt_input_text_df: pd.DataFrame):
 23 |         logger.info(f"OpenAIService.get_prompt. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}")
 24 |         prompt_length_limit = 3000  # obsolete
 25 |         is_use_source = self.config.get('source_service').get('is_use_source')
 26 |         if is_use_source:
 27 |             prompt_engineering = f"\n\nAnswer the question '{search_text}' using above information with about 100 words:"
 28 |             prompt = ""
 29 |             for index, row in gpt_input_text_df.iterrows():
 30 |                 prompt += f"""{row['text']}\n"""
 31 |             # limit the prompt length
 32 |             prompt = prompt[:prompt_length_limit]
 33 |             return prompt + prompt_engineering
 34 |         else:
 35 |             return f"\n\nAnswer the question '{search_text}' with about 100 words:"
 36 | 
 37 |     def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame):
 38 |         logger.info(f"OpenAIService.get_prompt_v2. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}")
 39 |         context_str = ""
 40 |         gpt_input_text_df = gpt_input_text_df.sort_values('url_id')
 41 |         url_id_list = gpt_input_text_df['url_id'].unique()
 42 |         for url_id in url_id_list:
 43 |             context_str += f"Source ({url_id})\n"
 44 |             for index, row in gpt_input_text_df[gpt_input_text_df['url_id'] == url_id].iterrows():
 45 |                 context_str += f"{row['text']}\n"
 46 |             context_str += "\n"
 47 |         prompt_length_limit = 3000 # obsolete
 48 |         context_str = context_str[:prompt_length_limit]
 49 |         prompt = \
 50 |             f"""
 51 | Answer with 100 words for the question below based on the provided sources using a scientific tone. 
 52 | If the context is insufficient, reply "I cannot answer".
 53 | Use Markdown for formatting code or text.
 54 | Source:
 55 | {context_str}
 56 | Question: {search_text}
 57 | Answer:
 58 | """
 59 |         return prompt
 60 | 
 61 |     def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame):
 62 |         language = self.config.get('general').get('language')
 63 |         if not self.config.get('source_service').get('is_use_source'):
 64 |             prompt = \
 65 |                 f"""
 66 | Instructions: Write a comprehensive reply to the given query.  
 67 | If the context is insufficient, reply "I cannot answer".
 68 | Query: {search_text}
 69 | """
 70 |             return prompt
 71 | 
 72 |         logger.info(f"OpenAIService.get_prompt_v3. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}")
 73 |         context_str = ""
 74 |         for _, row_url in gpt_input_text_df[['url_id', 'url']].drop_duplicates().iterrows():
 75 |             domain = urlparse(row_url['url']).netloc.replace('www.', '')
 76 |             context_str += f"Source [{row_url['url_id']}] {domain}\n"
 77 |             for index, row in gpt_input_text_df[(gpt_input_text_df['url_id'] == row_url['url_id']) & gpt_input_text_df['in_scope']].iterrows():
 78 |                 context_str += f"{row['text']}\n"
 79 |             context_str += "\n\n"
 80 |         prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')
 81 |         context_str = context_str[:prompt_length_limit]
 82 |         prompt = \
 83 |             f"""
 84 | Web search result:
 85 | {context_str}
 86 | 
 87 | Instructions: Using the provided web search results, write a comprehensive reply to the given query. 
 88 | Make sure to cite results using [number] notation after the reference.
 89 | If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.
 90 | Answer in language: {language}
 91 | If the context is insufficient, reply "I cannot answer because my reference sources don't have related info" in language {language}.
 92 | Query: {search_text}
 93 | """
 94 |         return prompt
 95 | 
 96 |     @abstractmethod
 97 |     def call_api(self, prompt):
 98 |         pass
 99 | 
100 | 
101 | class OpenAIService(LLMService):
102 |     def __init__(self, config, sender: Sender = None):
103 |         super().__init__(config)
104 |         self.sender = sender
105 |         open_api_key = config.get('llm_service').get('openai_api').get('api_key')
106 |         if open_api_key is None:
107 |             raise Exception("OpenAI API key is not set.")
108 |         openai.api_key = open_api_key
109 | 
110 |     @storage_cached('openai', 'prompt')
111 |     def call_api(self, prompt: str):
112 |         if self.sender is not None:
113 |             self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg='Calling OpenAI API ...')
114 | 
115 |         openai_api_config = self.config.get('llm_service').get('openai_api')
116 |         model = openai_api_config.get('model')
117 |         is_stream = openai_api_config.get('stream')
118 |         logger.info(f"OpenAIService.call_api. model: {model}, len(prompt): {len(prompt)}")
119 | 
120 |         if model in ['gpt-3.5-turbo', 'gpt-4']:
121 |             try:
122 |                 response = openai.ChatCompletion.create(
123 |                     model=model,
124 |                     messages=[
125 |                         {"role": "system", "content": "You are a helpful search engine."},
126 |                         {"role": "user", "content": prompt}
127 |                     ],
128 |                     stream=is_stream
129 |                 )
130 |             except Exception as ex:
131 |                 raise ex
132 | 
133 |             if is_stream:
134 |                 collected_messages = []
135 |                 # iterate through the stream of events
136 |                 for chunk in response:
137 |                     chunk_message = chunk['choices'][0]['delta'].get("content", None)  # extract the message
138 |                     if chunk_message is not None:
139 |                         if self.sender is not None:
140 |                             self.sender.send_message(msg_type=MSG_TYPE_OPEN_AI_STREAM, msg=chunk_message)
141 |                         collected_messages.append(chunk_message)  # save the message
142 | 
143 |                 full_reply_content = ''.join(collected_messages)
144 |                 return full_reply_content
145 |             else:
146 |                 return response.choices[0].message.content
147 |         else:
148 |             try:
149 |                 response = openai.Completion.create(
150 |                     model=model,
151 |                     prompt=prompt,
152 |                     max_tokens=openai_api_config.get('max_tokens'),
153 |                     temperature=openai_api_config.get('temperature'),
154 |                     top_p=openai_api_config.get('top_p'),
155 |                 )
156 |             except Exception as ex:
157 |                 raise ex
158 |             return self.clean_response_text(response.choices[0].text)
159 | 
160 | 
161 | class GooseAIService(LLMService):
162 |     def __init__(self, config, sender: Sender = None):
163 |         super().__init__(config)
164 |         self.sender = sender
165 |         goose_api_key = config.get('goose_ai_api').get('api_key')
166 |         if goose_api_key is None:
167 |             raise Exception("Goose API key is not set.")
168 |         openai.api_key = goose_api_key
169 |         openai.api_base = config.get('goose_ai_api').get('api_base')
170 | 
171 |     @storage_cached('gooseai', 'prompt')
172 |     def call_api(self, prompt: str, sender: Sender = None):
173 |         if self.sender is not None:
174 |             self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg='Calling gooseAI API ...')
175 |         logger.info(f"GooseAIService.call_openai_api. len(prompt): {len(prompt)}")
176 |         goose_api_config = self.config.get('goose_ai_api')
177 |         try:
178 |             response = openai.Completion.create(
179 |                 engine=goose_api_config.get('model'),
180 |                 prompt=prompt,
181 |                 max_tokens=goose_api_config.get('max_tokens'),
182 |                 # stream=True
183 |             )
184 |         except Exception as ex:
185 |             raise ex
186 |         return self.clean_response_text(response.choices[0].text)
187 | 
188 | 
189 | class LLMServiceFactory:
190 |     @staticmethod
191 |     def create_llm_service(config, sender: Sender = None) -> LLMService:
192 |         provider = config.get('llm_service').get('provider')
193 |         if provider == 'openai':
194 |             return OpenAIService(config, sender)
195 |         elif provider == 'goose_ai':
196 |             return GooseAIService(config, sender)
197 |         else:
198 |             logger.error(f'LLM Service for {provider} is not yet implemented.')
199 |             raise NotImplementedError(f'LLM Service - {provider} - not is supported')
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     # Load config
204 |     with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f:
205 |         config = yaml.load(f, Loader=yaml.FullLoader)
206 |         service_factory = LLMServiceFactory()
207 |         service = service_factory.create_llm_service(config)
208 |         prompt = """
209 |         """
210 |         # response_text = service.call_openai_api('What is ChatGPT')
211 |         response_text = service.call_api(prompt)
212 |         print(response_text)
213 | 


--------------------------------------------------------------------------------
/src/NLPUtil.py:
--------------------------------------------------------------------------------
  1 | import tiktoken
  2 | 
  3 | 
  4 | def remove_substrings(strings):
  5 |     """
  6 |     Remove any string that is a substring of another string
  7 |     Input ["abc", "ab", "c"]
  8 |     Output ["abc", "c"]
  9 |     """
 10 |     # Sort the strings by length in descending order
 11 |     strings_sorted = sorted(strings, key=len, reverse=False)
 12 | 
 13 |     # Remove any string that is a substring of another string
 14 |     result = []
 15 |     for i in range(len(strings_sorted)):
 16 |         is_substring = False
 17 |         for j in range(i + 1, len(strings_sorted)):
 18 |             if strings_sorted[i] in strings_sorted[j]:
 19 |                 is_substring = True
 20 |                 break
 21 |         if not is_substring:
 22 |             result.append(strings_sorted[i])
 23 | 
 24 |     return result
 25 | 
 26 | 
 27 | def get_longest_common_word_sequences(paragraph1, paragraph2, k=10):
 28 |     """
 29 |     Find the longest common subsequences of words between two paragraphs
 30 |     Input: p1: "The quick brown fox jumps over the lazy dog", p2: "The quick brown dog jumps over the lazy fox"
 31 |     Output: ["jumps over the lazy", "the quick brown"]
 32 |     """
 33 |     # Tokenize the paragraphs into lists of words
 34 |     word_lists1 = [word.lower() for word in paragraph1.split()]
 35 |     word_lists2 = [word.lower() for word in paragraph2.split()]
 36 | 
 37 |     # Initialize a table to store the lengths of common subsequences
 38 |     table = [[0] * (len(word_lists2) + 1) for _ in range(len(word_lists1) + 1)]
 39 | 
 40 |     # Fill in the table by comparing each pair of words
 41 |     common_sequences = []
 42 |     for i in range(1, len(word_lists1) + 1):
 43 |         for j in range(1, len(word_lists2) + 1):
 44 |             if word_lists1[i - 1] == word_lists2[j - 1]:
 45 |                 table[i][j] = table[i - 1][j - 1] + 1
 46 |                 sequence_len = table[i][j]
 47 |                 # if sequence_len >= k:
 48 |                 sequence = ' '.join(word_lists1[i - sequence_len:i])
 49 |                 if sequence not in common_sequences:
 50 |                     common_sequences.append(sequence)
 51 |             else:
 52 |                 table[i][j] = 0
 53 | 
 54 |     # Sort the common sequences by length in descending order and return the top k longest sequences
 55 |     common_sequences = remove_substrings(common_sequences)
 56 |     longest_sequences = sorted(common_sequences, key=len, reverse=True)[:k]
 57 |     min_sequence_len = 10
 58 |     longest_sequences = [sequence for sequence in longest_sequences if len(sequence) >= min_sequence_len]
 59 |     return longest_sequences
 60 | 
 61 | 
 62 | def split_with_delimiters(string, delimiter_list):
 63 |     """
 64 |     Key point if this function is it will preserve the delimiters to serve the purpose
 65 |     Input: ("is fine-tuned from a gpt-3.5 series", ["fine-tuned", "gpt-3.5"])
 66 |     Output: ['is ', 'fine-tuned', ' from a ', 'gpt-3.5', ' series']
 67 |     """
 68 |     result = []
 69 |     start = 0
 70 |     for i in range(len(string)):
 71 |         for delimiter in delimiter_list:
 72 |             delimiter_len = len(delimiter)
 73 |             if string[i:i + delimiter_len] == delimiter:
 74 |                 if i > start:
 75 |                     result.append(string[start:i])
 76 |                 result.append(delimiter)
 77 |                 start = i + delimiter_len
 78 |                 break
 79 |         else:
 80 |             continue
 81 |     if start < len(string):
 82 |         result.append(string[start:])
 83 |     return result
 84 | 
 85 | 
 86 | def num_tokens_from_string(string: str) -> int:
 87 |     """
 88 |     Returns the number of tokens in a text string.
 89 |     https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
 90 |     """
 91 |     encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
 92 |     num_tokens = len(encoding.encode(string))
 93 |     return num_tokens
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]."
 98 |     paragraph2 = """
 99 | Source (1)
100 | ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response.
101 | - ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly.
102 | ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure.
103 | Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF).
104 | 
105 | Source (3)
106 | ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data.
107 | But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware.
108 | ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn.
109 | But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader.
110 | ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on.
111 | ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way".
112 | ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times.
113 | ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss
114 |     """
115 | 
116 |     # common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2)
117 |     # # print(common_stems)
118 |     # for common_stem in common_stems:
119 |     #     print(common_stem)
120 | 
121 |     # text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early",
122 |     #              "sensitive to tweaks to the input phrasing or attempting the same prompt multiple",
123 |     #              "is fine-tuned from a model in the gpt-3.5 series, which finished training in",
124 |     #              "is fine-tuned from a model in the gpt-3.5 series, which finished training",
125 |     #              "sensitive to tweaks to the input phrasing or attempting the same prompt",
126 |     #              "is fine-tuned from a model in the gpt-3.5 series, which finished",
127 |     #              "sensitive to tweaks to the input phrasing or attempting the same",
128 |     #              "sensitive to tweaks to the input phrasing or attempting the",
129 |     #              "is fine-tuned from a model in the gpt-3.5 series, which"]
130 |     # text_list = FrontendService.remove_substrings(text_list)
131 |     # for text in text_list:
132 |     #     print(text)
133 | 
134 |     # response_text = "is fine-tuned from a gpt-3.5 series"
135 |     # split_list = split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"])
136 |     # print(split_list)
137 | 
138 |     s = "OpenAI 推出了一個新型聊天機器人模型ChatGPT"
139 |     print(num_tokens_from_string(s))
140 | 


--------------------------------------------------------------------------------
/src/SearchGPTService.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pandas as pd
  4 | import yaml
  5 | 
  6 | from FrontendService import FrontendService
  7 | from LLMService import LLMServiceFactory
  8 | from SemanticSearchService import BatchOpenAISemanticSearchService
  9 | from SourceService import SourceService
 10 | from Util import setup_logger, get_project_root, storage_cached
 11 | from website.sender import Sender
 12 | 
 13 | logger = setup_logger('SearchGPTService')
 14 | 
 15 | 
 16 | class SearchGPTService:
 17 |     """
 18 |     SearchGPT app->service->child-service structure
 19 |     - (Try to) app import service, child-service inherit service
 20 | 
 21 |     SearchGPT class
 22 |     - SourceService
 23 |     -- BingService
 24 |     -- Doc/PPT/PDF Service
 25 |     - SemanticSearchModule
 26 |     - LLMService
 27 |     -- OpenAIService
 28 |     -- GooseAPIService
 29 |     - FrontendService
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(self, ui_overriden_config=None, sender: Sender = None):
 34 |         with open(os.path.join(get_project_root(), 'src/config/config.yaml'), encoding='utf-8') as f:
 35 |             self.config = yaml.load(f, Loader=yaml.FullLoader)
 36 |         self.overide_config_by_query_string(ui_overriden_config)
 37 |         self.validate_config()
 38 |         self.sender = sender
 39 | 
 40 |     def overide_config_by_query_string(self, ui_overriden_config):
 41 |         if ui_overriden_config is None:
 42 |             return
 43 |         for key, value in ui_overriden_config.items():
 44 |             if value is not None and value != '':
 45 |                 # query_string is flattened (one level) while config.yaml is nested (two+ levels)
 46 |                 # Any better way to handle this?
 47 |                 if key == 'bing_search_subscription_key':
 48 |                     self.config['source_service']['bing_search']['subscription_key'] = value
 49 |                 elif key == 'openai_api_key':
 50 |                     self.config['llm_service']['openai_api']['api_key'] = value
 51 |                 elif key == 'is_use_source':
 52 |                     self.config['source_service']['is_use_source'] = False if value.lower() in ['false', '0'] else True
 53 |                 elif key == 'llm_service_provider':
 54 |                     self.config['llm_service']['provider'] = value
 55 |                 elif key == 'llm_model':
 56 |                     if self.config['llm_service']['provider'] == 'openai':
 57 |                         self.config['llm_service']['openai_api']['model'] = value
 58 |                     elif self.config['llm_service']['provider'] == 'goose_ai':
 59 |                         self.config['llm_service']['goose_ai_api']['model'] = value
 60 |                     else:
 61 |                         raise Exception(f"llm_model is not supported for llm_service_provider: {self.config['llm_service']['provider']}")
 62 |                 elif key == 'language':
 63 |                     self.config['general']['language'] = value
 64 |                 else:
 65 |                     # invalid query_string but not throwing exception first
 66 |                     pass
 67 | 
 68 |     def validate_config(self):
 69 |         if self.config['source_service']['is_enable_bing_search']:
 70 |             assert self.config['source_service']['bing_search']['subscription_key'], 'bing_search_subscription_key is required'
 71 |         if self.config['llm_service']['provider'] == 'openai':
 72 |             assert self.config['llm_service']['openai_api']['api_key'], 'openai_api_key is required'
 73 | 
 74 |     @storage_cached('web', 'search_text')
 75 |     def query_and_get_answer(self, search_text):
 76 |         source_module = SourceService(self.config, self.sender)
 77 |         bing_text_df = source_module.extract_bing_text_df(search_text)
 78 |         doc_text_df = source_module.extract_doc_text_df(bing_text_df)
 79 |         text_df = pd.concat([bing_text_df, doc_text_df], ignore_index=True)
 80 | 
 81 |         semantic_search_service = BatchOpenAISemanticSearchService(self.config, self.sender)
 82 |         gpt_input_text_df = semantic_search_service.search_related_source(text_df, search_text)
 83 |         gpt_input_text_df = BatchOpenAISemanticSearchService.post_process_gpt_input_text_df(gpt_input_text_df,
 84 |                                                                                             self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_token_limit'))
 85 | 
 86 |         llm_service = LLMServiceFactory.create_llm_service(self.config, self.sender)
 87 |         prompt = llm_service.get_prompt_v3(search_text, gpt_input_text_df)
 88 |         response_text = llm_service.call_api(prompt=prompt)
 89 | 
 90 |         frontend_service = FrontendService(self.config, response_text, gpt_input_text_df)
 91 |         source_text, data_json = frontend_service.get_data_json(response_text, gpt_input_text_df)
 92 | 
 93 |         print('===========Prompt:============')
 94 |         print(prompt)
 95 |         print('===========Search:============')
 96 |         print(search_text)
 97 |         print('===========Response text:============')
 98 |         print(response_text)
 99 |         print('===========Source text:============')
100 |         print(source_text)
101 | 
102 |         return response_text, source_text, data_json
103 | 


--------------------------------------------------------------------------------
/src/SemanticSearchService.py:
--------------------------------------------------------------------------------
  1 | import openai
  2 | import pandas as pd
  3 | import re
  4 | from openai.embeddings_utils import cosine_similarity
  5 | from website.sender import Sender, MSG_TYPE_SEARCH_STEP
  6 | 
  7 | from Util import setup_logger
  8 | from NLPUtil import num_tokens_from_string
  9 | 
 10 | # from abc import ABC, abstractmethod
 11 | # from langchain.embeddings import HuggingFaceEmbeddings
 12 | # from langchain.vectorstores import FAISS
 13 | BASE_MODEL = "text-embedding-ada-002"  # default embedding of faiss-openai
 14 | logger = setup_logger('SemanticSearchService')
 15 | 
 16 | 
 17 | # class SemanticSearchService(ABC):
 18 | #     def __init__(self, config):
 19 | #         self.cwd = os.getcwd()
 20 | #         self.config = config
 21 | #         self.index = None
 22 | #         self.provider = ''
 23 | #
 24 | #     @abstractmethod
 25 | #     def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str):
 26 | #         pass
 27 | #
 28 | #     @abstractmethod
 29 | #     def retrieve_result_by_search_text_from_text_df(self, search_text, text_df) -> pd.DataFrame:
 30 | #         pass
 31 | #
 32 | #     @staticmethod
 33 | #     def use_index_to_search(index, search_text):
 34 | #         pass
 35 | #
 36 | #     def clean_sentence_to_avoid_lexical_error(self, text):
 37 | #         """
 38 | #         Clean sentence. Pyterrier will throw error if the search query contains some special characters shown below
 39 | #             jnius.JavaException: JVM exception occurred: Failed to process qid 1 '
 40 | #             <search query>' -- Lexical error at line 3, column 90.  Encountered: "\'" (39), after : "" org.terrier.querying.parser.QueryParserException
 41 | #             python-BaseException
 42 | #         :return:
 43 | #         """
 44 | #         # TODO: good way to clean
 45 | #         return text.replace("'", "").replace("?", "").replace("!", "").replace(":", "").replace(";", "")
 46 | #
 47 | #
 48 | # class PyTerrierService(SemanticSearchService):
 49 | #     def __init__(self, config):
 50 | #         super().__init__(config)
 51 | #         self.provider = 'pyterrier'
 52 | #
 53 | #     def create_index_column_in_df(self, text_df: pd.DataFrame) -> pd.DataFrame:
 54 | #         """
 55 | #         add a docno column (primary key / index column) to the dataframe
 56 | #         :param text_df:
 57 | #         :return: text_df with docno column
 58 | #         """
 59 | #         text_df["docno"] = text_df.index + 1
 60 | #         text_df["docno"] = text_df["docno"].astype(str)
 61 | #         return text_df
 62 | #
 63 | #     def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str):
 64 | #         """
 65 | #         index the text_df to get a indexref
 66 | #         :param text_df:
 67 | #             required columns:
 68 | #                 docno: as primary key for later process to retrieve back the row
 69 | #                 text: the text to be indexed
 70 | #         :return:
 71 | #             indexref:
 72 | #         """
 73 | #         import pyterrier as pt
 74 | #         if not pt.started():
 75 | #             pt.init()
 76 | #         datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
 77 | #         df_indexer_path = os.path.join(self.cwd, f".index/{indexref_folder_name}_" + datetime_str)
 78 | #         if not os.path.exists(df_indexer_path):
 79 | #             os.makedirs(df_indexer_path)
 80 | #
 81 | #         # TODO: using overwrite?
 82 | #         # Currently I cannot use overwrite=True to directly overwrite the existing index folder
 83 | #         #   when I index for the second time, it will throw error. Therefore need to create a new folder
 84 | #         #   I also cannot delete it in the last step, because the process is still running and consuming the index files inside.
 85 | #
 86 | #         # TODO: using a better wmodel than Tf?
 87 | #         pd_indexer = pt.DFIndexer(df_indexer_path, wmodel="Tf")
 88 | #         indexref = pd_indexer.index(text_df["text"], text_df["docno"])
 89 | #         return indexref
 90 | #
 91 | #     @staticmethod
 92 | #     def use_index_to_search(index, search_text):
 93 | #         result_df: pd.DataFrame = pt.BatchRetrieve(index).search(search_text)
 94 | #         return result_df
 95 | #
 96 | #     def retrieve_result_by_search_text_from_text_df(self, search_text, text_df):
 97 | #         logger.info(f"PyTerrierService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}")
 98 | #         text_df = self.create_index_column_in_df(text_df)
 99 | #         index = self.index_text_df(text_df, 'df_index')
100 | #         result_df: pd.DataFrame = self.use_index_to_search(index, search_text)
101 | #         return result_df.merge(text_df, on="docno", how="left")
102 | #
103 | #
104 | # class LangChainFAISSService(SemanticSearchService):
105 | #     def __init__(self, config):
106 | #         super().__init__(config)
107 | #         self.provider = self.config.get('semantic_search').get('provider')
108 | #         self.embeddings = None
109 | #         if self.provider == 'faiss-openai':
110 | #             self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('llm_service').get('openai_api').get('api_key'))
111 | #         elif self.provider == 'faiss-huggingface':
112 | #             self.embeddings = HuggingFaceEmbeddings()
113 | #         else:
114 | #             raise Exception(f"provider {self.provider} is not supported")
115 | #
116 | #     def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str):
117 | #         logger.info(f"LangChainFAISSService.index_text_df. text_df.shape: {text_df.shape}")
118 | #         text_df['docno'] = text_df.index.tolist()
119 | #         texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist()
120 | #         docno_dict = [{'docno': docno} for docno in docno_list]
121 | #         faiss_index = FAISS.from_texts(texts, self.embeddings, metadatas=docno_dict)
122 | #         return faiss_index
123 | #
124 | #     @staticmethod
125 | #     def use_index_to_search(index, search_text):
126 | #         index: FAISS
127 | #         # k: Number of Documents to return. Defaults to 4.
128 | #         # fetch_k: Number of Documents to fetch to pass to MMR algorithm.
129 | #
130 | #         # k = 15
131 | #         # # Cons: you can only pick k, but you cannot filter by score
132 | #         # tuples = index.similarity_search_with_score(search_text, k=k)
133 | #         # docno_list = [t[0].metadata['docno'] for t in tuples]
134 | #         # score_list = [t[1] for t in tuples]
135 | #         # result_df = pd.DataFrame({'docno': docno_list, 'score': score_list})
136 | #         # result_df['rank'] = result_df.index
137 | #
138 | #         k = 30
139 | #         docs = index.max_marginal_relevance_search(search_text, k=k, fetch_k=999)
140 | #         docno_list = [doc.metadata['docno'] for doc in docs]
141 | #         result_df = pd.DataFrame({'docno': docno_list})
142 | #         result_df['rank'] = result_df.index
143 | #         result_df['score'] = 999
144 | #
145 | #         return result_df
146 | #
147 | #     def retrieve_result_by_search_text_from_text_df(self, search_text, text_df):
148 | #         logger.info(f"LangChainFAISSService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}")
149 | #         faiss_index = self.index_text_df(text_df, '')
150 | #         result_df = self.use_index_to_search(faiss_index, search_text)
151 | #         return result_df.merge(text_df, on="docno", how="left")
152 | #
153 | #
154 | # class SemanticSearchServiceFactory:
155 | #     @staticmethod
156 | #     def create_semantic_search_service(config) -> SemanticSearchService:
157 | #         provider = config.get('semantic_search').get('provider')
158 | #         if provider == 'pyterrier':
159 | #             return PyTerrierService(config)
160 | #         elif provider in ['faiss-openai', 'faiss-huggingface']:
161 | #             return LangChainFAISSService(config)
162 | #         else:
163 | #             logger.error(f'SemanticSearchService for {provider} is not yet implemented.')
164 | #             raise NotImplementedError(f'SemanticSearchService - {provider} - is not supported')
165 | 
166 | 
167 | class BatchOpenAISemanticSearchService:
168 |     def __init__(self, config, sender: Sender = None):
169 |         self.config = config
170 |         openai.api_key = config.get('llm_service').get('openai_api').get('api_key')
171 |         self.sender = sender
172 | 
173 |     @staticmethod
174 |     def batch_call_embeddings(texts, chunk_size=1000):
175 |         texts = [text.replace("\n", " ") for text in texts]
176 |         embeddings = []
177 |         for i in range(0, len(texts), chunk_size):
178 |             response = openai.Embedding.create(
179 |                 input=texts[i: i + chunk_size], engine=BASE_MODEL
180 |             )
181 |             embeddings += [r["embedding"] for r in response["data"]]
182 |         return embeddings
183 | 
184 |     @staticmethod
185 |     def compute_embeddings_for_text_df(text_df: pd.DataFrame):
186 |         """Compute embeddings for a text_df and return the text_df with the embeddings column added."""
187 |         print(f'compute_embeddings_for_text_df() len(texts): {len(text_df)}')
188 |         text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " "))
189 |         text_df['embedding'] = BatchOpenAISemanticSearchService.batch_call_embeddings(text_df['text'].tolist())
190 |         return text_df
191 | 
192 |     def search_related_source(self, text_df: pd.DataFrame, target_text, n=30):
193 |         if not self.config.get('source_service').get('is_use_source'):
194 |             col = ['name', 'url', 'url_id', 'snippet', 'text', 'similarities', 'rank', 'docno']
195 |             return pd.DataFrame(columns=col)
196 | 
197 |         if self.sender is not None:
198 |             self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Searching from extracted text")
199 |         print(f'search_similar() text: {target_text}')
200 |         embedding = BatchOpenAISemanticSearchService.batch_call_embeddings([target_text])[0]
201 |         text_df = BatchOpenAISemanticSearchService.compute_embeddings_for_text_df(text_df)
202 |         text_df['similarities'] = text_df['embedding'].apply(lambda x: cosine_similarity(x, embedding))
203 |         result_df = text_df.sort_values('similarities', ascending=False).head(n)
204 |         result_df['rank'] = range(1, len(result_df) + 1)
205 |         result_df['docno'] = range(1, len(result_df) + 1)
206 |         return result_df
207 | 
208 |     @staticmethod
209 |     def post_process_gpt_input_text_df(gpt_input_text_df, prompt_token_limit):
210 |         # clean out of prompt texts for existing [1], [2], [3]... in the source_text for response output stability
211 |         gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x))
212 |         # length of char and token
213 |         gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x))
214 |         gpt_input_text_df['len_token'] = gpt_input_text_df['text'].apply(lambda x: num_tokens_from_string(x))
215 | 
216 |         gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum()
217 |         gpt_input_text_df['cumsum_len_token'] = gpt_input_text_df['len_token'].cumsum()
218 | 
219 |         max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_token'] <= prompt_token_limit]['rank'].max() + 1
220 |         gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank  # In order to get also the row slightly larger than prompt_length_limit
221 |         # reorder url_id with url that in scope.
222 |         url_id_list = gpt_input_text_df['url_id'].unique()
223 |         url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1)))
224 |         gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map)
225 |         return gpt_input_text_df
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/src/SourceService.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from BingService import BingService
 7 | from Util import setup_logger
 8 | from text_extract.doc import support_doc_type, doc_extract_svc_map
 9 | from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc
10 | from website.sender import Sender, MSG_TYPE_SEARCH_STEP
11 | 
12 | logger = setup_logger('SourceModule')
13 | 
14 | 
15 | class SourceService:
16 |     def __init__(self, config, sender: Sender = None):
17 |         self.config = config
18 |         self.sender = sender
19 | 
20 |     def extract_bing_text_df(self, search_text):
21 |         # BingSearch using search_text
22 |         #   check if bing search result is cached and load if exists
23 |         bing_text_df = None
24 |         if not self.config['source_service']['is_use_source'] or not self.config['source_service']['is_enable_bing_search']:
25 |             return bing_text_df
26 | 
27 |         bing_service = BingService(self.config)
28 |         if self.sender is not None:
29 |             self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Calling bing search API")
30 |         website_df = bing_service.call_bing_search_api(search_text=search_text)
31 |         if self.sender is not None:
32 |             self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Extracting sentences from bing search result ...")
33 |         bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df=website_df)
34 | 
35 |         return bing_text_df
36 | 
37 |     def extract_doc_text_df(self, bing_text_df):
38 |         # DocSearch using doc_search_path
39 |         #  bing_text_df is used for doc_id arrangement
40 |         if not self.config['source_service']['is_use_source'] or not self.config['source_service']['is_enable_doc_search']:
41 |             return pd.DataFrame([])
42 |         if self.sender is not None:
43 |             self.sender.send_message(msg_type=MSG_TYPE_SEARCH_STEP, msg="Extracting sentences from document")
44 |         files_grabbed = list()
45 |         for doc_type in support_doc_type:
46 |             tmp_file_list = glob.glob(self.config['source_service']['doc_search_path'] + os.sep + "*." + doc_type)
47 |             files_grabbed.extend({"file_path": file_path, "doc_type": doc_type} for file_path in tmp_file_list)
48 | 
49 |         logger.info(f"File list: {files_grabbed}")
50 |         doc_sentence_list = list()
51 | 
52 |         start_doc_id = 1 if bing_text_df is None else bing_text_df['url_id'].max() + 1
53 |         for doc_id, file in enumerate(files_grabbed, start=start_doc_id):
54 |             extract_svc: AbstractDocExtractSvc = doc_extract_svc_map[file['doc_type']]
55 |             sentence_list = extract_svc.extract_from_doc(file['file_path'])
56 | 
57 |             file_name = file['file_path'].split(os.sep)[-1]
58 |             for sentence in sentence_list:
59 |                 doc_sentence_list.append({
60 |                     'name': file_name,
61 |                     'url': file['file_path'],
62 |                     'url_id': doc_id,
63 |                     'snippet': '',
64 |                     'text': sentence
65 |                 })
66 |         doc_text_df = pd.DataFrame(doc_sentence_list)
67 |         return doc_text_df
68 | 


--------------------------------------------------------------------------------
/src/Util.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pickle
  4 | import re
  5 | from copy import deepcopy
  6 | from functools import wraps
  7 | from hashlib import md5
  8 | from pathlib import Path
  9 | 
 10 | 
 11 | def get_project_root() -> Path:
 12 |     return Path(__file__).parent.parent
 13 | 
 14 | 
 15 | def setup_logger(tag):
 16 |     logger = logging.getLogger(tag)
 17 |     logger.setLevel(logging.DEBUG)
 18 | 
 19 |     handler: logging.StreamHandler = logging.StreamHandler()
 20 |     formatter: logging.Formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 21 |     handler.setFormatter(formatter)
 22 |     logger.addHandler(handler)
 23 |     return logger
 24 | 
 25 | 
 26 | def save_result_cache(path: Path, hash: str, type: str, **kwargs):
 27 |     cache_dir = path / type
 28 |     os.makedirs(cache_dir, exist_ok=True)
 29 |     path = Path(cache_dir, f'{hash}.pickle')
 30 |     with open(path, 'wb') as f:
 31 |         pickle.dump(kwargs, f)
 32 | 
 33 | 
 34 | def load_result_from_cache(path: Path, hash: str, type: str):
 35 |     path = path / type / f'{hash}.pickle'
 36 |     with open(path, 'rb') as f:
 37 |         return pickle.load(f)
 38 | 
 39 | 
 40 | def check_result_cache_exists(path: Path, hash: str, type: str) -> bool:
 41 |     path = path / type / f'{hash}.pickle'
 42 |     return True if os.path.exists(path) else False
 43 | 
 44 | 
 45 | def check_max_number_of_cache(path: Path, type, max_number_of_cache: int = 10):
 46 |     path = path / type
 47 |     if len(os.listdir(path)) > max_number_of_cache:
 48 |         ctime_list = [(os.path.getctime(path / file), file) for file in os.listdir(path)]
 49 |         oldest_file = sorted(ctime_list)[0][1]
 50 |         os.remove(path / oldest_file)
 51 | 
 52 | 
 53 | def split_sentences_from_paragraph(text):
 54 |     sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
 55 |     return sentences
 56 | 
 57 | 
 58 | def remove_api_keys(d):
 59 |     key_to_remove = ['api_key', 'subscription_key']
 60 |     temp_key_list = []
 61 |     for key, value in d.items():
 62 |         if key in key_to_remove:
 63 |             temp_key_list += [key]
 64 |         if isinstance(value, dict):
 65 |             remove_api_keys(value)
 66 | 
 67 |     for key in temp_key_list:
 68 |         d.pop(key)
 69 |     return d
 70 | 
 71 | 
 72 | def path_safe_string_conversion(filename: str):
 73 |     # https://stackoverflow.com/questions/7406102/create-sane-safe-filename-from-any-unsafe-string
 74 |     return "".join([c for c in filename if c.isalpha() or c.isdigit() or c == ' ']).rstrip()
 75 | 
 76 | 
 77 | def storage_cached(cache_type: str, cache_hash_key_name: str):
 78 |     def storage_cache_decorator(func):
 79 |         @wraps(func)
 80 |         def wrapper(*args, **kwargs):
 81 |             assert getattr(args[0], 'config'), 'storage_cached is only applicable to class method with config attribute'
 82 |             assert cache_hash_key_name in kwargs, f'Target method does not have {cache_hash_key_name} keyword argument'
 83 | 
 84 |             config = getattr(args[0], 'config')
 85 |             if config.get('cache').get('is_enable').get(cache_type):
 86 |                 hash_key = str(kwargs[cache_hash_key_name])
 87 | 
 88 |                 cache_path = Path(get_project_root(), config.get('cache').get('path'))
 89 | 
 90 |                 cache_hash = md5(str(config).encode() + hash_key.encode()).hexdigest()
 91 |                 if cache_type == 'web':
 92 |                     cache_hash = path_safe_string_conversion(hash_key)
 93 | 
 94 |                 if check_result_cache_exists(cache_path, cache_hash, cache_type):
 95 |                     result = load_result_from_cache(cache_path, cache_hash, cache_type)['result']
 96 |                 else:
 97 |                     result = func(*args, **kwargs)
 98 |                     config_for_cache = deepcopy(config)
 99 |                     config_for_cache = remove_api_keys(config_for_cache)  # remove api keys
100 |                     save_result_cache(cache_path, cache_hash, cache_type, result=result, config=config_for_cache)
101 | 
102 |                     check_max_number_of_cache(cache_path, cache_type, config.get('cache').get('max_number_of_cache'))
103 |             else:
104 |                 result = func(*args, **kwargs)
105 | 
106 |             return result
107 | 
108 |         return wrapper
109 | 
110 |     return storage_cache_decorator
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     text = "There are many things you can do to learn how to run faster, Mr. Wan, such as incorporating speed workouts into your running schedule, running hills, counting your strides, and adjusting your running form. Lean forward when you run and push off firmly with each foot. Pump your arms actively and keep your elbows bent at a 90-degree angle. Try to run every day, and gradually increase the distance you run for long-distance runs. Make sure you rest at least one day per week to allow your body to recover. Avoid running with excess gear that could slow you down."
115 |     sentences = split_sentences_from_paragraph(text)
116 |     print(len(sentences))
117 |     print(sentences)
118 | 


--------------------------------------------------------------------------------
/src/config/config.yaml:
--------------------------------------------------------------------------------
 1 | general:
 2 |   language: en-US # It will determine bing search market and LLM prompt language. en-US / zh-CN / it-IT / fr-FR / de-DE / es-ES / ja-JP / ko-KR / pt-BR / ru-RU / etc.
 3 | source_service:
 4 |   is_use_source: true # grounded or not grounded. If not grounded, meaning just Q&A via LLM
 5 |   is_enable_bing_search: true
 6 |   is_enable_doc_search: false
 7 |   doc_search_path:
 8 |   bing_search:
 9 |     end_point: https://api.bing.microsoft.com
10 |     subscription_key:
11 |     result_count: 3
12 |     sentence_count_per_site: 20
13 |     text_extract: trafilatura # beautifulsoup / trafilatura
14 | llm_service:
15 |   provider: openai # openai/goose_ai
16 |   openai_api:
17 |     api_key:
18 |     # model: gpt-3.5-turbo is the best one. Details: https://platform.openai.com/docs/models/gpt-3-5.
19 |     # model: gpt-4 is in limited preview
20 |     # model: text-babbage-001
21 |     # model: text-curie-001
22 |     # model: text-davinci-003
23 |     model: gpt-3.5-turbo # default
24 |     max_tokens: 300
25 |     temperature: 1
26 |     top_p: 1
27 |     prompt:
28 |       prompt_token_limit: 1500
29 |     stream: true
30 |   goose_ai_api:
31 |     api_key:
32 |     api_base: https://api.goose.ai/v1
33 |     # https://goose.ai/docs/models
34 |     model: gpt-neo-20b
35 |     max_tokens: 100
36 | cache:  # .cache result for efficiency and consistency
37 |   is_enable:
38 |     web: true
39 |     bing_search_website: false
40 |     bing_search_website_content: false
41 |     openai: false
42 |     gooseai: false
43 |   path: .cache
44 |   max_number_of_cache: 50
45 | frontend_service:
46 |   prompt_examples:
47 |     col1_list:
48 |       - Alpaca lora
49 |       - what is new for gpt4?
50 |       - Why Llama LLM model is so popular?
51 |       - Why did SVB collapsed?
52 |       - End of FTX
53 |     col2_list:
54 |       - digital twin有哪些用处
55 |       - 아가동산사건의 문제가 뭐야
56 |       - Hoe maak ik pasta
57 |       - 日本国憲法は誰が作ったのか？
58 |       - Comment gagner de l'argent
59 | 
60 | 


--------------------------------------------------------------------------------
/src/flask_app.py:
--------------------------------------------------------------------------------
1 | from website import create_app
2 | 
3 | app = create_app()
4 | 
5 | if __name__ == '__main__':
6 |     app.run(debug=True)
7 | 


--------------------------------------------------------------------------------
/src/gradio_app.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | 
 3 | from SearchGPTService import SearchGPTService
 4 | 
 5 | 
 6 | def query_and_get_answer(search_text):
 7 |     search_gpt_service = SearchGPTService()
 8 |     response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text)
 9 |     return response_text, source_text
10 | 
11 | 
12 | demo = gr.Interface(fn=query_and_get_answer,
13 |                     inputs=gr.Textbox(placeholder="What is chatgpt"),
14 |                     outputs=["text", "text", "text"])
15 | demo.launch()
16 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
 1 | from SearchGPTService import SearchGPTService
 2 | 
 3 | if __name__ == '__main__':
 4 |     search_text = 'the source of dark energy'
 5 | 
 6 |     search_gpt_service = SearchGPTService()
 7 |     response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text=search_text)
 8 |     print()
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/text_extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/src/text_extract/__init__.py


--------------------------------------------------------------------------------
/src/text_extract/doc/__init__.py:
--------------------------------------------------------------------------------
 1 | from .docx_svc import docx_extract_svc
 2 | from .ppt_svc import ppt_extract_svc
 3 | 
 4 | support_doc_type = ['doc', 'docx', 'ppt', 'pptx']
 5 | doc_extract_svc_map = {
 6 |     'doc': docx_extract_svc,
 7 |     'docx': docx_extract_svc,
 8 |     'ppt': ppt_extract_svc,
 9 |     'pptx': ppt_extract_svc
10 | }
11 | 


--------------------------------------------------------------------------------
/src/text_extract/doc/abc_doc_extract.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class AbstractDocExtractSvc(abc.ABC):
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     @abc.abstractmethod
 9 |     def extract_from_doc(self, path: str):
10 |         pass
11 | 


--------------------------------------------------------------------------------
/src/text_extract/doc/docx_svc.py:
--------------------------------------------------------------------------------
 1 | import docx
 2 | 
 3 | from Util import split_sentences_from_paragraph
 4 | from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc
 5 | 
 6 | 
 7 | class DocxSvc(AbstractDocExtractSvc):
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     def extract_from_doc(self, path: str):
12 |         doc_file = docx.Document(path)
13 |         raw_text_list = [paragraph.text for paragraph in doc_file.paragraphs if len(paragraph.text) > 0]
14 |         sentence_list = list()
15 |         for raw_text in raw_text_list:
16 |             sentence_list.extend(split_sentences_from_paragraph(raw_text))
17 | 
18 |         # Remove duplicates
19 |         sentence_list = list(dict.fromkeys(sentence_list))
20 | 
21 |         return sentence_list
22 | 
23 | 
24 | docx_extract_svc = DocxSvc()
25 | 


--------------------------------------------------------------------------------
/src/text_extract/doc/ppt_svc.py:
--------------------------------------------------------------------------------
 1 | import pptx
 2 | from Util import split_sentences_from_paragraph
 3 | 
 4 | from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc
 5 | 
 6 | 
 7 | class PptSvc(AbstractDocExtractSvc):
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     def extract_from_doc(self, path: str):
12 |         prs = pptx.Presentation(path)
13 |         sentence_list = list()
14 |         for i, slide in enumerate(prs.slides, start=1):
15 |             for j, shape in enumerate(slide.shapes, start=1):
16 |                 if hasattr(shape, "text"):
17 |                     sentence_list.extend(split_sentences_from_paragraph(shape.text))
18 | 
19 |         # Remove duplicates
20 |         sentence_list = list(dict.fromkeys(sentence_list))
21 | 
22 |         return sentence_list
23 | 
24 | 
25 | ppt_extract_svc = PptSvc()
26 | 


--------------------------------------------------------------------------------
/src/text_extract/html/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelthwan/searchGPT/f0adbd2eb5456c560a15883cd9befc82d385f3be/src/text_extract/html/__init__.py


--------------------------------------------------------------------------------
/src/text_extract/html/abc_html_extract.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class AbstractHtmlExtractSvc(abc.ABC):
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     @abc.abstractmethod
 9 |     def extract_from_html(self, text: str):
10 |         pass
11 | 


--------------------------------------------------------------------------------
/src/text_extract/html/beautiful_soup.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | 
 3 | from text_extract.html.abc_html_extract import AbstractHtmlExtractSvc
 4 | 
 5 | 
 6 | class BeautifulSoupSvc(AbstractHtmlExtractSvc):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def extract_from_html(self, html_str: str):
11 |         soup = BeautifulSoup(html_str, "html.parser")
12 |         return [el.get_text() for el in soup.select('p')]
13 | 


--------------------------------------------------------------------------------
/src/text_extract/html/trafilatura.py:
--------------------------------------------------------------------------------
 1 | from trafilatura import bare_extraction
 2 | from trafilatura.meta import reset_caches
 3 | 
 4 | from text_extract.html.abc_html_extract import AbstractHtmlExtractSvc
 5 | 
 6 | 
 7 | class TrafilaturaSvc(AbstractHtmlExtractSvc):
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     def extract_from_html(self, html_str: str):
12 |         extract = bare_extraction(html_str, favor_precision=True)
13 |         # reset_caches()
14 |         try:
15 |             return extract['text'].split("\n")
16 |         except:
17 |             return []
18 | 


--------------------------------------------------------------------------------
/src/website/__init__.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | 
 3 | 
 4 | def create_app():
 5 |     app = Flask(__name__)
 6 |     app.config['SECRET_KEY'] = 'secret_key_xyz'
 7 | 
 8 |     from .views import views
 9 |     app.register_blueprint(views, url_prefix='/')
10 | 
11 |     return app
12 | 


--------------------------------------------------------------------------------
/src/website/sender.py:
--------------------------------------------------------------------------------
 1 | from flask import render_template
 2 | 
 3 | MSG_TYPE_SEARCH_STEP = 'search-step'
 4 | MSG_TYPE_OPEN_AI_STREAM = 'openai-stream'
 5 | 
 6 | # global var to store progress. Native polling 'socket'
 7 | exporting_progress = {}
 8 | 
 9 | 
10 | class Sender:
11 |     def __init__(self, request_id: str):
12 |         self.request_id = request_id
13 |         self.received_step_events = []
14 |         self.openai_stream = ''
15 |         self.search_result_step_html = ''
16 | 
17 |     def send_message(self, msg_type, msg: str):
18 |         if msg_type == MSG_TYPE_SEARCH_STEP:
19 |             self.received_step_events.append(msg)
20 |             self.search_result_step_html = render_template('search_result_step.html',
21 |                                                            search_result_step_json=[{'msg': received_msg} for received_msg in self.received_step_events])
22 |         elif msg_type == MSG_TYPE_OPEN_AI_STREAM:
23 |             self.openai_stream += msg
24 |         else:
25 |             pass
26 |         global exporting_progress
27 |         exporting_progress[self.request_id] = {'html': self.search_result_step_html,
28 |                                                'openai_stream': self.openai_stream}
29 | 


--------------------------------------------------------------------------------
/src/website/static/index.js:
--------------------------------------------------------------------------------
 1 | $(document).ready(function () {
 2 |     let refresh_progress = function () {
 3 |         let status = $('#status').val()
 4 |         if (status === 'done' || status === 'error') {
 5 |             return;
 6 |         }
 7 |         $.get("/progress",
 8 |             {request_id: $('#request_id').val()},
 9 |             function (data, status) {
10 |                 if (status === 'success') {
11 |                     $('#search-result-step').html(data.html);
12 |                     $('#result-text')[0].innerText = data.openai_stream;
13 |                 }
14 |             }
15 |         );
16 |     }
17 | 
18 |     let submit_search = function (is_poll, event) {
19 |         if (event) {
20 |             event.preventDefault();
21 |         }
22 |         let search_text = $('#form1').val();
23 |         $('#search-btn')[0].disabled = true;
24 |         $('#status').val('processing');
25 |         $('#search-result-spinner').addClass('d-flex');
26 |         // $('#search-results').hide();
27 |         $('#search_text')[0].innerText = search_text;
28 |         $('#search_result_sources')[0].innerText = '';
29 |         $('#explain_results').hide();
30 |         $.ajax({
31 |             url: '/search',
32 |             type: 'POST',
33 |             data: {
34 |                 q: search_text,
35 |                 request_id: $('#request_id').val(),
36 |                 bing_search_subscription_key: $('#bing_search_subscription_key').val(),
37 |                 openai_api_key: $('#openai_api_key').val(),
38 |                 is_use_source: $('input[name="is_use_source"]')[0].checked,
39 |                 llm_service_provider: $('#llm_service_provider').val(),
40 |                 llm_model: $('#llm_model').val(),
41 |                 language: $('#language').val()
42 |             },
43 |             success: function (response) {
44 |                 $('#' + response.id).html(response.html)
45 |                 $('#explain_results').html(response.explain_html)
46 |                 $('#request_id_status_html').html(response.request_id_status_html)
47 |                 $('#search-btn')[0].disabled = false;
48 |                 $('#search-result-spinner').removeClass('d-flex');
49 |                 $('#search-results').show();
50 |                 $('#explain_results').show();
51 |             },
52 |             error: function (error) {
53 |                 console.log(error)
54 |                 $('#explain_results').html(response.explain_html)
55 |                 $('#request_id_status_html').html(response.request_id_status_html)
56 |                 $('#search-btn')[0].disabled = false;
57 |                 $('#search-result-spinner').removeClass('d-flex');
58 |                 $('#search-results').show();
59 |                 $('#explain_results').show();
60 |             }
61 |         })
62 | 
63 |         // call 10 times progress each sec
64 |         if (is_poll) {
65 |             CALL_TIMES = 15; // 2 sec for 30 sec
66 |             for (let i = 1; i < CALL_TIMES + 1; i++) {
67 |                 setTimeout(refresh_progress, 2000 * i);
68 |             }
69 |         }
70 | 
71 |     }
72 | 
73 |     $('.prompt-ex-btn').click(function () {
74 |         $('#form1').val($(this).text())
75 |         submit_search(false, null);
76 |     });
77 | 
78 |     $('form').submit(function (event) {
79 |         submit_search(true, event);
80 |     })
81 | })


--------------------------------------------------------------------------------
/src/website/templates/alert_box.html:
--------------------------------------------------------------------------------
1 | {% if error %}
2 | <div class="alert alert-danger alert-dismissible fade show" role="alert">
3 |     <strong>Encountered error</strong> {{ error }}
4 |     <button type="button" class="btn-close" data-bs-dismiss="alert" aria-label="Close"></button>
5 | </div>
6 | {% endif %}


--------------------------------------------------------------------------------
/src/website/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta charset="utf-8"/>
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1"/>
 6 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet"
 7 |           integrity="sha384-GLhlTQ8iRABdZLl6O3oVMWSktQOp6b7In1Zl3/Jr59b6EGGoI1aFkw7cmDA6j6gD" crossorigin="anonymous">
 8 |     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
 9 |     <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Noto+Sans:400,700&display=swap">
10 |     <style>
11 |         body, .container {
12 |             font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
13 |         }
14 | 
15 |         .list-group {
16 |             font-size: 0.8rem;
17 |         }
18 |     </style>
19 | 
20 |     <title>{% block title %}SearchGPT{% endblock %}</title>
21 | </head>
22 | <body>
23 | <nav class="navbar navbar-expand-lg bg-body-tertiary">
24 |     <div class="container-fluid">
25 |         <a class="navbar-brand" href="#">SearchGPT</a>
26 |     </div>
27 | </nav>
28 | <div class="container">{% block content %} {% endblock %}</div>
29 | 
30 | <footer class="bg-light p-3">
31 |     <div class="container text-center">
32 |         <div class="row">
33 |             <div class="col-lg-12">
34 |                 <p class="text-muted small mb-0">SearchGPT 20230323 Version (
35 |                     <a href="https://github.com/michaelthwan/searchGPT"><i class="fa fa-github"></i> Github</a>
36 |                     ). Your feedback will help us to improve</p>
37 |             </div>
38 |         </div>
39 |     </div>
40 | </footer>
41 | <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"
42 |         integrity="sha384-oBqDVmMz9ATKxIep9tiCxS/Z9fNfEXiDAYTujMAeBAsjFuCZSmKbSSUnQlmh/jp3"
43 |         crossorigin="anonymous"></script>
44 | <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.min.js"
45 |         integrity="sha384-mQ93GR66B00ZXjt0YO5KlohRA5SY2XofN4zfuZxLkoj1gXtW8ANNCe9d5Y3eG5eD"
46 |         crossorigin="anonymous"></script>
47 | <script src="https://code.jquery.com/jquery-3.2.1.min.js"
48 |         integrity="sha384-xBuQ/xzmlsLoJpyjoggmTEz8OWUFM0/RC5BsqQBDX2v5cMvDHcMakNTNrHIW2I5f"
49 |         crossorigin="anonymous"></script>
50 | <script type="text/javascript" src="{{ url_for('static', filename='index.js') }}"></script>
51 | </body>
52 | </html>
53 | 


--------------------------------------------------------------------------------
/src/website/templates/explain_result.html:
--------------------------------------------------------------------------------
 1 | <div class="accordion" id="accordionExample">
 2 |     <div class="accordion-item">
 3 |         <h2 class="accordion-header" id="headingOne">
 4 |             <button class="accordion-button" type="button" data-bs-toggle="collapse"
 5 |                     data-bs-target="#collapseOne" aria-expanded="true" aria-controls="collapseOne">
 6 |                 Click to expand Explainability 👆
 7 |             </button>
 8 |         </h2>
 9 |         <div id="collapseOne" class="accordion-collapse collapse" aria-labelledby="headingOne"
10 |              data-bs-parent="#accordionExample">
11 |             <div class="accordion-body">
12 |                 <div class="row">
13 |                     <div class="col-md-4">
14 |                         <h2>{{search_text}}</h2>
15 |                         <p>
16 |                             {% for item in response_explain_json %}
17 |                             {% if item['type'] == 'newline' %}
18 |                             <br>
19 |                             {% elif item['type'] == 'keyword' %}
20 |                             <span style="background-color: {{item['color']}}">{{ item['text'] }}</span>
21 |                             {% else %}
22 |                             <span>{{ item['text'] }}</span>
23 |                             {% endif %}
24 |                             {% endfor %}
25 |                         </p>
26 |                     </div>
27 |                     <div class="col-md-8">
28 |                         <p style="font-size: 0.8rem">
29 |                             {% for item in source_explain_json %}
30 |                             {% if item['type'] == 'newline' %}
31 |                             <br>
32 |                             {% elif item['type'] == 'keyword' %}
33 |                             <span style="background-color: {{item['color']}}">{{ item['text'] }}</span>
34 |                             {% else %}
35 |                             <span>{{ item['text'] }}</span>
36 |                             {% endif %}
37 |                             {% endfor %}
38 |                         </p>
39 |                     </div>
40 |                 </div>
41 |             </div>
42 |         </div>
43 |     </div>
44 | </div>


--------------------------------------------------------------------------------
/src/website/templates/index.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %} {% block title %}SearchGPT{% endblock %}
  2 | {% block content %}
  3 | <div class="container mt-4">
  4 |     <div id="alert-box">
  5 |         {% include 'alert_box.html' %}
  6 |     </div>
  7 |     <div>
  8 |         <div id="prompt_examples">
  9 |             {% include 'prompt_examples.html' %}
 10 |         </div>
 11 |         <hr>
 12 |         <form action="/search" method="post">
 13 |             <div class="input-group">
 14 |                 <input type="search" id="form1" class="form-control"
 15 |                        placeholder="Ask: e.g. What is ChatGPT"
 16 |                        name="q" value="{{ request.args.get('q', '') }}"
 17 |                        minlength="5" maxlength="200" required/>
 18 |                 <button type="submit" class="btn btn-primary" id="search-btn"><i class="fa fa-search"></i></button>
 19 |             </div>
 20 |             <div class="row mt-4">
 21 |                 <div class="col-md-6">
 22 |                     <div class="form-group">
 23 |                         <label for="bing_search_subscription_key">Bing Search Subscription Key</label>
 24 |                         <input type="text" class="form-control" id="bing_search_subscription_key"
 25 |                                placeholder="Enter key"
 26 |                                name="bing_search_subscription_key"
 27 |                                value="{{ request.args.get('bing_search_subscription_key', '') }}">
 28 |                     </div>
 29 |                     <div class="form-group">
 30 |                         <label for="openai_api_key">OpenAI API Key</label>
 31 |                         <input type="text" class="form-control" id="openai_api_key" placeholder="Enter key"
 32 |                                name="openai_api_key" value="{{ request.args.get('openai_api_key', '') }}">
 33 |                     </div>
 34 |                     <div id="request_id_status_html" style="font-size: 4px; color: #f7f7f7;" >
 35 |                         {% include 'request_id_status_html.html' %}
 36 |                     </div>
 37 |                     <div style="font-size: 0.6rem; color: #4f71a8">
 38 |                         Note: <br>
 39 |                         - Calls took about 15 seconds to fully complete normally. <br>
 40 |                              (~5 sec for Bing Search, ~10 sec for OpenAI) <br>
 41 |                         - Sometimes the API is not stable. If it exceeded 30 seconds, please try again. <br>
 42 |                         - Current source filtering: first 20 lines for each websites, then get the best results and trimmed to 1500 tokens <br>
 43 |                         - Non-English language is experimental. It will somehow work.
 44 |                     </div>
 45 |                 </div>
 46 |                 <div class="col-md-6">
 47 |                     <div class="form-group">
 48 |                         <label>Use source/grounded search?</label>
 49 |                         <div class="form-check">
 50 |                             {% if request.args.get('is_use_source', 'true') != 'False' %}
 51 |                             <input class="form-check-input" type="checkbox" name="is_use_source" checked>
 52 |                             {% else %}
 53 |                             <input class="form-check-input" type="checkbox" name="is_use_source">
 54 |                             {% endif %}
 55 |                             <label class="form-check-label">
 56 |                                 Check to enable
 57 |                             </label>
 58 |                         </div>
 59 |                         <input type="hidden" name="is_use_source" value="False">
 60 |                     </div>
 61 |                     <div class="form-group">
 62 |                         <label for="llm_service_provider">LLM Service Provider</label>
 63 |                         <select class="form-control" id="llm_service_provider">
 64 |                             <option value="openai">OpenAI</option>
 65 |                         </select>
 66 |                     </div>
 67 |                     <div class="form-group">
 68 |                         <label for="llm_model">LLM Model</label>
 69 |                         <select class="form-control" id="llm_model" name="llm_model">
 70 |                             <option value="gpt-3.5-turbo">gpt-3.5-turbo (Best model that supporting ChatGPT)</option>
 71 |                             <option value="gpt-4">gpt-4 (GPT4 is in Limited preview)</option>
 72 |                             <option value="text-davinci-003"
 73 |                                     {% if request.args.get('llm_model') == 'text-davinci-003' %} selected {% endif %}
 74 |                             >text-davinci-003</option>
 75 |                             <option value="text-curie-001"
 76 |                                     {% if request.args.get('llm_model') == 'text-curie-001' %} selected {% endif %}
 77 |                             >text-curie-001</option>
 78 |                         </select>
 79 |                     </div>
 80 |                     <div class="form-group">
 81 |                         <label for="language">Language<div style="display: inline; font-size: 0.8rem"> (Control bing search region and LLM response)</div></label>
 82 |                         <select class="form-control" id="language" name="language">
 83 |                             <option value="en-US">en-US English</option>
 84 |                             <option value="zh-CN" {% if request.args.get('language') == 'zh-CN' %} selected {% endif %}>zh-CN Chinese 中文</option>
 85 |                             <option value="it-IT" {% if request.args.get('language') == 'it-IT' %} selected {% endif %}>it-IT Italian Italiano</option>
 86 |                             <option value="fr-FR" {% if request.args.get('language') == 'fr-FR' %} selected {% endif %}>fr-FR French Français</option>
 87 |                             <option value="de-DE" {% if request.args.get('language') == 'de-DE' %} selected {% endif %}>de-DE German Deutsch</option>
 88 |                             <option value="es-ES" {% if request.args.get('language') == 'es-ES' %} selected {% endif %}>es-ES Spanish Español</option>
 89 |                             <option value="ja-JP" {% if request.args.get('language') == 'ja-JP' %} selected {% endif %}>ja-JP Japanese 日本語</option>
 90 |                             <option value="ko-KR" {% if request.args.get('language') == 'ko-KR' %} selected {% endif %}>ko-KR Korean 한국어</option>
 91 |                             <option value="ru-RU" {% if request.args.get('language') == 'ru-RU' %} selected {% endif %}>ru-RU Russian Русский</option>
 92 |                         </select>
 93 |                     </div>
 94 |                 </div>
 95 |             </div>
 96 |         </form>
 97 |     </div>
 98 |     <hr>
 99 |     <div class="container mt-4">
100 |         <div id="search-result-spinner-div" style="height: 36px">
101 |             <div class="justify-content-center" id="search-result-spinner" hidden>
102 |                 <div class="spinner-border" role="status">
103 |                     <span class="sr-only">Loading...</span>
104 |                 </div>
105 |             </div>
106 |         </div>
107 |         <div id="search-result-step">
108 |             {% include 'search_result_step.html' %}
109 |         </div>
110 |         <div id="search-results">
111 |             {% include 'search_result.html' %}
112 |         </div>
113 |         <hr>
114 |         <div id="explain_results">
115 |             {% include 'explain_result.html' %}
116 |         </div>
117 | 
118 |     </div>
119 |     <hr>
120 | </div>
121 | 
122 | 
123 | {% endblock %}
124 | 


--------------------------------------------------------------------------------
/src/website/templates/index_static.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %} {% block title %}Home{% endblock %}
  2 | {% block content %}
  3 |     <div class="container mt-4">
  4 |         <div class="alert alert-danger alert-dismissible fade show" role="alert">
  5 |             <strong>Encountered error!</strong> {{ "error message." }}
  6 |             <button type="button" class="btn-close" data-bs-dismiss="alert" aria-label="Close"></button>
  7 |         </div>
  8 |         <div>
  9 |             <form action="#" method="get">
 10 |                 <div class="input-group">
 11 |                     <input type="search" id="form1" class="form-control" placeholder="What is ChatGPT"/>
 12 |                     <button type="button" class="btn btn-primary">
 13 |                         <i class="fa fa-search"></i>
 14 |                     </button>
 15 |                 </div>
 16 |             </form>
 17 |         </div>
 18 |         <hr>
 19 |         <div class="container mt-4">
 20 |             <div class="row">
 21 |                 <div class="col-md-6">
 22 |                     <h2>what is chatgpt</h2>
 23 |                     <p>ChatGPT is a computer program that generates text responses in the form of a conversation. It was
 24 |                         created by OpenAI in 2018, and it is free to use for anyone with an account on their website.
 25 |                         [5] ChatGPT is designed to generate human-like responses, but it is not always
 26 |                         accurate. Users can upvote or downvote the responses they receive, and OpenAI gathers data from
 27 |                         users to further train and fine-tune ChatGPT. [4].</p>
 28 |                 </div>
 29 |                 <div class="col-md-6">
 30 |                     <ul class="list-group">
 31 |                         <li class="list-group-item">
 32 |                             <h5 class="mb-1">
 33 |                                 <img src="https://www.google.com/s2/favicons?domain=zdnet.com" alt="Favicon">
 34 |                                 <a href="https://www.zdnet.com/article/what-is-chatgpt-and-why-does-it-matter-heres-everything-you-need-to-know/">[1]
 35 |                                     zdnet.com</a>
 36 |                             </h5>
 37 |                             <h4 class="mb-1" style="font-family: 'Playfair Display', Georgia">What is ChatGPT and why
 38 |                                 does it matter? Here's everything you need to know</h4>
 39 |                             <p class="mb-1"> The launch of a paid version had been rumored for some time before the
 40 |                                 official release. In January, OpenAI announced on its Discord server that it was
 41 |                                 considering to start charging for ChatGPT with a version called ChatGPT
 42 |                                 Professional.</p>
 43 |                         </li>
 44 |                         <li class="list-group-item">
 45 |                             <h5 class="mb-1">
 46 |                                 <img src="https://www.google.com/s2/favicons?domain=openai.com" alt="Favicon">
 47 |                                 <a href="https://openai.com/blog/chatgpt/">[2] openai.com</a>
 48 |                             </h5>
 49 |                             <h4 class="mb-1" style="font-family: 'Playfair Display', Georgia">ChatGPT: Optimizing
 50 |                                 Language Models for Dialogue</h4>
 51 |                             <p class="mb-1">We are excited to introduce ChatGPT to get users' feedback and learn about
 52 |                                 its strengths and weaknesses. During the research preview, usage of ChatGPT is free. Try
 53 |                                 it now at chat.openai.com.</p>
 54 |                         </li>
 55 |                         <li class="list-group-item">
 56 |                             <h5 class="mb-1">[3] xxx.com</h5>
 57 |                             <h4 class="mb-1" style="font-family: 'Playfair Display', Georgia">xxx title xxx</h4>
 58 |                             <p class="mb-1">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec varius elit
 59 |                                 lectus, vitae faucibus mi egestas in. Nulla</p>
 60 |                         </li>
 61 |                         <li class="list-group-item">
 62 |                             <h5 class="mb-1">[4] xxx.com</h5>
 63 |                             <h4 class="mb-1" style="font-family: 'Playfair Display', Georgia">xxx title xxx</h4>
 64 |                             <p class="mb-1">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec varius elit
 65 |                                 lectus, vitae faucibus mi egestas in. Nulla</p>
 66 |                         </li>
 67 |                     </ul>
 68 |                 </div>
 69 |             </div>
 70 |             <div class="accordion" id="accordionExample">
 71 |                 <div class="accordion-item">
 72 |                     <h2 class="accordion-header" id="headingOne">
 73 |                         <button class="accordion-button" type="button" data-bs-toggle="collapse"
 74 |                                 data-bs-target="#collapseOne" aria-expanded="true" aria-controls="collapseOne">
 75 |                             Click to expand Explainability 👆
 76 |                         </button>
 77 |                     </h2>
 78 |                     <div id="collapseOne" class="accordion-collapse collapse" aria-labelledby="headingOne"
 79 |                          data-bs-parent="#accordionExample">
 80 |                         <div class="accordion-body">
 81 |                             <div class="row">
 82 |                                 <div class="col-md-4">
 83 |                                     <p>What is ChatGPT.</p>
 84 |                                 </div>
 85 |                                 <div class="col-md-8">
 86 |                                     <p>ChatGPT is a computer program that generates text responses in the form of a
 87 |                                         conversation. It was created by OpenAI in 2018, and it is free to use for anyone
 88 |                                         with an account on their website. [5] ChatGPT is designed to generate human-like
 89 |                                         responses, but it is not always
 90 |                                         accurate. Users can upvote or downvote the responses they receive, and OpenAI
 91 |                                         gathers data from users to further train and fine-tune ChatGPT.</p>
 92 |                                 </div>
 93 |                             </div>
 94 |                         </div>
 95 |                     </div>
 96 |                 </div>
 97 |             </div>
 98 |         </div>
 99 |         <hr>
100 |     </div>
101 | 
102 | {% endblock %}
103 | 


--------------------------------------------------------------------------------
/src/website/templates/prompt_examples.html:
--------------------------------------------------------------------------------
 1 | <div class="accordion" id="explain-result">
 2 |     <div class="accordion-item">
 3 |         <h2 class="accordion-header" id="headingOne">
 4 |             <button class="accordion-button" type="button" data-bs-toggle="collapse"
 5 |                     data-bs-target="#collapsePrompt" aria-expanded="true" aria-controls="collapsePrompt">
 6 |                 Click to obtain examples of prompt 👆
 7 |             </button>
 8 |         </h2>
 9 |         <div id="collapsePrompt" class="accordion-collapse collapse" aria-labelledby="headingOne"
10 |              data-bs-parent="#accordionExample">
11 |             <div class="accordion-body">
12 |                 <div class="row">
13 |                     <div class="col-md-4">
14 |                         <div class="d-grid gap-2 mx-auto">
15 |                             <div style="font-weight: bold">Prompts that SearchGPT > ChatGPT</div>
16 |                             {% for prompt in prompt_examples_json.col1_list %}
17 |                             <button class="btn btn-light prompt-ex-btn" type="button">{{prompt}}</button>
18 |                             {% endfor %}
19 |                         </div>
20 |                     </div>
21 |                     <div class="col-md-4">
22 |                         <div class="d-grid gap-2 mx-auto">
23 |                             <div style="font-weight: bold">Multi-language experiments</div>
24 |                             {% for prompt in prompt_examples_json.col2_list %}
25 |                             <button class="btn btn-light prompt-ex-btn" type="button">{{prompt}}</button>
26 |                             {% endfor %}
27 |                         </div>
28 |                     </div>
29 |                     <div class="col-md-4">
30 |                         <div class="d-grid gap-2 mx-auto">
31 |                             <div style="font-weight: bold">Users' last 5 queries (coming soon!)</div>
32 |                         </div>
33 |                     </div>
34 |                 </div>
35 |             </div>
36 |         </div>
37 |     </div>
38 | </div>


--------------------------------------------------------------------------------
/src/website/templates/request_id_status_html.html:
--------------------------------------------------------------------------------
1 | <div class="form-group">
2 |     <label>Request Id</label>
3 |     <input type="text" id="request_id" style="font-size: 4px; color: #f7f7f7; border: #f7f7f7;" value="{{request_id}}"></input>
4 | </div>
5 | <div class="form-group">
6 |     <label>Status</label>
7 |     <input type="text" id="status" style="font-size: 4px; color: #f7f7f7; border: #f7f7f7;" value="{{status}}"></input>
8 | </div>
9 | 


--------------------------------------------------------------------------------
/src/website/templates/search_result.html:
--------------------------------------------------------------------------------
 1 | <div class="row">
 2 |     <div class="col-md-6">
 3 |         <h2 id="search_text">{{search_text}}</h2>
 4 |         <p id="result-text">
 5 |             {% for item in response_json %}
 6 |             {% if item['type'] == 'newline' %}
 7 |             <br>
 8 |             {% elif item['type'] == 'footnote' %}
 9 |             <span style="font-weight: bold; font-size: 0.8rem; background-color: #88effc;">{{ item['text'] }}</span>
10 |             {% else %}
11 |             {{ item['text'] }}
12 |             {% endif %}
13 |             {% endfor %}
14 |         </p>
15 |     </div>
16 |     <div class="col-md-6">
17 |         <ul id="search_result_sources" class="list-group">
18 |             {% for item in source_json %}
19 |             <li class="list-group-item">
20 |                 <h5 class="mb-1">
21 |                     <img src="https://www.google.com/s2/favicons?domain={{item['domain']}}" alt="Favicon">
22 |                     <a href="{{item['url']}}">{{item['footnote']}} {{item['domain']}}</a>
23 |                 </h5>
24 |                 <h4 class="mb-1" style="font-family: 'Playfair Display', Georgia">{{item['title']}}</h4>
25 |                 <p class="mb-1">{{item['text']}}</p>
26 |             </li>
27 |             {% endfor %}
28 |         </ul>
29 |     </div>
30 | </div>


--------------------------------------------------------------------------------
/src/website/templates/search_result_step.html:
--------------------------------------------------------------------------------
 1 | {% for item in search_result_step_json %}
 2 | <div class="row">
 3 |     <span class="cardSpan">
 4 |         <span class="icon">
 5 |             <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
 6 |               <path fill="#4CAF50" d="M9 16.17l-4.17-4.17-1.41 1.41 5.58 5.58 12-12-1.41-1.41z"/>
 7 |             </svg>
 8 |         </span>
 9 |         <span class="text">{{ item['msg'] }}</span>
10 |     </span>
11 | </div>
12 | {% endfor %}


--------------------------------------------------------------------------------
/src/website/views.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import string
  4 | import tracemalloc
  5 | 
  6 | import psutil
  7 | from flask import Blueprint, render_template, request
  8 | 
  9 | from SearchGPTService import SearchGPTService
 10 | from FrontendService import FrontendService
 11 | from Util import setup_logger
 12 | from website.sender import exporting_progress, Sender
 13 | 
 14 | logger = setup_logger('Views')
 15 | views = Blueprint('views', __name__)
 16 | 
 17 | process = psutil.Process(os.getpid())
 18 | tracemalloc.start()
 19 | memory_snapshot = None
 20 | 
 21 | 
 22 | @views.route('/', methods=['GET'])
 23 | @views.route('/index', methods=['GET'])
 24 | def start_page():
 25 |     request_id = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(16))
 26 | 
 27 |     data_json = {'response_json': [], 'source_json': [], 'response_explain_json': [], 'source_explain_json': [],
 28 |                  'prompt_examples_json': FrontendService.get_prompt_examples_json()}
 29 |     return render_template("index.html",
 30 |                            search_text='' or "Please search for something.",
 31 |                            response_json=data_json.get('response_json'),
 32 |                            source_json=data_json.get('source_json'),
 33 |                            response_explain_json=data_json.get('response_explain_json'),
 34 |                            source_explain_json=data_json.get('source_explain_json'),
 35 |                            prompt_examples_json=data_json.get('prompt_examples_json'),
 36 |                            request_id=request_id, status="init",
 37 |                            error=None
 38 |                            )
 39 | 
 40 | 
 41 | @views.route('/search', methods=['POST'])
 42 | def index_page():
 43 |     error = None
 44 |     data_json = {'response_json': [], 'source_json': []}
 45 |     request_id = request.values.get('request_id')
 46 |     search_text = request.values.get('q')
 47 | 
 48 |     try:
 49 |         ui_overriden_config = {
 50 |             'bing_search_subscription_key': request.values.get('bing_search_subscription_key'),
 51 |             'openai_api_key': request.values.get('openai_api_key'),
 52 |             'is_use_source': request.values.get('is_use_source'),
 53 |             'llm_service_provider': request.values.get('llm_service_provider'),
 54 |             'llm_model': request.values.get('llm_model'),
 55 |             'language': request.values.get('language'),
 56 |         }
 57 |         logger.info(f"GET ui_overriden_config: {ui_overriden_config}")
 58 | 
 59 |         if search_text is not None:
 60 |             sender = Sender(request_id) if request_id is not None and request_id != "" else None
 61 |             search_gpt_service = SearchGPTService(ui_overriden_config, sender)
 62 |             _, _, data_json = search_gpt_service.query_and_get_answer(search_text=search_text)
 63 |     except Exception as e:
 64 |         error = str(e)
 65 | 
 66 |     if error is None:
 67 |         id = 'search-results'
 68 |         result_html = render_template('search_result.html',
 69 |                                       search_text=search_text,
 70 |                                       response_json=data_json.get('response_json'),
 71 |                                       source_json=data_json.get('source_json'),
 72 |                                       )
 73 |         explain_html = render_template('explain_result.html',
 74 |                                        search_text=search_text,
 75 |                                        response_explain_json=data_json.get('response_explain_json'),
 76 |                                        source_explain_json=data_json.get('source_explain_json'),
 77 |                                        )
 78 |         request_id_status_html = render_template('request_id_status_html.html', request_id=request_id, status="done")
 79 |     else:
 80 |         id = 'alert-box'
 81 |         result_html = render_template('alert_box.html', error=error)
 82 |         explain_html = render_template('explain_result.html',
 83 |                                        search_text=search_text,
 84 |                                        response_explain_json=[],
 85 |                                        source_explain_json=[],
 86 |                                        )
 87 |         request_id_status_html = render_template('request_id_status_html.html', request_id=request_id, status="error")
 88 |     return {
 89 |         'id': id,
 90 |         'html': result_html,
 91 |         'explain_html': explain_html,
 92 |         'request_id_status_html': request_id_status_html,
 93 |     }
 94 | 
 95 | 
 96 | @views.route('/progress')
 97 | def progress():
 98 |     request_id = request.values.get('request_id')
 99 |     request_dict = exporting_progress.get(request_id, '')
100 |     return request_dict
101 | 
102 | 
103 | @views.route('/index_static', methods=['GET', 'POST'])
104 | def index_static_page():
105 |     return render_template("index_static.html")
106 | 
107 | 
108 | @views.route("/data", methods=["GET"])
109 | def get_data():
110 |     return {'id': 1, 'test': 'test'}
111 | 
112 | 
113 | @views.route('/memory')
114 | def print_memory():
115 |     return {'memory': process.memory_info().rss}
116 | 
117 | 
118 | @views.route("/snapshot")
119 | def snap():
120 |     global memory_snapshot
121 |     if not memory_snapshot:
122 |         memory_snapshot = tracemalloc.take_snapshot()
123 |         return "taken snapshot\n"
124 |     else:
125 |         lines = []
126 |         memory_snapshot_temp = tracemalloc.take_snapshot()
127 |         top_stats = memory_snapshot_temp.compare_to(memory_snapshot, 'lineno')
128 |         memory_snapshot = memory_snapshot_temp
129 |         for stat in top_stats[:5]:
130 |             lines.append(str(stat))
131 |         return "\n".join(lines)
132 | 


--------------------------------------------------------------------------------