├── static └── favicon.png ├── requirements.txt ├── README.md ├── LICENSE ├── app.py └── templates └── index.html /static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LexiestLeszek/sova_ollama/HEAD/static/favicon.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.1 2 | Flask==3.0.2 3 | google==3.0.0 4 | gunicorn==21.2.0 5 | langchain==0.1.6 6 | langchain_community==0.0.19 7 | ollama==0.1.6 8 | Requests==2.31.0 9 | translators==5.8.9 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Sova 2 | 3 | ## Description 4 | 5 | Sova is a web search engine that utilizes power of Large Language Models (LLM) to return full text answer instead of bunch of links like traditional search engines do. 6 | 7 | ## Installation 8 | 9 | Before installing Sova, ensure you have Python installed on your system. You will also need pip, which is typically included with Python. 10 | 11 | To install Sova, follow these steps: 12 | 13 | 1. Clone the repository using this command in terminal: `git clone https://github.com/LexiestLeszek/sova_ollama.git` 14 | 2. Install the required dependencies using this command in terminal: `pip install -r requirements.txt` 15 | 3. Install Ollama from their website https://ollama.com/ 16 | 4. Download the local LLM (in our case - qwen:1.8b) using this command in terminal: `ollama pull qwen:1.8b` 17 | 5. Run the gunicorn server using this command in terminal: `gunicorn -b 0.0.0.0:8080 app:app` 18 | 6. Use the search-bar to input your search query or or question. 19 | 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Leszek Mielnikow 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | from googlesearch import search 3 | import re 4 | import translators as ts 5 | import gunicorn 6 | import requests 7 | import ollama 8 | from googlesearch import search 9 | import requests 10 | from bs4 import BeautifulSoup 11 | from langchain_community.embeddings import HuggingFaceEmbeddings 12 | from langchain.text_splitter import RecursiveCharacterTextSplitter 13 | from langchain_community.vectorstores import Chroma 14 | 15 | app = Flask(__name__) 16 | 17 | def get_context(question): 18 | print("Init Search") 19 | search_results = search(question, num=5, stop=5, pause=2) 20 | top_links = list(search_results) 21 | exclude_list = ["google", "facebook", "twitter", "instagram", "youtube", "tiktok", "kremlin"] 22 | top_links = [link for link in top_links if not any(domain in link for domain in exclude_list)] 23 | scraped_texts = [] 24 | for link in top_links: 25 | print(link) 26 | try: 27 | page = requests.get(link) 28 | soup = BeautifulSoup(page.content, 'html.parser') 29 | text = ' '.join([p.get_text() for p in soup.find_all('p')]) 30 | print(text) 31 | except Exception as e: 32 | print(f"Failed to scrape the link: {link}\nError: {e}") 33 | scraped_texts.append(text) 34 | 35 | all_scraped_text = '.\n'.join(scraped_texts) 36 | 37 | print(all_scraped_text) 38 | 39 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) 40 | documents = text_splitter.split_text(all_scraped_text) 41 | embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2') 42 | 43 | db = Chroma.from_texts(documents, embedding=embeddings) 44 | retriever = db.as_retriever(search_kwargs={"k": 3}) 45 | 46 | context = retriever.get_relevant_documents(question) 47 | 48 | return context 49 | 50 | # Function to generate output using the LLM model 51 | def generate_output(question,context): 52 | # Prepare the prompt with the input text 53 | # Define a prompt template 54 | prompt = f"""Use the following context to answer the question at the end. 55 | If you don't know the answer, just say that you don't know, don't try to make up an answer. 56 | Answer only factual information based on the context below. 57 | Context: {context}.\n 58 | Question: {question}\n 59 | Helpful Answer:""" 60 | 61 | response = ollama.chat(model='qwen:1.8b', messages=[ 62 | 63 | { 64 | 'role': 'system', 65 | 'content': 'You are a Helpful AI assistant that answers based on the information from the user prompt Context and nothing else.', 66 | 67 | 'role': 'user', 68 | 'content': f'{prompt}', 69 | }, 70 | ]) 71 | 72 | output = response['message']['content'] 73 | 74 | # Return the generated text 75 | return output 76 | 77 | @app.route('/', methods=['GET', 'POST']) 78 | def index(): 79 | query = "" 80 | answer = "" 81 | 82 | if request.method == 'POST': 83 | query = request.form['query'] 84 | 85 | if query: 86 | 87 | context = get_context(query) 88 | 89 | # Generate the output using the LLM model 90 | answer = generate_output(query,context) 91 | 92 | print(answer) 93 | 94 | return render_template('index.html', query=query, answer=answer) 95 | 96 | #if __name__ == '__main__': 97 | # app.run(debug=True, port="8021") 98 | 99 | # to run on ngrok: 100 | # 1. in main Mac terminal: "ngrok http 8080" 101 | # 2. run this app in IDE terminal: "gunicorn -b 0.0.0.0:8080 app:app" 102 | 103 | # # Find the process running on port 8081 104 | # lsof -ti tcp:PORT 105 | 106 | # Kill the process (replace PID with the actual process ID) 107 | # kill PID 108 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 |{{ answer }}
171 |