├── .gitattributes
├── .gitignore
├── README.md
├── detective_test.py
├── main.py
├── modules
    ├── chatbot.py
    ├── database
    │   ├── db.py
    │   └── sql-murder-mystery.db
    ├── model.py
    ├── settings.py
    └── tools.py
├── requirements.txt
└── translator_test.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-documentation
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SQL Detective - A Murder Mystery RAG Game
 2 | 
 3 | SQL Detective is an interactive investigation game where you solve a murder mystery by querying a police database using natural language. This application showcases the power of Retrieval-Augmented Generation (RAG) techniques with local small language models and database integration.
 4 | 
 5 | ## 🔎 Overview
 6 | 
 7 | As a detective in SQL City, you're tasked with solving a murder that occurred on January 15, 2018. The uniqueness of this application lies in its approach:
 8 | 
 9 | 1. **Natural Language to SQL Translation**: Ask questions in plain English
10 | 2. **Database Query Execution**: Retrieve specific information from the police database
11 | 3. **Context-Aware Analysis**: Get intelligent insights based on the query results
12 | 4. **Progressive Investigation**: Follow leads from clue to clue to solve the case
13 | 
14 | ## 🚀 Features
15 | 
16 | - **Conversational Interface**: Interact with the investigation through a chat interface
17 | - **Automatic SQL Translation**: No SQL knowledge required - just ask questions naturally
18 | - **Intelligent Analysis**: The detective AI analyzes query results and suggests next steps
19 | - **Investigation Notes**: Keep track of your findings in the detective's notebook
20 | - **Local Processing**: All processing happens locally with no data sent to external APIs
21 | 
22 | ## 🛠️ Technical Stack
23 | 
24 | - **Backend Framework**: `Python`
25 | - **Frontend**: `Streamlit`
26 | - **Database**: `SQLite`
27 | - **LLM Platform**: `Ollama`
28 | - **Language Model Used**: `DeepSeek-R1:1.5b`
29 | - **RAG Implementation**: `LangChain` for orchestration and context management
30 | 
31 | ## 📂 Project Structure
32 | ```
33 | Ollama-SQLite-RAG/ 
34 | │ 
35 | ├── main.py              # Application entry point 
36 | ├── modules/ 
37 | │ ├── chatbot.py             # Main conversation handler
38 | │ ├── model.py               # Base classes for language model interactions 
39 | │ ├── settings.py            # Configuration settings 
40 | │ ├── tools.py               # Utility functions 
41 | │ └── database/ 
42 | │   ├── db.py                    # Database connection and query handling 
43 | │   └── sql-murder-mystery.db    # SQLite database file 
44 | │ 
45 | ├── detective_test.py    # Test script for detective model 
46 | └── translator_test.py   # Test script for translator model
47 | ```
48 | ## 🧠 How It Works
49 | 
50 | 1. **User Input**: The user submits a natural language question about the case
51 | 2. **Translation**: The Translator model converts the question to a valid SQL query
52 | 3. **Database Query**: The system executes the SQL query against the police database
53 | 4. **Analysis**: The Detective model analyzes the query results in the context of the question
54 | 5. **Response**: The system presents findings and suggests next investigative steps
55 | 
56 | ## 📖 RAG Architecture
57 | 
58 | This application implements a complete RAG pipeline:
59 | 
60 | 1. **Retrieval**: SQL queries retrieve relevant information from the database
61 | 2. **Augmentation**: The retrieved data augments the context for the language model
62 | 3. **Generation**: The language model generates insights based on the augmented context
63 | 
64 | ## 🚦 Getting Started
65 | 
66 | ### Prerequisites
67 | - Python 3.8+
68 | - [Ollama](https://ollama.ai/) installed locally
69 | - `DeepSeek-R1:1.5b` model downloaded to Ollama
70 | 
71 | ### Installation
72 | 
73 | ```bash
74 | # Clone the repository
75 | git clone https://github.com/Ne0bliviscaris/Ollama-SQLite-RAG.git
76 | cd Ollama-SQLite-RAG
77 | 
78 | # Create a virtual environment (optional but recommended)
79 | python -m venv venv
80 | source venv/bin/activate  # On Windows: venv\Scripts\activate
81 | 
82 | # Install dependencies
83 | pip install -r requirements.txt
84 | 
85 | # Start Ollama in the background and install the model
86 | ollama run deepseek-r1:1.5b
87 | # Run the application
88 | python main.py
89 | ```
90 | 
91 | ## 🙏 Acknowledgements
92 | 
93 | - [SQL Murder Mystery](https://github.com/NUKnightLab/sql-mysteries) for the original game concept
94 | - [LangChain](https://langchain.com/) for the RAG framework
95 | - [Ollama](https://ollama.ai/) for local language model support
96 | - [DeepSeek](https://www.deepseek.com) for their R1:1.5b model
97 | - [Streamlit](https://streamlit.io/) for the interactive web interface
98 | 


--------------------------------------------------------------------------------
/detective_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from modules.model import Detective
 4 | 
 5 | # question = "find last house on Franklin Ave"
 6 | # question = "find people living in the house with the largest address number on street named 'Franklin Ave'"
 7 | 
 8 | question = "there was a 'murder' in 'SQL City' on 'january 15, 2018'. Find the report, i want to find any clues"
 9 | context = r"""
10 | {"user_inputs":["there was a 'murder' in 'SQL City' on 'january 15, 2018'. Find the report, i want to find any clues"],"sql_queries":["SELECT * FROM crime_scene_report WHERE date = '20180115' AND type = 'murder'"],"query_results":[[{"date":20180115,"type":"murder","description":"Life? Dont talk to me about life.","city":"Albany"},{"date":20180115,"type":"murder","description":"Mama, I killed a man, put a gun against his head...","city":"Reno"},{"date":20180115,"type":"murder","description":"Security footage shows that there were 2 witnesses. The first witness lives at the last house on \"Northwestern Dr\". The second witness, named Annabel, lives somewhere on \"Franklin Ave\".","city":"SQL City"}]]}
11 | """
12 | 
13 | context_dict = json.loads(context.strip())
14 | query_results = context_dict["query_results"][0]
15 | response = Detective(question, query_results)
16 | 
17 | 
18 | print((response.full_response))
19 | print("\nAnswer:\n", response.answer)
20 | print("\nNext step:\n", response.next_step)
21 | print("\nThinking:\n", response.thinking)
22 | print("\nRules:\n", response.rules)
23 | print("\nPrompt:\n", response.prompt())
24 | # print("\nContext:\n", response.context)
25 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | import streamlit as st
 5 | 
 6 | from modules.chatbot import chatbot
 7 | 
 8 | st.set_page_config(
 9 |     page_title="Local SQL RAG",
10 |     page_icon="🤖",
11 |     menu_items={"About": "https://github.com/Ne0bliviscaris/Ollama-SQLite-RAG"},
12 | )
13 | 
14 | 
15 | def title_screen():
16 |     st.title("SQL RAG - local Ollama - Langchain")
17 |     st.markdown(
18 |         """
19 |         ### 🔍 Welcome, Detective
20 |         
21 |         A murder has been committed in SQL City, and you've been called to solve the case. 
22 |         
23 |         Your only lead is that the crime occurred on **January 15, 2018**, but the crime scene report has gone missing.
24 |         Using your detective skills and SQL knowledge, you must:
25 |         
26 |         1. Query the police database to find relevant information
27 |         2. Follow leads by asking the right questions
28 |         3. Connect the dots to identify the killer
29 |         
30 |         Type your investigation queries in natural language, and the system will:
31 |         - Translate your questions to SQL
32 |         - Search the database
33 |         - Help you analyze the results
34 |         
35 |         Can you solve the mystery before the trail goes cold?
36 |         """
37 |     )
38 | 
39 | 
40 | def main():
41 |     if not st.session_state:
42 |         title_screen()
43 |     chatbot()
44 | 
45 | 
46 | if __name__ == "__main__":  # Poprawiono cudzysłowy
47 |     # Launch streamlit and check if it's not already running
48 |     if not os.environ.get("RUNNING_IN_STREAMLIT"):
49 |         # Mark streamlit as running
50 |         os.environ["RUNNING_IN_STREAMLIT"] = "1"
51 |         file_path = os.path.abspath(__file__)
52 |         # Run streamlit with correct command list
53 |         subprocess.run(["streamlit", "run", file_path], check=True)
54 |     else:
55 |         main()
56 | 


--------------------------------------------------------------------------------
/modules/chatbot.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import streamlit as st
  3 | 
  4 | from modules.database.db import execute_sql_query
  5 | from modules.model import Detective, Translator
  6 | 
  7 | 
  8 | def chatbot():
  9 |     """Main chatbot function."""
 10 |     game_rules_sidebar()
 11 |     initialize_chat_session()
 12 |     display_chat_input()
 13 |     with st.container():
 14 |         show_chat_history()
 15 |     rag_pipeline()
 16 | 
 17 | 
 18 | def initialize_chat_session():
 19 |     """Initialize chat session state variables."""
 20 |     if "messages" not in st.session_state:
 21 |         st.session_state.messages = []
 22 |     if "index" not in st.session_state:
 23 |         st.session_state.index = 0
 24 |     if "context" not in st.session_state:
 25 |         st.session_state.context = {
 26 |             "user_inputs": [],
 27 |             "sql_queries": [],
 28 |             "query_results": [],
 29 |             "detective_answers": [],
 30 |             "detective_thinking": [],
 31 |         }
 32 |     if "current_state" not in st.session_state:
 33 |         st.session_state.current_state = None
 34 | 
 35 | 
 36 | def display_chat_input():
 37 |     if prompt := st.chat_input("Ask about the case..."):
 38 |         index = st.session_state.index
 39 |         st.session_state.current_state = "translator"
 40 |         prefill_context()
 41 |         update_context("user_inputs", prompt, index)
 42 |         update_messages("user", prompt, index)
 43 |         st.session_state.index += 1
 44 |         st.rerun()
 45 | 
 46 | 
 47 | def prefill_context():
 48 |     """Prefill context with empty messages."""
 49 |     for key in st.session_state.context:
 50 |         st.session_state.context[key].append(None)
 51 | 
 52 | 
 53 | def show_chat_history():
 54 |     """Display chat history with synchronized context data."""
 55 |     for index, message_group in enumerate(st.session_state.messages):
 56 |         if not message_group:
 57 |             continue
 58 | 
 59 |         for message in message_group:
 60 |             with st.chat_message(message["role"]):
 61 |                 st.markdown(message["content"])
 62 | 
 63 |                 if message["role"] == "database":
 64 |                     show_query_results(index)
 65 |                 if message["role"] == "assistant":
 66 |                     show_thinking_process(index)
 67 | 
 68 | 
 69 | def rag_pipeline():
 70 |     """Main RAG pipeline."""
 71 |     if st.session_state.current_state == "translator":
 72 |         translate_question()
 73 |     if st.session_state.current_state == "database":
 74 |         execute_query()
 75 |     if st.session_state.current_state == "detective":
 76 |         detective_conclusion()
 77 | 
 78 | 
 79 | def translate_question():
 80 |     """Translate the natural question to SQL query"""
 81 |     with st.spinner("Translating to SQL..."):
 82 |         current_index = st.session_state.index - 1
 83 | 
 84 |         user_input = st.session_state.context["user_inputs"][current_index]
 85 |         translation = Translator(user_input)
 86 | 
 87 |         update_context("sql_queries", translation.sql_query, current_index)
 88 |         update_messages("database", translation.sql_query, current_index)
 89 | 
 90 |         st.session_state.current_state = "database"
 91 |         st.rerun()
 92 | 
 93 | 
 94 | def execute_query():
 95 |     """Execute SQL query and return results."""
 96 |     with st.spinner("Executing SQL query..."):
 97 |         current_index = st.session_state.index - 1
 98 |         query = st.session_state.context["sql_queries"][current_index]
 99 |         try:
100 |             query_results = execute_sql_query(query)
101 |             update_context("query_results", query_results, current_index)
102 |             st.session_state.current_state = "detective"
103 |         except Exception as e:
104 |             print(f"Error executing translated query.\nRephrase the question and try again.\nError message: {e}")
105 |             st.session_state.current_state = None
106 | 
107 |         st.rerun()
108 | 
109 | 
110 | def detective_conclusion():
111 |     """Detective's conclusion."""
112 |     with st.spinner("Detective is analyzing the results..."):
113 |         current_index = st.session_state.index - 1
114 | 
115 |         user_input = st.session_state.context["user_inputs"][current_index]
116 |         query_results = st.session_state.context["query_results"][current_index]
117 | 
118 |         detective = Detective(user_input=user_input, context=query_results)
119 | 
120 |         update_context("detective_answers", detective.answer, current_index)
121 |         update_context("detective_thinking", detective.thinking, current_index)
122 |         update_messages("assistant", detective.answer, current_index)
123 | 
124 |         st.session_state.current_state = None
125 |         st.rerun()
126 | 
127 | 
128 | def update_context(key, value, index):
129 |     """ "Update context value at specific index."""
130 |     context = st.session_state.context[key]
131 |     while len(context) <= index:
132 |         context.append(None)
133 |     context[index] = value
134 | 
135 | 
136 | def update_messages(role, content, index):
137 |     messages = st.session_state.messages
138 |     while len(messages) <= index:
139 |         messages.append([])
140 | 
141 |     messages[index].append({"role": role, "content": content})
142 | 
143 | 
144 | def show_query_results(index):
145 |     """Display query results for given message index."""
146 |     query_results = st.session_state.context["query_results"][index]
147 | 
148 |     # Validate index
149 |     if not query_results:
150 |         st.warning("Error executing translated query. Rephrase the question and try again.")
151 |         return
152 | 
153 |     with st.expander("Query Results"):
154 |         df = convert_results_to_dataframe(query_results)
155 |         st.dataframe(df)
156 | 
157 | 
158 | def convert_results_to_dataframe(results):
159 |     """Convert query results to DataFrame."""
160 |     return pd.DataFrame(results)
161 | 
162 | 
163 | def show_thinking_process(index):
164 |     """Display detective's thinking process in expandable section."""
165 |     with st.expander("Detective's Thinking Process"):
166 |         st.write(st.session_state.context["detective_thinking"][index])
167 | 
168 | 
169 | def game_rules_sidebar():
170 |     """Displays game rules, hints and tools in the sidebar."""
171 |     with st.sidebar:
172 |         st.title("Detective's Handbook")
173 | 
174 |         with st.expander("Case Brief", expanded=True):
175 |             st.markdown(
176 |                 """
177 |                 **THE CASE**
178 |                 
179 |                 - Crime: Murder
180 |                 - Date: January 15, 2018
181 |                 - Location: SQL City
182 |                 - Status: Unsolved
183 |                 
184 |                 Begin by finding the crime scene report.
185 |             """
186 |             )
187 | 
188 |         with st.expander("Investigation Tips"):
189 |             st.markdown(
190 |                 """
191 |                 - Ask specific questions about people, places, or evidence
192 |                 - Follow leads from one piece of evidence to another
193 |                 - Look for connections between witnesses and suspects
194 |                 - Pay attention to alibis and timelines
195 |             """
196 |             )
197 | 
198 |         # Save notes in session state for persistence
199 |         if "notes" not in st.session_state:
200 |             st.session_state.notes = ""
201 | 
202 |         st.session_state.notes = st.text_area(
203 |             label="Detective's Notes",
204 |             value=st.session_state.notes,
205 |             height=300,
206 |             placeholder="Record your clues, suspects and theories here...",
207 |         )
208 | 


--------------------------------------------------------------------------------
/modules/database/db.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | from langchain_community.utilities import SQLDatabase
 4 | 
 5 | from modules.settings import DB_FILE
 6 | 
 7 | 
 8 | def execute_sql_query(extracted_query):
 9 |     """Connect to database and execute SQL query"""
10 |     with sqlite3.connect(DB_FILE) as db_connection:
11 |         cursor = db_connection.cursor()
12 |         cursor.execute(extracted_query)
13 |         results = cursor.fetchall()
14 | 
15 |         column_names = [description[0] for description in cursor.description]
16 | 
17 |         return convert_results_to_dict(results, column_names)
18 | 
19 | 
20 | def convert_results_to_dict(records, column_names):
21 |     """Convert list of tuples to a list of dictionaries."""
22 |     return [dict(zip(column_names, row)) for row in records]
23 | 
24 | 
25 | def get_db_schema():
26 |     db = db_without_solution()
27 |     return db.get_table_info()
28 | 
29 | 
30 | def database_connect():
31 |     """Establishes a connection to the SQL database using the provided URI."""
32 |     db = SQLDatabase.from_uri(f"sqlite:///{DB_FILE}")
33 |     return db
34 | 
35 | 
36 | def db_without_solution():
37 |     """Connects to SQL database. Solution table is excluded."""
38 |     return SQLDatabase.from_uri(f"sqlite:///{DB_FILE}", ignore_tables=["solution"])
39 | 


--------------------------------------------------------------------------------
/modules/database/sql-murder-mystery.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ne0bliviscaris/SQL-Detective-RAG-Game/bd36e67fe4d5ef8cbc816ed94f8ac8f78fcf8596/modules/database/sql-murder-mystery.db


--------------------------------------------------------------------------------
/modules/model.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from langchain.chains.sql_database.query import create_sql_query_chain
  4 | from langchain_core.output_parsers import StrOutputParser
  5 | from langchain_core.prompts import PromptTemplate
  6 | from langchain_core.runnables import RunnablePassthrough
  7 | from langchain_ollama import ChatOllama
  8 | 
  9 | from modules.database.db import db_without_solution, get_db_schema
 10 | from modules.settings import MODEL
 11 | from modules.tools import model_answer_regex
 12 | 
 13 | 
 14 | class Model:
 15 |     """Base class for all models."""
 16 | 
 17 |     def __init__(self, user_input, context=None):
 18 |         self.user_input = user_input
 19 |         self.context = context
 20 |         self.full_response = self.get_model_response()
 21 |         self.sql_query = self.get_field("sql_query")
 22 |         self.answer = self.get_field("answer")
 23 |         self.next_step = self.get_field("next_step")
 24 |         self.thinking = self.get_field("thinking")
 25 |         self.rules = self.get_field("rules_followed")
 26 | 
 27 |     def get_model_response(self):
 28 |         """Get response using instance attributes."""
 29 |         try:
 30 |             langchain = self.build_langchain()
 31 |             response_str = langchain.invoke(self.model_input())
 32 |             try:
 33 |                 parsed_response = json.loads(response_str)
 34 |                 return parsed_response
 35 |             except:
 36 |                 return response_str
 37 |         except Exception as e:
 38 |             return f"Model Connection error. Make sure Ollama is running and {MODEL} is installed.\n{e}"
 39 | 
 40 |     def get_field(self, field=None):
 41 |         """Get SQL query from parsed_response."""
 42 |         if isinstance(self.full_response, dict):
 43 |             return self.full_response.get(field)
 44 | 
 45 |         if isinstance(self.full_response, str):
 46 |             return model_answer_regex(self.full_response, field)
 47 |         return None
 48 | 
 49 |     def get_thinking_process(self):
 50 |         """Get thinking process from parsed_response."""
 51 |         return self.response_json.get("thinking")
 52 | 
 53 | 
 54 | class Translator(Model):
 55 |     """SQL Translator model class."""
 56 | 
 57 |     def model_input(self):
 58 |         return {
 59 |             "input": self.user_input,
 60 |             "question": self.user_input,
 61 |         }
 62 | 
 63 |     def prompt(self) -> str:
 64 |         """Prompt template to translate text instructions into SQL query"""
 65 |         translator = """
 66 |         **ROLE:** You are a SQL Translator. Your task is to translate the following question into a valid SQL query. Use {dialect} dialect.
 67 | 
 68 |         **Database Schema:**
 69 |         {table_info}
 70 | 
 71 |         **Rules:**
 72 |         1. Ensure the output contains a valid SQL query.
 73 |         2. The query must strictly follow the provided database schema and use only the available tables and columns.
 74 |         3. Keep the thinking process brief, ensuring it logically aligns with the user input.
 75 |         4. Avoid unnecessary complexity—only join tables or include conditions that are directly relevant to the user's question.
 76 |         5. Fetch all columns by default using 'SELECT *', unless a specific column is mentioned in the input.
 77 |         6. Be flexible in interpreting imprecise or incomplete user input while providing a valid SQL query.
 78 |         7. Do not use your own knowledge or external sources.
 79 |         8. Do not assume anything that is not explicitly present in the schema.
 80 |         9. ONLY return the SQL query, no additional explanations or text.
 81 |         10. If the user is asking about the order of items (first, last etc.), use an ORDER BY clause based on the relevant column.
 82 |         11. While using columns from multiple tables, ensure you call proper tables.
 83 | 
 84 |         
 85 |         **Input:** {input}
 86 | 
 87 |         **Output:**
 88 |         ```json
 89 |         {{
 90 |             "user_input": "{input}",
 91 |             "sql_query": "SELECT * FROM table_name WHERE condition;"
 92 |             "thinking": "Thinking process.",
 93 |             "rules_followed": "Rules followed while generating answer."
 94 |         }}
 95 |         top_k: {top_k}
 96 |         ```
 97 |         """
 98 |         return PromptTemplate(
 99 |             template=translator,
100 |             input_variables=["input"],
101 |             partial_variables={
102 |                 "dialect": "sqlite",
103 |                 "table_info": get_db_schema(),
104 |             },
105 |         )
106 | 
107 |     def build_langchain(self):
108 |         """Builds and returns a language chain with database and Ollama connections."""
109 |         db = db_without_solution()
110 |         llm = ChatOllama(
111 |             temperature=0,
112 |             seed=1,
113 |             model=MODEL,
114 |             num_predict=512,  # Output tokens limit
115 |             top_p=0.95,
116 |             format="json",
117 |             mirostat=2,
118 |             mirostat_eta=2,
119 |             mirostat_tau=1,
120 |             tfs_z=50,  # reduce the impact of less probable tokens
121 |             repeat_penalty=1.5,
122 |             top_k=2,
123 |         )
124 |         prompt = self.prompt()
125 |         return create_sql_query_chain(llm, db, prompt)
126 | 
127 | 
128 | class Detective(Model):
129 |     """Detective model for analyzing case data and providing conclusions."""
130 | 
131 |     def model_input(self):
132 |         return {
133 |             "user_input": self.user_input,
134 |             "context": self.context,
135 |         }
136 | 
137 |     def prompt(self) -> str:
138 |         """Prompt template to translate text instructions into SQL query"""
139 |         detective = """
140 |         **ROLE:** You are a detective solving a case. Analyze the provided information in reference to the user's question. Your response must be concise, fact-based, and logically structured.
141 | 
142 |         **Rules:**
143 |         1. Provide an "answer" based only on the given context.
144 |         2. Suggest a "next_step" that logically follows from the answer.
145 |         3. Keep "thinking" brief but clear, explaining how the answer was derived.
146 |         4. Use only the received data—do not assume or use external knowledge.
147 |         5. Use exact column names from the query results in responses.
148 |         6. For **last, highest, or largest values**, return the maximum in the relevant column.
149 |         7. For **first, lowest, or smallest values**, return the minimum in the relevant column.
150 |         8. For **specific persons, objects, or events**, find an exact match in the data.
151 |         9. For **patterns, summaries, or trends**, analyze and summarize the provided data.
152 |         10. Dates are stored in the database as integers in the format YYYYMMDD. Make sure you read them properly.
153 |         11. If query results contain relevant data, extract and summarize key information.
154 |         12. If multiple records match, summarize them concisely.
155 |         13. If no relevant data is found, return `"answer": "No relevant data available."`
156 |         14. If the user input is ambiguous, provide a response based on the most likely interpretation.
157 |         15. Ensure the answer is correct—scan the received context carefully.
158 | 
159 |         **User_input:** {{user_input}}
160 | 
161 |         **Context:** {{context}}
162 | 
163 |         **Output:**
164 |         ```json
165 |         {{
166 |             "answer": "Your conclusion here.",
167 |             "next_step": "Logical next step based on the answer.",
168 |             "thinking": "Reasoning behind the answer and next step.",
169 |             "user_input": "{user_input}",
170 |             "rules_followed": "[List rules followed while generating answer.]"
171 |         }}
172 |         ```
173 |         """
174 |         return PromptTemplate(
175 |             template=detective,
176 |             input_variables=["user_input", "context"],
177 |             partial_variables={
178 |                 "top_k": 1,
179 |             },
180 |         )
181 | 
182 |     def build_langchain(self):
183 |         """Builds and returns a language chain with database and Ollama connections."""
184 |         llm = ChatOllama(
185 |             temperature=0,
186 |             seed=1,
187 |             model=MODEL,
188 |             num_predict=1024,  # Output tokens limit
189 |             top_p=0.95,
190 |             format="json",
191 |             mirostat=2,
192 |             mirostat_eta=2,
193 |             mirostat_tau=1,
194 |             tfs_z=50,  # reduce the impact of less probable tokens
195 |             repeat_penalty=1.5,
196 |         )
197 |         prompt = self.prompt()
198 | 
199 |         return (
200 |             {"user_input": RunnablePassthrough(), "context": lambda x: self.context} | prompt | llm | StrOutputParser()
201 |         )
202 | 


--------------------------------------------------------------------------------
/modules/settings.py:
--------------------------------------------------------------------------------
 1 | # Easy Model Switch
 2 | # MODEL = "llama3"
 3 | # MODEL = "zephyr:7b"
 4 | MODEL = "deepseek-r1:1.5b"
 5 | # MODEL = "llama3.2:1b"
 6 | 
 7 | # Switched to local model
 8 | # if MODEL == "llama3":
 9 | #     MODEL_URL = "http://localhost:11434"
10 | # elif MODEL == "zephyr:7b":
11 | #     MODEL_URL = "http://localhost:55555"
12 | 
13 | 
14 | DB_FILE = r"modules\database\sql-murder-mystery.db"
15 | 


--------------------------------------------------------------------------------
/modules/tools.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def model_answer_regex(response, field_name):
 5 |     """Extract field value from JSON-like text response."""
 6 |     FIELD = rf'"{field_name}"'  # Field name in quotes
 7 |     SEPARATOR = r"\s*:\s*"  # Colon : with optional spaces
 8 |     QUOTE = r'"'  # Opening quote '
 9 |     CONTENT = r'([^"]*)'  # Capture group for field content
10 |     END = r'(?:"|$)'  # Closing quote ' or end of string
11 | 
12 |     regex = f"{FIELD}{SEPARATOR}{QUOTE}{CONTENT}{END}"
13 |     match = re.search(regex, response, re.VERBOSE)
14 | 
15 |     if match:
16 |         return match.group(1)
17 |     return None
18 | 
19 | 
20 | def convert_list_to_string(results, columns):
21 |     """Convert list of tuples to string with column names"""
22 |     results_str = f"{', '.join(columns)}\n"
23 |     for row in results:
24 |         results_str += str(row) + "\n"
25 |     return results_str
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain==0.3.20
2 | langchain_community==0.3.19
3 | langchain_core==0.3.41
4 | langchain_ollama==0.2.3
5 | pandas==2.2.3
6 | streamlit==1.39.0
7 | 


--------------------------------------------------------------------------------
/translator_test.py:
--------------------------------------------------------------------------------
 1 | from modules.model import Translator
 2 | 
 3 | response = Translator("find all murder cases from SQL City")
 4 | # response = Translator("Find who lives in the last house in Franklin Ave. Order house numbers")
 5 | # response = Translator("Find who lives on the Franklin Ave")
 6 | # response = Translator("find all people with blue eyes. drivers_license has info about eye color")
 7 | # response = Translator("find names of all people with blue eyes. Eyes color is in drivers_license table")
 8 | # response = Translator("find all people with blue eyes. 'drivers_license' table has info about 'eye_color'")
 9 | # response = Translator("show all police murder case reports")
10 | 
11 | 
12 | print((response.full_response))
13 | print("\nQuery:\n", response.sql_query)
14 | print("\nThinking:\n", response.thinking)
15 | # print("Prompt:\n", response.prompt())
16 | 


--------------------------------------------------------------------------------