├── .dockerignore
├── images
├── app.png
└── diagram.png
├── .gitignore
├── utils.py
├── requirements.txt
├── bot.Dockerfile
├── pull_model.Dockerfile
├── .example.env
├── CONTRIBUTING.md
├── docker-compose.yml
├── db.py
├── README.md
├── chains.py
├── agent.py
├── LICENSE
└── bot.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !*.py
3 | !requirements.txt
--------------------------------------------------------------------------------
/images/app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dockersamples/CodeExplorer/HEAD/images/app.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | data/
3 | embedding_model/
4 | .DS_Store
5 | __pycache__
6 | .venv
7 | test
--------------------------------------------------------------------------------
/images/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dockersamples/CodeExplorer/HEAD/images/diagram.png
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | class BaseLogger:
2 | def __init__(self) -> None:
3 | self.info = print
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai==0.27.10
2 | python-dotenv==1.0.0
3 | wikipedia==1.4.0
4 | tiktoken==0.5.1
5 | neo4j==5.2.1
6 | streamlit==1.27.2
7 | # sentence_transformers==2.2.2
8 | transformers==4.34.1
9 | Pillow==10.1.0
10 | fastapi==0.103.2
11 | PyPDF2==3.0.1
12 | # torch==2.0.1
13 | pydantic==1.10.13
14 | uvicorn==0.23.2
15 | sse-starlette==1.6.5
16 | boto3==1.28.69
17 | numexpr==2.8.7
18 | # langchain==0.0.323
19 |
--------------------------------------------------------------------------------
/bot.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM langchain/langchain:0.0.323
2 |
3 | WORKDIR /app
4 |
5 | RUN apt-get update && apt-get install -y \
6 | build-essential \
7 | curl \
8 | software-properties-common \
9 | && rm -rf /var/lib/apt/lists/*
10 |
11 | COPY requirements.txt .
12 |
13 | RUN pip install -r requirements.txt
14 |
15 | # set email for streamlit so it doesn't ask in docker container
16 | RUN mkdir -p ~/.streamlit/
17 | RUN echo "[general]" > ~/.streamlit/credentials.toml
18 | RUN echo "email = \"\"" >> ~/.streamlit/credentials.toml
19 |
20 | COPY *.py .
21 |
22 | EXPOSE 8501
23 |
24 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
25 |
26 | ENTRYPOINT ["streamlit", "run", "bot.py", "--server.port=8501", "--server.address=0.0.0.0"]
27 |
--------------------------------------------------------------------------------
/pull_model.Dockerfile:
--------------------------------------------------------------------------------
1 | #syntax = docker/dockerfile:1.4
2 |
3 | FROM ollama/ollama:latest AS ollama
4 | FROM babashka/babashka:latest
5 |
6 | # just using as a client - never as a server
7 | COPY --from=ollama /bin/ollama ./bin/ollama
8 |
9 | COPY <!! done :stop))
31 |
32 | (println "OLLAMA model only pulled if both LLM and OLLAMA_BASE_URL are set and the LLM model is not gpt")))
33 | (catch Throwable _ (System/exit 1)))
34 | EOF
35 |
36 | ENTRYPOINT ["bb", "-f", "pull_model.clj"]
37 |
38 |
--------------------------------------------------------------------------------
/.example.env:
--------------------------------------------------------------------------------
1 | #*****************************************************************
2 | # LLM and Embedding Model
3 | #*****************************************************************
4 | LLM=codellama:7b-instruct #or any Ollama model tag, gpt-4, gpt-3.5, or claudev2
5 | EMBEDDING_MODEL=ollama #or sentence_transformer, openai, ollama, or aws
6 |
7 | #*****************************************************************
8 | # Neo4j
9 | #*****************************************************************
10 | NEO4J_URI=neo4j://database:7687
11 | NEO4J_USERNAME=neo4j
12 | NEO4J_PASSWORD=password
13 |
14 | #*****************************************************************
15 | # Langchain
16 | #*****************************************************************
17 | # Optional for enabling Langchain Smith API
18 |
19 | #LANGCHAIN_TRACING_V2=true # false
20 | #LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
21 | #LANGCHAIN_PROJECT=#your-project-name
22 | #LANGCHAIN_API_KEY=#your-api-key ls_...
23 |
24 | #*****************************************************************
25 | # Ollama
26 | #*****************************************************************
27 | OLLAMA_BASE_URL=http://host.docker.internal:11434
28 | # OLLAMA_BASE_URL=http://llm:11434
29 |
30 | #*****************************************************************
31 | # OpenAI
32 | #*****************************************************************
33 | # Only required when using OpenAI LLM or embedding model
34 |
35 | #OPENAI_API_KEY=sk-...
36 |
37 | #*****************************************************************
38 | # AWS
39 | #*****************************************************************
40 | # Only required when using AWS Bedrock LLM or embedding model
41 |
42 | #AWS_ACCESS_KEY_ID=
43 | #AWS_SECRET_ACCESS_KEY=
44 | #AWS_DEFAULT_REGION=us-east-1
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Code Explorer
2 |
3 | First off, thank you for considering contributing to Code Explorer! We're excited to have you on board.
4 |
5 | Please take a moment to review this document in order to make the contribution process easy and effective for everyone involved.
6 |
7 | ## Code of Conduct
8 |
9 | This project and everyone participating in it is governed by the [Code Explorer Code of Conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. Please report unacceptable behavior to the project maintainers.
10 |
11 | ## How Can I Contribute?
12 |
13 | ### Reporting Bugs
14 |
15 | If you find a bug in Code Explorer, please open an issue on the [GitHub issue tracker](https://github.com/your-username/CodeExplorer/issues). Be sure to include a clear title and description of the problem, as well as any relevant details such as the version of Code Explorer you're using.
16 |
17 | ### Suggesting Enhancements
18 |
19 | If you have an idea for a new feature or improvement to Code Explorer, please open an issue on the [GitHub issue tracker](https://github.com/dockersamples/CodeExplorer/issues). Be sure to provide a clear title and description of your suggestion, as well as any relevant details or examples.
20 |
21 | ### Pull Requests
22 |
23 | If you'd like to contribute code to Code Explorer, please follow these steps:
24 |
25 | 1. Fork the repository and create a new branch for your changes.
26 | 2. Make your changes and ensure that your code is well-formatted and follows the project's coding style guidelines.
27 | 3. Write clear and concise commit messages that describe your changes.
28 | 4. Open a pull request on the [GitHub repository](https://github.com/dockersamples/CodeExplorer) and provide a clear title and description of your changes.
29 | 5. Wait for feedback from the project maintainers and make any necessary adjustments.
30 | 6. Once your pull request is approved, it will be merged into the main branch.
31 |
32 | ## Styleguides
33 |
34 | ### Git Commit Messages
35 |
36 | * Use the present tense ("Add feature" not "Added feature")
37 | * Use the imperative mood ("Move cursor to..." not "Moves cursor to...")
38 | * Limit the first line to 72 characters or less
39 | * Reference issues and pull requests using the `#` symbol
40 |
41 | ### Python Code
42 |
43 | * Follow the PEP 8 style guide
44 | * Use 4 spaces per indentation level
45 | * Use blank lines to separate functions and classes, and larger blocks of code inside functions
46 | * Add comments to explain complex sections of code
47 |
48 | ### Documentation
49 |
50 | * Use Markdown for documentation
51 | * Keep documentation up-to-date with any changes to the code
52 | * Use clear and concise language
53 |
54 | Thank you for your interest in contributing to Code Explorer! We're excited to see what you can bring to the project.
55 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | llm:
3 | image: ollama/ollama:latest
4 | profiles: ['linux']
5 | networks:
6 | - net
7 | # deploy:
8 | # resources:
9 | # reservations:
10 | # devices:
11 | # - driver: nvidia
12 | # count: all
13 | # capabilities: [gpu]
14 |
15 | pull-model:
16 | build:
17 | dockerfile: pull_model.Dockerfile
18 | environment:
19 | - OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
20 | - LLM=${LLM-codellama:7b-instruct}
21 | networks:
22 | - net
23 | depends_on:
24 | - llm
25 |
26 | database:
27 | image: neo4j:5.11
28 | ports:
29 | - 7687:7687
30 | - 7474:7474
31 | volumes:
32 | - ./data:/data
33 | environment:
34 | - NEO4J_AUTH=${NEO4J_USERNAME-neo4j}/${NEO4J_PASSWORD-password}
35 | - NEO4J_PLUGINS=["apoc"]
36 | - NEO4J_db_tx__log_rotation_retention__policy=false
37 | healthcheck:
38 | test:
39 | [
40 | 'CMD-SHELL',
41 | 'wget --no-verbose --tries=1 --spider localhost:7474 || exit 1'
42 | ]
43 | interval: 5s
44 | timeout: 3s
45 | retries: 5
46 | networks:
47 | - net
48 |
49 | bot:
50 | build:
51 | dockerfile: bot.Dockerfile
52 | volumes:
53 | - ./embedding_model:/embedding_model
54 | - type: bind
55 | source: /home/${whoami}
56 | target: /home/${whoami}
57 | environment:
58 | - NEO4J_URI=${NEO4J_URI-neo4j://database:7687}
59 | - NEO4J_PASSWORD=${NEO4J_PASSWORD-password}
60 | - NEO4J_USERNAME=${NEO4J_USERNAME-neo4j}
61 | - OPENAI_API_KEY=${OPENAI_API_KEY-}
62 | - OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
63 | - LLM=${LLM-codellama:7b-instruct}
64 | - EMBEDDING_MODEL=${EMBEDDING_MODEL-ollama}
65 | - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"}
66 | - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false}
67 | - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT}
68 | - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY}
69 | - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
70 | - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
71 | - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION}
72 | networks:
73 | - net
74 | depends_on:
75 | database:
76 | condition: service_healthy
77 | pull-model:
78 | condition: service_completed_successfully
79 | develop:
80 | watch:
81 | - action: rebuild
82 | path: .
83 | ports:
84 | - 8501:8501
85 |
86 | networks:
87 | net:
88 |
--------------------------------------------------------------------------------
/db.py:
--------------------------------------------------------------------------------
1 | import os
2 | from streamlit.logger import get_logger
3 | from dotenv import load_dotenv
4 | from chains import (
5 | load_embedding_model
6 | )
7 | from langchain.text_splitter import RecursiveCharacterTextSplitter
8 | from langchain.vectorstores.neo4j_vector import Neo4jVector
9 | from langchain.text_splitter import Language
10 | from langchain.document_loaders.generic import GenericLoader
11 | from langchain.document_loaders.parsers import LanguageParser
12 | from langchain.agents.conversational_chat.prompt import FORMAT_INSTRUCTIONS
13 |
14 | load_dotenv(".env")
15 |
16 | url = os.getenv("NEO4J_URI")
17 | username = os.getenv("NEO4J_USERNAME")
18 | password = os.getenv("NEO4J_PASSWORD")
19 | ollama_base_url = os.getenv("OLLAMA_BASE_URL")
20 | embedding_model_name = os.getenv("EMBEDDING_MODEL")
21 | llm_name = os.getenv("LLM")
22 | # Remapping for Langchain Neo4j integration
23 | os.environ["NEO4J_URL"] = url
24 |
25 | logger = get_logger(__name__)
26 |
27 | embeddings, dimension = load_embedding_model(
28 | embedding_model_name, config={"ollama_base_url": ollama_base_url}, logger=logger
29 | )
30 |
31 | def process_documents(language, directory) -> (str, Neo4jVector):
32 | print("File chunking begins...", language, directory)
33 |
34 | # Create a dictionary mapping languages to file extensions
35 | language_suffix_mapping = {
36 | Language.CPP: ".cpp",
37 | Language.GO: ".go",
38 | Language.JAVA: ".java",
39 | Language.KOTLIN: ".kt",
40 | Language.JS: ".js",
41 | Language.TS: ".ts",
42 | Language.PHP: ".php",
43 | Language.PROTO: ".proto",
44 | Language.PYTHON: ".py",
45 | Language.RST: ".rst",
46 | Language.RUBY: ".rb",
47 | Language.RUST: ".rs",
48 | Language.SCALA: ".scala",
49 | Language.SWIFT: ".swift",
50 | Language.MARKDOWN: ".md",
51 | Language.LATEX: ".tex",
52 | Language.HTML: ".html",
53 | Language.SOL: ".sol",
54 | Language.CSHARP: ".cs",
55 | }
56 | # Get the corresponding suffix based on the selected language
57 | suffix = language_suffix_mapping.get(language, "")
58 | print("language file extension:", suffix)
59 |
60 | loader = GenericLoader.from_filesystem(
61 | path=directory,
62 | glob="**/*",
63 | suffixes=[suffix],
64 | parser=LanguageParser(language=language, parser_threshold=500)
65 | )
66 | documents = loader.load()
67 | print("Total documents:", len(documents))
68 | if len(documents) == 0:
69 | return ("0 documents found", None)
70 |
71 | text_splitter = RecursiveCharacterTextSplitter.from_language(language=language,
72 | chunk_size=5000,
73 | chunk_overlap=500)
74 |
75 | chunks = text_splitter.split_documents(documents)
76 | print("Chunks:", len(chunks))
77 |
78 | hashStr = "myHash" # str(abs(hash(directory)))
79 |
80 | # Store the chunks part in db (vector)
81 | vectorstore = Neo4jVector.from_documents(
82 | chunks,
83 | url=url,
84 | username=username,
85 | password=password,
86 | embedding=embeddings,
87 | index_name=f"index_{hashStr}",
88 | node_label=f"node_{hashStr}",
89 | pre_delete_collection=True, # Delete existing data
90 | )
91 |
92 | print("Files are now chunked up")
93 |
94 | return (None, vectorstore)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code Explorer
2 |
3 | This app allows you to ask questions and get answers regarding your code provided the folder location of your code. It is a RAG LLM chain. The app was based on the GenAI stack from https://github.com/docker/genai-stack.
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | # Configure
13 |
14 | Create a `.env` file from the environment template file `.example.env`
15 |
16 | Available variables:
17 | | Variable Name | Default value | Description |
18 | |------------------------|------------------------------------|-------------------------------------------------------------------------|
19 | | OLLAMA_BASE_URL | http://host.docker.internal:11434 | REQUIRED - URL to Ollama LLM API |
20 | | NEO4J_URI | neo4j://database:7687 | REQUIRED - URL to Neo4j database |
21 | | NEO4J_USERNAME | neo4j | REQUIRED - Username for Neo4j database |
22 | | NEO4J_PASSWORD | password | REQUIRED - Password for Neo4j database |
23 | | LLM | codellama:7b-instruct | REQUIRED - Can be any Ollama model tag, or gpt-4 or gpt-3.5 or claudev2 |
24 | | EMBEDDING_MODEL | ollama | REQUIRED - Can be sentence_transformer, openai, aws or ollama |
25 | | AWS_ACCESS_KEY_ID | | REQUIRED - Only if LLM=claudev2 or embedding_model=aws |
26 | | AWS_SECRET_ACCESS_KEY | | REQUIRED - Only if LLM=claudev2 or embedding_model=aws |
27 | | AWS_DEFAULT_REGION | | REQUIRED - Only if LLM=claudev2 or embedding_model=aws |
28 | | OPENAI_API_KEY | | REQUIRED - Only if LLM=gpt-4 or LLM=gpt-3.5 or embedding_model=openai |
29 | | LANGCHAIN_ENDPOINT | "https://api.smith.langchain.com" | OPTIONAL - URL to Langchain Smith API |
30 | | LANGCHAIN_TRACING_V2 | false | OPTIONAL - Enable Langchain tracing v2 |
31 | | LANGCHAIN_PROJECT | | OPTIONAL - Langchain project name |
32 | | LANGCHAIN_API_KEY | | OPTIONAL - Langchain API key |
33 |
34 | NOTE: If using `EMBEDDING_MODEL=sentence_transformer`, uncomment code in `requirements.txt` & `chains.py`. It was commented out to reduce code size.
35 |
36 | NOTE: Make sure to set the `OLLAMA_BASE_URL=http://llm:11434` in the `.env` file when using Ollama docker container.
37 |
38 | ## Project Structure
39 |
40 | * `LICENSE`: Contains the project's license information.
41 | * `README.md`: The main README file, providing an introduction and overview of the project.
42 | * `CONTRIBUTING.md`: Guidelines for contributing to the project.
43 | * `CODE_OF_CONDUCT.md`: The code of conduct for the project.
44 | * `.env` and `.example.env`: Configuration files for environment variables.
45 | * `agent.py`, `bot.py`, `chains.py`, `db.py`, and `utils.py`: Core logic and functionality of the project.
46 | * `docker-compose.yml`: Defines and manages multi-container Docker applications.
47 | * `Dockerfile` and `bot.Dockerfile`: Instructions for building Docker images.
48 | * `requirements.txt`: Lists the project's dependencies and their versions.
49 | * `images`: Contains visual assets used in the README and other documentation.
50 |
51 |
52 | # Docker (Linux only)
53 |
54 | **Build only**
55 |
56 | ```
57 | docker compose --profile linux build
58 | ```
59 |
60 | **To start everything (Linux)**
61 |
62 | ```
63 | docker compose --profile linux up --build
64 | ```
65 |
66 | To enter **watch mode** (auto rebuild on file changes).
67 | First start everything, then in new terminal:
68 |
69 | ```
70 | docker compose --profile linux alpha watch
71 | ```
72 |
73 | **Shutdown**
74 | If health check fails or containers don't start up as expected, shutdown
75 | completely to start up again.
76 |
77 | ```
78 | docker compose --profile linux down
79 | ```
80 |
81 | # Application
82 |
83 | Access the app at http://localhost:8501. In the sidebar, enter the path to your code folder and click "Process files". Then start asking questions about your code in the main chat. The detailed mode toggle switches between asking the QA agent only (detailedMode=true) and asking an agent which uses the QA agent as a tool (detailedMode=false). In testing, the agent appears to summarize rather than giving a technical response as opposed to the QA agent only.
84 |
85 | The Neo4j vector database can be explored at http://localhost:7474.
86 |
87 | ## Query:
88 |
89 | What is the purpose of LLMSingleActionAgent() function?
90 |
--------------------------------------------------------------------------------
/chains.py:
--------------------------------------------------------------------------------
1 | from langchain.embeddings.openai import OpenAIEmbeddings
2 | from langchain.embeddings import (
3 | OllamaEmbeddings,
4 | # SentenceTransformerEmbeddings,
5 | BedrockEmbeddings,
6 | )
7 | from langchain.chat_models import ChatOpenAI, ChatOllama, BedrockChat
8 | from langchain.chains import RetrievalQAWithSourcesChain
9 | from langchain.chains.qa_with_sources import load_qa_with_sources_chain
10 | from langchain.prompts.chat import (
11 | ChatPromptTemplate,
12 | SystemMessagePromptTemplate,
13 | HumanMessagePromptTemplate,
14 | MessagesPlaceholder
15 | )
16 | from typing import List, Any
17 | from utils import BaseLogger
18 | from langchain.chains import LLMChain
19 | from langchain.memory import ConversationBufferMemory
20 |
21 |
22 | def load_embedding_model(embedding_model_name: str, logger=BaseLogger(), config={}):
23 | if embedding_model_name == "ollama":
24 | embeddings = OllamaEmbeddings(
25 | base_url=config["ollama_base_url"], model="codellama:7b-instruct"
26 | )
27 | dimension = 4096
28 | logger.info("Embedding: Using Ollama")
29 | elif embedding_model_name == "openai":
30 | embeddings = OpenAIEmbeddings()
31 | dimension = 1536
32 | logger.info("Embedding: Using OpenAI")
33 | elif embedding_model_name == "aws":
34 | embeddings = BedrockEmbeddings()
35 | dimension = 1536
36 | logger.info("Embedding: Using AWS")
37 | # else:
38 | # embeddings = SentenceTransformerEmbeddings(
39 | # model_name="all-MiniLM-L6-v2", cache_folder="./embedding_model"
40 | # )
41 | # dimension = 384
42 | # logger.info("Embedding: Using SentenceTransformer")
43 | return embeddings, dimension
44 |
45 |
46 | def load_llm(llm_name: str, logger=BaseLogger(), config={}):
47 | if llm_name == "gpt-4":
48 | logger.info("LLM: Using GPT-4")
49 | return ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True)
50 | elif llm_name == "gpt-3.5":
51 | logger.info("LLM: Using GPT-3.5")
52 | return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True)
53 | elif llm_name == "claudev2":
54 | logger.info("LLM: ClaudeV2")
55 | return BedrockChat(
56 | model_id="anthropic.claude-v2",
57 | model_kwargs={"temperature": 0.0, "max_tokens_to_sample": 1024},
58 | streaming=True,
59 | )
60 | elif len(llm_name):
61 | logger.info(f"LLM: Using Ollama: {llm_name}")
62 | return ChatOllama(
63 | temperature=0,
64 | base_url=config["ollama_base_url"],
65 | model=llm_name,
66 | streaming=True,
67 | # seed=2,
68 | top_k=10, # A higher value (100) will give more diverse answers, while a lower value (10) will be more conservative.
69 | top_p=0.3, # Higher value (0.95) will lead to more diverse text, while a lower value (0.5) will generate more focused text.
70 | num_ctx=3072, # Sets the size of the context window used to generate the next token.
71 | )
72 | logger.info("LLM: Using GPT-3.5")
73 | return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True)
74 |
75 |
76 | def configure_llm_only_chain(llm):
77 | # LLM only response
78 | template = """
79 | You are a helpful assistant that helps a support agent with answering programming questions.
80 | If you don't know the answer, just say that you don't know, you must not make up an answer.
81 | """
82 | human_template = "{question}"
83 |
84 | chat_prompt = ChatPromptTemplate.from_messages([
85 | SystemMessagePromptTemplate.from_template(template), # The persistent system prompt
86 | MessagesPlaceholder(variable_name="chat_history"), # Where the memory will be stored.
87 | HumanMessagePromptTemplate.from_template(human_template) # Where the human input will injected
88 | ])
89 |
90 | memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
91 | chain = LLMChain(
92 | llm=llm,
93 | prompt=chat_prompt,
94 | verbose=False,
95 | memory=memory,
96 | )
97 |
98 | def generate_llm_output(
99 | user_input: str, callbacks: List[Any]
100 | ) -> str:
101 | answer = chain.invoke(user_input, config={"callbacks": callbacks})["text"]
102 | return answer
103 |
104 | return generate_llm_output
105 |
106 |
107 | def get_qa_rag_chain(_vectorstore, llm):
108 | # Create qa RAG chain
109 | system_template = """
110 | Use the following pieces of context to answer the question at the end.
111 | The context contains code source files which can be used to answer the question as well as be used as references.
112 | If you don't know the answer, just say that you don't know, don't try to make up an answer.
113 | ----
114 | {summaries}
115 | ----
116 | Generate concise answers with references to code source files at the end of every answer.
117 | """
118 | user_template = "Question:```{question}```"
119 | chat_prompt = ChatPromptTemplate.from_messages([
120 | SystemMessagePromptTemplate.from_template(system_template), # The persistent system prompt
121 | HumanMessagePromptTemplate.from_template(user_template), # Where the human input will injected
122 | ])
123 | qa_chain = load_qa_with_sources_chain(
124 | llm,
125 | chain_type="stuff",
126 | prompt=chat_prompt,
127 | )
128 | qa = RetrievalQAWithSourcesChain(
129 | combine_documents_chain=qa_chain,
130 | retriever=_vectorstore.as_retriever(search_kwargs={"k": 2}),
131 | reduce_k_below_max_tokens=False,
132 | max_tokens_limit=3375,
133 | return_source_documents=True
134 | )
135 |
136 | return qa
--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
1 | from langchain.memory import ConversationBufferWindowMemory
2 | from langchain.chains import LLMChain
3 | from langchain.agents import Tool
4 | from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
5 | from typing import List, Union
6 | from langchain.schema import AgentAction, AgentFinish, HumanMessage
7 | import re
8 | from langchain.prompts import BaseChatPromptTemplate
9 | from langchain.chains import LLMMathChain
10 |
11 |
12 | # Set up the base template
13 | template = """
14 | Answer the following question as best you can. You have access to the following tools:
15 |
16 | {tools}
17 |
18 | Use the following format:
19 |
20 | \nQuestion: the input question you must answer
21 | \nThought: you should always think about what to do. if you have an answer to the question, then submit the final answer.
22 | \nAction: the action to take, should be one of [{tool_names}]
23 | \nAction Input: the input to the action
24 | \nObservation: the result of the action
25 | \n... (this Thought/Action/Action Input/Observation can repeat N times)
26 | \nThought: I now know the final answer
27 | \nFinal Answer: the final answer to the originals input question
28 |
29 | Begin!
30 |
31 | Previous conversation history:
32 | {chat_history}
33 |
34 | New Question: {input}
35 | {agent_scratchpad}"""
36 |
37 |
38 | # Set up a prompt template
39 | class CustomPromptTemplate(BaseChatPromptTemplate):
40 | # The template to use
41 | template: str
42 | # The list of tools available
43 | tools: List[Tool]
44 |
45 | def format_messages(self, **kwargs) -> str:
46 | # Get the intermediate steps (AgentAction, Observation tuples)
47 | # Format them in a particular way
48 | intermediate_steps = kwargs.pop("intermediate_steps")
49 | thoughts = ""
50 | for action, observation in intermediate_steps:
51 | thoughts += action.log
52 | thoughts += f"\nObservation: {observation}\nThought: "
53 | # Set the agent_scratchpad variable to that value
54 | kwargs["agent_scratchpad"] = thoughts
55 | # Create a tools variable from the list of tools provided
56 | kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
57 | # Create a list of tool names for the tools provided
58 | kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
59 | formatted = self.template.format(**kwargs)
60 | return [HumanMessage(content=formatted)]
61 |
62 | class CustomOutputParser(AgentOutputParser):
63 | def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
64 | # Check if agent should finish
65 | if "Final Answer:" in llm_output:
66 | return AgentFinish(
67 | # Return values is generally always a dictionary with a single `output` key
68 | # It is not recommended to try anything else at the moment :)
69 | return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
70 | log=llm_output,
71 | )
72 | # Parse out the action and action input
73 | regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
74 | match = re.search(regex, llm_output, re.DOTALL)
75 | if not match:
76 | # raise ValueError(f"Could not parse LLM output: `{llm_output}`")
77 | print("Could not parse LLM output, but finishing agent anyways: `{llm_output}`")
78 | return AgentFinish(
79 | # Return values is generally always a dictionary with a single `output` key
80 | # It is not recommended to try anything else at the moment :)
81 | return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
82 | log=llm_output,
83 | )
84 | action = match.group(1).strip()
85 | action_input = match.group(2)
86 | # Return the action and action input
87 | return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
88 |
89 | def get_agent_executor(_qa, llm):
90 | # Create agent executor and use qa and math as tools
91 | llm_math_chain = LLMMathChain.from_llm(llm=llm)
92 | tools = [
93 | Tool.from_function(
94 | name = "Code",
95 | func=_qa.run,
96 | description="useful for when you need to answer questions about code"
97 | ),
98 | Tool.from_function(
99 | func=llm_math_chain.run,
100 | name="Calculator",
101 | description="useful for when you need to answer questions about math"
102 | )
103 | ]
104 |
105 | prompt = CustomPromptTemplate(
106 | template=template,
107 | tools=tools,
108 | # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
109 | # This includes the `intermediate_steps` variable because that is needed
110 | input_variables=["input", "chat_history", "intermediate_steps"]
111 | )
112 | output_parser = CustomOutputParser()
113 | llm_chain = LLMChain(llm=llm, prompt=prompt)
114 | tool_names = [tool.name for tool in tools]
115 | agent = LLMSingleActionAgent(
116 | llm_chain=llm_chain,
117 | handle_parsing_errors=False,
118 | output_parser=output_parser,
119 | stop=["\nObservation:", "Observation:", "\nObservation", "Observation"],
120 | allowed_tools=tool_names,
121 | max_iterations=2,
122 | )
123 | # only retain the last message in the chat history
124 | memory = ConversationBufferWindowMemory(k=1, memory_key="chat_history", return_messages=True)
125 | agent_executor = AgentExecutor.from_agent_and_tools(
126 | agent=agent,
127 | tools=tools,
128 | verbose=True,
129 | memory=memory,
130 | max_iterations=2,
131 | )
132 |
133 | # agent_executor = initialize_agent(
134 | # tools,
135 | # llm,
136 | # agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
137 | # verbose=True,
138 | # handle_parsing_errors=False,
139 | # agent_kwargs={
140 | # 'output_parser': output_parser
141 | # },
142 | # memory=memory,
143 | # max_iterations=2,
144 | # )
145 |
146 | return agent_executor
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/bot.py:
--------------------------------------------------------------------------------
1 | import os
2 | import streamlit as st
3 | from streamlit.logger import get_logger
4 | # import tkinter as tk
5 | # from tkinter import filedialog
6 | from langchain.callbacks.base import BaseCallbackHandler
7 | from dotenv import load_dotenv
8 | from chains import (
9 | load_llm,
10 | configure_llm_only_chain,
11 | get_qa_rag_chain
12 | )
13 | from langchain.vectorstores.neo4j_vector import Neo4jVector
14 | from langchain.text_splitter import Language
15 | from agent import get_agent_executor
16 | from db import process_documents
17 |
18 | # set page title
19 | st.set_page_config(
20 | page_title="Code Explorer",
21 | page_icon="👨💻",
22 | layout="centered",
23 | initial_sidebar_state="expanded",
24 | menu_items={
25 | "About": "GitHub: https://github.com/tobyloki/CodeExplorer"
26 | }
27 | )
28 |
29 | load_dotenv(".env")
30 |
31 | url = os.getenv("NEO4J_URI")
32 | username = os.getenv("NEO4J_USERNAME")
33 | password = os.getenv("NEO4J_PASSWORD")
34 | ollama_base_url = os.getenv("OLLAMA_BASE_URL")
35 | embedding_model_name = os.getenv("EMBEDDING_MODEL")
36 | llm_name = os.getenv("LLM")
37 | # Remapping for Langchain Neo4j integration
38 | os.environ["NEO4J_URL"] = url
39 |
40 | logger = get_logger(__name__)
41 |
42 | @st.cache_resource
43 | def initLLM():
44 | # create llm
45 | llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url})
46 |
47 | return llm
48 |
49 | llm = initLLM()
50 |
51 | @st.cache_resource
52 | def get_llm_chain():
53 | chain = configure_llm_only_chain(llm)
54 | return chain
55 |
56 | @st.cache_resource
57 | def process_directory(language, directory, count) -> (str, Neo4jVector):
58 | error, vectorstore = process_documents(language, directory)
59 | return (error, vectorstore)
60 |
61 | @st.cache_resource
62 | def get_qa_chain(_vectorstore, count):
63 | qa = get_qa_rag_chain(_vectorstore, llm)
64 | return qa
65 |
66 | @st.cache_resource
67 | def get_agent(_qa, count):
68 | qa = get_agent_executor(_qa, llm)
69 | return qa
70 |
71 | class StreamHandler(BaseCallbackHandler):
72 | def __init__(self, container, initial_text=""):
73 | self.container = container
74 | self.text = initial_text
75 |
76 | def on_llm_new_token(self, token: str, **kwargs) -> None:
77 | # if token.endswith('?'):
78 | # token += '\n\n\n'
79 | # token = token.replace('"', '')
80 | self.text += token
81 | self.container.markdown(self.text)
82 |
83 | def main():
84 | qa = None
85 | agent = None
86 | llm_chain = get_llm_chain()
87 |
88 | if "language" not in st.session_state:
89 | st.session_state[f"language"] = None
90 | if "directory" not in st.session_state:
91 | st.session_state[f"directory"] = None
92 | if "detailedMode" not in st.session_state:
93 | st.session_state[f"detailedMode"] = True
94 | if "vectorstoreCount" not in st.session_state: # only incremented to reset cache for processDocuments()
95 | st.session_state[f"vectorstoreCount"] = 0
96 | if "qaCount" not in st.session_state: # only incremented to reset cache for get_qa_rag_chain()
97 | st.session_state[f"qaCount"] = 0
98 | if "user_input" not in st.session_state:
99 | st.session_state[f"user_input"] = []
100 | if "generated" not in st.session_state:
101 | st.session_state[f"generated"] = []
102 |
103 | # # Set up tkinter
104 | # root = tk.Tk()
105 | # root.withdraw()
106 |
107 | # # Make folder picker dialog appear on top of other windows
108 | # root.wm_attributes('-topmost', 1)
109 |
110 | # sidebar
111 | with st.sidebar:
112 | # Convert enum values to a list of strings
113 | languages_list = [lang.value for lang in Language]
114 | default_index = languages_list.index(Language.PYTHON)
115 | languageSelected = st.selectbox(
116 | 'Select language',
117 | languages_list,
118 | index=default_index
119 | )
120 |
121 | # show folder picker dialog
122 | # st.title('Select Folder')
123 | # folderClicked = st.button('Folder Picker')
124 |
125 | currentPath = os.getcwd()
126 | directory = st.text_input('Enter folder path', currentPath)
127 | directory = directory.strip()
128 |
129 | processBtnClicked = st.button('Process files')
130 | if processBtnClicked:
131 | if not os.path.exists(directory):
132 | st.error("Path doesn't exist!")
133 | else:
134 | # directory = filedialog.askdirectory(master=root)
135 | if isinstance(directory, str) and directory:
136 | st.session_state[f"language"] = languageSelected
137 | st.session_state[f"directory"] = directory
138 | st.session_state[f"vectorstoreCount"] += 1
139 | st.session_state[f"qaCount"] += 1
140 | st.session_state[f"user_input"] = []
141 | st.session_state[f"generated"] = []
142 |
143 | # show folder selected
144 | if st.session_state[f"directory"]:
145 | st.code(st.session_state[f"directory"])
146 |
147 | error, vectorstore = process_directory(st.session_state[f"language"], st.session_state[f"directory"], st.session_state[f"vectorstoreCount"])
148 |
149 | if error:
150 | st.error(error)
151 | elif vectorstore:
152 | qa = get_qa_chain(vectorstore, st.session_state[f"qaCount"])
153 | agent = get_agent(qa, st.session_state[f"qaCount"])
154 |
155 | # show clear chat history button
156 | clearMemoryClicked = st.button("🧹 Reset chat history")
157 | if clearMemoryClicked:
158 | st.session_state[f"qaCount"] += 1
159 | st.session_state[f"user_input"] = []
160 | st.session_state[f"generated"] = []
161 |
162 | qa = get_qa_rag_chain(vectorstore, st.session_state[f"qaCount"])
163 | agent = get_agent(qa, st.session_state[f"qaCount"])
164 |
165 | # show toggle to switch between qa and agent mode
166 | detailedMode = st.toggle('Detailed mode', value=True)
167 | st.session_state[f"detailedMode"] = detailedMode
168 |
169 | # load previous chat history
170 | if st.session_state[f"generated"]:
171 | size = len(st.session_state[f"generated"])
172 | # Display all exchanges
173 | for i in range(0, size):
174 | with st.chat_message("user"):
175 | st.write(st.session_state[f"user_input"][i])
176 | with st.chat_message("assistant"):
177 | st.write(st.session_state[f"generated"][i])
178 |
179 | # user chat
180 | user_input = st.chat_input("What coding issue can I help you resolve today?")
181 | if user_input:
182 | with st.chat_message("user"):
183 | st.write(user_input)
184 | st.session_state[f"user_input"].append(user_input)
185 | with st.chat_message("assistant"):
186 | with st.spinner("Generating..."):
187 | stream_handler = StreamHandler(st.empty())
188 | if qa:
189 | if st.session_state[f"detailedMode"]:
190 | print("Using QA")
191 | result = qa(
192 | {"question": user_input},
193 | callbacks=[stream_handler]
194 | )
195 | answer = result["answer"]
196 | else:
197 | print("Using Agent")
198 | result = agent(
199 | {"input": user_input},
200 | callbacks=[stream_handler]
201 | )
202 | answer = result["output"]
203 |
204 | # print("result:", result)
205 | else:
206 | print("Using LLM only")
207 | answer = llm_chain(
208 | {"question": user_input},
209 | callbacks=[stream_handler]
210 | )
211 |
212 | # answer = answer.replace('"', '')
213 | st.session_state[f"generated"].append(answer)
214 |
215 |
216 | if __name__ == "__main__":
217 | main()
218 |
219 |
220 |
221 |
--------------------------------------------------------------------------------