├── .env ├── .gitignore ├── README.md ├── main.py ├── requirements.txt ├── streamlit_app.py └── utils.py /.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=your_openai_api_key_here 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Virtual Environment 7 | venv/ 8 | env/ 9 | .env 10 | 11 | # IDEs and Editors 12 | .vscode/ 13 | .idea/ 14 | *.swp 15 | *.swo 16 | 17 | # OS generated files 18 | .DS_Store 19 | Thumbs.db 20 | 21 | # Project specific 22 | .aider* 23 | *.log 24 | *.sqlite 25 | 26 | # Jupyter Notebooks 27 | .ipynb_checkpoints 28 | 29 | # Distribution / packaging 30 | .Python 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | lib/ 38 | lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | wheels/ 43 | share/python-wheels/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | 48 | # PyInstaller 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Streamlit 76 | .streamlit/ 77 | 78 | # LangChain related 79 | vectorstore/ 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Page Question Answering App 2 | 3 | This application allows users to ask questions about the content of a web page using advanced natural language processing techniques. It combines web scraping, text processing, and AI-powered question answering to provide informative responses based on the content of any given web page. 4 | 5 | ## Features 6 | 7 | - Web page content extraction 8 | - Text processing and vectorization 9 | - AI-powered question answering using GPT-4 10 | - Command-line interface 11 | - Web-based user interface using Streamlit 12 | 13 | ## Technologies Used 14 | 15 | - Python 3.8+ 16 | - LangChain: For building the retrieval-augmented generation (RAG) pipeline 17 | - OpenAI GPT-4: For natural language understanding and generation 18 | - Chroma: For vector storage and similarity search 19 | - BeautifulSoup4: For web scraping 20 | - Streamlit: For the web-based user interface 21 | 22 | ## Setup and Installation 23 | 24 | 1. Clone the repository: 25 | ``` 26 | git clone https://github.com/timkitch/yt-coding-assistants-rag-tutorial.git 27 | cd yt-coding-assistants-rag-tutorial 28 | ``` 29 | 30 | 2. Install the required dependencies: 31 | ``` 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | 3. Set up your OpenAI API key: 36 | - Create a `.env` file in the project root 37 | - Add your OpenAI API key: `OPENAI_API_KEY=your_api_key_here` 38 | 39 | ## Usage 40 | 41 | ### Command-line Interface 42 | 43 | Run the main script: 44 | 45 | ``` 46 | python main.py 47 | ``` 48 | 49 | Follow the prompts to enter a URL and ask questions about the web page content. 50 | 51 | ### Web Interface 52 | 53 | Launch the Streamlit app: 54 | 55 | ``` 56 | streamlit run streamlit_app.py 57 | ``` 58 | 59 | Open your web browser and navigate to the URL provided by Streamlit (usually `http://localhost:8501`). 60 | 61 | 1. Enter the URL of the web page you want to analyze. 62 | 2. Wait for the page to be processed. 63 | 3. Enter your questions in the text input field. 64 | 4. View the AI-generated answers based on the web page content. 65 | 66 | ## Project Structure 67 | 68 | - `main.py`: Contains the core logic for document processing, vector store creation, and RAG chain setup. 69 | - `utils.py`: Utility functions for user input and output handling. 70 | - `streamlit_app.py`: Streamlit web application interface. 71 | - `requirements.txt`: List of Python dependencies. 72 | - `.env`: Configuration file for environment variables (not tracked in git). 73 | 74 | ## Contributing 75 | 76 | Contributions are welcome! Please feel free to submit a Pull Request. 77 | 78 | ## License 79 | 80 | This project is open source and available under the [MIT License](LICENSE). 81 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from langchain_community.document_loaders import WebBaseLoader 4 | from langchain_openai import OpenAIEmbeddings 5 | from langchain_community.vectorstores import Chroma 6 | from langchain_text_splitters import RecursiveCharacterTextSplitter 7 | from langchain_openai import ChatOpenAI 8 | from langchain.chains import create_retrieval_chain 9 | from langchain.chains.combine_documents import create_stuff_documents_chain 10 | from langchain_core.prompts import ChatPromptTemplate 11 | from utils import get_user_input, display_answer, get_user_question 12 | 13 | # Load environment variables 14 | load_dotenv() 15 | 16 | def main(): 17 | url = get_user_input() 18 | documents = load_and_process_document(url) 19 | vectorstore = create_vectorstore(documents) 20 | print("Vector store created successfully.") 21 | 22 | retriever = vectorstore.as_retriever() 23 | rag_chain = create_rag_chain(retriever) 24 | 25 | while True: 26 | question = get_user_question() 27 | if question.lower() == 'quit': 28 | break 29 | response = rag_chain.invoke({"input": question}) 30 | display_answer(response['answer']) 31 | 32 | def load_and_process_document(url): 33 | loader = WebBaseLoader(url) 34 | documents = loader.load() 35 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 36 | splits = text_splitter.split_documents(documents) 37 | return splits 38 | 39 | def create_vectorstore(documents): 40 | embeddings = OpenAIEmbeddings() 41 | vectorstore = Chroma.from_documents(documents, embeddings) 42 | return vectorstore 43 | 44 | def create_rag_chain(retriever): 45 | llm = ChatOpenAI(model="gpt-4") 46 | 47 | prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context: 48 | 49 | Context: {context} 50 | 51 | Question: {input} 52 | 53 | Answer: """) 54 | 55 | document_chain = create_stuff_documents_chain(llm, prompt) 56 | 57 | return create_retrieval_chain(retriever, document_chain) 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.2.0 2 | langchain-community 3 | langchain-openai 4 | chromadb 5 | beautifulsoup4 6 | python-dotenv 7 | streamlit 8 | -------------------------------------------------------------------------------- /streamlit_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from main import load_and_process_document, create_vectorstore, create_rag_chain 3 | 4 | def main(): 5 | # Custom CSS for colors 6 | st.markdown(""" 7 | 27 | """, unsafe_allow_html=True) 28 | 29 | st.title("🌐 Web Page Question Answering") 30 | 31 | url = st.text_input("🔗 Enter the URL of the web page:") 32 | 33 | if url: 34 | with st.spinner("🔍 Loading and processing the web page..."): 35 | documents = load_and_process_document(url) 36 | vectorstore = create_vectorstore(documents) 37 | retriever = vectorstore.as_retriever() 38 | rag_chain = create_rag_chain(retriever) 39 | 40 | st.success("✅ Web page processed successfully!") 41 | 42 | question = st.text_input("❓ Ask a question about the web page:") 43 | 44 | if question: 45 | with st.spinner("🤔 Generating answer..."): 46 | response = rag_chain.invoke({"input": question}) 47 | 48 | st.subheader("💡 Answer:") 49 | st.markdown(f"
{response['answer']}
", unsafe_allow_html=True) 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | def get_user_input(): 2 | return input("Please enter the URL of the web page: ") 3 | 4 | def get_user_question(): 5 | return input("Ask a question about the web page (or type 'quit' to exit): ") 6 | 7 | def display_answer(answer): 8 | print("\nAnswer:") 9 | print(answer) 10 | print() 11 | --------------------------------------------------------------------------------