├── backend
├── __init__.py
└── core.py
├── consts.py
├── .gitattributes
├── .idea
├── .gitignore
├── vcs.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── modules.xml
├── documentation-helper.iml
└── misc.xml
├── static
├── banner.gif
├── Tavily Logo.png
├── LangChain Logo.png
├── Trimmed Padded Langchain.png
└── Tavily Logo Trimmed Padded.png
├── .streamlit
└── config.toml
├── Pipfile
├── logger.py
├── .gitignore
├── main.py
├── ingestion.py
├── README.md
├── LICENSE
├── Tavily Crawl Demo Tutorial.ipynb
└── Tavily Demo Tutorial.ipynb
/backend/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/consts.py:
--------------------------------------------------------------------------------
1 | INDEX_NAME = "langchain-docs-2025"
2 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=nbstripout
2 | *.ipynb diff=ipynb
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/static/banner.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emarco177/documentation-helper/HEAD/static/banner.gif
--------------------------------------------------------------------------------
/static/Tavily Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emarco177/documentation-helper/HEAD/static/Tavily Logo.png
--------------------------------------------------------------------------------
/static/LangChain Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emarco177/documentation-helper/HEAD/static/LangChain Logo.png
--------------------------------------------------------------------------------
/static/Trimmed Padded Langchain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emarco177/documentation-helper/HEAD/static/Trimmed Padded Langchain.png
--------------------------------------------------------------------------------
/static/Tavily Logo Trimmed Padded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/emarco177/documentation-helper/HEAD/static/Tavily Logo Trimmed Padded.png
--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | primaryColor = "#4CAF50"
3 | backgroundColor = "#1E1E1E"
4 | secondaryBackgroundColor = "#252526"
5 | textColor = "#FFFFFF"
6 | font = "sans serif"
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/documentation-helper.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | url = "https://pypi.org/simple"
3 | verify_ssl = true
4 | name = "pypi"
5 |
6 | [packages]
7 | beautifulsoup4 = "*"
8 | black = "*"
9 | tiktoken = "*"
10 | openai = "*"
11 | unstructured = "*"
12 | nltk = "*"
13 | fastapi = "*"
14 | jinja2 = "*"
15 | uvicorn = "*"
16 | streamlit = "*"
17 | streamlit-chat = "*"
18 | tqdm = "*"
19 | isort = "*"
20 | langchainhub = "*"
21 | firecrawl-py = "*"
22 | langchain-community = "*"
23 | langsmith = "*"
24 | pillow = "*"
25 | certifi = "*"
26 | langchain-chroma = "*"
27 | python-dotenv = "*"
28 | langchain-tavily = "*"
29 | langchain-pinecone = "*"
30 |
31 | [dev-packages]
32 | ipykernel = "*"
33 |
34 | [requires]
35 | python_version = "3.11"
36 | python_full_version = "3.11.0"
37 |
38 | [pipenv]
39 | allow_prereleases = true
40 |
--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
1 | # Color codes for better logging
2 | class Colors:
3 | PURPLE = "\033[95m"
4 | CYAN = "\033[96m"
5 | DARKCYAN = "\033[36m"
6 | BLUE = "\033[94m"
7 | GREEN = "\033[92m"
8 | YELLOW = "\033[93m"
9 | RED = "\033[91m"
10 | BOLD = "\033[1m"
11 | UNDERLINE = "\033[4m"
12 | END = "\033[0m"
13 |
14 |
15 | def log_info(message: str, color: str = Colors.CYAN):
16 | """Log info message with color"""
17 | print(f"{color}ℹ️ {message}{Colors.END}")
18 |
19 |
20 | def log_success(message: str):
21 | """Log success message in green"""
22 | print(f"{Colors.GREEN}✅ {message}{Colors.END}")
23 |
24 |
25 | def log_error(message: str):
26 | """Log error message in red"""
27 | print(f"{Colors.RED}❌ {message}{Colors.END}")
28 |
29 |
30 | def log_warning(message: str):
31 | """Log warning message in yellow"""
32 | print(f"{Colors.YELLOW}⚠️ {message}{Colors.END}")
33 |
34 |
35 | def log_header(message: str):
36 | """Log header message with emphasis"""
37 | print(f"\n{Colors.BOLD}{Colors.PURPLE}{'='*60}{Colors.END}")
38 | print(f"{Colors.BOLD}{Colors.PURPLE}🚀 {message}{Colors.END}")
39 | print(f"{Colors.BOLD}{Colors.PURPLE}{'='*60}{Colors.END}\n")
40 |
--------------------------------------------------------------------------------
/backend/core.py:
--------------------------------------------------------------------------------
1 | from dotenv import load_dotenv
2 |
3 | load_dotenv()
4 | from typing import Any, Dict, List
5 |
6 | from langchain import hub
7 | from langchain_chroma import Chroma
8 | from langchain_classic.chains.combine_documents import \
9 | create_stuff_documents_chain
10 | from langchain_classic.chains.history_aware_retriever import \
11 | create_history_aware_retriever
12 | from langchain_classic.chains.retrieval import create_retrieval_chain
13 | from langchain_core.output_parsers import StrOutputParser
14 | from langchain_core.runnables import RunnablePassthrough
15 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings
16 |
17 | from consts import INDEX_NAME
18 |
19 | embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
20 | chroma = Chroma(persist_directory="chroma_db", embedding_function=embeddings)
21 |
22 |
23 | def run_llm(query: str, chat_history: List[Dict[str, Any]] = []):
24 | embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
25 | docsearch = Chroma(persist_directory="chroma_db", embedding_function=embeddings)
26 | chat = ChatOpenAI(verbose=True, temperature=0)
27 |
28 | rephrase_prompt = hub.pull("langchain-ai/chat-langchain-rephrase")
29 |
30 | retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
31 | stuff_documents_chain = create_stuff_documents_chain(chat, retrieval_qa_chat_prompt)
32 |
33 | history_aware_retriever = create_history_aware_retriever(
34 | llm=chat, retriever=docsearch.as_retriever(), prompt=rephrase_prompt
35 | )
36 | qa = create_retrieval_chain(
37 | retriever=history_aware_retriever, combine_docs_chain=stuff_documents_chain
38 | )
39 |
40 | result = qa.invoke(input={"input": query, "chat_history": chat_history})
41 | return result
42 |
43 |
44 | def format_docs(docs):
45 | return "\n\n".join(doc.page_content for doc in docs)
46 |
47 |
48 | def run_llm2(query: str, chat_history: List[Dict[str, Any]] = []):
49 | embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
50 | docsearch = Chroma(persist_directory="chroma_db", embedding_function=embeddings)
51 | chat = ChatOpenAI(model="gpt-4o-mini", verbose=True, temperature=0)
52 |
53 | rephrase_prompt = hub.pull("langchain-ai/chat-langchain-rephrase")
54 |
55 | retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
56 |
57 | rag_chain = (
58 | {
59 | "context": docsearch.as_retriever() | format_docs,
60 | "input": RunnablePassthrough(),
61 | }
62 | | retrieval_qa_chat_prompt
63 | | chat
64 | | StrOutputParser()
65 | )
66 |
67 | retrieve_docs_chain = (lambda x: x["input"]) | docsearch.as_retriever()
68 |
69 | chain = RunnablePassthrough.assign(context=retrieve_docs_chain).assign(
70 | answer=rag_chain
71 | )
72 |
73 | result = chain.invoke({"input": query, "chat_history": chat_history})
74 | return result
75 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | langchain-docs/
162 | chroma_db/
163 | .DS_Store
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from dotenv import load_dotenv
2 |
3 | load_dotenv()
4 | from typing import Set
5 |
6 | import streamlit as st
7 |
8 | from backend.core import run_llm
9 |
10 | st.set_page_config(
11 | page_title="Your App Title",
12 | page_icon="🧊",
13 | layout="wide",
14 | initial_sidebar_state="expanded",
15 | )
16 | from io import BytesIO
17 |
18 | import requests
19 | # Add these imports
20 | from PIL import Image
21 |
22 |
23 | def create_sources_string(source_urls: Set[str]) -> str:
24 | if not source_urls:
25 | return ""
26 | sources_list = list(source_urls)
27 | sources_list.sort()
28 | sources_string = "sources:\n"
29 | for i, source in enumerate(sources_list):
30 | sources_string += f"{i+1}. {source}\n"
31 | return sources_string
32 |
33 |
34 | # Add this function to get a profile picture
35 | def get_profile_picture(email):
36 | # This uses Gravatar to get a profile picture based on email
37 | # You can replace this with a different service or use a default image
38 | gravatar_url = f"https://www.gravatar.com/avatar/{hash(email)}?d=identicon&s=200"
39 | response = requests.get(gravatar_url)
40 | img = Image.open(BytesIO(response.content))
41 | return img
42 |
43 |
44 | # Custom CSS for dark theme and modern look
45 | st.markdown(
46 | """
47 |
67 | """,
68 | unsafe_allow_html=True,
69 | )
70 |
71 | # Set page config at the very beginning
72 |
73 |
74 | # Sidebar user information
75 | with st.sidebar:
76 | st.title("User Profile")
77 |
78 | # You can replace these with actual user data
79 | user_name = "John Doe"
80 | user_email = "john.doe@example.com"
81 |
82 | profile_pic = get_profile_picture(user_email)
83 | st.image(profile_pic, width=150)
84 | st.write(f"**Name:** {user_name}")
85 | st.write(f"**Email:** {user_email}")
86 |
87 | st.header("LangChain🦜🔗 Udemy Course- Helper Bot")
88 |
89 | # Initialize session state
90 | if "chat_answers_history" not in st.session_state:
91 | st.session_state["chat_answers_history"] = []
92 | st.session_state["user_prompt_history"] = []
93 | st.session_state["chat_history"] = []
94 |
95 | # Create two columns for a more modern layout
96 | col1, col2 = st.columns([2, 1])
97 |
98 | with col1:
99 | prompt = st.text_input("Prompt", placeholder="Enter your message here...")
100 |
101 | with col2:
102 | if st.button("Submit", key="submit"):
103 | prompt = prompt or "Hello" # Default message if input is empty
104 |
105 | if prompt:
106 | with st.spinner("Generating response..."):
107 | generated_response = run_llm(
108 | query=prompt, chat_history=st.session_state["chat_history"]
109 | )
110 |
111 | sources = set(doc.metadata["source"] for doc in generated_response["context"])
112 | formatted_response = (
113 | f"{generated_response['answer']} \n\n {create_sources_string(sources)}"
114 | )
115 |
116 | st.session_state["user_prompt_history"].append(prompt)
117 | st.session_state["chat_answers_history"].append(formatted_response)
118 | st.session_state["chat_history"].append(("human", prompt))
119 | st.session_state["chat_history"].append(("ai", generated_response["answer"]))
120 |
121 | # Display chat history
122 | if st.session_state["chat_answers_history"]:
123 | for generated_response, user_query in zip(
124 | st.session_state["chat_answers_history"],
125 | st.session_state["user_prompt_history"],
126 | ):
127 | st.chat_message("user").write(user_query)
128 | st.chat_message("assistant").write(generated_response)
129 |
130 |
131 | # Add a footer
132 | st.markdown("---")
133 | st.markdown("Powered by LangChain and Streamlit")
134 |
--------------------------------------------------------------------------------
/ingestion.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import ssl
4 | from typing import Any, Dict, List
5 |
6 | import certifi
7 | from dotenv import load_dotenv
8 | from langchain_chroma import Chroma
9 | from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
10 | from langchain_core.documents import Document
11 | from langchain_openai import OpenAIEmbeddings
12 | from langchain_pinecone import PineconeVectorStore
13 | from langchain_tavily import TavilyCrawl, TavilyExtract, TavilyMap
14 |
15 | from logger import (Colors, log_error, log_header, log_info, log_success,
16 | log_warning)
17 |
18 | load_dotenv()
19 |
20 | # Configure SSL context to use certifi certificates
21 | ssl_context = ssl.create_default_context(cafile=certifi.where())
22 | os.environ["SSL_CERT_FILE"] = certifi.where()
23 | os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
24 |
25 |
26 | embeddings = OpenAIEmbeddings(
27 | model="text-embedding-3-small",
28 | show_progress_bar=False,
29 | chunk_size=50,
30 | retry_min_seconds=10,
31 | )
32 | vectorstore = Chroma(persist_directory="chroma_db", embedding_function=embeddings)
33 | # vectorstore = PineconeVectorStore(
34 | # index_name="langchain-docs-2025", embedding=embeddings
35 | # )
36 | tavily_extract = TavilyExtract()
37 | tavily_map = TavilyMap(max_depth=5, max_breadth=20, max_pages=1000)
38 | tavily_crawl = TavilyCrawl()
39 |
40 |
41 | async def index_documents_async(documents: List[Document], batch_size: int = 50):
42 | """Process documents in batches asynchronously."""
43 | log_header("VECTOR STORAGE PHASE")
44 | log_info(
45 | f"📚 VectorStore Indexing: Preparing to add {len(documents)} documents to vector store",
46 | Colors.DARKCYAN,
47 | )
48 |
49 | # Create batches
50 | batches = [
51 | documents[i : i + batch_size] for i in range(0, len(documents), batch_size)
52 | ]
53 |
54 | log_info(
55 | f"📦 VectorStore Indexing: Split into {len(batches)} batches of {batch_size} documents each"
56 | )
57 |
58 | # Process all batches concurrently
59 | async def add_batch(batch: List[Document], batch_num: int):
60 | try:
61 | await vectorstore.aadd_documents(batch)
62 | log_success(
63 | f"VectorStore Indexing: Successfully added batch {batch_num}/{len(batches)} ({len(batch)} documents)"
64 | )
65 | except Exception as e:
66 | log_error(f"VectorStore Indexing: Failed to add batch {batch_num} - {e}")
67 | return False
68 | return True
69 |
70 | # Process batches concurrently
71 | tasks = [add_batch(batch, i + 1) for i, batch in enumerate(batches)]
72 | results = await asyncio.gather(*tasks, return_exceptions=True)
73 |
74 | # Count successful batches
75 | successful = sum(1 for result in results if result is True)
76 |
77 | if successful == len(batches):
78 | log_success(
79 | f"VectorStore Indexing: All batches processed successfully! ({successful}/{len(batches)})"
80 | )
81 | else:
82 | log_warning(
83 | f"VectorStore Indexing: Processed {successful}/{len(batches)} batches successfully"
84 | )
85 |
86 |
87 | async def main():
88 | """Main async function to orchestrate the entire process."""
89 | log_header("DOCUMENTATION INGESTION PIPELINE")
90 |
91 | log_info(
92 | "🗺️ TavilyCrawl: Starting to crawl the documentation site",
93 | Colors.PURPLE,
94 | )
95 | # Crawl the documentation site
96 |
97 | res = tavily_crawl.invoke(
98 | {
99 | "url": "https://python.langchain.com/",
100 | "max_depth": 2,
101 | "extract_depth": "advanced",
102 | }
103 | )
104 |
105 | # Convert Tavily crawl results to LangChain Document objects
106 | all_docs = []
107 | for tavily_crawl_result_item in res["results"]:
108 | log_info(
109 | f"TavilyCrawl: Successfully crawled {tavily_crawl_result_item['url']} from documentation site"
110 | )
111 | all_docs.append(
112 | Document(
113 | page_content=tavily_crawl_result_item["raw_content"],
114 | metadata={"source": tavily_crawl_result_item["url"]},
115 | )
116 | )
117 |
118 | # Split documents into chunks
119 | log_header("DOCUMENT CHUNKING PHASE")
120 | log_info(
121 | f"✂️ Text Splitter: Processing {len(all_docs)} documents with 4000 chunk size and 200 overlap",
122 | Colors.YELLOW,
123 | )
124 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
125 | splitted_docs = text_splitter.split_documents(all_docs)
126 | log_success(
127 | f"Text Splitter: Created {len(splitted_docs)} chunks from {len(all_docs)} documents"
128 | )
129 |
130 | # Process documents asynchronously
131 | await index_documents_async(splitted_docs, batch_size=500)
132 |
133 | log_header("PIPELINE COMPLETE")
134 | log_success("🎉 Documentation ingestion pipeline finished successfully!")
135 | log_info("📊 Summary:", Colors.BOLD)
136 | log_info(f" • Documents extracted: {len(all_docs)}")
137 | log_info(f" • Chunks created: {len(splitted_docs)}")
138 |
139 |
140 | if __name__ == "__main__":
141 | asyncio.run(main())
142 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 🦜 LangChain Documentation Helper
3 |
4 |
5 |
6 | **An intelligent documentation assistant powered by LangChain and vector search**
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | [](https://www.python.org/downloads/)
16 | [](https://langchain.com/)
17 | [](https://streamlit.io/)
18 | [](https://pinecone.io/)
19 | [](https://app.tavily.com/home?utm_campaign=eden_marco&utm_medium=socials&utm_source=linkedin)
20 | [](LICENSE)
21 |
22 | [](https://www.udemy.com/course/langchain/?couponCode=DEC-2025)
23 |
24 |
25 |
26 | ## 🎯 Overview
27 |
28 | The **LangChain Documentation Helper** is a sophisticated AI-powered web application that serves as a slim version of [chat.langchain.com](https://chat.langchain.com/). This intelligent documentation assistant provides accurate answers to questions about LangChain documentation using advanced Retrieval-Augmented Generation (RAG) techniques, enhanced with web crawling capabilities and conversational memory.
29 |
30 | ### ✨ Key Features
31 |
32 | **RAG Pipeline Flow:**
33 |
34 | 1. 🌐 **Web Crawling**: Real-time web scraping and content extraction using Tavily's advanced crawling capabilities
35 | 2. 📚 **Document Processing**: Intelligent chunking and preprocessing of LangChain documentation
36 | 3. 🔍 **Vector Storage**: Advanced embedding and indexing using Pinecone for fast similarity search
37 | 4. 🎯 **Intelligent Retrieval**: Context-aware document retrieval based on user queries
38 | 5. 🧩 **Memory System**: Conversational memory for coreference resolution and context continuity
39 | 6. 🧠 **Context-Aware Generation**: Provides accurate, contextual answers with source citations
40 | 7. 💬 **Interactive Interface**: User-friendly chat interface powered by Streamlit
41 | 8. 🚀 **Real-time Processing**: Fast end-to-end pipeline from query to response
42 |
43 | ## 🎬 Demo
44 |
45 |
46 |

47 |
Interactive demo showing the LangChain Documentation Helper in action
48 |
49 |
50 | ## 🛠️ Tech Stack
51 |
52 |
53 |
54 | | Component | Technology | Description |
55 | |-----------|------------|-------------|
56 | | 🖥️ **Frontend** | Streamlit | Interactive web interface |
57 | | 🧠 **AI Framework** | LangChain 🦜🔗 | Orchestrates the AI pipeline |
58 | | 🔍 **Vector Database** | Pinecone 🌲 | Stores and retrieves document embeddings |
59 | | 🌐 **Web Crawling** | Tavily | Intelligent web scraping and content extraction |
60 | | 🧩 **Memory** | Conversational Memory | Coreference resolution and context continuity |
61 | | 🤖 **LLM** | OpenAI GPT | Powers the conversational AI |
62 | | 🐍 **Backend** | Python | Core application logic |
63 |
64 |
65 |
66 | ## 🚀 Quick Start
67 |
68 | ### Prerequisites
69 |
70 | - Python 3.8 or higher
71 | - OpenAI API key
72 | - Pinecone API key
73 | - [Tavily API key](https://app.tavily.com/home?utm_campaign=eden_marco&utm_medium=socials&utm_source=linkedin) (required - for documentation crawling and web search)
74 |
75 | ### Installation
76 |
77 | 1. **Clone the repository**
78 | ```bash
79 | git clone https://github.com/emarco177/documentation-helper.git
80 | cd documentation-helper
81 | ```
82 |
83 | 2. **Set up environment variables**
84 |
85 | Create a `.env` file in the root directory:
86 | ```env
87 | PINECONE_API_KEY=your_pinecone_api_key_here
88 | OPENAI_API_KEY=your_openai_api_key_here
89 | TAVILY_API_KEY=your_tavily_api_key_here # Required - for documentation crawling
90 | ```
91 |
92 | 3. **Install dependencies**
93 | ```bash
94 | pipenv install
95 | ```
96 |
97 | 4. **Ingest LangChain Documentation** (Run the ingestion pipeline)
98 | ```bash
99 | python ingestion.py # Uses Tavily to crawl and index documentation
100 | ```
101 |
102 | 5. **Run the application**
103 | ```bash
104 | streamlit run main.py
105 | ```
106 |
107 | 6. **Open your browser** and navigate to `http://localhost:8501`
108 |
109 | ## 🧪 Testing
110 |
111 | Run the test suite to ensure everything is working correctly:
112 |
113 | ```bash
114 | pipenv run pytest .
115 | ```
116 |
117 | ## 📁 Project Structure
118 |
119 | ```
120 | documentation-helper/
121 | ├── backend/ # Core backend logic
122 | │ ├── __init__.py
123 | │ └── core.py
124 | ├── static/ # Static assets (images, logos)
125 | │ ├── banner.gif
126 | │ ├── LangChain Logo.png
127 | │ ├── Tavily Logo.png
128 | │ ├── Tavily Logo Trimmed Padded.png
129 | │ └── Trimmed Padded Langchain.png
130 | ├── chroma_db/ # Local vector database
131 | ├── main.py # Streamlit application entry point
132 | ├── ingestion.py # Document ingestion pipeline
133 | ├── consts.py # Configuration constants
134 | ├── logger.py # Logging utilities
135 | ├── Tavily Demo Tutorial.ipynb # 📚 Tutorial: Introduction to Tavily API
136 | ├── Tavily Crawl Demo Tutorial.ipynb # 📚 Tutorial: Advanced Tavily crawling techniques
137 | └── requirements files # Pipfile, Pipfile.lock
138 | ```
139 |
140 | ### 📚 Tutorial Notebooks
141 |
142 | The project includes comprehensive Jupyter notebooks that serve as hands-on tutorials:
143 |
144 | - **`Tavily Demo Tutorial.ipynb`**: Introduction to Tavily API basics and core functionality
145 | - **`Tavily Crawl Demo Tutorial.ipynb`**: Advanced tutorial covering Tavily's crawling capabilities, including TavilyMap and TavilyExtract features
146 |
147 | These tutorials provide step-by-step guidance on integrating Tavily's powerful web search and crawling capabilities into your AI applications.
148 |
149 | ## 🔧 Configuration
150 |
151 | ### Environment Variables
152 |
153 | | Variable | Description | Required |
154 | |----------|-------------|----------|
155 | | `PINECONE_API_KEY` | Your Pinecone API key for vector storage | ✅ |
156 | | `OPENAI_API_KEY` | Your OpenAI API key for LLM access | ✅ |
157 | | `TAVILY_API_KEY` | Your Tavily API key for documentation crawling and web search | ✅ |
158 |
159 | ## 🤝 Contributing
160 |
161 | Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
162 |
163 | ## 📚 Learning Resources
164 |
165 | This project is designed as a learning tool for understanding:
166 | - 🦜 LangChain framework implementation
167 | - 🔍 Vector search and embeddings
168 | - 💬 Conversational AI development
169 | - 🏗️ RAG (Retrieval-Augmented Generation) architecture
170 |
171 | ## 📄 License
172 |
173 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
174 |
175 | ## 🌟 Support
176 |
177 | If you find this project helpful, please consider:
178 | - ⭐ Starring the repository
179 | - 🐛 Reporting issues
180 | - 💡 Contributing improvements
181 | - 📢 Sharing with others
182 |
183 | ---
184 |
185 |
186 |
187 | ### 🔗 Connect with Me
188 |
189 | [](https://www.udemy.com/course/langchain/?referralCode=D981B8213164A3EA91AC)
190 | [](https://www.linkedin.com/in/eden-marco/)
191 | [](https://twitter.com/EdenEmarco177)
192 |
193 | **Built with ❤️ by Eden Marco**
194 |
195 |
196 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Tavily Crawl Demo Tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "vscode": {
7 | "languageId": "raw"
8 | }
9 | },
10 | "source": [
11 | "# TavilyCrawl Tutorial: Intelligent Web Crawling\n",
12 | "\n",
13 | "## What We'll Build\n",
14 | "\n",
15 | "In this tutorial, you'll learn how to use TavilyCrawl to intelligently crawl websites using AI-guided instructions. We'll demonstrate:\n",
16 | "\n",
17 | "1. **Basic Web Crawling** - Crawl a website without specific instructions\n",
18 | "2. **Instruction-Guided Crawling** - Use natural language to target specific content\n",
19 | "3. **Results Comparison** - Compare the effectiveness of both approaches\n",
20 | "4. **Best Practices** - Learn how to write effective crawling instructions\n",
21 | "\n",
22 | "### Target Website\n",
23 | "We'll crawl the LangChain documentation (https://python.langchain.com/) to find content about AI agents.\n",
24 | "\n",
25 | "## What is TavilyCrawl?\n",
26 | "\n",
27 | "TavilyCrawl is an intelligent web crawler that uses AI to determine which paths to explore during crawling. It combines AI-powered decision making with parallel processing capabilities.\n",
28 | "\n",
29 | "### Key Features:\n",
30 | "\n",
31 | "- **AI-Powered Path Selection**: Uses AI to determine which paths to explore\n",
32 | "- **Parallel Processing**: Explores hundreds of paths simultaneously \n",
33 | "- **Advanced Extraction**: Extracts content from dynamically rendered pages\n",
34 | "- **Instruction-Driven**: Follows natural language instructions to guide exploration\n",
35 | "- **Targeted Content**: Returns content tailored for LLM integration and RAG systems\n",
36 | "\n",
37 | "### Tavily Resources:\n",
38 | "- Official Website\n",
39 | "- API Documentation\n",
40 | "- Crawl API Reference\n",
41 | "- LangChain Python Integration\n",
42 | "- Get API Key\n",
43 | "\n",
44 | "This tutorial demonstrates TavilyCrawl by comparing crawl results with and without instructions on the LangChain documentation.\n",
45 | "\n",
46 | "---"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {
52 | "vscode": {
53 | "languageId": "raw"
54 | }
55 | },
56 | "source": [
57 | "## Setup & Installation\n",
58 | "\n",
59 | "First, let's install the required packages and set up our environment.\n"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# Install required packages\n",
69 | "%pip install langchain-tavily certifi\n",
70 | "\n",
71 | "# For pretty printing and visualization\n",
72 | "%pip install rich pandas"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "import os\n",
82 | "import ssl\n",
83 | "import json\n",
84 | "from typing import Any, Dict, List\n",
85 | "\n",
86 | "import certifi\n",
87 | "from langchain_tavily import TavilyCrawl\n",
88 | "from rich.console import Console\n",
89 | "from rich.panel import Panel\n",
90 | "from rich.table import Table\n",
91 | "from rich.json import JSON\n",
92 | "\n",
93 | "# Configure SSL context\n",
94 | "ssl_context = ssl.create_default_context(cafile=certifi.where())\n",
95 | "os.environ[\"SSL_CERT_FILE\"] = certifi.where()\n",
96 | "os.environ[\"REQUESTS_CA_BUNDLE\"] = certifi.where()\n",
97 | "\n",
98 | "# Initialize rich console for pretty printing\n",
99 | "console = Console()\n",
100 | "\n",
101 | "print(\"All imports successful!\")"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {
107 | "vscode": {
108 | "languageId": "raw"
109 | }
110 | },
111 | "source": [
112 | "## API Key Setup\n",
113 | "\n",
114 | "You'll need a Tavily API key to use TavilyCrawl. Get yours at [https://app.tavily.com/home?utm_campaign=eden_marco&utm_medium=socials&utm_source=linkedin](https://app.tavily.com/home?utm_campaign=eden_marco&utm_medium=socials&utm_source=linkedin).\n",
115 | "\n",
116 | "Set environment variable `TAVILY_API_KEY`"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "# Set your Tavily API key here\n",
126 | "import getpass\n",
127 | "\n",
128 | "# For Google Colab, you can use getpass for secure input\n",
129 | "if 'TAVILY_API_KEY' not in os.environ:\n",
130 | " os.environ['TAVILY_API_KEY'] = getpass.getpass('Enter your Tavily API key: ')\n",
131 | "\n",
132 | "# Alternative: Set directly (uncomment and add your key)\n",
133 | "# os.environ[\"TAVILY_API_KEY\"] = \"your_tavily_api_key_here\"\n",
134 | "\n",
135 | "print(\"API key set successfully!\")"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {
141 | "vscode": {
142 | "languageId": "raw"
143 | }
144 | },
145 | "source": [
146 | "## Initialize TavilyCrawl\n",
147 | "\n",
148 | "Initialize TavilyCrawl and set up target URL for demonstration."
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "# Initialize TavilyCrawl\n",
158 | "tavily_crawl = TavilyCrawl()\n",
159 | "\n",
160 | "# Target URL: LangChain Documentation\n",
161 | "target_url = \"https://python.langchain.com/\"\n",
162 | "\n",
163 | "console.print(Panel.fit(\n",
164 | " f\"Target Website: {target_url}\\nCrawler: TavilyCrawl\",\n",
165 | " title=\"Demo Setup\",\n",
166 | " border_style=\"bright_blue\"\n",
167 | "))\n",
168 | "\n",
169 | "print(\"TavilyCrawl initialized successfully\")"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {
175 | "vscode": {
176 | "languageId": "raw"
177 | }
178 | },
179 | "source": [
180 | "## Demo 1: Crawl Without Instructions\n",
181 | "\n",
182 | "Crawl without specific instructions to show baseline behavior on the LangChain documentation."
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "# Demo 1: Crawl without instructions\n",
192 | "console.print(Panel.fit(\n",
193 | " f\"Target: {target_url}\\nInstructions: None (baseline crawl)\\nMax Depth: 1\\nExtract Depth: advanced\",\n",
194 | " title=\"Demo 1: Crawl Without Instructions\",\n",
195 | " border_style=\"blue\"\n",
196 | "))\n",
197 | "\n",
198 | "console.print(\"Running TavilyCrawl without instructions...\", style=\"blue\")\n",
199 | "\n",
200 | "# Basic crawl without instructions\n",
201 | "basic_result = tavily_crawl.invoke({\n",
202 | " \"url\": target_url,\n",
203 | " \"max_depth\": 1,\n",
204 | " \"extract_depth\": \"advanced\"\n",
205 | "})\n",
206 | "\n",
207 | "# Show raw output immediately\n",
208 | "console.print(basic_result)\n",
209 | "\n",
210 | "# Extract results for analysis\n",
211 | "basic_results = basic_result.get(\"results\", [])"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "# Now display the formatted results nicely\n"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "console.print(f\"\\nResults Without Instructions: {len(basic_results)} pages\", style=\"cyan\")\n",
228 | "console.print(\" Mix of all content types from LangChain docs\")\n",
229 | "console.print(\" No filtering - everything from the crawled sections\")\n",
230 | "console.print(\" Requires manual work to find specific content\")\n",
231 | "\n",
232 | "console.print(\"\\nSample Results from Basic Crawl (No Filtering):\\n\", style=\"cyan\")\n",
233 | "\n",
234 | "for i, result in enumerate(basic_results[:3], 1): # Show first 3 results\n",
235 | " url = result.get(\"url\", \"No URL\")\n",
236 | " content = result.get(\"raw_content\", \"No content\")[:150] + \"...\"\n",
237 | " \n",
238 | " panel_content = f\"\"\"URL: {url}\n",
239 | "\n",
240 | "Content Preview:\n",
241 | "{content}\"\"\"\n",
242 | " \n",
243 | " console.print(Panel(\n",
244 | " panel_content,\n",
245 | " title=f\"{i}. {url}\",\n",
246 | " border_style=\"blue\"\n",
247 | " ))\n",
248 | " print()\n",
249 | "\n",
250 | "console.print(f\"... and {len(basic_results) - 3} more mixed results\", style=\"italic cyan\")\n",
251 | "console.print(\"Note: Mixed content types - guides, integrations, concepts, etc.\", style=\"cyan\")"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "## Demo 2: Crawl With Instructions\n",
259 | "\n",
260 | "Use specific instructions to improve the quality and relevance of crawl results. Instructions can dramatically improve targeting and filtering."
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "instructions = \"Find all pages about ai agents\"\n",
270 | "\n",
271 | "console.print(Panel.fit(\n",
272 | " f\"Target: {target_url} (same as Demo 1)\\nInstructions: {instructions}\\nType: Specific, action-oriented\\nMax Depth: 3\\nExtract Depth: advanced\",\n",
273 | " title=\"Demo 2: Crawl With Instructions\", \n",
274 | " border_style=\"green\"\n",
275 | "))\n",
276 | "\n",
277 | "console.print(\"Starting crawl with instructions...\", style=\"green\")\n",
278 | "console.print(\"Instructions will guide the AI to target specific content\", style=\"italic\")"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "# Execute the crawl with instructions\n",
288 | "result_with_instructions = tavily_crawl.invoke({\n",
289 | " \"url\": target_url,\n",
290 | " \"instructions\": instructions,\n",
291 | " \"max_depth\": 3,\n",
292 | " \"extract_depth\": \"advanced\"\n",
293 | "})\n",
294 | "\n",
295 | "# Show raw output immediately\n",
296 | "console.print(\"\\nRaw TavilyCrawl Output:\", style=\"yellow\")\n",
297 | "console.print(result_with_instructions)\n",
298 | "\n",
299 | "console.print(\"\\nCrawl with instructions completed\", style=\"green\")\n",
300 | "\n",
301 | "# Show the results of instruction-based filtering\n",
302 | "results_with_instructions = result_with_instructions.get(\"results\", [])"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "# Display the targeted agent documentation found\n",
312 | "console.print(\"\\nLangChain Agent Documentation Found:\\n\", style=\"green\")\n",
313 | "\n",
314 | "for i, result in enumerate(results_with_instructions, 1):\n",
315 | " url = result.get(\"url\", \"No URL\")\n",
316 | " content = result.get(\"raw_content\", \"No content\")[:200] + \"...\"\n",
317 | " \n",
318 | " panel_content = f\"\"\"URL: {url}\n",
319 | "\n",
320 | "Content Preview:\n",
321 | "{content}\"\"\"\n",
322 | " \n",
323 | " console.print(Panel(\n",
324 | " panel_content,\n",
325 | " title=f\"{i}. {url}\",\n",
326 | " border_style=\"green\"\n",
327 | " ))\n",
328 | " print()\n",
329 | "\n",
330 | "console.print(\"Note: All results are specifically about agents in LangChain\", style=\"green\")"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "## Comparison of Both Approaches\n",
338 | "\n",
339 | "Compare both approaches to understand the impact of instruction quality."
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "# Create comparison table\n",
349 | "comparison_table = Table(title=\"TavilyCrawl: Instruction Quality Comparison\")\n",
350 | "comparison_table.add_column(\"Approach\", style=\"cyan\", no_wrap=True)\n",
351 | "comparison_table.add_column(\"Instructions\", style=\"yellow\")\n",
352 | "comparison_table.add_column(\"Pages Found\", style=\"blue\")\n",
353 | "comparison_table.add_column(\"Content Quality\", style=\"green\")\n",
354 | "comparison_table.add_column(\"Usefulness\", style=\"red\")\n",
355 | "\n",
356 | "comparison_table.add_row(\n",
357 | " \"No Instructions\",\n",
358 | " \"None (baseline)\",\n",
359 | " f\"{len(basic_results)}\",\n",
360 | " \"Mixed (all types)\",\n",
361 | " \"Low (requires filtering)\"\n",
362 | ")\n",
363 | "\n",
364 | "comparison_table.add_row(\n",
365 | " \"With Instructions\",\n",
366 | " instructions,\n",
367 | " f\"{len(results_with_instructions)}\",\n",
368 | " \"Highly targeted\",\n",
369 | " \"High (ready to use)\"\n",
370 | ")\n",
371 | "\n",
372 | "console.print(comparison_table)\n",
373 | "\n",
374 | "console.print(\"\\nKey Observations:\", style=\"blue\")\n",
375 | "console.print(\" No instructions return everything, requiring manual filtering\")\n",
376 | "console.print(\" Instructions provide highly targeted, ready-to-use results\")\n",
377 | "console.print(\" Best practice: Use specific, action-oriented instructions\")\n",
378 | "\n",
379 | "console.print(f\"\\nEfficiency with Instructions:\", style=\"green\")\n",
380 | "console.print(f\" Filtering efficiency: {((len(basic_results) - len(results_with_instructions)) / len(basic_results) * 100):.1f}% reduction in noise\")\n",
381 | "console.print(\" Time saved: No manual post-processing required\")\n",
382 | "console.print(\" AI-powered: Intelligent path selection and content filtering\")"
383 | ]
384 | }
385 | ],
386 | "metadata": {
387 | "kernelspec": {
388 | "display_name": "documentation-helper-zmyxh5Q8",
389 | "language": "python",
390 | "name": "python3"
391 | },
392 | "language_info": {
393 | "codemirror_mode": {
394 | "name": "ipython",
395 | "version": 3
396 | },
397 | "file_extension": ".py",
398 | "mimetype": "text/x-python",
399 | "name": "python",
400 | "nbconvert_exporter": "python",
401 | "pygments_lexer": "ipython3",
402 | "version": "3.10.11"
403 | }
404 | },
405 | "nbformat": 4,
406 | "nbformat_minor": 2
407 | }
408 |
--------------------------------------------------------------------------------
/Tavily Demo Tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "MhwXdmHllHtV"
7 | },
8 | "source": [
9 | "# 🗺️ TavilyMap & TavilyExtract Tutorial\n",
10 | "\n",
11 | "> **📚 Part of the LangChain Course: Building AI Agents & RAG Apps** \n",
12 | "> [🎓 Get the full course](https://www.udemy.com/course/langchain/?referralCode=D981B8213164A3EA91AC)\n",
13 | "\n",
14 | "\n",
15 | "This notebook demonstrates two powerful tools from Tavily AI:\n",
16 | "- **TavilyMap**: Automatically discovers and maps website structures\n",
17 | "- **TavilyExtract**: Extracts clean, structured content from web pages\n",
18 | "\n",
19 | "Perfect for documentation scraping, research, and content extraction! 🚀\n",
20 | "\n",
21 | "---\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {
27 | "id": "znycq-qvlG1R"
28 | },
29 | "source": [
30 | "## 📦 Setup & Installation\n",
31 | "\n",
32 | "First, let's install the required packages and set up our environment.\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "colab": {
40 | "base_uri": "https://localhost:8080/"
41 | },
42 | "id": "4rOvqccCKFpU",
43 | "outputId": "744a63dc-7970-4f82-d5b7-11209d95e3f8"
44 | },
45 | "outputs": [],
46 | "source": [
47 | "# Install required packages\n",
48 | "!pip install langchain-tavily certifi\n",
49 | "\n",
50 | "# For pretty printing and visualization\n",
51 | "!pip install rich pandas\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {
58 | "colab": {
59 | "base_uri": "https://localhost:8080/"
60 | },
61 | "id": "rlIwFtP0KFpU",
62 | "outputId": "a0b47c17-8469-46b8-d0aa-54217cbb86ff"
63 | },
64 | "outputs": [],
65 | "source": [
66 | "import asyncio\n",
67 | "import os\n",
68 | "import ssl\n",
69 | "from typing import Any, Dict, List\n",
70 | "\n",
71 | "import certifi\n",
72 | "from langchain_tavily import TavilyExtract, TavilyMap\n",
73 | "from rich.console import Console\n",
74 | "from rich.panel import Panel\n",
75 | "\n",
76 | "# Configure SSL context\n",
77 | "ssl_context = ssl.create_default_context(cafile=certifi.where())\n",
78 | "os.environ[\"SSL_CERT_FILE\"] = certifi.where()\n",
79 | "os.environ[\"REQUESTS_CA_BUNDLE\"] = certifi.where()\n",
80 | "\n",
81 | "# Initialize rich console for pretty printing\n",
82 | "console = Console()\n",
83 | "\n",
84 | "\n",
85 | "print(\"✅ All imports successful!\")\n"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {
91 | "id": "EjtXG-Y3lVNi"
92 | },
93 | "source": [
94 | "## 🔑 API Key Setup\n",
95 | "\n",
96 | "You'll need a Tavily API key to use these tools. Get yours at [tavily.com](https://app.tavily.com/home?utm_campaign=eden_marco&utm_medium=socials&utm_source=linkedin).\n",
97 | "\n",
98 | "Set environment variable `TAVILY_API_KEY`"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {
105 | "id": "C3N8qUKSKFpU"
106 | },
107 | "outputs": [],
108 | "source": [
109 | "\n",
110 | "\n",
111 | "# Set directly (uncomment and add your key)\n",
112 | "# tavily_api_key = \"your_tavily_api_key_here\"\n",
113 | "\n",
114 | "os.environ[\"TAVILY_API_KEY\"] = \"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\""
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "id": "7hPhgHXkl937"
121 | },
122 | "source": [
123 | "## 🗺️ TavilyMap: Website Structure Discovery\n",
124 | "\n",
125 | "TavilyMap automatically discovers and maps website structures by crawling through links. It's perfect for:\n",
126 | "- Documentation sites\n",
127 | "- Blog archives\n",
128 | "- Knowledge bases\n",
129 | "- Any structured website\n",
130 | "\n",
131 | "### Key Parameters:\n",
132 | "- `max_depth`: How deep to crawl (default: 3)\n",
133 | "- `max_breadth`: How many links per page (default: 10)\n",
134 | "- `max_pages`: Maximum total pages to discover (default: 100)\n"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "colab": {
142 | "base_uri": "https://localhost:8080/"
143 | },
144 | "id": "DbAVGf-jKFpU",
145 | "outputId": "a1c161ef-1ec9-4849-beaa-8c67fa8a32ff"
146 | },
147 | "outputs": [],
148 | "source": [
149 | "# Initialize TavilyMap with custom settings\n",
150 | "tavily_map = TavilyMap(\n",
151 | " max_depth=3, # Crawl up to 3 levels deep\n",
152 | " max_breadth=15, # Follow up to 15 links per page\n",
153 | " max_pages=50 # Limit to 50 total pages for demo\n",
154 | ")\n",
155 | "\n",
156 | "print(\"✅ TavilyMap initialized successfully!\")\n"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {
162 | "id": "l1GJ0FPKmBhz"
163 | },
164 | "source": [
165 | "### 📊 Demo: Mapping a Documentation Site\n",
166 | "\n",
167 | "Let's map the structure of a popular documentation site. We'll use the FastAPI documentation as an example.\n"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "colab": {
175 | "base_uri": "https://localhost:8080/",
176 | "height": 918
177 | },
178 | "id": "bh_ZAl7MKFpV",
179 | "outputId": "85982542-7b66-48af-fa4d-8f495b701281"
180 | },
181 | "outputs": [],
182 | "source": [
183 | "# Example website to map\n",
184 | "demo_url = \"https://python.langchain.com/docs/introduction/\"\n",
185 | "\n",
186 | "console.print(f\"🔍 Mapping website structure for: {demo_url}\", style=\"bold blue\")\n",
187 | "console.print(\"This may take a moment...\")\n",
188 | "\n",
189 | "# Map the website structure\n",
190 | "site_map = tavily_map.invoke(demo_url)\n",
191 | "\n",
192 | "# Display results\n",
193 | "urls = site_map.get('results', [])\n",
194 | "console.print(f\"\\n✅ Successfully mapped {len(urls)} URLs!\", style=\"bold green\")\n",
195 | "\n",
196 | "# Show first 10 URLs as examples\n",
197 | "console.print(\"\\n📋 First 50 discovered URLs:\", style=\"bold yellow\")\n",
198 | "for i, url in enumerate(urls[:50], 1):\n",
199 | " console.print(f\" {i:2d}. {url}\")\n",
200 | "\n",
201 | "if len(urls) > 10:\n",
202 | " console.print(f\" ... and {len(urls) - 50} more URLs\")\n"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {
208 | "id": "UHv-60zimbyH"
209 | },
210 | "source": [
211 | "## 🔍 TavilyExtract: Clean Content Extraction\n",
212 | "\n",
213 | "TavilyExtract takes URLs and returns clean, structured content without ads, navigation, or other noise. Perfect for:\n",
214 | "- Documentation processing\n",
215 | "- Content analysis\n",
216 | "- Research and data collection\n",
217 | "- Building knowledge bases\n",
218 | "\n",
219 | "### Key Features:\n",
220 | "- Removes HTML markup and navigation\n",
221 | "- Extracts main content only\n",
222 | "- Handles JavaScript-rendered content\n",
223 | "- Batch processing support"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "colab": {
231 | "base_uri": "https://localhost:8080/",
232 | "height": 0
233 | },
234 | "id": "5Vj1ZeaqKFpV",
235 | "outputId": "36e93bf4-29d5-43c9-fd44-11c6c8ce76a3"
236 | },
237 | "outputs": [],
238 | "source": [
239 | "# Initialize TavilyExtract\n",
240 | "tavily_extract = TavilyExtract()\n",
241 | "\n",
242 | "print(\"✅ TavilyExtract initialized successfully!\")\n"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {
248 | "id": "QtLQklhamjiZ"
249 | },
250 | "source": [
251 | "### 📄 Demo: Extracting Content from URLs\n",
252 | "\n",
253 | "Let's extract clean content from some of the URLs we discovered earlier.\n"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {
260 | "colab": {
261 | "base_uri": "https://localhost:8080/",
262 | "height": 9443
263 | },
264 | "id": "hJ8NNHBDKFpV",
265 | "outputId": "b9e0c404-af90-48c8-e508-23dbd90a7e7a"
266 | },
267 | "outputs": [],
268 | "source": [
269 | "# Select a few interesting URLs for extraction\n",
270 | "sample_urls = [urls[15]] # Take first 5 URLs\n",
271 | "console.print(f\"📚 Extracting content from {len(sample_urls)} URLs...\", style=\"bold blue\")\n",
272 | "\n",
273 | "# Extract content\n",
274 | "extraction_result = await tavily_extract.ainvoke(input={\"urls\": sample_urls})\n",
275 | "\n",
276 | "# Display results\n",
277 | "extracted_docs = extraction_result.get('results', [])\n",
278 | "console.print(f\"\\n✅ Successfully extracted {len(extracted_docs)} documents!\", style=\"bold green\")\n",
279 | "\n",
280 | "# Show summary of each extracted document\n",
281 | "for i, doc in enumerate(extracted_docs, 1):\n",
282 | " url = doc.get('url', 'Unknown')\n",
283 | " content = doc.get('raw_content', '')\n",
284 | "\n",
285 | " # Create a panel for each document\n",
286 | " panel_content = f\"\"\"URL: {url}\n",
287 | "Content Length: {len(content):,} characters\n",
288 | "Preview: {content}...\"\"\"\n",
289 | "\n",
290 | " console.print(Panel(panel_content, title=f\"Document {i}\", border_style=\"blue\"))\n",
291 | " print() # Add spacing\n"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {
297 | "id": "hxYWKCEDnvVU"
298 | },
299 | "source": [
300 | "### ⚡ Batch Processing Demo\n",
301 | "\n",
302 | "For larger datasets, we can process URLs in batches to optimize performance and handle rate limits.\n"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "metadata": {
309 | "colab": {
310 | "base_uri": "https://localhost:8080/",
311 | "height": 167
312 | },
313 | "id": "9iqi-NRFKFpV",
314 | "outputId": "a8e88357-bf6c-4b6e-be52-e6e9d87aced7"
315 | },
316 | "outputs": [],
317 | "source": [
318 | "def chunk_urls(urls: List[str], chunk_size: int = 3) -> List[List[str]]:\n",
319 | " \"\"\"Split URLs into chunks of specified size.\"\"\"\n",
320 | " chunks = []\n",
321 | " for i in range(0, len(urls), chunk_size):\n",
322 | " chunk = urls[i:i + chunk_size]\n",
323 | " chunks.append(chunk)\n",
324 | " return chunks\n",
325 | "\n",
326 | "async def extract_batch(urls: List[str], batch_num: int) -> List[Dict[str, Any]]:\n",
327 | " \"\"\"Extract documents from a batch of URLs.\"\"\"\n",
328 | " try:\n",
329 | " console.print(f\"🔄 Processing batch {batch_num} with {len(urls)} URLs\", style=\"blue\")\n",
330 | " docs = await tavily_extract.ainvoke(input={\"urls\": urls})\n",
331 | " results = docs.get('results', [])\n",
332 | " console.print(f\"✅ Batch {batch_num} completed - extracted {len(results)} documents\", style=\"green\")\n",
333 | " return results\n",
334 | " except Exception as e:\n",
335 | " console.print(f\"❌ Batch {batch_num} failed: {e}\", style=\"red\")\n",
336 | " return []\n",
337 | "\n",
338 | "# Process a larger set of URLs in batches\n",
339 | "url_batches = chunk_urls(urls[:9], chunk_size=3) # Take first 9 URLs for batch demo, split to batches of 3\n",
340 | "\n",
341 | "console.print(f\"📦 Processing 9 URLs in {len(url_batches)} batches\", style=\"bold yellow\")\n",
342 | "\n",
343 | "# Process batches concurrently\n",
344 | "tasks = [extract_batch(batch, i + 1) for i, batch in enumerate(url_batches)]\n",
345 | "batch_results = await asyncio.gather(*tasks)\n",
346 | "\n",
347 | "# Flatten results\n",
348 | "all_extracted = []\n",
349 | "for batch_result in batch_results:\n",
350 | " all_extracted.extend(batch_result)\n",
351 | "\n",
352 | "console.print(f\"\\n🎉 Batch processing complete! Total documents extracted: {len(all_extracted)}\", style=\"bold green\")\n"
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {
358 | "id": "YkP6R-Sgn3dY"
359 | },
360 | "source": [
361 | "## 🎯 Real-World Use Cases\n",
362 | "\n",
363 | "Here are some practical applications of TavilyMap and TavilyExtract:\n",
364 | "\n",
365 | "### 1. Documentation Scraping\n",
366 | "- Map entire documentation sites\n",
367 | "- Extract clean content for search indexes\n",
368 | "- Build knowledge bases from existing docs\n",
369 | "\n",
370 | "### 2. Competitive Analysis\n",
371 | "- Map competitor websites\n",
372 | "- Extract product information\n",
373 | "- Monitor content changes\n",
374 | "\n",
375 | "### 3. Research & Content Collection\n",
376 | "- Gather information from multiple sources\n",
377 | "- Build datasets for analysis\n",
378 | "- Create content archives\n",
379 | "\n",
380 | "### 4. SEO & Site Analysis\n",
381 | "- Discover all pages on a site\n",
382 | "- Analyze content structure\n",
383 | "- Identify content gaps\n"
384 | ]
385 | },
386 | {
387 | "cell_type": "markdown",
388 | "metadata": {
389 | "id": "DoyoDss4n8Tj"
390 | },
391 | "source": [
392 | "## 🎯 Conclusion\n",
393 | "\n",
394 | "This tutorial demonstrated the power of TavilyMap and TavilyExtract for automated web content discovery and extraction:\n",
395 | "\n",
396 | "### Key Takeaways:\n",
397 | "\n",
398 | "1. **TavilyMap** is perfect for:\n",
399 | " - Discovering website structures\n",
400 | " - Finding all pages on a site\n",
401 | " - Site auditing\n",
402 | "\n",
403 | "2. **TavilyExtract** excels at:\n",
404 | " - Clean content extraction\n",
405 | " - Removing HTML noise\n",
406 | " - Batch processing\n",
407 | " - Structured data collection\n",
408 | "\n",
409 | "3. **Combined** they enable:\n",
410 | " - Complete documentation scraping\n",
411 | " - Automated content pipelines\n",
412 | " - Knowledge base creation\n",
413 | " - Research automation\n",
414 | "\n",
415 | "### Next Steps:\n",
416 | "- Integrate with vector databases for semantic search\n",
417 | "- Add content filtering and classification\n",
418 | "- Build monitoring systems for content changes\n",
419 | "- Create automated reporting dashboards\n",
420 | "\n",
421 | "---\n",
422 | "\n",
423 | "**Happy scraping!** 🚀"
424 | ]
425 | }
426 | ],
427 | "metadata": {
428 | "colab": {
429 | "collapsed_sections": [
430 | "znycq-qvlG1R"
431 | ],
432 | "provenance": []
433 | },
434 | "kernelspec": {
435 | "display_name": "documentation-helper-H_4XlsCB",
436 | "language": "python",
437 | "name": "python3"
438 | },
439 | "language_info": {
440 | "codemirror_mode": {
441 | "name": "ipython",
442 | "version": 3
443 | },
444 | "file_extension": ".py",
445 | "mimetype": "text/x-python",
446 | "name": "python",
447 | "nbconvert_exporter": "python",
448 | "pygments_lexer": "ipython3",
449 | "version": "3.12.3"
450 | }
451 | },
452 | "nbformat": 4,
453 | "nbformat_minor": 0
454 | }
455 |
--------------------------------------------------------------------------------