├── requirements.txt ├── LICENSE ├── docstoteles ├── presentation │ ├── chat.py │ └── scraping.py ├── app.py └── service │ ├── scraping.py │ └── rag.py ├── .gitignore └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | python-dotenv 3 | groq 4 | firecrawl 5 | langchain 6 | langchain-community 7 | langchain-groq 8 | faiss-cpu 9 | sentence-transformers -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Asimov Academy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docstoteles/presentation/chat.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import os 3 | from service.rag import RAGService 4 | 5 | def show(): 6 | st.header("💬 Chat com Documentação") 7 | 8 | if "collection" not in st.session_state or not st.session_state.collection: 9 | st.warning("Selecione uma coleção na barra lateral para começar.") 10 | return 11 | 12 | rag = RAGService() 13 | loaded = rag.load_collection(st.session_state.collection) 14 | if not loaded: 15 | st.error("Não foi possível carregar a coleção selecionada.") 16 | return 17 | 18 | if "messages" not in st.session_state: 19 | st.session_state.messages = [] 20 | 21 | question = st.text_input("Pergunte algo sobre a documentação:") 22 | if st.button("Enviar") and question: 23 | with st.spinner("Consultando IA..."): 24 | answer = rag.ask_question(question) 25 | st.session_state.messages.append((question, answer)) 26 | 27 | st.divider() 28 | st.subheader("Histórico") 29 | for q, a in st.session_state.messages[::-1]: 30 | st.markdown(f"**Você:** {q}") 31 | st.markdown(f"**Docstóteles:** {a}") -------------------------------------------------------------------------------- /docstoteles/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import os 3 | from dotenv import load_dotenv 4 | from presentation import scraping 5 | from presentation import chat 6 | 7 | load_dotenv() 8 | 9 | st.set_page_config(page_title="Docstóteles", page_icon="📚", layout="wide") 10 | st.title("📚 Docstóteles - RAG Simples") 11 | 12 | # Sidebar para seleção 13 | with st.sidebar: 14 | st.header("Coleções") 15 | mode = st.radio("Modo:", ["Chat", "Scraping"]) 16 | 17 | st.divider() 18 | st.subheader("Coleções Disponíveis") 19 | 20 | collections_dir = "data/collections" 21 | if os.path.exists(collections_dir): 22 | collections = [d for d in os.listdir(collections_dir) 23 | if os.path.isdir(os.path.join(collections_dir, d))] 24 | 25 | for collection in collections: 26 | col1, col2 = st.columns([3, 1]) 27 | with col1: 28 | st.write(f"📁 {collection}") 29 | with col2: 30 | if st.button("Usar", key=f"use_{collection}"): 31 | st.session_state.collection = collection 32 | st.rerun() 33 | 34 | # Estados de sessão 35 | if "messages" not in st.session_state: 36 | st.session_state.messages = [] 37 | if "collection" not in st.session_state: 38 | st.session_state.collection = None 39 | 40 | # Importar páginas 41 | if mode == "Scraping": 42 | scraping.show() 43 | else: 44 | chat.show() -------------------------------------------------------------------------------- /docstoteles/presentation/scraping.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import os 3 | from service.scraping import ScrapingService 4 | 5 | def show(): 6 | st.header("🔍 Web Scraping") 7 | 8 | scraper = ScrapingService() 9 | 10 | with st.form("scraping_form"): 11 | url = st.text_input("URL do site:", placeholder="https://exemplo.com") 12 | collection_name = st.text_input("Nome da coleção:", placeholder="minha-colecao") 13 | submitted = st.form_submit_button("Iniciar Scraping") 14 | 15 | if submitted and url and collection_name: 16 | with st.spinner("Extraindo conteúdo..."): 17 | result = scraper.scrape_website(url, collection_name) 18 | 19 | if result["success"]: 20 | st.success(f"✅ {result['files']} arquivos salvos!") 21 | if st.button("Ir para Chat"): 22 | st.rerun() 23 | else: 24 | st.error(f"❌ Erro: {result['error']}") 25 | 26 | st.divider() 27 | st.subheader("Coleções Disponíveis") 28 | 29 | collections_dir = "data/collections" 30 | if os.path.exists(collections_dir): 31 | collections = [d for d in os.listdir(collections_dir) 32 | if os.path.isdir(os.path.join(collections_dir, d))] 33 | 34 | for collection in collections: 35 | col1, col2 = st.columns([3, 1]) 36 | with col1: 37 | st.write(f"📁 {collection}") -------------------------------------------------------------------------------- /docstoteles/service/scraping.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from firecrawl import FirecrawlApp 4 | 5 | class ScrapingService: 6 | def __init__(self): 7 | self.api_key = os.getenv("FIRECRAWL_API_KEY") 8 | self.api_url = os.getenv("FIRECRAWL_API_URL") 9 | 10 | self.app = FirecrawlApp(api_key=self.api_key, api_url=self.api_url) 11 | 12 | def scrape_website(self, url, collection_name): 13 | """Scraping completo em uma função""" 14 | try: 15 | # 1. Mapear URLs - CORREÇÃO AQUI 16 | map_result = self.app.map_url(url) 17 | 18 | # O map_result é um objeto MapResponse, não um dict 19 | # Vamos acessar os links diretamente 20 | if hasattr(map_result, 'links'): 21 | links = map_result.links 22 | elif hasattr(map_result, 'data') and hasattr(map_result.data, 'links'): 23 | links = map_result.data.links[:10] 24 | else: 25 | # Se não conseguir acessar, tentar como dict (fallback) 26 | links = getattr(map_result, 'links', [])[:10] 27 | 28 | if not links: 29 | raise Exception("Nenhum link encontrado!") 30 | 31 | print(f"Encontrados {len(links)} links") 32 | 33 | # 2. Fazer scraping - CORREÇÃO: batch_scrape_urls só aceita 1 argumento 34 | scrape_result = self.app.batch_scrape_urls(links) 35 | 36 | # 3. Extrair dados do resultado 37 | if hasattr(scrape_result, 'data'): 38 | scraped_data = scrape_result.data 39 | else: 40 | scraped_data = scrape_result.get("data", []) if hasattr(scrape_result, 'get') else [] 41 | 42 | # 4. Salvar arquivos 43 | collection_path = f"data/collections/{collection_name}" 44 | os.makedirs(collection_path, exist_ok=True) 45 | 46 | saved_count = 0 47 | for i, page in enumerate(scraped_data, 1): 48 | # Acessar markdown do objeto page 49 | if hasattr(page, 'markdown') and page.markdown: 50 | markdown_content = page.markdown 51 | elif hasattr(page, 'data') and hasattr(page.data, 'markdown'): 52 | markdown_content = page.data.markdown 53 | elif isinstance(page, dict) and page.get("markdown"): 54 | markdown_content = page["markdown"] 55 | else: 56 | continue 57 | 58 | with open(f"{collection_path}/{i}.md", "w", encoding="utf-8") as f: 59 | f.write(markdown_content) 60 | saved_count += 1 61 | 62 | return {"success": True, "files": saved_count} 63 | 64 | except Exception as e: 65 | print(f"Erro no scraping: {str(e)}") 66 | return {"success": False, "error": str(e)} -------------------------------------------------------------------------------- /docstoteles/service/rag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from langchain_community.document_loaders import DirectoryLoader, TextLoader 3 | from langchain_community.embeddings import HuggingFaceEmbeddings 4 | from langchain_community.vectorstores import FAISS 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter 6 | from langchain_groq import ChatGroq 7 | from langchain.chains import RetrievalQA 8 | from langchain.prompts import PromptTemplate 9 | 10 | class RAGService: 11 | def __init__(self): 12 | # Inicializar embeddings 13 | self.embeddings = HuggingFaceEmbeddings( 14 | model_name="all-MiniLM-L6-v2" 15 | ) 16 | 17 | # Inicializar LLM 18 | self.llm = ChatGroq( 19 | groq_api_key=os.getenv("GROQ_API_KEY"), 20 | model_name="llama3-8b-8192" 21 | ) 22 | 23 | # Text splitter 24 | self.text_splitter = RecursiveCharacterTextSplitter( 25 | chunk_size=1000, 26 | chunk_overlap=200 27 | ) 28 | 29 | self.vector_store = None 30 | self.qa_chain = None 31 | 32 | def load_collection(self, collection_name): 33 | """Carrega documentos e cria vector store""" 34 | collection_path = f"data/collections/{collection_name}" 35 | 36 | # Carregar documentos 37 | loader = DirectoryLoader( 38 | collection_path, 39 | glob="**/*.md", 40 | loader_cls=TextLoader, 41 | loader_kwargs={'encoding': 'utf-8'} 42 | ) 43 | 44 | documents = loader.load() 45 | 46 | if not documents: 47 | return False 48 | 49 | # Dividir em chunks 50 | texts = self.text_splitter.split_documents(documents) 51 | 52 | # Criar vector store 53 | self.vector_store = FAISS.from_documents(texts, self.embeddings) 54 | 55 | # Criar chain de QA 56 | template = """ 57 | Use os seguintes documentos para responder a pergunta. Se você não souber a resposta, diga que não sabe. 58 | 59 | {context} 60 | 61 | Pergunta: {question} 62 | Resposta: 63 | """ 64 | 65 | prompt = PromptTemplate( 66 | template=template, 67 | input_variables=["context", "question"] 68 | ) 69 | 70 | self.qa_chain = RetrievalQA.from_chain_type( 71 | llm=self.llm, 72 | chain_type="stuff", 73 | retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}), 74 | chain_type_kwargs={"prompt": prompt} 75 | ) 76 | 77 | return True 78 | 79 | def ask_question(self, question): 80 | """Faz pergunta usando RAG""" 81 | if not self.qa_chain: 82 | return "Nenhuma coleção carregada." 83 | 84 | try: 85 | result = self.qa_chain.run(question) 86 | return result 87 | except Exception as e: 88 | return f"Erro ao processar pergunta: {str(e)}" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Abstra 171 | # Abstra is an AI-powered process automation framework. 172 | # Ignore directories containing user credentials, local state, and settings. 173 | # Learn more at https://abstra.io/docs 174 | .abstra/ 175 | 176 | # Visual Studio Code 177 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 178 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 179 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 180 | # you could uncomment the following to ignore the enitre vscode folder 181 | # .vscode/ 182 | 183 | # Ruff stuff: 184 | .ruff_cache/ 185 | 186 | # PyPI configuration file 187 | .pypirc 188 | 189 | # Cursor 190 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to 191 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data 192 | # refer to https://docs.cursor.com/context/ignore-files 193 | .cursorignore 194 | .cursorindexingignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📚 Docstóteles - IA SEMPRE ATUALIZADA (Web Scraping + RAG) 2 | 3 | Transforme qualquer documentação em um assistente de IA atualizado! 4 | Crie um chat que responde sobre qualquer tecnologia, usando scraping inteligente e RAG, com ferramentas 100% gratuitas. 5 | 6 | ## ✨ O que é o Docstóteles? 7 | 8 | O Docstóteles é uma aplicação que junta Web Scraping inteligente (Fire Crawl) com RAG (LangChain + Groq) para criar um assistente de IA que conhece qualquer documentação da web. 9 | Você cola o link de uma documentação (Django, React, Vue, etc), o app baixa tudo, indexa e cria um chat para perguntas e respostas super atualizadas. 10 | 11 | ## 🚀 Tecnologias Usadas 12 | 13 | - [Streamlit](https://streamlit.io/) — Interface gráfica 14 | - [Fire Crawl](https://firecrawl.dev/) — Web Scraping inteligente 15 | - [Groq API](https://console.groq.com/) — LLM gratuita 16 | - [LangChain](https://python.langchain.com/) — RAG e embeddings 17 | - [Hugging Face](https://huggingface.co/) — Embeddings 18 | - [FAISS](https://github.com/facebookresearch/faiss) — Vector store 19 | 20 | ## 🛠️ Instalação 21 | 22 | 1. **Clone o repositório:** 23 | ```bash 24 | git clone https://github.com/asimov-academy/video-docstoteles-material.git 25 | cd video-docstoteles-material 26 | ``` 27 | 28 | 2. **Crie e ative um ambiente virtual (recomendado):** 29 | ```bash 30 | python3 -m venv .venv 31 | source .venv/bin/activate 32 | ``` 33 | > No Windows, use: `.venv\Scripts\activate` 34 | 35 | 3. **Instale as dependências:** 36 | ```bash 37 | pip install -r requirements.txt 38 | ``` 39 | 40 | 4. **Configure as chaves de API:** 41 | - Crie um arquivo `.env` na raiz do projeto e preencha com suas chaves: 42 | ``` 43 | GROQ_API_KEY=sua_chave_groq 44 | FIRECRAWL_API_KEY=sua_chave_firecrawl 45 | FIRECRAWL_API_URL=url_firecrawl 46 | ``` 47 | 48 | 5. **Crie as pastas necessárias:** 49 | ```bash 50 | mkdir -p data/collections 51 | ``` 52 | 53 | ## 🏃‍♂️ Como rodar 54 | 55 | ```bash 56 | streamlit run docstoteles/app.py 57 | ``` 58 | 59 | Acesse o app no navegador pelo link que aparecer no terminal. 60 | 61 | --- 62 | 63 | ## 📝 Como usar 64 | 65 | ### 1. Scraping 66 | 67 | - Vá para o modo "Scraping" na barra lateral. 68 | - Cole a URL da documentação (ex: https://docs.streamlit.io). 69 | - Dê um nome para a coleção. 70 | - Clique em "Iniciar Scraping". 71 | - Aguarde o download dos arquivos. 72 | 73 | ### 2. Chat 74 | 75 | - Selecione o modo "Chat" na barra lateral. 76 | - Escolha a coleção que você criou. 77 | - Pergunte qualquer coisa sobre a documentação! 78 | 79 | --- 80 | 81 | ## 🌐 Sugestões de sites para testar 82 | 83 | - https://docs.streamlit.io 84 | - https://python.langchain.com/docs 85 | - https://docs.python.org/3/tutorial 86 | 87 | --- 88 | 89 | ## 📦 Estrutura do Projeto 90 | 91 | ``` 92 | docstoteles/ 93 | app.py 94 | presentation/ 95 | scraping.py 96 | chat.py 97 | service/ 98 | scraping.py 99 | rag.py 100 | data/ 101 | collections/ 102 | requirements.txt 103 | README.md 104 | .env (você deve criar) 105 | ``` 106 | 107 | --- 108 | 109 | ## 💡 Dicas 110 | 111 | - O projeto é base: você pode expandir, conectar outros modelos, adicionar uploads, etc. 112 | - Fire Crawl e Groq são gratuitos (Groq não pede cartão). 113 | - O scraping baixa até 10 páginas por padrão (ajuste no código se quiser mais). 114 | 115 | --- 116 | 117 | ## 🧑‍💻 Contribua! 118 | 119 | Sugestões, issues e PRs são bem-vindos! 120 | 121 | --- 122 | 123 | # 🚦 Passo a Passo Docstóteles 124 | 125 | ## 1️⃣ Setup Básico 126 | 127 | ### 1.1 Instale as dependências 128 | 129 | ```bash 130 | pip install streamlit python-dotenv groq firecrawl langchain langchain-community langchain-groq faiss-cpu sentence-transformers 131 | ``` 132 | 133 | ### 1.2 Crie o arquivo `.env` 134 | 135 | ```env 136 | GROQ_API_KEY=sua_chave_groq 137 | FIRECRAWL_API_KEY=sua_chave_firecrawl 138 | FIRECRAWL_API_URL=url_firecrawl 139 | ``` 140 | 141 | ### 1.3 Estrutura do App Principal (`docstoteles/app.py`) 142 | 143 | ```python 144 | import streamlit as st 145 | import os 146 | from dotenv import load_dotenv 147 | from presentation import scraping 148 | from presentation import chat 149 | 150 | load_dotenv() 151 | 152 | st.set_page_config(page_title="Docstóteles", page_icon="📚", layout="wide") 153 | st.title("📚 Docstóteles - RAG Simples") 154 | 155 | # Sidebar para seleção 156 | with st.sidebar: 157 | st.header("Coleções") 158 | mode = st.radio("Modo:", ["Chat", "Scraping"]) 159 | st.divider() 160 | st.subheader("Coleções Disponíveis") 161 | collections_dir = "data/collections" 162 | if os.path.exists(collections_dir): 163 | collections = [d for d in os.listdir(collections_dir) 164 | if os.path.isdir(os.path.join(collections_dir, d))] 165 | for collection in collections: 166 | col1, col2 = st.columns([3, 1]) 167 | with col1: 168 | st.write(f"📁 {collection}") 169 | with col2: 170 | if st.button("Usar", key=f"use_{collection}"): 171 | st.session_state.collection = collection 172 | st.rerun() 173 | 174 | # Estados de sessão 175 | if "messages" not in st.session_state: 176 | st.session_state.messages = [] 177 | if "collection" not in st.session_state: 178 | st.session_state.collection = None 179 | 180 | # Importar páginas 181 | if mode == "Scraping": 182 | scraping.show() 183 | else: 184 | chat.show() 185 | ``` 186 | 187 | --- 188 | 189 | ## 2️⃣ Sistema de Scraping 190 | 191 | ### 2.1 Estrutura de Diretórios 192 | 193 | ```bash 194 | mkdir -p pages data/collections 195 | ``` 196 | 197 | ### 2.2 Serviço de Scraping (`docstoteles/service/scraping.py`) 198 | 199 | ```python 200 | import os 201 | import requests 202 | from firecrawl import FirecrawlApp 203 | 204 | class ScrapingService: 205 | def __init__(self): 206 | self.api_key = os.getenv("FIRECRAWL_API_KEY") 207 | self.api_url = os.getenv("FIRECRAWL_API_URL") 208 | self.app = FirecrawlApp(api_key=self.api_key, api_url=self.api_url) 209 | 210 | def scrape_website(self, url, collection_name): 211 | """Scraping completo em uma função""" 212 | try: 213 | # 1. Mapear URLs 214 | map_result = self.app.map_url(url) 215 | if hasattr(map_result, 'links'): 216 | links = map_result.links 217 | elif hasattr(map_result, 'data') and hasattr(map_result.data, 'links'): 218 | links = map_result.data.links[:10] 219 | else: 220 | links = getattr(map_result, 'links', [])[:10] 221 | if not links: 222 | raise Exception("Nenhum link encontrado!") 223 | print(f"Encontrados {len(links)} links") 224 | # 2. Fazer scraping 225 | scrape_result = self.app.batch_scrape_urls(links) 226 | # 3. Extrair dados do resultado 227 | if hasattr(scrape_result, 'data'): 228 | scraped_data = scrape_result.data 229 | else: 230 | scraped_data = scrape_result.get("data", []) if hasattr(scrape_result, 'get') else [] 231 | # 4. Salvar arquivos 232 | collection_path = f"data/collections/{collection_name}" 233 | os.makedirs(collection_path, exist_ok=True) 234 | saved_count = 0 235 | for i, page in enumerate(scraped_data, 1): 236 | if hasattr(page, 'markdown') and page.markdown: 237 | markdown_content = page.markdown 238 | elif hasattr(page, 'data') and hasattr(page.data, 'markdown'): 239 | markdown_content = page.data.markdown 240 | elif isinstance(page, dict) and page.get("markdown"): 241 | markdown_content = page["markdown"] 242 | else: 243 | continue 244 | with open(f"{collection_path}/{i}.md", "w", encoding="utf-8") as f: 245 | f.write(markdown_content) 246 | saved_count += 1 247 | return {"success": True, "files": saved_count} 248 | except Exception as e: 249 | print(f"Erro no scraping: {str(e)}") 250 | return {"success": False, "error": str(e)} 251 | ``` 252 | 253 | ### 2.3 Página de Scraping (`docstoteles/presentation/scraping.py`) 254 | 255 | ```python 256 | import streamlit as st 257 | import os 258 | from service.scraping import ScrapingService 259 | 260 | def show(): 261 | st.header("🔍 Web Scraping") 262 | scraper = ScrapingService() 263 | with st.form("scraping_form"): 264 | url = st.text_input("URL do site:", placeholder="https://exemplo.com") 265 | collection_name = st.text_input("Nome da coleção:", placeholder="minha-colecao") 266 | submitted = st.form_submit_button("Iniciar Scraping") 267 | if submitted and url and collection_name: 268 | with st.spinner("Extraindo conteúdo..."): 269 | result = scraper.scrape_website(url, collection_name) 270 | if result["success"]: 271 | st.success(f"✅ {result['files']} arquivos salvos!") 272 | if st.button("Ir para Chat"): 273 | st.rerun() 274 | else: 275 | st.error(f"❌ Erro: {result['error']}") 276 | st.divider() 277 | st.subheader("Coleções Disponíveis") 278 | collections_dir = "data/collections" 279 | if os.path.exists(collections_dir): 280 | collections = [d for d in os.listdir(collections_dir) 281 | if os.path.isdir(os.path.join(collections_dir, d))] 282 | for collection in collections: 283 | col1, col2 = st.columns([3, 1]) 284 | with col1: 285 | st.write(f"📁 {collection}") 286 | ``` 287 | 288 | --- 289 | 290 | ## 3️⃣ Sistema RAG 291 | 292 | ### 3.1 Dependências do RAG 293 | 294 | ```bash 295 | pip install langchain langchain-community langchain-groq faiss-cpu 296 | ``` 297 | 298 | ### 3.2 Serviço RAG com LangChain (`docstoteles/service/rag.py`) 299 | 300 | ```python 301 | import os 302 | from langchain_community.document_loaders import DirectoryLoader, TextLoader 303 | from langchain_community.embeddings import HuggingFaceEmbeddings 304 | from langchain_community.vectorstores import FAISS 305 | from langchain.text_splitter import RecursiveCharacterTextSplitter 306 | from langchain_groq import ChatGroq 307 | from langchain.chains import RetrievalQA 308 | from langchain.prompts import PromptTemplate 309 | 310 | class RAGService: 311 | def __init__(self): 312 | self.embeddings = HuggingFaceEmbeddings( 313 | model_name="all-MiniLM-L6-v2" 314 | ) 315 | self.llm = ChatGroq( 316 | groq_api_key=os.getenv("GROQ_API_KEY"), 317 | model_name="llama3-8b-8192" 318 | ) 319 | self.text_splitter = RecursiveCharacterTextSplitter( 320 | chunk_size=1000, 321 | chunk_overlap=200 322 | ) 323 | self.vector_store = None 324 | self.qa_chain = None 325 | def load_collection(self, collection_name): 326 | collection_path = f"data/collections/{collection_name}" 327 | loader = DirectoryLoader( 328 | collection_path, 329 | glob="**/*.md", 330 | loader_cls=TextLoader, 331 | loader_kwargs={'encoding': 'utf-8'} 332 | ) 333 | documents = loader.load() 334 | if not documents: 335 | return False 336 | texts = self.text_splitter.split_documents(documents) 337 | self.vector_store = FAISS.from_documents(texts, self.embeddings) 338 | template = """ 339 | Use os seguintes documentos para responder a pergunta. Se você não souber a resposta, diga que não sabe. 340 | 341 | {context} 342 | 343 | Pergunta: {question} 344 | Resposta: 345 | """ 346 | prompt = PromptTemplate( 347 | template=template, 348 | input_variables=["context", "question"] 349 | ) 350 | self.qa_chain = RetrievalQA.from_chain_type( 351 | llm=self.llm, 352 | chain_type="stuff", 353 | retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}), 354 | chain_type_kwargs={"prompt": prompt} 355 | ) 356 | return True 357 | def ask_question(self, question): 358 | if not self.qa_chain: 359 | return "Nenhuma coleção carregada." 360 | try: 361 | result = self.qa_chain.run(question) 362 | return result 363 | except Exception as e: 364 | return f"Erro ao processar pergunta: {str(e)}" 365 | ``` 366 | 367 | ### 3.3 Página de Chat (`docstoteles/presentation/chat.py`) 368 | 369 | ```python 370 | import streamlit as st 371 | import os 372 | from service.scraping import ScrapingService 373 | 374 | def show(): 375 | st.header("🔍 Web Scraping") 376 | scraper = ScrapingService() 377 | with st.form("scraping_form"): 378 | url = st.text_input("URL do site:", placeholder="https://exemplo.com") 379 | collection_name = st.text_input("Nome da coleção:", placeholder="minha-colecao") 380 | submitted = st.form_submit_button("Iniciar Scraping") 381 | if submitted and url and collection_name: 382 | with st.spinner("Extraindo conteúdo..."): 383 | result = scraper.scrape_website(url, collection_name) 384 | if result["success"]: 385 | st.success(f"✅ {result['files']} arquivos salvos!") 386 | if st.button("Ir para Chat"): 387 | st.rerun() 388 | else: 389 | st.error(f"❌ Erro: {result['error']}") 390 | st.divider() 391 | st.subheader("Coleções Disponíveis") 392 | collections_dir = "data/collections" 393 | if os.path.exists(collections_dir): 394 | collections = [d for d in os.listdir(collections_dir) 395 | if os.path.isdir(os.path.join(collections_dir, d))] 396 | for collection in collections: 397 | col1, col2 = st.columns([3, 1]) 398 | with col1: 399 | st.write(f"📁 {collection}") 400 | ``` 401 | 402 | --- --------------------------------------------------------------------------------