├── requirements.txt
├── LICENSE
├── docstoteles
    ├── presentation
    │   ├── chat.py
    │   └── scraping.py
    ├── app.py
    └── service
    │   ├── scraping.py
    │   └── rag.py
├── .gitignore
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | python-dotenv
3 | groq
4 | firecrawl
5 | langchain
6 | langchain-community
7 | langchain-groq
8 | faiss-cpu
9 | sentence-transformers 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Asimov Academy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docstoteles/presentation/chat.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | from service.rag import RAGService
 4 | 
 5 | def show():
 6 |     st.header("💬 Chat com Documentação")
 7 |     
 8 |     if "collection" not in st.session_state or not st.session_state.collection:
 9 |         st.warning("Selecione uma coleção na barra lateral para começar.")
10 |         return
11 |     
12 |     rag = RAGService()
13 |     loaded = rag.load_collection(st.session_state.collection)
14 |     if not loaded:
15 |         st.error("Não foi possível carregar a coleção selecionada.")
16 |         return
17 |     
18 |     if "messages" not in st.session_state:
19 |         st.session_state.messages = []
20 |     
21 |     question = st.text_input("Pergunte algo sobre a documentação:")
22 |     if st.button("Enviar") and question:
23 |         with st.spinner("Consultando IA..."):
24 |             answer = rag.ask_question(question)
25 |             st.session_state.messages.append((question, answer))
26 |     
27 |     st.divider()
28 |     st.subheader("Histórico")
29 |     for q, a in st.session_state.messages[::-1]:
30 |         st.markdown(f"**Você:** {q}")
31 |         st.markdown(f"**Docstóteles:** {a}") 


--------------------------------------------------------------------------------
/docstoteles/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | from dotenv import load_dotenv
 4 | from presentation import scraping
 5 | from presentation import chat
 6 | 
 7 | load_dotenv()
 8 | 
 9 | st.set_page_config(page_title="Docstóteles", page_icon="📚", layout="wide")
10 | st.title("📚 Docstóteles - RAG Simples")
11 | 
12 | # Sidebar para seleção
13 | with st.sidebar:
14 |     st.header("Coleções")
15 |     mode = st.radio("Modo:", ["Chat", "Scraping"])
16 |     
17 |     st.divider()
18 |     st.subheader("Coleções Disponíveis")
19 |     
20 |     collections_dir = "data/collections"
21 |     if os.path.exists(collections_dir):
22 |         collections = [d for d in os.listdir(collections_dir) 
23 |                       if os.path.isdir(os.path.join(collections_dir, d))]
24 |         
25 |         for collection in collections:
26 |             col1, col2 = st.columns([3, 1])
27 |             with col1:
28 |                 st.write(f"📁 {collection}")
29 |             with col2:
30 |                 if st.button("Usar", key=f"use_{collection}"):
31 |                     st.session_state.collection = collection
32 |                     st.rerun()
33 | 
34 | # Estados de sessão
35 | if "messages" not in st.session_state:
36 |     st.session_state.messages = []
37 | if "collection" not in st.session_state:
38 |     st.session_state.collection = None
39 | 
40 | # Importar páginas
41 | if mode == "Scraping":
42 |     scraping.show()
43 | else:
44 |     chat.show() 


--------------------------------------------------------------------------------
/docstoteles/presentation/scraping.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | from service.scraping import ScrapingService
 4 | 
 5 | def show():
 6 |     st.header("🔍 Web Scraping")
 7 |     
 8 |     scraper = ScrapingService()
 9 |     
10 |     with st.form("scraping_form"):
11 |         url = st.text_input("URL do site:", placeholder="https://exemplo.com")
12 |         collection_name = st.text_input("Nome da coleção:", placeholder="minha-colecao")
13 |         submitted = st.form_submit_button("Iniciar Scraping")
14 |     
15 |     if submitted and url and collection_name:
16 |         with st.spinner("Extraindo conteúdo..."):
17 |             result = scraper.scrape_website(url, collection_name)
18 |             
19 |             if result["success"]:
20 |                 st.success(f"✅ {result['files']} arquivos salvos!")
21 |                 if st.button("Ir para Chat"):
22 |                     st.rerun()
23 |             else:
24 |                 st.error(f"❌ Erro: {result['error']}")
25 |             
26 |     st.divider()
27 |     st.subheader("Coleções Disponíveis")
28 |     
29 |     collections_dir = "data/collections"
30 |     if os.path.exists(collections_dir):
31 |         collections = [d for d in os.listdir(collections_dir) 
32 |                       if os.path.isdir(os.path.join(collections_dir, d))]
33 |         
34 |         for collection in collections:
35 |             col1, col2 = st.columns([3, 1])
36 |             with col1:
37 |                 st.write(f"📁 {collection}") 


--------------------------------------------------------------------------------
/docstoteles/service/scraping.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from firecrawl import FirecrawlApp
 4 | 
 5 | class ScrapingService:
 6 |     def __init__(self):
 7 |         self.api_key = os.getenv("FIRECRAWL_API_KEY")
 8 |         self.api_url = os.getenv("FIRECRAWL_API_URL")
 9 | 
10 |         self.app = FirecrawlApp(api_key=self.api_key, api_url=self.api_url)
11 |     
12 |     def scrape_website(self, url, collection_name):
13 |         """Scraping completo em uma função"""
14 |         try:
15 |             # 1. Mapear URLs - CORREÇÃO AQUI
16 |             map_result = self.app.map_url(url)
17 |             
18 |             # O map_result é um objeto MapResponse, não um dict
19 |             # Vamos acessar os links diretamente
20 |             if hasattr(map_result, 'links'):
21 |                 links = map_result.links
22 |             elif hasattr(map_result, 'data') and hasattr(map_result.data, 'links'):
23 |                 links = map_result.data.links[:10]
24 |             else:
25 |                 # Se não conseguir acessar, tentar como dict (fallback)
26 |                 links = getattr(map_result, 'links', [])[:10]
27 |             
28 |             if not links:
29 |                 raise Exception("Nenhum link encontrado!")
30 |             
31 |             print(f"Encontrados {len(links)} links")
32 |             
33 |             # 2. Fazer scraping - CORREÇÃO: batch_scrape_urls só aceita 1 argumento
34 |             scrape_result = self.app.batch_scrape_urls(links)
35 |             
36 |             # 3. Extrair dados do resultado
37 |             if hasattr(scrape_result, 'data'):
38 |                 scraped_data = scrape_result.data
39 |             else:
40 |                 scraped_data = scrape_result.get("data", []) if hasattr(scrape_result, 'get') else []
41 |             
42 |             # 4. Salvar arquivos
43 |             collection_path = f"data/collections/{collection_name}"
44 |             os.makedirs(collection_path, exist_ok=True)
45 |             
46 |             saved_count = 0
47 |             for i, page in enumerate(scraped_data, 1):
48 |                 # Acessar markdown do objeto page
49 |                 if hasattr(page, 'markdown') and page.markdown:
50 |                     markdown_content = page.markdown
51 |                 elif hasattr(page, 'data') and hasattr(page.data, 'markdown'):
52 |                     markdown_content = page.data.markdown
53 |                 elif isinstance(page, dict) and page.get("markdown"):
54 |                     markdown_content = page["markdown"]
55 |                 else:
56 |                     continue
57 |                 
58 |                 with open(f"{collection_path}/{i}.md", "w", encoding="utf-8") as f:
59 |                     f.write(markdown_content)
60 |                 saved_count += 1
61 |             
62 |             return {"success": True, "files": saved_count}
63 |             
64 |         except Exception as e:
65 |             print(f"Erro no scraping: {str(e)}")
66 |             return {"success": False, "error": str(e)} 


--------------------------------------------------------------------------------
/docstoteles/service/rag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from langchain_community.document_loaders import DirectoryLoader, TextLoader
 3 | from langchain_community.embeddings import HuggingFaceEmbeddings
 4 | from langchain_community.vectorstores import FAISS
 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 6 | from langchain_groq import ChatGroq
 7 | from langchain.chains import RetrievalQA
 8 | from langchain.prompts import PromptTemplate
 9 | 
10 | class RAGService:
11 |     def __init__(self):
12 |         # Inicializar embeddings
13 |         self.embeddings = HuggingFaceEmbeddings(
14 |             model_name="all-MiniLM-L6-v2"
15 |         )
16 |         
17 |         # Inicializar LLM
18 |         self.llm = ChatGroq(
19 |             groq_api_key=os.getenv("GROQ_API_KEY"),
20 |             model_name="llama3-8b-8192"
21 |         )
22 |         
23 |         # Text splitter
24 |         self.text_splitter = RecursiveCharacterTextSplitter(
25 |             chunk_size=1000,
26 |             chunk_overlap=200
27 |         )
28 |         
29 |         self.vector_store = None
30 |         self.qa_chain = None
31 |     
32 |     def load_collection(self, collection_name):
33 |         """Carrega documentos e cria vector store"""
34 |         collection_path = f"data/collections/{collection_name}"
35 |         
36 |         # Carregar documentos
37 |         loader = DirectoryLoader(
38 |             collection_path,
39 |             glob="**/*.md",
40 |             loader_cls=TextLoader,
41 |             loader_kwargs={'encoding': 'utf-8'}
42 |         )
43 |         
44 |         documents = loader.load()
45 |         
46 |         if not documents:
47 |             return False
48 |         
49 |         # Dividir em chunks
50 |         texts = self.text_splitter.split_documents(documents)
51 |         
52 |         # Criar vector store
53 |         self.vector_store = FAISS.from_documents(texts, self.embeddings)
54 |         
55 |         # Criar chain de QA
56 |         template = """
57 |             Use os seguintes documentos para responder a pergunta. Se você não souber a resposta, diga que não sabe.
58 | 
59 |             {context}
60 | 
61 |             Pergunta: {question}
62 |             Resposta:
63 |         """
64 | 
65 |         prompt = PromptTemplate(
66 |             template=template,
67 |             input_variables=["context", "question"]
68 |         )
69 |         
70 |         self.qa_chain = RetrievalQA.from_chain_type(
71 |             llm=self.llm,
72 |             chain_type="stuff",
73 |             retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
74 |             chain_type_kwargs={"prompt": prompt}
75 |         )
76 |         
77 |         return True
78 |     
79 |     def ask_question(self, question):
80 |         """Faz pergunta usando RAG"""
81 |         if not self.qa_chain:
82 |             return "Nenhuma coleção carregada."
83 |         
84 |         try:
85 |             result = self.qa_chain.run(question)
86 |             return result
87 |         except Exception as e:
88 |             return f"Erro ao processar pergunta: {str(e)}" 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Abstra
171 | # Abstra is an AI-powered process automation framework.
172 | # Ignore directories containing user credentials, local state, and settings.
173 | # Learn more at https://abstra.io/docs
174 | .abstra/
175 | 
176 | # Visual Studio Code
177 | #  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
178 | #  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
179 | #  and can be added to the global gitignore or merged into this file. However, if you prefer, 
180 | #  you could uncomment the following to ignore the enitre vscode folder
181 | # .vscode/
182 | 
183 | # Ruff stuff:
184 | .ruff_cache/
185 | 
186 | # PyPI configuration file
187 | .pypirc
188 | 
189 | # Cursor
190 | #  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
191 | #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
192 | #  refer to https://docs.cursor.com/context/ignore-files
193 | .cursorignore
194 | .cursorindexingignore


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 📚 Docstóteles - IA SEMPRE ATUALIZADA (Web Scraping + RAG)
  2 | 
  3 | Transforme qualquer documentação em um assistente de IA atualizado!  
  4 | Crie um chat que responde sobre qualquer tecnologia, usando scraping inteligente e RAG, com ferramentas 100% gratuitas.
  5 | 
  6 | ## ✨ O que é o Docstóteles?
  7 | 
  8 | O Docstóteles é uma aplicação que junta Web Scraping inteligente (Fire Crawl) com RAG (LangChain + Groq) para criar um assistente de IA que conhece qualquer documentação da web.  
  9 | Você cola o link de uma documentação (Django, React, Vue, etc), o app baixa tudo, indexa e cria um chat para perguntas e respostas super atualizadas.
 10 | 
 11 | ## 🚀 Tecnologias Usadas
 12 | 
 13 | - [Streamlit](https://streamlit.io/) — Interface gráfica
 14 | - [Fire Crawl](https://firecrawl.dev/) — Web Scraping inteligente
 15 | - [Groq API](https://console.groq.com/) — LLM gratuita
 16 | - [LangChain](https://python.langchain.com/) — RAG e embeddings
 17 | - [Hugging Face](https://huggingface.co/) — Embeddings
 18 | - [FAISS](https://github.com/facebookresearch/faiss) — Vector store
 19 | 
 20 | ## 🛠️ Instalação
 21 | 
 22 | 1. **Clone o repositório:**
 23 |    ```bash
 24 |    git clone https://github.com/asimov-academy/video-docstoteles-material.git
 25 |    cd video-docstoteles-material
 26 |    ```
 27 | 
 28 | 2. **Crie e ative um ambiente virtual (recomendado):**
 29 |    ```bash
 30 |    python3 -m venv .venv
 31 |    source .venv/bin/activate
 32 |    ```
 33 |    > No Windows, use: `.venv\Scripts\activate`
 34 | 
 35 | 3. **Instale as dependências:**
 36 |    ```bash
 37 |    pip install -r requirements.txt
 38 |    ```
 39 | 
 40 | 4. **Configure as chaves de API:**
 41 |    - Crie um arquivo `.env` na raiz do projeto e preencha com suas chaves:
 42 |      ```
 43 |      GROQ_API_KEY=sua_chave_groq
 44 |      FIRECRAWL_API_KEY=sua_chave_firecrawl
 45 |      FIRECRAWL_API_URL=url_firecrawl
 46 |      ```
 47 | 
 48 | 5. **Crie as pastas necessárias:**
 49 |    ```bash
 50 |    mkdir -p data/collections
 51 |    ```
 52 | 
 53 | ## 🏃‍♂️ Como rodar
 54 | 
 55 | ```bash
 56 | streamlit run docstoteles/app.py
 57 | ```
 58 | 
 59 | Acesse o app no navegador pelo link que aparecer no terminal.
 60 | 
 61 | ---
 62 | 
 63 | ## 📝 Como usar
 64 | 
 65 | ### 1. Scraping
 66 | 
 67 | - Vá para o modo "Scraping" na barra lateral.
 68 | - Cole a URL da documentação (ex: https://docs.streamlit.io).
 69 | - Dê um nome para a coleção.
 70 | - Clique em "Iniciar Scraping".
 71 | - Aguarde o download dos arquivos.
 72 | 
 73 | ### 2. Chat
 74 | 
 75 | - Selecione o modo "Chat" na barra lateral.
 76 | - Escolha a coleção que você criou.
 77 | - Pergunte qualquer coisa sobre a documentação!
 78 | 
 79 | ---
 80 | 
 81 | ## 🌐 Sugestões de sites para testar
 82 | 
 83 | - https://docs.streamlit.io
 84 | - https://python.langchain.com/docs
 85 | - https://docs.python.org/3/tutorial
 86 | 
 87 | ---
 88 | 
 89 | ## 📦 Estrutura do Projeto
 90 | 
 91 | ```
 92 | docstoteles/
 93 |   app.py
 94 |   presentation/
 95 |     scraping.py
 96 |     chat.py
 97 |   service/
 98 |     scraping.py
 99 |     rag.py
100 | data/
101 |   collections/
102 | requirements.txt
103 | README.md
104 | .env (você deve criar)
105 | ```
106 | 
107 | ---
108 | 
109 | ## 💡 Dicas
110 | 
111 | - O projeto é base: você pode expandir, conectar outros modelos, adicionar uploads, etc.
112 | - Fire Crawl e Groq são gratuitos (Groq não pede cartão).
113 | - O scraping baixa até 10 páginas por padrão (ajuste no código se quiser mais).
114 | 
115 | ---
116 | 
117 | ## 🧑‍💻 Contribua!
118 | 
119 | Sugestões, issues e PRs são bem-vindos!
120 | 
121 | ---
122 | 
123 | # 🚦 Passo a Passo Docstóteles
124 | 
125 | ## 1️⃣ Setup Básico
126 | 
127 | ### 1.1 Instale as dependências
128 | 
129 | ```bash
130 | pip install streamlit python-dotenv groq firecrawl langchain langchain-community langchain-groq faiss-cpu sentence-transformers
131 | ```
132 | 
133 | ### 1.2 Crie o arquivo `.env`
134 | 
135 | ```env
136 | GROQ_API_KEY=sua_chave_groq
137 | FIRECRAWL_API_KEY=sua_chave_firecrawl
138 | FIRECRAWL_API_URL=url_firecrawl
139 | ```
140 | 
141 | ### 1.3 Estrutura do App Principal (`docstoteles/app.py`)
142 | 
143 | ```python
144 | import streamlit as st
145 | import os
146 | from dotenv import load_dotenv
147 | from presentation import scraping
148 | from presentation import chat
149 | 
150 | load_dotenv()
151 | 
152 | st.set_page_config(page_title="Docstóteles", page_icon="📚", layout="wide")
153 | st.title("📚 Docstóteles - RAG Simples")
154 | 
155 | # Sidebar para seleção
156 | with st.sidebar:
157 |     st.header("Coleções")
158 |     mode = st.radio("Modo:", ["Chat", "Scraping"])
159 |     st.divider()
160 |     st.subheader("Coleções Disponíveis")
161 |     collections_dir = "data/collections"
162 |     if os.path.exists(collections_dir):
163 |         collections = [d for d in os.listdir(collections_dir) 
164 |                       if os.path.isdir(os.path.join(collections_dir, d))]
165 |         for collection in collections:
166 |             col1, col2 = st.columns([3, 1])
167 |             with col1:
168 |                 st.write(f"📁 {collection}")
169 |             with col2:
170 |                 if st.button("Usar", key=f"use_{collection}"):
171 |                     st.session_state.collection = collection
172 |                     st.rerun()
173 | 
174 | # Estados de sessão
175 | if "messages" not in st.session_state:
176 |     st.session_state.messages = []
177 | if "collection" not in st.session_state:
178 |     st.session_state.collection = None
179 | 
180 | # Importar páginas
181 | if mode == "Scraping":
182 |     scraping.show()
183 | else:
184 |     chat.show()
185 | ```
186 | 
187 | ---
188 | 
189 | ## 2️⃣ Sistema de Scraping
190 | 
191 | ### 2.1 Estrutura de Diretórios
192 | 
193 | ```bash
194 | mkdir -p pages data/collections
195 | ```
196 | 
197 | ### 2.2 Serviço de Scraping (`docstoteles/service/scraping.py`)
198 | 
199 | ```python
200 | import os
201 | import requests
202 | from firecrawl import FirecrawlApp
203 | 
204 | class ScrapingService:
205 |     def __init__(self):
206 |         self.api_key = os.getenv("FIRECRAWL_API_KEY")
207 |         self.api_url = os.getenv("FIRECRAWL_API_URL")
208 |         self.app = FirecrawlApp(api_key=self.api_key, api_url=self.api_url)
209 |     
210 |     def scrape_website(self, url, collection_name):
211 |         """Scraping completo em uma função"""
212 |         try:
213 |             # 1. Mapear URLs
214 |             map_result = self.app.map_url(url)
215 |             if hasattr(map_result, 'links'):
216 |                 links = map_result.links
217 |             elif hasattr(map_result, 'data') and hasattr(map_result.data, 'links'):
218 |                 links = map_result.data.links[:10]
219 |             else:
220 |                 links = getattr(map_result, 'links', [])[:10]
221 |             if not links:
222 |                 raise Exception("Nenhum link encontrado!")
223 |             print(f"Encontrados {len(links)} links")
224 |             # 2. Fazer scraping
225 |             scrape_result = self.app.batch_scrape_urls(links)
226 |             # 3. Extrair dados do resultado
227 |             if hasattr(scrape_result, 'data'):
228 |                 scraped_data = scrape_result.data
229 |             else:
230 |                 scraped_data = scrape_result.get("data", []) if hasattr(scrape_result, 'get') else []
231 |             # 4. Salvar arquivos
232 |             collection_path = f"data/collections/{collection_name}"
233 |             os.makedirs(collection_path, exist_ok=True)
234 |             saved_count = 0
235 |             for i, page in enumerate(scraped_data, 1):
236 |                 if hasattr(page, 'markdown') and page.markdown:
237 |                     markdown_content = page.markdown
238 |                 elif hasattr(page, 'data') and hasattr(page.data, 'markdown'):
239 |                     markdown_content = page.data.markdown
240 |                 elif isinstance(page, dict) and page.get("markdown"):
241 |                     markdown_content = page["markdown"]
242 |                 else:
243 |                     continue
244 |                 with open(f"{collection_path}/{i}.md", "w", encoding="utf-8") as f:
245 |                     f.write(markdown_content)
246 |                 saved_count += 1
247 |             return {"success": True, "files": saved_count}
248 |         except Exception as e:
249 |             print(f"Erro no scraping: {str(e)}")
250 |             return {"success": False, "error": str(e)}
251 | ```
252 | 
253 | ### 2.3 Página de Scraping (`docstoteles/presentation/scraping.py`)
254 | 
255 | ```python
256 | import streamlit as st
257 | import os
258 | from service.scraping import ScrapingService
259 | 
260 | def show():
261 |     st.header("🔍 Web Scraping")
262 |     scraper = ScrapingService()
263 |     with st.form("scraping_form"):
264 |         url = st.text_input("URL do site:", placeholder="https://exemplo.com")
265 |         collection_name = st.text_input("Nome da coleção:", placeholder="minha-colecao")
266 |         submitted = st.form_submit_button("Iniciar Scraping")
267 |     if submitted and url and collection_name:
268 |         with st.spinner("Extraindo conteúdo..."):
269 |             result = scraper.scrape_website(url, collection_name)
270 |             if result["success"]:
271 |                 st.success(f"✅ {result['files']} arquivos salvos!")
272 |                 if st.button("Ir para Chat"):
273 |                     st.rerun()
274 |             else:
275 |                 st.error(f"❌ Erro: {result['error']}")
276 |     st.divider()
277 |     st.subheader("Coleções Disponíveis")
278 |     collections_dir = "data/collections"
279 |     if os.path.exists(collections_dir):
280 |         collections = [d for d in os.listdir(collections_dir) 
281 |                       if os.path.isdir(os.path.join(collections_dir, d))]
282 |         for collection in collections:
283 |             col1, col2 = st.columns([3, 1])
284 |             with col1:
285 |                 st.write(f"📁 {collection}")
286 | ```
287 | 
288 | ---
289 | 
290 | ## 3️⃣ Sistema RAG
291 | 
292 | ### 3.1 Dependências do RAG
293 | 
294 | ```bash
295 | pip install langchain langchain-community langchain-groq faiss-cpu
296 | ```
297 | 
298 | ### 3.2 Serviço RAG com LangChain (`docstoteles/service/rag.py`)
299 | 
300 | ```python
301 | import os
302 | from langchain_community.document_loaders import DirectoryLoader, TextLoader
303 | from langchain_community.embeddings import HuggingFaceEmbeddings
304 | from langchain_community.vectorstores import FAISS
305 | from langchain.text_splitter import RecursiveCharacterTextSplitter
306 | from langchain_groq import ChatGroq
307 | from langchain.chains import RetrievalQA
308 | from langchain.prompts import PromptTemplate
309 | 
310 | class RAGService:
311 |     def __init__(self):
312 |         self.embeddings = HuggingFaceEmbeddings(
313 |             model_name="all-MiniLM-L6-v2"
314 |         )
315 |         self.llm = ChatGroq(
316 |             groq_api_key=os.getenv("GROQ_API_KEY"),
317 |             model_name="llama3-8b-8192"
318 |         )
319 |         self.text_splitter = RecursiveCharacterTextSplitter(
320 |             chunk_size=1000,
321 |             chunk_overlap=200
322 |         )
323 |         self.vector_store = None
324 |         self.qa_chain = None
325 |     def load_collection(self, collection_name):
326 |         collection_path = f"data/collections/{collection_name}"
327 |         loader = DirectoryLoader(
328 |             collection_path,
329 |             glob="**/*.md",
330 |             loader_cls=TextLoader,
331 |             loader_kwargs={'encoding': 'utf-8'}
332 |         )
333 |         documents = loader.load()
334 |         if not documents:
335 |             return False
336 |         texts = self.text_splitter.split_documents(documents)
337 |         self.vector_store = FAISS.from_documents(texts, self.embeddings)
338 |         template = """
339 |             Use os seguintes documentos para responder a pergunta. Se você não souber a resposta, diga que não sabe.
340 | 
341 |             {context}
342 | 
343 |             Pergunta: {question}
344 |             Resposta:
345 |         """
346 |         prompt = PromptTemplate(
347 |             template=template,
348 |             input_variables=["context", "question"]
349 |         )
350 |         self.qa_chain = RetrievalQA.from_chain_type(
351 |             llm=self.llm,
352 |             chain_type="stuff",
353 |             retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
354 |             chain_type_kwargs={"prompt": prompt}
355 |         )
356 |         return True
357 |     def ask_question(self, question):
358 |         if not self.qa_chain:
359 |             return "Nenhuma coleção carregada."
360 |         try:
361 |             result = self.qa_chain.run(question)
362 |             return result
363 |         except Exception as e:
364 |             return f"Erro ao processar pergunta: {str(e)}"
365 | ```
366 | 
367 | ### 3.3 Página de Chat (`docstoteles/presentation/chat.py`)
368 | 
369 | ```python
370 | import streamlit as st
371 | import os
372 | from service.scraping import ScrapingService
373 | 
374 | def show():
375 |     st.header("🔍 Web Scraping")
376 |     scraper = ScrapingService()
377 |     with st.form("scraping_form"):
378 |         url = st.text_input("URL do site:", placeholder="https://exemplo.com")
379 |         collection_name = st.text_input("Nome da coleção:", placeholder="minha-colecao")
380 |         submitted = st.form_submit_button("Iniciar Scraping")
381 |     if submitted and url and collection_name:
382 |         with st.spinner("Extraindo conteúdo..."):
383 |             result = scraper.scrape_website(url, collection_name)
384 |             if result["success"]:
385 |                 st.success(f"✅ {result['files']} arquivos salvos!")
386 |                 if st.button("Ir para Chat"):
387 |                     st.rerun()
388 |             else:
389 |                 st.error(f"❌ Erro: {result['error']}")
390 |     st.divider()
391 |     st.subheader("Coleções Disponíveis")
392 |     collections_dir = "data/collections"
393 |     if os.path.exists(collections_dir):
394 |         collections = [d for d in os.listdir(collections_dir) 
395 |                       if os.path.isdir(os.path.join(collections_dir, d))]
396 |         for collection in collections:
397 |             col1, col2 = st.columns([3, 1])
398 |             with col1:
399 |                 st.write(f"📁 {collection}")
400 | ```
401 | 
402 | ---


--------------------------------------------------------------------------------