├── .dockerignore ├── src ├── __init__.py ├── email │ ├── __init__.py │ └── send_email.py ├── service │ ├── __init__.py │ └── main.py ├── etls │ ├── boa │ │ ├── __init__.py │ │ ├── defs.py │ │ ├── metadata.py │ │ ├── load.py │ │ └── scrapper.py │ ├── bocm │ │ ├── __init__.py │ │ ├── defs.py │ │ ├── README.md │ │ ├── metadata.py │ │ ├── load.py │ │ ├── utils.py │ │ └── scrapper.py │ ├── boe │ │ ├── __init__.py │ │ ├── defs.py │ │ ├── metadata.py │ │ ├── load.py │ │ ├── loading │ │ │ ├── documents.py │ │ │ └── defs_id_largos.py │ │ └── scrapper.py │ ├── boja │ │ ├── __init__.py │ │ ├── defs.py │ │ ├── README.md │ │ ├── utils.py │ │ ├── metadata.py │ │ ├── load.py │ │ └── scrapper.py │ ├── bopv │ │ ├── __init__.py │ │ ├── defs.py │ │ ├── README.md │ │ ├── metadata.py │ │ ├── load.py │ │ └── scrapper.py │ ├── template │ │ ├── __init__.py │ │ ├── defs.py │ │ ├── metadata.py │ │ ├── scrapper.py │ │ ├── README.md │ │ └── load.py │ ├── bopz │ │ ├── defs.py │ │ ├── README.md │ │ ├── utils.py │ │ ├── metadata.py │ │ ├── load.py │ │ └── scrapper.py │ ├── common │ │ ├── metadata.py │ │ ├── scrapper.py │ │ ├── etl.py │ │ └── utils.py │ ├── jobs.py │ └── utils.py ├── utils.py └── initialize.py ├── benchmark ├── requirements.txt ├── output │ ├── load_test_sync_failures.csv │ ├── load_test_async_exceptions.csv │ ├── load_test_sync_exceptions.csv │ ├── response_time_async_failures.csv │ ├── response_time_sync_failures.csv │ ├── response_time_async_exceptions.csv │ ├── response_time_sync_exceptions.csv │ ├── load_test_async_failures.csv │ ├── response_time_async_stats.csv │ ├── response_time_sync_stats.csv │ ├── load_test_sync_stats.csv │ ├── load_test_async_stats.csv │ ├── response_time_async_stats_history.csv │ └── response_time_sync_stats_history.csv ├── benchmark.py └── README.md ├── bin ├── build └── run ├── Makefile ├── config ├── example_qdrant_local.yaml ├── qlora.yaml └── config.yaml ├── render.yaml ├── Dockerfile ├── doc ├── cron_etl_daily_public.sh ├── supabase │ ├── query.sql │ └── starting.md ├── crontab_e.sh ├── qdrant │ └── queries.json └── deployment_guide.md ├── evaluation └── embeddings │ ├── eval.py │ ├── README.md │ ├── defs.py │ └── questions.py ├── requirements.txt ├── research └── fine-tuning-embedding-model │ ├── README.md │ └── 1.5-CheckDataset.ipynb ├── LICENSE ├── .github └── workflows │ └── bandit.yml ├── .gitignore └── README.md /.dockerignore: -------------------------------------------------------------------------------- 1 | .git -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/email/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/etls/boa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/etls/bocm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/etls/boe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/etls/boja/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/etls/bopv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/etls/template/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | locust==2.16.1 -------------------------------------------------------------------------------- /src/etls/boa/defs.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "boa" -------------------------------------------------------------------------------- /src/etls/boja/defs.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "boja" -------------------------------------------------------------------------------- /src/etls/bopv/defs.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "bopv" -------------------------------------------------------------------------------- /src/etls/bocm/defs.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "bocm" 2 | -------------------------------------------------------------------------------- /src/etls/boe/defs.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "justicio" 2 | -------------------------------------------------------------------------------- /src/etls/bopz/defs.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "bopz" 2 | -------------------------------------------------------------------------------- /src/etls/template/defs.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "template" 2 | -------------------------------------------------------------------------------- /benchmark/output/load_test_sync_failures.csv: -------------------------------------------------------------------------------- 1 | Method,Name,Error,Occurrences 2 | -------------------------------------------------------------------------------- /benchmark/output/load_test_async_exceptions.csv: -------------------------------------------------------------------------------- 1 | Count,Message,Traceback,Nodes 2 | -------------------------------------------------------------------------------- /benchmark/output/load_test_sync_exceptions.csv: -------------------------------------------------------------------------------- 1 | Count,Message,Traceback,Nodes 2 | -------------------------------------------------------------------------------- /benchmark/output/response_time_async_failures.csv: -------------------------------------------------------------------------------- 1 | Method,Name,Error,Occurrences 2 | -------------------------------------------------------------------------------- /benchmark/output/response_time_sync_failures.csv: -------------------------------------------------------------------------------- 1 | Method,Name,Error,Occurrences 2 | -------------------------------------------------------------------------------- /benchmark/output/response_time_async_exceptions.csv: -------------------------------------------------------------------------------- 1 | Count,Message,Traceback,Nodes 2 | -------------------------------------------------------------------------------- /benchmark/output/response_time_sync_exceptions.csv: -------------------------------------------------------------------------------- 1 | Count,Message,Traceback,Nodes 2 | -------------------------------------------------------------------------------- /bin/build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -xeuo pipefail 4 | 5 | docker build -t ia-boe:latest . -------------------------------------------------------------------------------- /src/etls/common/metadata.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class MetadataDocument(BaseModel): 5 | # Source 6 | source_name: str 7 | source_type: str 8 | -------------------------------------------------------------------------------- /bin/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | docker run -v $(pwd):/usr/app \ 6 | -e PINECONE_API_KEY \ 7 | -e PINECONE_ENV \ 8 | ia-boe:latest 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init: 2 | pip install -r requirements.txt 3 | 4 | isort: 5 | isort --check-only src evaluation 6 | 7 | format: isort 8 | black src evaluation 9 | 10 | isort-fix: 11 | isort src evaluation 12 | -------------------------------------------------------------------------------- /benchmark/output/load_test_async_failures.csv: -------------------------------------------------------------------------------- 1 | Method,Name,Error,Occurrences 2 | GET,/aqa,"ConnectionResetError(54, 'Connection reset by peer')",16 3 | GET,/aqa,HTTPError('500 Server Error: Internal Server Error for url: /aqa'),33 4 | -------------------------------------------------------------------------------- /src/etls/template/metadata.py: -------------------------------------------------------------------------------- 1 | """ 2 | Define the metadata to be stored in the embedding database. 3 | """ 4 | 5 | from src.etls.common.metadata import MetadataDocument 6 | 7 | 8 | class TemplateMetadataDocument(MetadataDocument): 9 | pass 10 | -------------------------------------------------------------------------------- /config/example_qdrant_local.yaml: -------------------------------------------------------------------------------- 1 | # You can create your own config file 2 | # You can generate your keys using your CLI: 3 | # openssl rand -hex 32 4 | service: 5 | api_key: 823e071f67c198cc05c73f8bd4580865e6a8819a1f3fe57d2cd49b5c892a5233 6 | read_only_api_key: d1aab4f05ae4fd7f4e4b8d9e5924469494ebb7897aed46cf2b0df0915410e0b0 7 | -------------------------------------------------------------------------------- /render.yaml: -------------------------------------------------------------------------------- 1 | previewsEnabled: true 2 | services: 3 | # A Docker web service 4 | - type: web 5 | name: ia-boe 6 | runtime: python 7 | plan: free 8 | autoDeploy: false 9 | buildCommand: pip install -r requirements.txt 10 | startCommand: uvicorn src.service.main:app --host 0.0.0.0 --port 10000 --workers 1 --timeout-keep-alive 125 --log-level info 11 | -------------------------------------------------------------------------------- /src/etls/boja/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web principal 3 | 4 | [Web principal del BOJA](https://www.juntadeandalucia.es/eboja.html) 5 | 6 | 7 | # Portal de búsqueda avanzada 8 | 9 | [Portal de búsqueda avanzada](https://www.juntadeandalucia.es/eboja/index.html) 10 | 11 | # Ejemplo de documentos scrapeados 12 | 13 | [Doc1] (https://www.juntadeandalucia.es/eboja/2024/9/index.html) 14 | [Doc2] (https://www.juntadeandalucia.es/eboja/2024/39/index.html) -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | RUN pip install --upgrade pip 4 | 5 | WORKDIR /usr/app 6 | 7 | COPY requirements.txt requirements.txt 8 | 9 | RUN pip install -r requirements.txt 10 | 11 | COPY . . 12 | 13 | ENV APP_PATH="/usr/app" 14 | ENV PYTHONPATH "${PYTHONPATH}:${APP_PATH}" 15 | 16 | CMD ["uvicorn", "src.service.app:APP", "--host", "0.0.0.0", "--port", "5000", "--workers", "2", "--timeout-keep-alive", "125", "--log-level", "info"] 17 | -------------------------------------------------------------------------------- /doc/cron_etl_daily_public.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export APP_PATH="." 4 | export SENDGRID_API_KEY="" 5 | export OPENAI_API_KEY="" 6 | export TOKENIZERS_PARALLELISM=false 7 | export TAVILY_API_KEY="" 8 | export QDRANT_API_KEY="" 9 | export QDRANT_API_URL="" 10 | 11 | 12 | cd ia-boe/ 13 | source venv3.9/bin/activate 14 | pip install -r requirements.txt 15 | python -m src.etls.boe.load.daily 16 | -------------------------------------------------------------------------------- /src/etls/boja/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def mes_a_numero(mes): 4 | meses = { 5 | "enero": 1, "febrero": 2, "marzo": 3, "abril": 4, 6 | "mayo": 5, "junio": 6, "julio": 7, "agosto": 8, 7 | "septiembre": 9, "octubre": 10, "noviembre": 11, "diciembre": 12 8 | } 9 | return meses.get(mes.lower(), 0) 10 | 11 | def clean_text(text: str) -> str: 12 | cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE) 13 | return cleaned -------------------------------------------------------------------------------- /src/etls/bopv/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web principal 3 | 4 | [Web principal del BOPV](https://www.euskadi.eus/web01-bopv/es/bopv2/datos/Ultimo.shtml) 5 | 6 | 7 | # Portal de búsqueda avanzada 8 | 9 | [Portal de búsqueda avanzada](https://www.euskadi.eus/web01-bopv/es/p43aBOPVWebWar/buscarAvanzada.do?idioma=es&tipoBusqueda=2) 10 | 11 | # Ejemplo de documentos scrapeados 12 | 13 | [Doc1] (https://www.euskadi.eus/web01-bopv/es/bopv2/datos/2024/02/2400757a.shtml) 14 | [Doc2] (https://www.euskadi.eus/web01-bopv/es/bopv2/datos/2024/02/2400759a.shtml) -------------------------------------------------------------------------------- /src/etls/bopz/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Web principal 3 | 4 | [Web principal del BOPZ](http://bop.dpz.es/BOPZ/) 5 | 6 | # Normativa 7 | 8 | [Normativa del BOPZ](http://bop.dpz.es/BOPZ/portal/normativa.pdf) 9 | 10 | # Portal de búsqueda 11 | 12 | [Portal de búsqueda](https://gestiona.comunidad.madrid/wleg_pub/secure/busquedaAvanzada/buscador.jsf?id=1) 13 | 14 | # Ejemplo de documentos scrapeados 15 | 16 | [Doc1] (http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729066) 17 | [Doc2] (http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729162) -------------------------------------------------------------------------------- /benchmark/output/response_time_async_stats.csv: -------------------------------------------------------------------------------- 1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100% 2 | GET,/aqa,18,0,5300.0,5349.600280277776,5259.814042,5824.726167,13780.0,0.1849910355610521,0.0,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800 3 | ,Aggregated,18,0,5300.0,5349.600280277776,5259.814042,5824.726167,13780.0,0.1849910355610521,0.0,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800 4 | -------------------------------------------------------------------------------- /benchmark/output/response_time_sync_stats.csv: -------------------------------------------------------------------------------- 1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100% 2 | GET,/qa,18,0,5400.0,5396.147113222223,5283.173665999996,5591.686541000001,13780.0,0.18341122514186337,0.0,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600 3 | ,Aggregated,18,0,5400.0,5396.147113222223,5283.173665999996,5591.686541000001,13780.0,0.18341122514186337,0.0,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600 4 | -------------------------------------------------------------------------------- /benchmark/output/load_test_sync_stats.csv: -------------------------------------------------------------------------------- 1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100% 2 | GET,/qa,110,0,80000.0,110124.43718445454,5342.831416,228389.20708300004,13780.0,0.36826503149218837,0.0,84000,155000,176000,193000,202000,214000,220000,224000,228000,228000,228000 3 | ,Aggregated,110,0,80000.0,110124.43718445454,5342.831416,228389.20708300004,13780.0,0.36826503149218837,0.0,84000,155000,176000,193000,202000,214000,220000,224000,228000,228000,228000 4 | -------------------------------------------------------------------------------- /evaluation/embeddings/eval.py: -------------------------------------------------------------------------------- 1 | from evaluation.embeddings.questions import QUERIES 2 | from src.initialize import initialize_app 3 | 4 | INIT_OBJECTS = initialize_app() 5 | 6 | 7 | success = 0 8 | for boe_id, question in QUERIES: 9 | docs = INIT_OBJECTS.vector_store.similarity_search_with_score( 10 | query=question, k=INIT_OBJECTS.config_loader["top_k_results"] 11 | ) 12 | for doc in docs: 13 | if doc[0].metadata["identificador"] == boe_id: 14 | success += 1 15 | # break 16 | 17 | 18 | print(f"Len queries: {len(QUERIES)}") 19 | print(f"Success answers: {success}") 20 | -------------------------------------------------------------------------------- /benchmark/output/load_test_async_stats.csv: -------------------------------------------------------------------------------- 1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100% 2 | GET,/aqa,3525,49,11000.0,11458.114285116599,8.411416999990706,32071.805625000023,13588.644822695036,12.065553609404489,0.16771975230094183,11000,13000,14000,17000,21000,23000,26000,26000,32000,32000,32000 3 | ,Aggregated,3525,49,11000.0,11458.114285116599,8.411416999990706,32071.805625000023,13588.644822695036,12.065553609404489,0.16771975230094183,11000,13000,14000,17000,21000,23000,26000,26000,32000,32000,32000 4 | -------------------------------------------------------------------------------- /evaluation/embeddings/README.md: -------------------------------------------------------------------------------- 1 | We want to evaluate some parameters for the system: 2 | 3 | * chunk_size (600, 1200, 1800) 4 | * chunk_overlap (50, 100) 5 | * k number of chunks as context (4, 6, 8) 6 | * Search params 7 | * https://qdrant.tech/documentation/concepts/search/ 8 | * https://qdrant.tech/documentation/tutorials/optimize/ 9 | 10 | 11 | More info: https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1 12 | 13 | ********************************************* 14 | 15 | We load a subset (`defs.py`) of BOE documents into different Qdrant databases (tier-free) and run `eval.py` against them. 16 | 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==6.0 2 | 3 | fastapi==0.103.2 4 | uvicorn==0.23.2 5 | 6 | requests==2.31.0 7 | beautifulsoup4==4.12.2 8 | lxml==4.9.2 9 | pydantic==2.4.2 10 | 11 | retry==0.9.2 12 | 13 | typer==0.9.0 14 | schedule==1.2.1 15 | 16 | langchain==0.2.1 17 | # langchainplus-sdk==0.0.20 18 | # langsmith==0.1.65 19 | langchain-openai==0.1.8 20 | langchain-core==0.2.3 21 | langchain-community==0.2.1 22 | langtrace_python_sdk==2.1.26 23 | 24 | qdrant-client==1.9.2 25 | 26 | sentence_transformers==2.2.2 27 | openai==1.30.5 28 | tavily-python==0.3.3 29 | 30 | sendgrid==6.10.0 31 | 32 | # Clean code tools 33 | black==23.9.1 34 | isort==5.12.0 35 | 36 | # Evaluation 37 | # ragas==0.1.0rc1 38 | -------------------------------------------------------------------------------- /src/etls/template/scrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Define a class with some methods (download_day, download_document) to scrape the information. 3 | """ 4 | 5 | import typing as tp 6 | from datetime import date 7 | 8 | from src.etls.common.scrapper import BaseScrapper 9 | from src.etls.template.metadata import TemplateMetadataDocument 10 | from src.initialize import initialize_logging 11 | 12 | initialize_logging() 13 | 14 | 15 | class TemplateScrapper(BaseScrapper): 16 | def download_day(self, day: date) -> tp.List[TemplateMetadataDocument]: 17 | """ 18 | Define how to navigate between documents for a single day 19 | """ 20 | pass 21 | 22 | def download_document(self, url: str) -> TemplateMetadataDocument: 23 | """ 24 | Define how a single document is scrapped 25 | """ 26 | pass 27 | -------------------------------------------------------------------------------- /src/email/send_email.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import os 3 | 4 | from sendgrid import SendGridAPIClient 5 | from sendgrid.helpers.mail import Content, Email, Mail, To 6 | 7 | from src.initialize import initialize_logging 8 | 9 | initialize_logging() 10 | 11 | 12 | def send_email(config_loader, subject: str, content: str) -> None: 13 | logger = lg.getLogger(send_email.__name__) 14 | logger.info("Sending email") 15 | sg = SendGridAPIClient(api_key=os.environ.get("SENDGRID_API_KEY")) 16 | from_email = Email(config_loader["admin_email"]) 17 | to_email = To(config_loader["admin_email"]) 18 | content = Content("text/plain", content) 19 | mail = Mail(from_email, to_email, subject, content) 20 | response = sg.client.mail.send.post(request_body=mail.get()) 21 | logger.info("Sent email with status %s", response.status_code) 22 | -------------------------------------------------------------------------------- /research/fine-tuning-embedding-model/README.md: -------------------------------------------------------------------------------- 1 | # Fine-tuning bge-m3-es-legal 2 | 3 | ### Introduction 4 | 5 | Customize the embedding model (BAAI/bge-m3) for a specific domain (Legal) and language (Spanish). 6 | 7 | ### Steps 8 | 9 | 1. Create a dataset to fine-tuning 10 | 2. Fine-tuning the model using `BAAI/bge-m3` as baseline. 11 | 12 | ### Notes 13 | 14 | Run in Runpod with 1 x RTX A6000. About 90 seconds per epoch. So, about 6 epochs -> 10 minutes. 15 | 16 | ### Based on: 17 | 18 | - https://www.philschmid.de/sagemaker-train-deploy-embedding-models 19 | - https://github.com/virattt/financial-datasets/ 20 | - https://github.com/virattt/financial-datasets/blob/main/financial_datasets/prompts.py 21 | 22 | ### Results 23 | 24 | - Dataset: https://huggingface.co/datasets/dariolopez/justicio-rag-embedding-qa 25 | - Model: https://huggingface.co/datasets/dariolopez/bge-m3-es-legal 26 | -------------------------------------------------------------------------------- /src/etls/bocm/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Estructura del BOCM 3 | 4 | [Estructura del BOCM](https://www.bocm.es/estructura) 5 | 6 | # Normativa 7 | 8 | [Normativa del BOCM](https://www.bocm.es/normativa-reguladora) 9 | 10 | 11 | # Portal de Legislación 12 | 13 | [wleg_pub](https://gestiona.comunidad.madrid/wleg_pub/secure/busquedaAvanzada/buscador.jsf?id=1) 14 | 15 | # Clasificación de campos que contiene cada sección del BOCM 16 | 17 | 18 | SECCIÓN | SUBSECCIÓN | APARTADO | TIPO | ANUNCIANTE | RANGO | ORGANO 19 | :---:|:---:|:---:|:---:|:---:|:---:|:---: 20 | 1 | x | | x | | x | x 21 | 2 | x | | | | | | 22 | 3 | x| x | | | | | 23 | 4 | x| | | | | | 24 | 5 | | | | x | | | 25 | 26 | > NOTA ACLARATORIA: (x = Tiene ese campo) 27 | 28 | # IDEAS para la mejora de los metadatos del scrapeo 29 | 30 | - Añadir una sección de análisis en los metadatos, cruzando info con el portal de legislación 31 | -------------------------------------------------------------------------------- /src/etls/bopv/metadata.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional 3 | 4 | from src.etls.common.metadata import MetadataDocument 5 | 6 | 7 | class BOPVMetadataDocument(MetadataDocument): 8 | """Class for keeping metadata of a BOPV Document scrapped.""" 9 | 10 | # Text 11 | filepath: str 12 | 13 | # Source 14 | source_name: str = "BOPV" 15 | source_type: str = "Boletin" 16 | 17 | # Metadatos 18 | identificador: str 19 | departamento: Optional[str] = None 20 | tipologia: str 21 | 22 | # Links 23 | titulo: Optional[str] = None 24 | url_pdf: str # pdf_link 25 | url_html: Optional[str] = None 26 | url_boletin: Optional[str] = None 27 | 28 | fecha_disposicion: str = "" 29 | anio: Optional[str] = None 30 | mes: Optional[str] = None 31 | dia: Optional[str] = None 32 | 33 | datetime_insert: str = datetime.utcnow().isoformat() 34 | 35 | -------------------------------------------------------------------------------- /src/etls/boja/metadata.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional 3 | 4 | from src.etls.common.metadata import MetadataDocument 5 | 6 | 7 | 8 | 9 | class BOJAMetadataDocument(MetadataDocument): 10 | """Class for keeping metadata of a BOJA Document scrapped.""" 11 | 12 | # Text 13 | filepath: str 14 | 15 | # Source 16 | source_name: str = "BOJA" 17 | source_type: str = "Boletin" 18 | 19 | # Metadatos 20 | identificador: str 21 | departamento: str 22 | tipologia: str 23 | 24 | # Links 25 | titulo: Optional[str] = None 26 | url_pdf: str # pdf_link 27 | url_html: Optional[str] = None 28 | url_boletin: Optional[str] = None 29 | 30 | fecha_disposicion: Optional[str] = None 31 | anio: Optional[str] = None 32 | mes: Optional[str] = None 33 | dia: Optional[str] = None 34 | 35 | datetime_insert: str = datetime.utcnow().isoformat() 36 | 37 | -------------------------------------------------------------------------------- /src/etls/bopz/utils.py: -------------------------------------------------------------------------------- 1 | # POST request data to filter retrieved BOPZ documents 2 | 3 | DATA_POST = { 4 | "numPag": "", 5 | "idProcedente": "8610", 6 | "idPortador": "", 7 | "hProcedente": " AYUNTAMIENTO DE ZARAGOZA", 8 | "hPortador": "", 9 | "idPagadora": "8610", 10 | "hPagadora": " AYUNTAMIENTO DE ZARAGOZA", 11 | "primeraVez": "N", 12 | "ficheroDoc": "N", 13 | "numRegistroInf": "", 14 | "numRegistroSup": "", 15 | "esPortalSN": "S", 16 | "numBoletinInf": "", 17 | "numRegistroNumInf": "", 18 | "numRegistroAnyoInf": "", 19 | "numRegistroNumSup": "", 20 | "numRegistroAnyoSup": "", 21 | "numBoletinAuxInf": "", 22 | "anyoBoletinInf": "", 23 | "numBoletinSup": "", 24 | "anyoBoletinSup": "", 25 | "fechaPubInf": "", 26 | "fechaPubSup": "", 27 | "procedente": " AYUNTAMIENTO DE ZARAGOZA", 28 | "tematica": "", 29 | "titulo": "", 30 | "contenido": "", 31 | } 32 | -------------------------------------------------------------------------------- /doc/supabase/query.sql: -------------------------------------------------------------------------------- 1 | -- Query the number of vectors 2 | select count(*) from documents; 3 | 4 | -- Query number of vectors by day 5 | -- TODO: add sundays 6 | select 7 | count(*) as counted_vector, 8 | metadata->>'fecha_publicacion' as date 9 | from documents 10 | group by date 11 | order by date 12 | 13 | -- Query max(day) 14 | select max(metadata->>'fecha_publicacion') as date from documents; 15 | 16 | -- Query min(day) 17 | select min(metadata->>'fecha_publicacion') as date from documents; 18 | 19 | -- Query metadata by identificador 20 | SELECT count(*) FROM documents WHERE metadata @> '{"identificador": "BOE-A-2023-38"}'; 21 | 22 | -- Query metadata by fecha_publicacion 23 | SELECT count(*) FROM documents WHERE metadata @> '{"fecha_publicacion": "2023-01-01"}'; 24 | 25 | -- TODO: Query to detect the duplicated (text/embeddings) 26 | 27 | 28 | -- metadata: https://medium.com/hackernoon/how-to-query-jsonb-beginner-sheet-cheat-4da3aa5082a3 29 | -------------------------------------------------------------------------------- /doc/crontab_e.sh: -------------------------------------------------------------------------------- 1 | # To automatize the daily run: 2 | # 1. Rename cron_etl_daily_public.sh to cron_etl_daily.sh 3 | # mv cron_etl_daily_public.sh cron_etl_daily.sh 4 | # 2. Fill the api keys 5 | # 3. Provide permissions to file: 6 | # chmod +x cron_etl_daily.sh 7 | # 4. Copy and paste this file on Crontab 8 | # crontab -e 9 | # Note: Using Ubuntu 22.04 as host probably you find errors like: requests.exceptions.SSLError: HTTPSConnectionPool(host='www.boe.es', port=443): Max retries exceeded with url: /boe/dias/2023/08/02 (Caused by SSLError(SSLError(1, '[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED] unsafe legacy renegotiation disabled (_ssl.c:1129)'))) 10 | # Solution (Vulnerable to the Man-in-the-Middle): https://stackoverflow.com/questions/71603314/ssl-error-unsafe-legacy-renegotiation-disabled 11 | 12 | 13 | SHELL=/usr/bin/bash 14 | CRON_TZ=UTC 15 | PROJECT_DIR=/home/ubuntu/ia-boe 16 | 17 | 20 07 * * * $PROJECT_DIR/cron_etl_daily.sh >> $PROJECT_DIR/logs/ingest_cron.out 2>> $PROJECT_DIR/logs/ingest_cron.err 18 | -------------------------------------------------------------------------------- /benchmark/benchmark.py: -------------------------------------------------------------------------------- 1 | import locust 2 | 3 | 4 | class ApiUser(locust.HttpUser): 5 | wait_time = locust.constant_pacing(1) 6 | input_data = ( 7 | "¿Es de aplicación la ley de garantía integral de la libertad sexual a niños (varones) menores de edad " 8 | "víctimas de violencias sexuales o solo a niñas y mujeres?" 9 | ) 10 | 11 | def health_check(self): 12 | self.client.get("/healthcheck") 13 | 14 | def endpoint404(self): 15 | with self.client.get( 16 | "/url_does_not_exist", catch_response=True 17 | ) as response: 18 | if response.status_code == 404: 19 | response.success() 20 | 21 | 22 | class ApiAsyncUser(ApiUser): 23 | 24 | url = "/aqa" 25 | 26 | @locust.task 27 | def aqa(self): 28 | self.client.get(self.url, json=self.input_data) 29 | 30 | 31 | class ApiSyncUser(ApiUser): 32 | 33 | url = "/qa" 34 | 35 | @locust.task 36 | def qa(self): 37 | self.client.get(self.url, json=self.input_data) 38 | -------------------------------------------------------------------------------- /src/etls/jobs.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import schedule 4 | 5 | from src.etls.boe.load import today as boe_today 6 | from src.etls.bopz.load import today as bopz_today 7 | from src.etls.bocm.load import today as bocm_today 8 | from src.etls.bopv.load import today as bopv_today 9 | from src.etls.boja.load import today as boja_today 10 | from src.etls.boa.load import today as boa_today 11 | from src.initialize import initialize_app 12 | 13 | 14 | INIT_OBJECTS = initialize_app() 15 | 16 | 17 | schedule.every().day.at("11:00").do(boe_today, init_objects=INIT_OBJECTS) 18 | schedule.every().day.at("11:05").do(bopz_today, init_objects=INIT_OBJECTS) 19 | schedule.every().day.at("11:10").do(bocm_today, init_objects=INIT_OBJECTS) 20 | schedule.every().day.at("11:15").do(bopv_today, init_objects=INIT_OBJECTS) 21 | schedule.every().day.at("11:20").do(boja_today, init_objects=INIT_OBJECTS) 22 | schedule.every().day.at("11:25").do(boa_today, init_objects=INIT_OBJECTS) 23 | # TODO: monthly jobs 24 | 25 | while True: 26 | schedule.run_pending() 27 | time.sleep(1) 28 | -------------------------------------------------------------------------------- /src/etls/boa/metadata.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional 3 | 4 | from src.etls.common.metadata import MetadataDocument 5 | 6 | 7 | class BOAMetadataDocument(MetadataDocument): 8 | """Class for keeping metadata of a BOA Document scrapped.""" 9 | 10 | # Text 11 | filepath: str 12 | 13 | # Source 14 | source_name: str = "BOA" 15 | source_type: str = "Boletin" 16 | 17 | # Metadatos 18 | numero_boletin: str 19 | identificador: str # DOCN 20 | departamento: Optional[str] = None 21 | seccion: Optional[str] = None 22 | subseccion: Optional[str] = None 23 | rango: Optional[str] = None 24 | codigo_materia: Optional[str] = None 25 | 26 | # Links 27 | titulo: Optional[str] = None 28 | url_pdf: str 29 | url_boletin: Optional[str] = None 30 | 31 | fecha_disposicion: str = "" 32 | fecha_publicacion: str = "" 33 | anio: Optional[str] = None 34 | mes: Optional[str] = None 35 | dia: Optional[str] = None 36 | 37 | datetime_insert: str = datetime.utcnow().isoformat() 38 | 39 | -------------------------------------------------------------------------------- /src/etls/bopz/metadata.py: -------------------------------------------------------------------------------- 1 | import typing as tp 2 | from datetime import datetime 3 | 4 | from pydantic import field_validator 5 | 6 | from src.etls.common.metadata import MetadataDocument 7 | 8 | 9 | class BOPZMetadataDocument(MetadataDocument): 10 | """Class for keeping metadata of a BOPZ Document scrapped.""" 11 | 12 | # Text 13 | filepath: str 14 | 15 | # Source 16 | source_name: str = "BOPZ" 17 | source_type: str = "Boletin" 18 | 19 | # Metadatos 20 | identificador: str 21 | numero_oficial: str = "" 22 | departamento: str 23 | titulo: str = "" 24 | url_pdf: str 25 | url_html: str 26 | fecha_publicacion: str 27 | fecha_disposicion: str = "" 28 | anio: str 29 | mes: str 30 | dia: str 31 | 32 | # Analisis 33 | materia: tp.List[str] 34 | 35 | datetime_insert: str = datetime.utcnow().isoformat() 36 | 37 | @field_validator("fecha_publicacion", "fecha_disposicion") 38 | @classmethod 39 | def isoformat(cls, v): 40 | if v: 41 | datetime.strptime(v, "%Y-%m-%d") 42 | return v 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Darío López Padial 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/etls/utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import requests 4 | from requests.adapters import HTTPAdapter 5 | from urllib3.util.retry import Retry 6 | import schedule 7 | 8 | 9 | def catch_exceptions(cancel_on_failure=False): 10 | def catch_exceptions_decorator(job_func): 11 | @functools.wraps(job_func) 12 | def wrapper(*args, **kwargs): 13 | try: 14 | return job_func(*args, **kwargs) 15 | except: 16 | import traceback 17 | print(traceback.format_exc()) 18 | if cancel_on_failure: 19 | return schedule.CancelJob 20 | return wrapper 21 | return catch_exceptions_decorator 22 | 23 | 24 | def create_retry_session(retries, backoff_factor=1, status_forcelist=[500, 502, 503, 504]): 25 | session = requests.Session() 26 | retry = Retry( 27 | total=retries, 28 | read=retries, 29 | connect=retries, 30 | backoff_factor=backoff_factor, 31 | status_forcelist=status_forcelist, 32 | ) 33 | adapter = HTTPAdapter(max_retries=retry) 34 | session.mount('http://', adapter) 35 | session.mount('https://', adapter) 36 | return session 37 | -------------------------------------------------------------------------------- /src/etls/common/scrapper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging as lg 3 | import typing as tp 4 | from abc import ABC, abstractmethod 5 | from datetime import date, timedelta 6 | 7 | from src.etls.common.metadata import MetadataDocument 8 | from src.initialize import initialize_logging 9 | 10 | initialize_logging() 11 | 12 | 13 | class BaseScrapper(ABC): 14 | def download_days(self, date_start: date, date_end: date) -> tp.List[MetadataDocument]: 15 | """Download all the documents between two dates (from date_start to date_end)""" 16 | logger = lg.getLogger(self.download_days.__name__) 17 | logger.info("Downloading content from day %s to %s", date_start, date_end) 18 | delta = timedelta(days=1) 19 | docs = [] 20 | date_start_aux = copy.copy(date_start) 21 | while date_start_aux <= date_end: 22 | docs += self.download_day(date_start_aux) 23 | date_start_aux += delta 24 | logger.info("Downloaded content from day %s to %s", date_start, date_end) 25 | return docs 26 | 27 | @abstractmethod 28 | def download_day(self, day: date) -> tp.List[MetadataDocument]: 29 | """Download all the documents for a specific date.""" 30 | pass 31 | 32 | @abstractmethod 33 | def download_document(self, url: str) -> MetadataDocument: 34 | """Get text and metadata from url document.""" 35 | pass 36 | -------------------------------------------------------------------------------- /doc/supabase/starting.md: -------------------------------------------------------------------------------- 1 | # 1. Create table and function 2 | 3 | -- Enable the pgvector extension to work with embedding vectors 4 | create extension vector; 5 | 6 | -- Create a table to store your documents 7 | create table documents ( 8 | id bigserial primary key, 9 | content text, -- corresponds to Document.pageContent 10 | metadata jsonb, -- corresponds to Document.metadata 11 | embedding vector(768) -- 768 works for OpenAI embeddings, change if needed 12 | ); 13 | 14 | -- Create a function to do queries 15 | CREATE FUNCTION match_documents(query_embedding vector(768), match_count int) 16 | RETURNS TABLE( 17 | id text, 18 | content text, 19 | metadata jsonb, 20 | -- we return matched vectors to enable maximal marginal relevance searches 21 | embedding vector(768), 22 | similarity float) 23 | LANGUAGE plpgsql 24 | AS $$ 25 | # variable_conflict use_column 26 | BEGIN 27 | RETURN query 28 | SELECT 29 | id, 30 | content, 31 | metadata, 32 | embedding, 33 | 1 -(documents.embedding <=> query_embedding) AS similarity 34 | FROM 35 | documents 36 | ORDER BY 37 | documents.embedding <=> query_embedding 38 | LIMIT match_count; 39 | END; 40 | $$; 41 | 42 | 43 | # 2. Edit the type of `id` column from `documents` table from int8 to text. 44 | 45 | Using the Supabase UI 46 | 47 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import time 3 | import typing as tp 4 | from functools import wraps 5 | 6 | from langchain.schema import Document 7 | from langchain.vectorstores import SupabaseVectorStore 8 | from pydantic import BaseModel 9 | from fastapi import Request 10 | from opentelemetry import baggage, context 11 | from langtrace_python_sdk.constants.instrumentation.common import ( 12 | LANGTRACE_ADDITIONAL_SPAN_ATTRIBUTES_KEY, 13 | ) 14 | 15 | 16 | class QAResponsePayloadModel(BaseModel): 17 | scoring_id: str 18 | context: tp.List[tp.Tuple[Document, float]] 19 | answer: str 20 | 21 | 22 | def timeit(func): 23 | @wraps(func) 24 | async def wrapper(*args, **kwargs): 25 | logger = lg.getLogger(func.__name__) 26 | logger.info("<<< Starting >>>") 27 | start_time = time.time() 28 | result = await func(*args, **kwargs) 29 | end_time = time.time() 30 | delta = end_time - start_time 31 | msg = f"{delta:2.2f}s" if delta > 1 else f"{1000 * delta:2.1f}ms" 32 | logger.info("<<< Completed >>> in %s", msg) 33 | return result 34 | 35 | return wrapper 36 | 37 | 38 | async def inject_additional_attributes(fn, attributes=None): 39 | if attributes: 40 | new_ctx = baggage.set_baggage( 41 | LANGTRACE_ADDITIONAL_SPAN_ATTRIBUTES_KEY, attributes 42 | ) 43 | context.attach(new_ctx) 44 | 45 | return await fn() 46 | -------------------------------------------------------------------------------- /evaluation/embeddings/defs.py: -------------------------------------------------------------------------------- 1 | boe_ids = [ 2 | "BOE-A-2023-11022", 3 | "BOE-A-2023-11077", 4 | "BOE-A-2023-11560", 5 | "BOE-A-2023-11959", 6 | "BOE-A-2023-12667", 7 | "BOE-A-2023-13811", 8 | "BOE-A-2023-14427", 9 | "BOE-A-2023-14713", 10 | "BOE-A-2023-14733", 11 | "BOE-A-2023-1776", 12 | "BOE-A-2023-2098", 13 | "BOE-A-2023-2980", 14 | "BOE-A-2023-3297", 15 | "BOE-A-2023-3346", 16 | "BOE-A-2023-3511", 17 | "BOE-A-2023-353", 18 | "BOE-A-2023-3847", 19 | "BOE-A-2023-4120", 20 | "BOE-A-2023-4324", 21 | "BOE-A-2023-4385", 22 | "BOE-A-2023-4514", 23 | "BOE-A-2023-4952", 24 | "BOE-A-2023-4994", 25 | "BOE-A-2023-5091", 26 | "BOE-A-2023-5093", 27 | "BOE-A-2023-5367", 28 | "BOE-A-2023-545", 29 | "BOE-A-2023-5452", 30 | "BOE-A-2023-5482", 31 | "BOE-A-2023-5582", 32 | "BOE-A-2023-5704", 33 | "BOE-A-2023-5961", 34 | "BOE-A-2023-6382", 35 | "BOE-A-2023-6721", 36 | "BOE-A-2023-7053", 37 | "BOE-A-2023-7343", 38 | "BOE-A-2023-7355", 39 | "BOE-A-2023-755", 40 | "BOE-A-2023-8110", 41 | "BOE-A-2023-8164", 42 | "BOE-A-2023-8315", 43 | "BOE-A-2023-8318", 44 | "BOE-A-2023-9030", 45 | "BOE-A-2023-9069", 46 | "BOE-A-2023-9428", 47 | "BOE-A-2023-9429", 48 | "BOE-A-2023-9719", 49 | "BOE-A-2023-9827", 50 | "BOE-A-2022-14630", # garantía integral de la libertad sexual 51 | "BOE-A-2023-12203", # derecho a la vivienda 52 | "BOE-A-2023-16889", # Sistema de Formación Profesional 53 | ] 54 | -------------------------------------------------------------------------------- /src/etls/bocm/metadata.py: -------------------------------------------------------------------------------- 1 | import typing as tp 2 | from datetime import datetime 3 | 4 | from pydantic import BaseModel, field_validator, Field 5 | import re 6 | 7 | from src.etls.common.metadata import MetadataDocument 8 | 9 | 10 | # REGEX 11 | CVE_REGEX = r"^BOCM-\d{8}-\d{1,3}$" # TODO: regex demasiado laxa 12 | 13 | 14 | class BOCMMetadataDocument(MetadataDocument): 15 | """Class for keeping metadata of a BOCM Document scrapped.""" 16 | 17 | # Text 18 | filepath: str 19 | 20 | # Source 21 | source_name: str = "BOCM" 22 | source_type: str = "Boletin" 23 | 24 | # Metadatos 25 | identificador: str = Field(pattern=CVE_REGEX, examples=["BOCM-20240129-24"]) 26 | numero_oficial: str = "" # Número de boletín 27 | paginas: str 28 | departamento: str # órgano (excepto sección 4, que no tiene) 29 | 30 | seccion_normalizada: str 31 | seccion: str 32 | subseccion: str 33 | tipo: str = "" 34 | apartado: str = "" 35 | rango: str = "" 36 | 37 | # Links 38 | titulo: str # title 39 | url_pdf: str # pdf_link 40 | url_html: str # html_link 41 | 42 | fecha_publicacion: str 43 | fecha_disposicion: str = "" 44 | anio: str 45 | mes: str 46 | dia: str 47 | 48 | datetime_insert: str = datetime.utcnow().isoformat() 49 | 50 | @field_validator("fecha_publicacion", "fecha_disposicion") 51 | @classmethod 52 | def isoformat(cls, v): 53 | if v: 54 | return datetime.strptime(v, "%Y-%m-%d").strftime("%Y-%m-%d") 55 | return v 56 | -------------------------------------------------------------------------------- /config/qlora.yaml: -------------------------------------------------------------------------------- 1 | # RTX 3090 on Runpod - approx 7 hours 2 | 3 | base_model: NousResearch/Llama-2-7b-chat-hf 4 | base_model_config: NousResearch/Llama-2-7b-chat-hf 5 | model_type: LlamaForCausalLM 6 | tokenizer_type: LlamaTokenizer 7 | is_llama_derived_model: true 8 | hub_model_id: Llama-2-databricks-dolly-oasst1-es-axolotl 9 | 10 | load_in_8bit: false 11 | load_in_4bit: true 12 | strict: false 13 | 14 | datasets: 15 | - path: dariolopez/Llama-2-databricks-dolly-oasst1-es 16 | type: completion 17 | dataset_prepared_path: last_run_prepared 18 | val_set_size: 0.01 19 | output_dir: ./qlora-out 20 | 21 | adapter: qlora 22 | lora_model_dir: 23 | 24 | sequence_len: 2048 # 4096 25 | sample_packing: true 26 | 27 | lora_r: 32 28 | lora_alpha: 16 29 | lora_dropout: 0.05 30 | lora_target_modules: 31 | lora_target_linear: true 32 | lora_fan_in_fan_out: 33 | 34 | wandb_project: axolotl 35 | wandb_entity: 36 | wandb_watch: 37 | wandb_run_id: 38 | wandb_log_model: 39 | 40 | 41 | gradient_accumulation_steps: 4 42 | micro_batch_size: 2 43 | num_epochs: 3 44 | optimizer: paged_adamw_32bit 45 | lr_scheduler: cosine 46 | learning_rate: 0.0002 47 | 48 | train_on_inputs: false 49 | group_by_length: false 50 | bf16: true 51 | fp16: false 52 | tf32: false 53 | 54 | gradient_checkpointing: true 55 | early_stopping_patience: 56 | resume_from_checkpoint: 57 | local_rank: 58 | logging_steps: 1 59 | xformers_attention: 60 | flash_attention: false 61 | 62 | warmup_steps: 100 63 | eval_steps: 0.01 64 | save_steps: 65 | debug: 66 | deepspeed: 67 | weight_decay: 0.0 68 | fsdp: 69 | fsdp_config: 70 | special_tokens: 71 | bos_token: "" 72 | eos_token: "" 73 | unk_token: "" 74 | -------------------------------------------------------------------------------- /doc/qdrant/queries.json: -------------------------------------------------------------------------------- 1 | // List all collections 2 | GET collections 3 | 4 | // Get collection info 5 | GET collections/justicio 6 | 7 | // List points in a collection, using filter by metadata 8 | POST collections/justicio/points/scroll 9 | { 10 | "limit": 40000, 11 | "filter": { 12 | "must": [ 13 | { 14 | "key": "metadata.anio", 15 | "match": { 16 | "value": "2018" 17 | } 18 | } 19 | ] 20 | } 21 | } 22 | 23 | // Count points in a collection, using filter by metadata 24 | POST collections/justicio/points/count 25 | { 26 | "filter": { 27 | "must": [ 28 | { 29 | "key": "metadata.anio", 30 | "match": { 31 | "value": "2018" 32 | } 33 | } 34 | ] 35 | } 36 | } 37 | 38 | // Count points in a collection, using filter by multiple metadata 39 | POST collections/justicio/points/count 40 | { 41 | "filter": { 42 | "must": [ 43 | { 44 | "key": "metadata.anio", 45 | "match": { 46 | "value": "2024" 47 | } 48 | }, 49 | { 50 | "key": "metadata.mes", 51 | "match": { 52 | "value": "02" 53 | } 54 | }, 55 | { 56 | "key": "metadata.dia", 57 | "match": { 58 | "value": "20" 59 | } 60 | } 61 | ] 62 | } 63 | } 64 | 65 | // Delete points in a collection, using filter by metadata 66 | POST collections/justicio/points/delete 67 | { 68 | "filter": { 69 | "must": [ 70 | { 71 | "key": "metadata.anio", 72 | "match": { 73 | "value": "2018" 74 | } 75 | } 76 | ] 77 | } 78 | } 79 | 80 | // https://qdrant.tech/documentation/concepts/filtering/ 81 | // https://qdrant.tech/documentation/concepts/points/ 82 | -------------------------------------------------------------------------------- /src/etls/boe/metadata.py: -------------------------------------------------------------------------------- 1 | import typing as tp 2 | from datetime import datetime 3 | 4 | from pydantic import BaseModel, field_validator 5 | 6 | from src.etls.common.metadata import MetadataDocument 7 | 8 | 9 | class BOEMetadataReferencia(BaseModel): 10 | id: str 11 | palabra: str 12 | texto: str 13 | 14 | 15 | class BOEMetadataDocument(MetadataDocument): 16 | """Class for keeping metadata of a BOE Document scrapped.""" 17 | 18 | # Text 19 | filepath: str 20 | 21 | # Source 22 | source_name: str = "BOE" 23 | source_type: str = "Boletin" 24 | 25 | # Metadatos 26 | identificador: str 27 | diario: str 28 | numero_oficial: str = "" 29 | departamento: str 30 | rango: str = "" 31 | titulo: str 32 | url_pdf: str 33 | origen_legislativo: str = "" 34 | fecha_publicacion: str 35 | fecha_disposicion: str = "" 36 | anio: str 37 | mes: str 38 | dia: str 39 | 40 | # Analisis 41 | observaciones: str = "" 42 | ambito_geografico: str = "" 43 | modalidad: str = "" 44 | tipo: str = "" 45 | materias: tp.List[str] 46 | alertas: tp.List[str] 47 | notas: tp.List[str] 48 | ref_posteriores: tp.List[BOEMetadataReferencia] 49 | ref_anteriores: tp.List[BOEMetadataReferencia] 50 | 51 | datetime_insert: str = datetime.utcnow().isoformat() 52 | 53 | @field_validator("ref_posteriores") 54 | @classmethod 55 | def ref_posteriores_to_json(cls, validators): 56 | return [v.json() for v in validators] 57 | 58 | @field_validator("ref_anteriores") 59 | @classmethod 60 | def ref_anteriores_to_json(cls, validators): 61 | return [v.json() for v in validators] 62 | 63 | @field_validator("fecha_publicacion", "fecha_disposicion") 64 | @classmethod 65 | def isoformat(cls, v): 66 | if v: 67 | return datetime.strptime(v, "%Y%m%d").strftime("%Y-%m-%d") 68 | return v 69 | -------------------------------------------------------------------------------- /src/etls/template/README.md: -------------------------------------------------------------------------------- 1 | This is a template module to load gazettes (e.g.: BOE) and/or single documents (e.g.: sentencias judiciales) into the embedding database. 2 | 3 | # Gazettes 4 | 5 | A gazette in this project has some requirements: 6 | 7 | * A gazette is divided into days. 8 | * Each day has many documents. 9 | 10 | To define an ETL for your gazette, you need to fill some files: 11 | 12 | 1. `metadata.py` Define the metadata to be stored in the embedding database. 13 | 2. `scrapper.py` Define a class with some methods to scrape the information. 14 | 3. `load.py` You can define the different scripts to load the data. 15 | 16 | ### Batch/Historical Load 17 | 18 | If you want to do a batch/historical load: 19 | 20 | ```sh 21 | python -m src.etls.template.load dates 22 | ``` 23 | 24 | Note: You should update the end/start dates in the `config/config.py' file. 25 | 26 | ### Daily (today) load 27 | 28 | Most likely, your Gazette will be updated every day, so you will need to run a daily ETL script. Take a look at src.etls.template.load.py for inspiration. 29 | 30 | ```sh 31 | python -m src.etls.template.load today 32 | ``` 33 | 34 | You will probably also want to schedule a daily job to update your embedding database. Then take a look at `src/etls/template/schedule.py`. 35 | 36 | **Note:** For a complete example of a gazette configuration, you can take a look at the BOE `src/etls/boe`. 37 | 38 | # Documents 39 | 40 | If you want to load a single document into the embedding database. 41 | 42 | ...In progress... 43 | 44 | 45 | # Want to develop your own module? 46 | 47 | You are welcome! Please contact us to discuss your requirements: 48 | 49 | * [Darío López](https://www.linkedin.com/in/dar%C3%ADo-l%C3%B3pez-padial-45269150/) 50 | * [Alex Dantart](https://www.linkedin.com/in/dantart/) 51 | * [Jorge Iliarte](https://www.linkedin.com/in/jorge-iliarte-llop/) 52 | * [Jorge Barrachina](https://www.linkedin.com/in/jorgebarrachina/) 53 | -------------------------------------------------------------------------------- /doc/deployment_guide.md: -------------------------------------------------------------------------------- 1 | # How to deploy the service in local 2 | 3 | ## 1. Prepare your vector database in local 4 | 5 | At this moment, we are working with Qdrant as vector database. 6 | 7 | Official doc: https://qdrant.tech/documentation/quick-start/ 8 | 9 | ### Download the latest Qdrant image from Dockerhub: 10 | 11 | ``` 12 | docker pull qdrant/qdrant 13 | ``` 14 | 15 | ### Run the service: 16 | 17 | ``` 18 | docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/config/example_qdrant_local.yaml:/qdrant/config/production.yaml -v $(pwd)/qdrant_storage:/qdrant/storage:z qdrant/qdrant 19 | ``` 20 | 21 | * REST API: localhost:6333 22 | * Web UI: localhost:6333/dashboard 23 | 24 | ## 2. Prepare Justicio 25 | 26 | ### Clone the code: 27 | 28 | ``` 29 | git clone git@github.com:bukosabino/justicio.git 30 | ``` 31 | 32 | ### Install the requirements: 33 | 34 | ``` 35 | sudo apt install python3-virtualenv 36 | virtualenv -p python3 venv3.10 37 | source venv3.10/bin/activate 38 | pip install -r requirements.txt 39 | ``` 40 | 41 | ### Export environment variables: 42 | 43 | Note: You need to get an API key for OpenAI and another for Sendgrid. 44 | 45 | ``` 46 | export APP_PATH="." 47 | export SENDGRID_API_KEY= 48 | export OPENAI_API_KEY= 49 | export TOKENIZERS_PARALLELISM=false 50 | export TAVILY_API_KEY="" 51 | export QDRANT_API_KEY="823e071f67c198cc05c73f8bd4580865e6a8819a1f3fe57d2cd49b5c892a5233" 52 | export QDRANT_API_URL="http://localhost:6333" 53 | ``` 54 | 55 | ### Add some vector to the vector database 56 | 57 | Load BOE documents into your vector database (depending on the selected data, may take a few minutes). 58 | 59 | ``` 60 | python -m src.etls.boe.load dates 2024/01/01 2024/01/07 61 | ``` 62 | 63 | ## 3. Run Justicio in local 64 | 65 | ``` 66 | uvicorn src.service.main:APP --host=0.0.0.0 --port=5001 --workers=1 --timeout-keep-alive=125 --log-level=info 67 | ``` 68 | 69 | In the browser 70 | 71 | ``` 72 | http://:5001/docs 73 | ``` 74 | -------------------------------------------------------------------------------- /src/etls/template/load.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import typer 4 | 5 | from src.email.send_email import send_email 6 | from src.etls.template.scrapper import TemplateScrapper 7 | from src.etls.common.etl import ETL 8 | from src.etls.template.defs import COLLECTION_NAME 9 | from src.initialize import initialize_app 10 | 11 | 12 | app = typer.Typer() 13 | 14 | 15 | @app.command() 16 | def today(init_objects=None): 17 | if init_objects is None: 18 | init_objects = initialize_app() 19 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 20 | boe_scrapper = TemplateScrapper() 21 | day = date.today() 22 | docs = boe_scrapper.download_day(day) 23 | if docs: 24 | etl_job.run(docs) 25 | 26 | subject = "Today ETL executed" 27 | content = f""" 28 | Today ETL executed 29 | - Date: {day} 30 | - Documents loaded: {len(docs)} 31 | - Database used: {init_objects.config_loader['vector_store']} 32 | """ 33 | send_email(init_objects.config_loader, subject, content) 34 | 35 | 36 | @app.command() 37 | def dates(date_start: str, date_end: str, init_objects=None): 38 | if init_objects is None: 39 | init_objects = initialize_app() 40 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 41 | scrapper = TemplateScrapper() 42 | docs = scrapper.download_days( 43 | date_start=datetime.strptime(date_start, "%Y/%m/%d").date(), 44 | date_end=datetime.strptime(date_end, "%Y/%m/%d").date(), 45 | ) 46 | if docs: 47 | etl_job.run(docs) 48 | 49 | subject = "Load ETL executed" 50 | content = f""" 51 | Load ETL executed 52 | - Date start: {date_start} 53 | - Date end: {date_end} 54 | - Documents loaded: {len(docs)} 55 | - Database used: {init_objects.config_loader['vector_store']} 56 | """ 57 | send_email(init_objects.config_loader, subject, content) 58 | 59 | 60 | if __name__ == "__main__": 61 | app() 62 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Load test using Locust 2 | 3 | We will try to compare our service working synchronously and asynchronously. 4 | 5 | ## Constant Incremental Load Test 6 | 7 | We will test how the application behaves in high traffic times and we will find the architectures breaking points. 8 | 9 | The test will submit several request (from 1 to 300) every second during 300 seconds. 10 | 11 | ### Sync 12 | 13 | ``` 14 | $ locust -f benchmark.py --host http://localhost:5001 --headless --users 300 --spawn-rate 1 --run-time 300s --html output/load_test_sync.html --csv=output/load_test_sync ApiSyncUser 15 | ``` 16 | 17 | Captura de pantalla 2023-09-30 a las 13 00 51 18 | 19 | As you can see, 0.4 appears to be the maximum number of simultaneous requests per second that our synchronous architecture can properly manage. 20 | 21 | At this point, the median response time is 9600ms. 22 | 23 | If you had more than 0.4 requests per second, you would increase the response time, but you would have the same number of requests per second. 24 | 25 | ### Async 26 | 27 | ``` 28 | $ locust -f benchmark.py --host http://localhost:5001 --headless --users 300 --spawn-rate 1 --run-time 300s --html output/load_test_async.html --csv=output/load_test_async ApiAsyncUser 29 | ``` 30 | 31 | Captura de pantalla 2023-09-30 a las 13 01 16 32 | 33 | As you can see, 12.1 seems to be the maximum number of simultaneous requests per second that our asynchronous architecture can properly manage. 34 | 35 | At this point, the median response time is 5400ms. 36 | 37 | If you have more requests than 12.1 per second, then you would increase the response time, but you will have the same number of requests per second. 38 | 39 | 40 | **Notes:** 41 | 42 | You can check the results on the `output` folder. 43 | Based on [this example](https://github.com/bukosabino/scoring-handler/tree/main/benchmark/experiment3-benchmarking-locust) 44 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | # RecursiveCharacterTextSplitter 2 | chunk_size: 1200 3 | chunk_overlap: 100 4 | 5 | admin_email: bukosabino@gmail.com 6 | 7 | embeddings_model_name: dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn 8 | embeddings_model_size: 768 9 | 10 | vector_store: 'qdrant' # {'qdrant', 'pinecone', 'supabase'} 11 | top_k_results: 10 12 | distance_type: 'Cosine' # {'Cosine', 'Euclid', 'Dot'} 13 | 14 | # Prompts 15 | prompt_system: | 16 | Como un experto en derecho y leyes españolas, tu tarea es responder preguntas sobre el Boletín Oficial del Estado (BOE) de España. Para ello, debes tener en cuenta y utilizar el contexto proporcionado para responder de forma precisa a la pregunta del usuario. 17 | Asegúrate de responder siempre en español. Si no conoces la respuesta o no tienes suficiente información para responderla, simplemente admítelo; no intentes inventar una respuesta. 18 | Deberás proporcionar detalles claros y precisos en tus respuestas, asegurándote de referenciar adecuadamente cualquier ley o reglamento pertinente. Tu objetivo es proporcionar respuestas útiles y precisas para ayudar a los usuarios a entender mejor el BOE y cómo se aplica a sus preguntas. 19 | 20 | prompt_system_context: | 21 | El contexto tiene un formato de lista, donde cada elemento será un diccionario con dos claves: 22 | [{'context': 'contexto necesario para contestar la pregunta', 'score': 0.8}] 23 | La clave 'context' contendrá la información necesaria para contestar a la pregunta y la clave 'score' será una puntuación de entre 0.0 y 1.0. Deberás dar más importancia al contexto cuanto mayor sea el score. 24 | En la respuesta no menciones nada sobre el contexto o los scores. 25 | 26 | # Qdrant 27 | collections: 28 | - justicio 29 | - boe 30 | - bocm 31 | - bopz 32 | - bopv 33 | - boja 34 | - boa 35 | 36 | # Openai 37 | llm_model_name: 'gpt-3.5-turbo-0125' # 'gpt-3.5-turbo-1106', 'gpt-4-1106-preview' 38 | temperature: 0 39 | seed: 42 40 | max_tokens: 1024 41 | 42 | # Not used 43 | ## Pinecone 44 | vector_store_index_name: justicio 45 | ## Supabase 46 | table_name: 'documents' 47 | query_name: 'match_documents' 48 | -------------------------------------------------------------------------------- /src/etls/boe/load.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import typer 4 | 5 | from src.email.send_email import send_email 6 | from src.etls.boe.scrapper import BOEScrapper 7 | from src.etls.common.etl import ETL 8 | from src.etls.utils import catch_exceptions 9 | from src.etls.boe.defs import COLLECTION_NAME 10 | from src.initialize import initialize_app 11 | 12 | 13 | app = typer.Typer() 14 | 15 | 16 | @app.command() 17 | @catch_exceptions(cancel_on_failure=True) 18 | def today(init_objects=None): 19 | if init_objects is None: 20 | init_objects = initialize_app() 21 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 22 | boe_scrapper = BOEScrapper() 23 | day = date.today() 24 | docs = boe_scrapper.download_day(day) 25 | if docs: 26 | etl_job.run(docs) 27 | 28 | subject = "[BOE] Daily ETL executed" 29 | content = f""" 30 | Daily ETL executed 31 | - Date: {day} 32 | - Documents loaded: {len(docs)} 33 | - Database used: {init_objects.config_loader['vector_store']} 34 | """ 35 | send_email(init_objects.config_loader, subject, content) 36 | 37 | 38 | @app.command() 39 | def dates(date_start: str, date_end: str, init_objects=None): 40 | if init_objects is None: 41 | init_objects = initialize_app() 42 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 43 | boe_scrapper = BOEScrapper() 44 | docs = boe_scrapper.download_days( 45 | date_start=datetime.strptime(date_start, "%Y/%m/%d").date(), 46 | date_end=datetime.strptime(date_end, "%Y/%m/%d").date(), 47 | ) 48 | if docs: 49 | etl_job.run(docs) 50 | 51 | subject = "[BOE] Load ETL executed" 52 | content = f""" 53 | Load ETL executed 54 | - Date start: {date_start} 55 | - Date end: {date_end} 56 | - Documents loaded: {len(docs)} 57 | - Database used: {init_objects.config_loader['vector_store']} 58 | """ 59 | send_email(init_objects.config_loader, subject, content) 60 | 61 | 62 | if __name__ == "__main__": 63 | app() 64 | -------------------------------------------------------------------------------- /src/etls/boja/load.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import typer 4 | 5 | from src.email.send_email import send_email 6 | from src.etls.boja.scrapper import BOJAScrapper 7 | from src.etls.utils import catch_exceptions 8 | from src.etls.boja.defs import COLLECTION_NAME 9 | from src.etls.common.etl import ETL 10 | from src.initialize import initialize_app 11 | 12 | app = typer.Typer() 13 | 14 | @app.command() 15 | @catch_exceptions(cancel_on_failure=True) 16 | def today(init_objects=None): 17 | if init_objects is None: 18 | init_objects = initialize_app() 19 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 20 | bopv_scrapper = BOJAScrapper() 21 | day = date.today() 22 | docs = bopv_scrapper.download_day(day) 23 | if docs: 24 | etl_job.run(docs) 25 | subject = "[BOJA] Daily ETL executed" 26 | content = f""" 27 | Daily ETL executed 28 | - Date: {day} 29 | - Documents loaded: {len(docs)} 30 | - Database used: {init_objects.config_loader['vector_store']} 31 | """ 32 | send_email(init_objects.config_loader, subject, content) 33 | 34 | @app.command() 35 | def dates(date_start: str, date_end: str, init_objects=None): 36 | if init_objects is None: 37 | init_objects = initialize_app() 38 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 39 | bopv_scrapper = BOJAScrapper() 40 | docs = bopv_scrapper.download_days( 41 | date_start=datetime.strptime(date_start, "%Y/%m/%d").date(), 42 | date_end=datetime.strptime(date_end, "%Y/%m/%d").date() 43 | ) 44 | if docs: 45 | etl_job.run(docs) 46 | 47 | subject = "[BOJA] Load ETL executed" 48 | content = f""" 49 | Load ETL executed 50 | - Date start: {date_start} 51 | - Date end: {date_end} 52 | - Documents loaded: {len(docs)} 53 | - Database used: {init_objects.config_loader['vector_store']} 54 | """ 55 | send_email(init_objects.config_loader, subject, content) 56 | 57 | 58 | if __name__ == "__main__": 59 | app() -------------------------------------------------------------------------------- /src/etls/boa/load.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | import json 3 | 4 | import typer 5 | 6 | from src.email.send_email import send_email 7 | from src.etls.boa.scrapper import BOAScrapper 8 | from src.etls.utils import catch_exceptions 9 | from src.etls.boa.defs import COLLECTION_NAME 10 | from src.etls.common.etl import ETL 11 | from src.initialize import initialize_app 12 | 13 | app = typer.Typer() 14 | 15 | @app.command() 16 | @catch_exceptions(cancel_on_failure=True) 17 | def today(init_objects=None): 18 | if init_objects is None: 19 | init_objects = initialize_app() 20 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 21 | boa_scrapper = BOAScrapper() 22 | day = date.today() 23 | docs = boa_scrapper.download_day(day) 24 | if docs: 25 | etl_job.run(docs) 26 | subject = "[BOA] Daily ETL executed" 27 | content = f""" 28 | Daily ETL executed 29 | - Date: {day} 30 | - Documents loaded: {len(docs)} 31 | - Database used: {init_objects.config_loader['vector_store']} 32 | """ 33 | send_email(init_objects.config_loader, subject, content) 34 | 35 | @app.command() 36 | def dates(date_start: str, date_end: str, init_objects=None): 37 | if init_objects is None: 38 | init_objects = initialize_app() 39 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 40 | boa_scrapper = BOAScrapper() 41 | docs = boa_scrapper.download_days( 42 | date_start=datetime.strptime(date_start, "%Y/%m/%d").date(), 43 | date_end=datetime.strptime(date_end, "%Y/%m/%d").date() 44 | ) 45 | if docs: 46 | etl_job.run(docs) 47 | 48 | subject = "[BOA] Load ETL executed" 49 | content = f""" 50 | Load ETL executed 51 | - Date start: {date_start} 52 | - Date end: {date_end} 53 | - Documents loaded: {len(docs)} 54 | - Database used: {init_objects.config_loader['vector_store']} 55 | """ 56 | send_email(init_objects.config_loader, subject, content) 57 | 58 | 59 | if __name__ == "__main__": 60 | app() -------------------------------------------------------------------------------- /src/etls/bocm/load.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import typer 4 | 5 | from src.email.send_email import send_email 6 | from src.etls.bocm.scrapper import BOCMScrapper 7 | from src.etls.common.etl import ETL 8 | from src.etls.utils import catch_exceptions 9 | from src.etls.bocm.defs import COLLECTION_NAME 10 | from src.initialize import initialize_app 11 | 12 | 13 | app = typer.Typer() 14 | 15 | 16 | @app.command() 17 | @catch_exceptions(cancel_on_failure=True) 18 | def today(init_objects=None): 19 | if init_objects is None: 20 | init_objects = initialize_app() 21 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 22 | bocm_scrapper = BOCMScrapper() 23 | day = date.today() 24 | docs = bocm_scrapper.download_day(day) 25 | if docs: 26 | etl_job.run(docs) 27 | 28 | subject = "[BOCM] Daily ETL executed" 29 | content = f""" 30 | Daily ETL executed 31 | - Date: {day} 32 | - Documents loaded: {len(docs)} 33 | - Database used: {init_objects.config_loader['vector_store']} 34 | """ 35 | send_email(init_objects.config_loader, subject, content) 36 | 37 | 38 | @app.command() 39 | def dates(date_start: str, date_end: str, init_objects=None): 40 | if init_objects is None: 41 | init_objects = initialize_app() 42 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 43 | bocm_scrapper = BOCMScrapper() 44 | docs = bocm_scrapper.download_days( 45 | date_start=datetime.strptime(date_start, "%Y/%m/%d").date(), 46 | date_end=datetime.strptime(date_end, "%Y/%m/%d").date(), 47 | ) 48 | if docs: 49 | etl_job.run(docs) 50 | 51 | subject = "[BOCM] Load ETL executed" 52 | content = f""" 53 | Load ETL executed 54 | - Date start: {date_start} 55 | - Date end: {date_end} 56 | - Documents loaded: {len(docs)} 57 | - Database used: {init_objects.config_loader['vector_store']} 58 | """ 59 | send_email(init_objects.config_loader, subject, content) 60 | 61 | 62 | if __name__ == "__main__": 63 | app() 64 | -------------------------------------------------------------------------------- /src/etls/bopz/load.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import typer 4 | 5 | from src.email.send_email import send_email 6 | from src.etls.bopz.scrapper import BOPZScrapper 7 | from src.etls.bopz.defs import COLLECTION_NAME 8 | from src.etls.utils import catch_exceptions 9 | from src.etls.common.etl import ETL 10 | from src.initialize import initialize_app 11 | 12 | 13 | app = typer.Typer() 14 | 15 | 16 | @app.command() 17 | @catch_exceptions(cancel_on_failure=True) 18 | def today(init_objects=None): 19 | if init_objects is None: 20 | init_objects = initialize_app() 21 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 22 | bopz_scrapper = BOPZScrapper() 23 | day = date.today() 24 | docs = bopz_scrapper.download_day(day) 25 | if docs: 26 | etl_job.run(docs) 27 | 28 | subject = "[BOPZ] Daily ETL executed" 29 | content = f""" 30 | Daily ETL executed 31 | - Date: {day} 32 | - Documents loaded: {len(docs)} 33 | - Database used: {init_objects.config_loader['vector_store']} 34 | """ 35 | send_email(init_objects.config_loader, subject, content) 36 | 37 | 38 | @app.command() 39 | def dates(date_start: str, date_end: str, init_objects=None): 40 | if init_objects is None: 41 | init_objects = initialize_app() 42 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 43 | bopz_scrapper = BOPZScrapper() 44 | docs = bopz_scrapper.download_days( 45 | date_start=datetime.strptime(date_start, "%Y/%m/%d").date(), 46 | date_end=datetime.strptime(date_end, "%Y/%m/%d").date(), 47 | ) 48 | if docs: 49 | etl_job.run(docs) 50 | 51 | subject = "[BOPZ] Load ETL executed" 52 | content = f""" 53 | Load ETL executed 54 | - Date start: {date_start} 55 | - Date end: {date_end} 56 | - Documents loaded: {len(docs)} 57 | - Database used: {init_objects.config_loader['vector_store']} 58 | """ 59 | send_email(init_objects.config_loader, subject, content) 60 | 61 | 62 | if __name__ == "__main__": 63 | app() 64 | -------------------------------------------------------------------------------- /src/etls/bopv/load.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | import json 3 | 4 | import typer 5 | 6 | from src.email.send_email import send_email 7 | from src.etls.bopv.scrapper import BOPVScrapper 8 | from src.etls.bopv.defs import COLLECTION_NAME 9 | from src.etls.utils import catch_exceptions 10 | from src.etls.common.etl import ETL 11 | from src.initialize import initialize_app 12 | 13 | app = typer.Typer() 14 | 15 | @app.command() 16 | @catch_exceptions(cancel_on_failure=True) 17 | def today(init_objects=None): 18 | if init_objects is None: 19 | init_objects = initialize_app() 20 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 21 | bopv_scrapper = BOPVScrapper() 22 | day = date.today() 23 | docs = bopv_scrapper.download_day(day) 24 | if docs: 25 | etl_job.run(docs) 26 | subject = "[BOPV] Daily ETL executed" 27 | content = f""" 28 | Daily ETL executed 29 | - Date: {day} 30 | - Documents loaded: {len(docs)} 31 | - Database used: {init_objects.config_loader['vector_store']} 32 | """ 33 | send_email(init_objects.config_loader, subject, content) 34 | 35 | @app.command() 36 | def dates(date_start: str, date_end: str, init_objects=None): 37 | if init_objects is None: 38 | init_objects = initialize_app() 39 | etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME]) 40 | bopv_scrapper = BOPVScrapper() 41 | docs = bopv_scrapper.download_days( 42 | date_start=datetime.strptime(date_start, "%Y/%m/%d").date(), 43 | date_end=datetime.strptime(date_end, "%Y/%m/%d").date() 44 | ) 45 | if docs: 46 | etl_job.run(docs) 47 | 48 | subject = "[BOPV] Load ETL executed" 49 | content = f""" 50 | Load ETL executed 51 | - Date start: {date_start} 52 | - Date end: {date_end} 53 | - Documents loaded: {len(docs)} 54 | - Database used: {init_objects.config_loader['vector_store']} 55 | """ 56 | send_email(init_objects.config_loader, subject, content) 57 | 58 | 59 | if __name__ == "__main__": 60 | app() -------------------------------------------------------------------------------- /src/etls/common/etl.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import os 3 | import typing as tp 4 | 5 | import pinecone 6 | from langchain.schema import Document 7 | from langchain.text_splitter import RecursiveCharacterTextSplitter 8 | from retry import retry 9 | 10 | from src.etls.common.metadata import MetadataDocument 11 | from src.etls.common.utils import TextLoader 12 | from src.initialize import initialize_logging 13 | 14 | initialize_logging() 15 | 16 | 17 | class ETL: 18 | def __init__(self, config_loader, vector_store): 19 | self._config_loader = config_loader 20 | self._vector_store = vector_store 21 | 22 | def run(self, docs: tp.List[MetadataDocument]): 23 | chunks = self._split_documents(docs) 24 | self._load_database(chunks) 25 | # self._log_database_stats() 26 | 27 | def _split_documents(self, docs: tp.List[MetadataDocument]) -> tp.List[Document]: 28 | """Split documents by chunks 29 | 30 | :param docs: 31 | :return: 32 | """ 33 | logger = lg.getLogger(self._split_documents.__name__) 34 | logger.info("Splitting in chunks %s documents", len(docs)) 35 | docs_chunks = [] 36 | for doc in docs: 37 | loader = TextLoader(file_path=doc.filepath, metadata=doc.dict()) 38 | documents = loader.load() 39 | text_splitter = RecursiveCharacterTextSplitter( 40 | chunk_size=self._config_loader["chunk_size"], 41 | chunk_overlap=self._config_loader["chunk_overlap"], 42 | ) 43 | docs_chunks += text_splitter.split_documents(documents) 44 | 45 | logger.info("Removing file %s", doc.filepath) 46 | os.remove(doc.filepath) 47 | logger.info("Splitted %s documents in %s chunks", len(docs), len(docs_chunks)) 48 | return docs_chunks 49 | 50 | @retry(tries=3, delay=2) 51 | def _load_database(self, docs_chunks: tp.List[Document]) -> None: 52 | logger = lg.getLogger(self._load_database.__name__) 53 | logger.info("Loading %s embeddings to database", len(docs_chunks)) 54 | self._vector_store.add_documents(docs_chunks) 55 | logger.info("Loaded %s embeddings to database", len(docs_chunks)) 56 | 57 | def _log_database_stats(self) -> None: 58 | logger = lg.getLogger(self._log_database_stats.__name__) 59 | index_name = self._config_loader["vector_store_index_name"] 60 | logger.info(pinecone.describe_index(index_name)) 61 | index = pinecone.Index(index_name) 62 | logger.info(index.describe_index_stats()) 63 | -------------------------------------------------------------------------------- /.github/workflows/bandit.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # Bandit is a security linter designed to find common security issues in Python code. 7 | # This action will run Bandit on your codebase. 8 | # The results of the scan will be found under the Security tab of your repository. 9 | 10 | # https://github.com/marketplace/actions/bandit-scan is ISC licensed, by abirismyname 11 | # https://pypi.org/project/bandit/ is Apache v2.0 licensed, by PyCQA 12 | 13 | name: Bandit 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '25 4 * * 6' 22 | 23 | jobs: 24 | bandit: 25 | permissions: 26 | contents: read # for actions/checkout to fetch code 27 | security-events: write # for github/codeql-action/upload-sarif to upload SARIF results 28 | actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status 29 | 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v2 33 | - name: Bandit Scan 34 | uses: shundor/python-bandit-scan@9cc5aa4a006482b8a7f91134412df6772dbda22c 35 | with: # optional arguments 36 | # exit with 0, even with results found 37 | exit_zero: true # optional, default is DEFAULT 38 | # Github token of the repository (automatically created by Github) 39 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information. 40 | # File or directory to run bandit on 41 | # path: # optional, default is . 42 | # Report only issues of a given severity level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) 43 | # level: # optional, default is UNDEFINED 44 | # Report only issues of a given confidence level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) 45 | # confidence: # optional, default is UNDEFINED 46 | # comma-separated list of paths (glob patterns supported) to exclude from scan (note that these are in addition to the excluded paths provided in the config file) (default: .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg) 47 | # excluded_paths: # optional, default is DEFAULT 48 | # comma-separated list of test IDs to skip 49 | # skips: # optional, default is DEFAULT 50 | # path to a .bandit file that supplies command line arguments 51 | # ini_path: # optional, default is DEFAULT 52 | 53 | -------------------------------------------------------------------------------- /evaluation/embeddings/questions.py: -------------------------------------------------------------------------------- 1 | QUERIES = ( 2 | # Solo sí es sí 3 | ( 4 | "BOE-A-2022-14630", 5 | ( 6 | "¿Es de aplicación la ley de garantía integral de la libertad sexual a niños (varones) " 7 | "menores de edad víctimas de violencias sexuales o solo a niñas y mujeres?" 8 | ), 9 | ), 10 | ( 11 | "BOE-A-2022-14630", 12 | ( 13 | "¿Se aplica la ley a niños (varones) menores de edad víctimas de violencias sexuales o solo a niñas y mujeres?" 14 | ), 15 | ), 16 | ( 17 | "BOE-A-2022-14630", 18 | ( 19 | "¿A qué ayudas económicas pueden acceder las mujeres víctimas de violencias sexuales? ¿Son compatibles con " 20 | "otras ayudas económicas?" 21 | ), 22 | ), 23 | ( 24 | "BOE-A-2022-14630", 25 | ( 26 | "¿Gozarán de derechos las víctimas de violencia sexual que estén en situación administrativa irregular?" 27 | ), 28 | ), 29 | ("BOE-A-2022-14630", ("¿Qué Leyes hablan sobre la igualdad de género?")), 30 | # Vivienda 31 | ( 32 | "BOE-A-2023-12203", 33 | ( 34 | "Según la ley de vivienda de 2023, ¿quién paga los honorarios de la inmobiliaria? Contesta detallando qué " 35 | "ocurre en caso de compra y de alquiler" 36 | ), 37 | ), 38 | ( 39 | "BOE-A-2023-12203", 40 | ( 41 | "¿Quién paga los honorarios de la inmobiliaria en caso de alquiler de una vivienda, arrendatario o arrendador?" 42 | ), 43 | ), 44 | ( 45 | "BOE-A-2023-12203", 46 | ( 47 | "¿Cuantos años necesita estar una vivienda sin uso para ser considerada 'vivienda vacía'?" 48 | ), 49 | ), 50 | ( 51 | "BOE-A-2023-12203", 52 | ("¿Se considera gran tenedor a aquella persona con 6 inmuebles?"), 53 | ), 54 | ( 55 | "BOE-A-2023-12203", 56 | ( 57 | "¿Se considera gran tenedor a aquella persona con 7 inmuebles en diferentes zonas tensionadas y no tensionadas?" 58 | ), 59 | ), 60 | ( 61 | "BOE-A-2023-12203", 62 | ( 63 | "¿Se considera gran tenedor a aquella persona con 7 inmuebles en una misma zona tensionada?" 64 | ), 65 | ), 66 | ("BOE-A-2023-12203", ("¿Cuántos años durará el carácter de 'zona tensionada'?")), 67 | ( 68 | "BOE-A-2023-12203", 69 | ( 70 | "¿Qué porcentaje sobre los ingresos es necesario para que una zona sea considerada tensionada?" 71 | ), 72 | ), 73 | ( 74 | "BOE-A-2023-12203", 75 | ( 76 | "¿Qué porcentaje sobre los ingresos es necesario para que una zona sea considerada tensionada?" 77 | ), 78 | ), 79 | # FP 80 | ( 81 | "BOE-A-2023-16889", 82 | ( 83 | "¿Los expertos o expertos senior necesitarán acreditar algún título como el máster de profesorado para " 84 | "impartir clase de Formación Profesional?" 85 | ), 86 | ), 87 | ) 88 | -------------------------------------------------------------------------------- /src/etls/boe/loading/documents.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing as tp 3 | import logging as lg 4 | 5 | from requests.exceptions import HTTPError 6 | from qdrant_client.http.models import Filter, FieldCondition, MatchValue 7 | from qdrant_client import QdrantClient 8 | import numpy as np 9 | 10 | from src.email.send_email import send_email 11 | from src.etls.boe.scrapper import BOEScrapper 12 | from src.etls.boe.loading.defs_id_largos import BOE_IDS 13 | from src.etls.common.etl import ETL 14 | from src.etls.boe.defs import COLLECTION_NAME 15 | from src.initialize import initialize_app, initialize_logging 16 | 17 | initialize_logging() 18 | 19 | QDRANT_CLIENT = QdrantClient(url=os.environ["QDRANT_API_URL"], api_key=os.environ["QDRANT_API_KEY"], timeout=1000) 20 | 21 | 22 | def load_important_ids(filename): 23 | with open(filename) as f: 24 | lines = f.read().splitlines() 25 | return lines 26 | 27 | 28 | def filter_documents_by_year(documents: tp.List[str]) -> tp.List[str]: 29 | documents_filtered = [] 30 | for document_id in documents: 31 | id_split = document_id.split("-") 32 | if id_split[0] != "BOE" or int(id_split[2]) < 2000: 33 | documents_filtered.append(document_id) 34 | return documents_filtered 35 | 36 | 37 | def filter_documents_loaded(documents: tp.List[str]) -> tp.List[str]: 38 | """Filters a list of document IDs that are not loaded on Embedding database.""" 39 | logger = lg.getLogger(filter_documents_loaded.__name__) 40 | query_vector = np.random.rand(768) 41 | documents_filtered = [] 42 | for document_id in documents: 43 | logger.info("Checking if document id is already loaded: %s", document_id) 44 | search_result = QDRANT_CLIENT.search( 45 | collection_name="justicio", 46 | query_vector=query_vector, 47 | query_filter=Filter( 48 | must=[FieldCondition(key="metadata.identificador", match=MatchValue(value=document_id))] 49 | ), 50 | limit=1, 51 | ) 52 | if not search_result: 53 | documents_filtered.append(document_id) 54 | logger.info("Document id: %s is added", document_id) 55 | 56 | return documents_filtered 57 | 58 | 59 | if __name__ == "__main__": 60 | logger = lg.getLogger("__main__") 61 | INIT_OBJECTS = initialize_app() 62 | etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[COLLECTION_NAME]) 63 | boe_scrapper = BOEScrapper() 64 | 65 | documents = load_important_ids("src/etls/boe/loading/defs_ids_importantes.txt") 66 | documents += BOE_IDS 67 | logger.info("Documents size: %s", len(documents)) 68 | documents_filtered = list(set(documents)) 69 | logger.info("Documents filtered by unique: %s", len(documents_filtered)) 70 | documents_filtered = filter_documents_by_year(documents_filtered) 71 | logger.info("Documents filtered by year: %s", len(documents_filtered)) 72 | logger.info(documents_filtered) 73 | # documents_filtered = filter_documents_loaded(documents_filtered) 74 | # logger.info('Documents filtered size: %s', len(documents_filtered)) 75 | 76 | docs = [] 77 | for boe_id in documents_filtered: 78 | logger.info("Loading BOE Id: %s", boe_id) 79 | url = f"https://www.boe.es/diario_boe/xml.php?id={boe_id}" 80 | try: 81 | meta_document = boe_scrapper.download_document(url) 82 | docs.append(meta_document) 83 | except HTTPError: 84 | logger.error("Not scrapped document %s", url) 85 | except AttributeError: 86 | logger.error("Not scrapped document %s", url) 87 | if docs: 88 | etl_job.run(docs) 89 | 90 | subject = "[BOE] Documents ETL executed" 91 | content = f""" 92 | Documents ETL executed 93 | - Documents loaded (BOE ids): {len(documents_filtered)} 94 | - Documents loaded: {len(docs)} 95 | - Database used: {INIT_OBJECTS.config_loader['vector_store']} 96 | """ 97 | send_email(INIT_OBJECTS.config_loader, subject, content) 98 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Nix 2 | shell.nix 3 | 4 | # Justicio 5 | qdrant_storage/ 6 | logs/ 7 | .vscode/ 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | cover/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | .pybuilder/ 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | # For a library or package, you might want to ignore these files since the code is 95 | # intended to run in multiple environments; otherwise, check them in: 96 | # .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # poetry 106 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 107 | # This is especially recommended for binary packages to ensure reproducibility, and is more 108 | # commonly ignored for libraries. 109 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 110 | #poetry.lock 111 | 112 | # pdm 113 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 114 | #pdm.lock 115 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 116 | # in version control. 117 | # https://pdm.fming.dev/#use-with-ide 118 | .pdm.toml 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | .idea/ 169 | 170 | .secrets 171 | .continous_deployment.md 172 | .qdrant.yaml 173 | cron_etl_daily.sh 174 | cron_etl_initial.sh 175 | 176 | *.pem 177 | 178 | xmls/ 179 | 180 | .DS_Store 181 | 182 | # Qdrant 183 | 184 | qdrant_data/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Justicio 2 | 3 | Justicio is a Question/Answering Assistant that generates answers from user questions about the official state gazette of Spain: 4 | Boletín Oficial del Estado (BOE). 5 | 6 | [Spanish link](https://www.boe.es) 7 | 8 | [English link](https://www.boe.es/index.php?lang=en) 9 | 10 | **TL;DR:** All BOE articles are embedded in vectors and stored in a vector database. When a question is asked, the question 11 | is embedded in the same latent space and the most relevant text is retrieved from the vector database by performing a 12 | query using the embedded question. The retrieved pieces of text are then sent to the LLM to construct an answer. 13 | 14 | # Service 15 | 16 | At this moment we are running a user-free service: [Justicio](https://justicio.es/) 17 | 18 | You can test it without charge! Please, give us your feedback if you have! 19 | 20 | # How it works under the hood 21 | 22 | ![image (4)](https://github.com/bukosabino/ia-boe/assets/4375209/bb2ad4ce-f90a-40bf-a77f-bc1443b9896e) 23 | 24 | ## Flow 25 | 26 | 0. All BOE articles are embedded as embeddings and stored in an embedding database. This process is run at startup and every day. 27 | 1. The user writes (using natural language) any question related to the BOE as input to the system. 28 | 2. The backend service processes the input request (user question), transforms the question into an embedding, and sends the generated embedding as a query to the embedding database. 29 | 3. The embedding database returns documents that most closely match the query. 30 | 4. The most similar documents returned by the embedding database are added to the input query as context. Then a request with all the information is sent to the LLM API model. 31 | 5. The LLM API model returns a natural language answer to the user's question. 32 | 6. The user receives an AI-generated response output. 33 | 34 | ## Components 35 | 36 | ### Backend service 37 | 38 | It is the web service, and it is a central component for the whole system, doing most of the tasks: 39 | 40 | * Process the input requests from the user. 41 | * Transform the input text into embeddings. 42 | * Send requests to the embeddings database to get the most similar embeddings. 43 | * Send requests to the LLM API model to generate the response. 44 | * Save the traces. 45 | * Handle input/output exceptions. 46 | 47 | ### Embedding/Vector database 48 | 49 | #### Loading data 50 | 51 | We download the BOE documents and break them into small chunks of text (e.g. 1200 characters). Each text chunk is transformed into an embedding (e.g. a numerically dense vector of 768 sizes). Some additional metadata is also stored with the vectors so that we can pre- or post-filter the search results. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load.py) 52 | 53 | The BOE is updated every day, so we need to run an ETL job every day to retrieve the new documents, transform them into embeddings, link the metadata, and store them in the embedding database. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load.py) 54 | 55 | #### Reading data 56 | 57 | It implements APIs to transform the input question into a vector, and to perform ANN (Approximate Nearest Neighbour) against all the vectors in the database. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/service/main.py) 58 | 59 | There are different types of search (semantic search, keyword search, or hybrid search). 60 | 61 | There are different types of ANNs (cosine similarity, Euclidean distance, or dot product). 62 | 63 | #### Embedding Model 64 | 65 | The text in BOE is written in Spanish, so we need a sentence transformer model that is fine-tuned using Spanish 66 | datasets. We are experimenting with [these models](https://github.com/bukosabino/sbert-spanish). 67 | 68 | More info: https://www.newsletter.swirlai.com/p/sai-notes-07-what-is-a-vector-database 69 | 70 | ### LLM API Model 71 | 72 | It is a Large Language Model (LLM) which generates answers for the user questions based on the context, which is 73 | the most similar documents returned by the embedding database. 74 | 75 | ## Tools 76 | 77 | - Langchain 78 | - FastAPI 79 | - Qdrant 80 | - [Fine tuned Spanish SBert model](https://github.com/bukosabino/sbert-spanish) 81 | - BeautifulSoup 82 | 83 | # Deploy your own service 84 | 85 | Check [doc/deployment_guide.md](https://github.com/bukosabino/justicio/blob/main/doc/deployment_guide.md) file 86 | 87 | # Want to help develop the project? 88 | 89 | You are welcome! Please, contact us to see how you can help. 90 | 91 | * [Darío López](https://www.linkedin.com/in/dar%C3%ADo-l%C3%B3pez-padial-45269150/) 92 | * [Alex Dantart](https://www.linkedin.com/in/dantart/) 93 | * [Jorge Iliarte](https://www.linkedin.com/in/jorge-iliarte-llop/) 94 | * [Jorge Barrachina](https://www.linkedin.com/in/jorgebarrachina/) 95 | -------------------------------------------------------------------------------- /src/etls/common/utils.py: -------------------------------------------------------------------------------- 1 | import typing as tp 2 | from requests.exceptions import Timeout 3 | import random 4 | from bs4 import BeautifulSoup 5 | 6 | from langchain.docstore.document import Document 7 | from langchain.document_loaders.base import BaseLoader 8 | 9 | from src.etls.utils import create_retry_session 10 | 11 | 12 | class TextLoader(BaseLoader): 13 | """Load text files.""" 14 | 15 | def __init__( 16 | self, 17 | file_path: str, 18 | encoding: tp.Optional[str] = None, 19 | metadata: tp.Optional[dict] = None, 20 | ): 21 | """Initialize with file path.""" 22 | self.file_path = file_path 23 | self.encoding = encoding 24 | self.metadata = metadata 25 | 26 | def load(self) -> tp.List[Document]: 27 | """Load from file path.""" 28 | with open(self.file_path, encoding=self.encoding) as f: 29 | text = f.read() 30 | return [Document(page_content=text, metadata=self.metadata)] 31 | 32 | 33 | class ScrapperError(Exception): 34 | """ 35 | Custom exception for scraping errors. 36 | """ 37 | 38 | def __init__(self, message="Error durante el proceso de scraping", *args, **kwargs): 39 | """ 40 | Initializes the exception with a custom error message. 41 | 42 | :param message: Error message describing the failure. 43 | :param args: Additional positional arguments. 44 | :param kwargs: Additional keyword arguments. 45 | """ 46 | super().__init__(message, *args, **kwargs) 47 | self.message = message 48 | 49 | def __str__(self): 50 | """ 51 | Returns a string representation of the exception, including the error message. 52 | """ 53 | return f"ScrapperError: {self.message}" 54 | 55 | 56 | class HTTPRequestException(Exception): 57 | """ 58 | Exception for errors occurring during HTTP requests made by HTTPRequester. 59 | """ 60 | def __init__(self, message="HTTP request error", *args): 61 | super().__init__(message, *args) 62 | 63 | 64 | class HTTPRequester: 65 | user_agents = [ 66 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 67 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 68 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 69 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 70 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 71 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", 72 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15" 73 | ] 74 | 75 | default_headers = { 76 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 77 | "Accept-Language": "es-ES,es;q=0.9,en;q=0.8", 78 | "Connection": "keep-alive", 79 | } 80 | 81 | @classmethod 82 | def get_random_user_agent(cls): 83 | """ 84 | Selects and returns a random User-Agent from the list of user_agents. 85 | """ 86 | return random.choice(cls.user_agents) 87 | 88 | @classmethod 89 | def get_headers(cls): 90 | """ 91 | Generates and returns headers including a random User-Agent. 92 | """ 93 | headers = cls.default_headers.copy() 94 | headers["User-Agent"] = cls.get_random_user_agent() 95 | return headers 96 | 97 | @staticmethod 98 | def get_soup(url, timeout=30, markup='html.parser'): 99 | """ 100 | Performs an HTTP GET request to the provided URL, using random headers, and returns a BeautifulSoup 101 | object if the response is successful. If there is an error or timeout, it throws HTTPRequestException. 102 | """ 103 | headers = HTTPRequester.get_headers() 104 | try: 105 | session = create_retry_session(retries=5) 106 | response = session.get(url, headers=headers, timeout=timeout) 107 | response.raise_for_status() 108 | return BeautifulSoup(response.content, markup) 109 | except Timeout as e: 110 | raise HTTPRequestException(f"HTTP request timed out: {e}") 111 | except requests.RequestException as e: 112 | raise HTTPRequestException(f"HTTP request failed: {e}") 113 | -------------------------------------------------------------------------------- /src/etls/bocm/utils.py: -------------------------------------------------------------------------------- 1 | import typing as tp 2 | import logging as lg 3 | import re 4 | from src.initialize import initialize_logging 5 | 6 | BOCM_PREFIX = "https://www.bocm.es" 7 | 8 | 9 | initialize_logging() 10 | 11 | 12 | def _get_url_from_cve(cve: str) -> str: 13 | return f"{BOCM_PREFIX}/{cve.lower()}" 14 | 15 | 16 | # Metadata from head tags 17 | def metadata_from_head_tags(soup) -> tp.List[str]: 18 | # extract cve from meta[name="TituloGSA"] 19 | cve = soup.select_one('meta[name="TituloGSA"]')["content"] 20 | fecha = cve.split("-")[1:2][0] 21 | fecha_publicacion = f'{fecha[:4]}-{fecha[4:6]}-{fecha[6:8]}' 22 | 23 | html_link = soup.select_one('meta[property="og:url"]')["content"] 24 | 25 | return [fecha_publicacion, cve, html_link] 26 | 27 | 28 | # Metadata from document header 29 | def metadata_from_doc(soup, seccion: str, cve: str) -> tp.List[str]: 30 | logger = lg.getLogger(metadata_from_doc.__name__) 31 | 32 | # Set defaults 33 | apartado, tipo, anunciante, organo, rango = ["", "", "", "", ""] 34 | 35 | # get headers 36 | paras = [f.get_text().strip().upper() for f in soup.select("#cabeceras p")][:3] 37 | 38 | # Metadata from article description 39 | desc_attempt = soup.select_one('meta[name="description"]') 40 | if (desc_attempt is not None): 41 | desc = desc_attempt["content"] 42 | else: 43 | desc = '' 44 | num_art = re.sub(r"BOCM-\d{8}-(\d{1,3})", r"\1", cve) 45 | 46 | try: 47 | if seccion == "1": 48 | subseccion_letter = ["A", "B", "C", "D"][int(seccion) - 1] 49 | subseccion_name = paras[0] 50 | organo = paras[2] 51 | # Some articles don't have filled description needed for rango field extraction 52 | if len(desc) > 10: 53 | rango = re.sub(r"^(\b[^\s]+\b)(.*)", r"\1", desc.split(num_art)[1], flags=re.ASCII).upper() 54 | if seccion == "2": 55 | subseccion_name = "DISPOSICIONES Y ANUNCIOS DEL ESTADO" 56 | organo = paras[0] 57 | if seccion == "3": 58 | paras_num = len(paras) 59 | subseccion_name = "ADMINISTRACIÓN LOCAL AYUNTAMIENTOS" 60 | if paras_num == 3: 61 | apartado, organo = paras[1:3] 62 | elif paras_num == 2: 63 | organo, apartado = paras[0:2] 64 | else: 65 | apartado = "MANCOMUNIDADES" 66 | organo = paras[0] 67 | 68 | if seccion == "4": 69 | subseccion_name = "ADMINISTRACIÓN DE JUSTICIA" 70 | if seccion == "5": 71 | subseccion_name = "OTROS ANUNCIOS" 72 | anunciante = paras[0] 73 | 74 | except: 75 | logger.error("Problem on section clasification for [%s]", cve) 76 | logger.error("Please review [%s]", _get_url_from_cve(cve)) 77 | 78 | return [subseccion_name, apartado, tipo, organo, anunciante, rango] 79 | 80 | 81 | def metadata_from_doc_header(soup) -> tp.List[str]: 82 | logger = lg.getLogger(metadata_from_doc_header.__name__) 83 | 84 | numero_oficial = soup.select_one(".cabecera_popup h1 strong").get_text().split("-")[1].strip().split(" ")[1].strip() 85 | s_field_a, cve_a, pags_a, *permalink = [str.get_text().split(":") for str in soup.select("#titulo_cabecera h2")] 86 | seccion_normalizada = s_field_a[0].strip().split(" ")[1] 87 | paginas = pags_a[1].strip() # Should I convert to int?? 88 | pdf_link = soup.select_one("#titulo_cabecera a")["href"] 89 | 90 | return [numero_oficial, seccion_normalizada, paginas, pdf_link] 91 | 92 | 93 | def select_section_from_id(soup, filtered_section: str) -> tp.List[str]: 94 | logger = lg.getLogger(select_section_from_id.__name__) 95 | 96 | section_links = [] 97 | section, subsection = filtered_section.split("-") 98 | section_container = soup.select_one(f'div[id*="secciones-seccion_{section}"]') 99 | if section_container is not None: 100 | if len(subsection) == 1: 101 | if section == "1": 102 | header_selector = ".view-grouping-header h3" 103 | content_selector = ".view-grouping-content" 104 | else: 105 | header_selector = ".view-content h3" 106 | content_selector = ".view-content" 107 | subsections = section_container.select(".view-grouping") 108 | for group in subsections: 109 | title = group.select_one(header_selector).text 110 | subsection_fix = f"{subsection}\)" 111 | if re.search(subsection_fix, title): 112 | links = [f'{BOCM_PREFIX}{a["href"]}' for a in group.select(f'{content_selector} a[href*="bocm-"]')] 113 | section_links += links 114 | else: 115 | links = [f'{BOCM_PREFIX}{a["href"]}' for a in section_container.select('a[href*="bocm-"]')] 116 | section_links += links 117 | logger.info(f"Captured {len(section_links)} docs from section [{section}]") 118 | return section_links 119 | 120 | 121 | def filter_links_by_section(soup, sections_filter_list: tp.List[str]) -> tp.List[str]: 122 | logger = lg.getLogger(filter_links_by_section.__name__) 123 | 124 | selected = [] 125 | for section_id in sections_filter_list: 126 | links = select_section_from_id(soup, section_id) 127 | selected += links 128 | 129 | logger.info("Retrieved [%s] links for current day", len(selected)) 130 | return selected 131 | 132 | 133 | def clean_text(text: str) -> str: 134 | cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE) 135 | return cleaned 136 | -------------------------------------------------------------------------------- /src/initialize.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import logging as lg 3 | import os 4 | 5 | import yaml 6 | from langchain.chains import RetrievalQA 7 | from langchain.chat_models import ChatOpenAI 8 | from langchain.embeddings import HuggingFaceEmbeddings 9 | from langchain.prompts import ( 10 | ChatPromptTemplate, 11 | HumanMessagePromptTemplate, 12 | SystemMessagePromptTemplate, 13 | ) 14 | from langchain.vectorstores.qdrant import Qdrant 15 | from openai import AsyncOpenAI 16 | from qdrant_client import QdrantClient 17 | from qdrant_client.models import VectorParams 18 | from tavily import TavilyClient 19 | 20 | 21 | def initialize_logging(): 22 | logger = lg.getLogger() 23 | logger.info("Initializing logging") 24 | logger.handlers = [] 25 | handler = lg.StreamHandler() 26 | formatter = lg.Formatter("[%(asctime)s] [%(process)d] [%(levelname)s] [%(name)s] %(message)s") 27 | handler.setFormatter(formatter) 28 | logger.addHandler(handler) 29 | logger.setLevel(lg.INFO) 30 | logger.info("Initialized logging") 31 | lg.getLogger("uvicorn.error").handlers = logger.handlers 32 | 33 | 34 | def initialize_app(): 35 | """Initializes the application""" 36 | logger = lg.getLogger(initialize_app.__name__) 37 | logger.info("Initializing application") 38 | config_loader = _init_config() 39 | vector_store = _init_vector_store(config_loader) 40 | openai_client = _init_openai_client() 41 | tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"]) 42 | # retrieval_qa = _init_retrieval_qa_llm(vector_store, config_loader) 43 | logger.info("Initialized application") 44 | init_objects = collections.namedtuple( 45 | "init_objects", ["config_loader", "vector_store", "openai_client", "tavily_client"] 46 | ) 47 | return init_objects(config_loader, vector_store, openai_client, tavily_client) 48 | 49 | 50 | def _init_config(): 51 | yaml_config_path = os.path.join(os.environ["APP_PATH"], "config", "config.yaml") 52 | with open(yaml_config_path, "r") as stream: 53 | config_loader = yaml.safe_load(stream) 54 | return config_loader 55 | 56 | 57 | def _init_vector_store(config_loader): 58 | logger = lg.getLogger(_init_vector_store.__name__) 59 | logger.info("Initializing vector store") 60 | if config_loader["vector_store"] == "qdrant": 61 | vector_store = _init_vector_stores_qdrant(config_loader) 62 | else: 63 | raise ValueError("Vector Database not configured") 64 | return vector_store 65 | 66 | 67 | def _init_vector_stores_qdrant(config_loader): 68 | logger = lg.getLogger(_init_vector_stores_qdrant.__name__) 69 | logger.info("Initializing vector stores") 70 | qdrant_client = QdrantClient( 71 | url=os.environ["QDRANT_API_URL"], 72 | api_key=os.environ["QDRANT_API_KEY"], 73 | prefer_grpc=True, 74 | ) 75 | embeddings = HuggingFaceEmbeddings( 76 | model_name=config_loader["embeddings_model_name"], 77 | model_kwargs={"device": "cpu"}, 78 | ) 79 | vector_stores = {} 80 | for collection_name in config_loader["collections"]: 81 | if not _exists_collection(qdrant_client, collection_name): 82 | logger.info("Creating collection for vector store") 83 | qdrant_client.recreate_collection( 84 | collection_name=collection_name, 85 | vectors_config=VectorParams( 86 | size=config_loader["embeddings_model_size"], distance=config_loader["distance_type"] 87 | ), 88 | on_disk_payload=True, 89 | ) 90 | logger.info("Created collection [%s] for vector store", collection_name) 91 | vector_stores[collection_name] = Qdrant(qdrant_client, collection_name, embeddings) 92 | logger.info("Initialized vector store for collection [%s]", collection_name) 93 | return vector_stores 94 | 95 | 96 | def _init_openai_client(): 97 | logger = lg.getLogger(_init_openai_client.__name__) 98 | logger.info("Initializing OpenAI client") 99 | client = AsyncOpenAI( 100 | api_key=os.environ.get("OPENAI_API_KEY"), 101 | ) 102 | logger.info("Initialized OpenAI client") 103 | return client 104 | 105 | 106 | def _exists_collection(qdrant_client, collection_name): 107 | logger = lg.getLogger(_exists_collection.__name__) 108 | try: 109 | qdrant_client.get_collection(collection_name=collection_name) 110 | return True 111 | except: 112 | logger.warn("Collection [%s] doesn't exist", collection_name) 113 | return False 114 | 115 | 116 | def _init_retrieval_qa_llm(vector_store, config_loader): 117 | # DEPRECATED 118 | logger = lg.getLogger(_init_retrieval_qa_llm.__name__) 119 | logger.info("Initializing RetrievalQA LLM") 120 | retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": config_loader["top_k_results"]}) 121 | system_template = f"{config_loader['prompt_system']}----------------\n{{context}}" 122 | messages = [ 123 | SystemMessagePromptTemplate.from_template(system_template), 124 | HumanMessagePromptTemplate.from_template("{question}"), 125 | ] 126 | retrieval_qa = RetrievalQA.from_chain_type( 127 | llm=ChatOpenAI( 128 | model_name=config_loader["llm_model_name"], 129 | temperature=config_loader["temperature"], 130 | max_tokens=config_loader["max_tokens"], 131 | ), 132 | chain_type="stuff", 133 | return_source_documents=True, 134 | retriever=retriever, 135 | chain_type_kwargs={"prompt": ChatPromptTemplate.from_messages(messages)}, 136 | ) 137 | logger.info(retrieval_qa.combine_documents_chain.llm_chain.prompt.format) 138 | logger.info("Initialized RetrievalQA LLM") 139 | return retrieval_qa 140 | -------------------------------------------------------------------------------- /src/etls/bopz/scrapper.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging as lg 3 | import tempfile 4 | import typing as tp 5 | from datetime import date, datetime 6 | 7 | import requests 8 | from bs4 import BeautifulSoup 9 | from requests.exceptions import HTTPError 10 | 11 | from src.etls.bopz.utils import DATA_POST 12 | from src.etls.bopz.metadata import BOPZMetadataDocument 13 | from src.etls.common.scrapper import BaseScrapper 14 | from src.initialize import initialize_logging 15 | from src.etls.utils import create_retry_session 16 | 17 | initialize_logging() 18 | 19 | 20 | def _extract_span_text(row: BeautifulSoup, regex: str) -> str: 21 | """ 22 | Extracts the text of the next sibling for a span element that contains the specified text_label. 23 | 24 | :param row: The BeautifulSoup row element to search within. 25 | :param regex: The regular expresion to search for within the span element. 26 | :return: The stripped text of the next sibling if found, otherwise an empty string. 27 | """ 28 | span_element = row.find("span", string=lambda t: re.search(regex, t)) 29 | return span_element.next_sibling.strip() if span_element and span_element.next_sibling else None 30 | 31 | 32 | def _extract_metadata(soup) -> tp.Dict: 33 | metadata_dict = {} 34 | 35 | # Metadatos 36 | if numero_registro := _extract_span_text(soup, r"N.\. Reg:"): 37 | metadata_dict["numero_oficial"] = numero_registro.split("/")[0] 38 | metadata_dict["titulo"] = f"BOPZ-{numero_registro.replace('/', '-')}" 39 | 40 | if departamento := _extract_span_text(soup, r"Publicador:"): 41 | metadata_dict["departamento"] = departamento 42 | 43 | if materia := _extract_span_text(soup, r"Materia"): 44 | metadata_dict["materia"] = [materia] 45 | 46 | if fecha_publicacion := _extract_span_text(soup, r"Fecha Pub:"): 47 | fecha_publicacion = datetime.strptime(fecha_publicacion, "%d/%m/%Y").strftime("%Y-%m-%d") 48 | metadata_dict["fecha_publicacion"] = fecha_publicacion 49 | metadata_dict["fecha_disposicion"] = fecha_publicacion 50 | metadata_dict["anio"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").year) 51 | metadata_dict["mes"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").month) 52 | metadata_dict["dia"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").day) 53 | 54 | href = soup.find("a", class_="adjunto")["href"][1:] 55 | metadata_dict["url_pdf"] = f"{'http://bop.dpz.es/BOPZ'}{href}" 56 | 57 | return metadata_dict 58 | 59 | 60 | def _list_links_day(url: str, day_str: str) -> tp.List[BeautifulSoup]: 61 | """Get a list of documents listed in a BOPZ url day 62 | 63 | :param url: url base link. Example: 'http://bop.dpz.es/BOPZ/portalBuscarEdictos.do' 64 | :param day_str: str date to scrap 65 | :return: list of documents to explore (BeatifullSoup objects) 66 | """ 67 | logger = lg.getLogger(_list_links_day.__name__) 68 | logger.info("Scrapping day: %s", day_str) 69 | DATA_POST["fechaPubInf"] = day_str 70 | DATA_POST["fechaPubSup"] = day_str 71 | response = requests.post(url, data=DATA_POST) 72 | response.raise_for_status() 73 | soup = BeautifulSoup(response.content, "html.parser") 74 | # Find all the rows in the response which correspond to published documents 75 | id_links = [ 76 | id_link 77 | for id_link in soup.find_all("div", class_="row listadoEdictos") 78 | if (href := id_link.find("a", class_="adjunto").get("href", "")) 79 | and "UploadServlet?ruta=Boletines" in href 80 | and href.endswith(".pdf") 81 | ] 82 | logger.info("Scrapped day successfully %s (%s BOPZ documents)", url, len(id_links)) 83 | return id_links 84 | 85 | 86 | class BOPZScrapper(BaseScrapper): 87 | def download_day(self, day: date) -> tp.List[BOPZMetadataDocument]: 88 | """Download all the documents for a specific date.""" 89 | logger = lg.getLogger(self.download_day.__name__) 90 | logger.info("Downloading BOPZ content for day %s", day) 91 | day_str = day.strftime("%d/%m/%Y") 92 | metadata_documents = [] 93 | try: 94 | id_links = _list_links_day("http://bop.dpz.es/BOPZ/portalBuscarEdictos.do", day_str) 95 | for id_link in id_links: 96 | try: 97 | onclick_div = id_link.find("div", onclick=True) 98 | if onclick_div: 99 | onclick_content = onclick_div["onclick"] 100 | start = onclick_content.find("'") + 1 101 | end = onclick_content.find("'", start) 102 | idEdicto = onclick_content[start:end] 103 | url_document = f"http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto={idEdicto}" 104 | metadata_doc = self.download_document(url_document, id_link) 105 | metadata_documents.append(metadata_doc) 106 | except HTTPError: 107 | logger.error("Not scrapped document %s on day %s", url_document, day_str) 108 | except AttributeError: 109 | logger.error("Not scrapped document %s on day %s", url_document, day_str) 110 | except HTTPError: 111 | logger.error("Not scrapped document on day %s", day_str) 112 | logger.info("Downloaded BOPZ content for day %s", day_str) 113 | return metadata_documents 114 | 115 | def download_document(self, url: str, metadata: BeautifulSoup) -> BOPZMetadataDocument: 116 | """Get text and metadata from a BOPZ document. 117 | 118 | :param url: document url link. Examples: 119 | * http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729066 120 | * http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729162 121 | :metadata BeautifulSoup: document metadata associated with url download link. 122 | :return: document with metadata and filepath with text content 123 | """ 124 | logger = lg.getLogger(self.download_document.__name__) 125 | logger.info("Scrapping document: %s", url) 126 | session = create_retry_session(retries=5) 127 | response = session.get(url, timeout=10) 128 | response.raise_for_status() 129 | soup = BeautifulSoup(response.text, "lxml") 130 | with tempfile.NamedTemporaryFile("w", delete=False) as fn: 131 | text = soup.find("div", class_="medium-12 panel").get_text(strip=True, separator="\n") 132 | fn.write(text) 133 | metadata_dict = _extract_metadata(metadata) 134 | metadata_dict["identificador"] = url.split("=")[1] 135 | metadata_dict["url_html"] = url 136 | metadata_doc = BOPZMetadataDocument(filepath=fn.name, **metadata_dict) 137 | logger.info("Scrapped document successfully %s", url) 138 | return metadata_doc 139 | -------------------------------------------------------------------------------- /src/etls/boe/scrapper.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import tempfile 3 | import typing as tp 4 | from datetime import date, datetime 5 | 6 | from bs4 import BeautifulSoup 7 | from requests.exceptions import HTTPError 8 | 9 | from src.etls.boe.metadata import BOEMetadataDocument, BOEMetadataReferencia 10 | from src.etls.common.scrapper import BaseScrapper 11 | from src.initialize import initialize_logging 12 | from src.etls.utils import create_retry_session 13 | 14 | initialize_logging() 15 | 16 | 17 | def _extract_metadata(soup) -> tp.Dict: 18 | metadata_dict = {} 19 | 20 | # Metadatos 21 | if identificador := soup.documento.metadatos.identificador: 22 | metadata_dict["identificador"] = identificador.get_text() 23 | 24 | if diario := soup.documento.metadatos.diario: 25 | metadata_dict["diario"] = diario.get_text() 26 | 27 | if numero_oficial := soup.documento.metadatos.numero_oficial: 28 | metadata_dict["numero_oficial"] = numero_oficial.get_text() 29 | 30 | if departamento := soup.documento.metadatos.departamento: 31 | metadata_dict["departamento"] = departamento.get_text() 32 | 33 | if rango := soup.documento.metadatos.rango: 34 | metadata_dict["rango"] = rango.get_text() 35 | 36 | if titulo := soup.documento.metadatos.titulo: 37 | metadata_dict["titulo"] = titulo.get_text() 38 | 39 | if url_pdf := soup.documento.metadatos.url_pdf: 40 | metadata_dict["url_pdf"] = url_pdf.get_text() 41 | 42 | if origen_legislativo := soup.documento.metadatos.origen_legislativo: 43 | metadata_dict["origen_legislativo"] = origen_legislativo.get_text() 44 | 45 | if fecha_publicacion := soup.documento.metadatos.fecha_publicacion: 46 | metadata_dict["fecha_publicacion"] = fecha_publicacion.get_text() 47 | 48 | if fecha_disposicion := soup.documento.metadatos.fecha_disposicion: 49 | metadata_dict["fecha_disposicion"] = fecha_disposicion.get_text() 50 | 51 | metadata_dict["anio"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%Y") 52 | 53 | metadata_dict["mes"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%m") 54 | 55 | metadata_dict["dia"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%d") 56 | 57 | # Analisis 58 | if observaciones := soup.documento.analisis.observaciones: 59 | metadata_dict["observaciones"] = observaciones.get_text() 60 | 61 | if ambito_geografico := soup.documento.analisis.ambito_geografico: 62 | metadata_dict["ambito_geografico"] = ambito_geografico.get_text() 63 | 64 | if modalidad := soup.documento.analisis.modalidad: 65 | metadata_dict["modalidad"] = modalidad.get_text() 66 | 67 | if tipo := soup.documento.analisis.tipo: 68 | metadata_dict["tipo"] = tipo.get_text() 69 | 70 | metadata_dict["materias"] = [ 71 | materia.get_text() for materia in soup.select("documento > analisis > materias > materia") 72 | ] 73 | metadata_dict["alertas"] = [alerta.get_text() for alerta in soup.select("documento > analisis > alertas > alerta")] 74 | metadata_dict["notas"] = [nota.get_text() for nota in soup.select("documento > analisis > notas > nota")] 75 | metadata_dict["ref_posteriores"] = [ 76 | BOEMetadataReferencia( 77 | id=ref["referencia"], 78 | palabra=ref.palabra.get_text(), 79 | texto=ref.texto.get_text(), 80 | ) 81 | for ref in soup.select("documento > analisis > referencias > posteriores > posterior") 82 | ] 83 | metadata_dict["ref_anteriores"] = [ 84 | BOEMetadataReferencia( 85 | id=ref["referencia"], 86 | palabra=ref.palabra.get_text(), 87 | texto=ref.texto.get_text(), 88 | ) 89 | for ref in soup.select("documento > analisis > referencias > anteriores > anterior") 90 | ] 91 | return metadata_dict 92 | 93 | 94 | def _list_links_day(url: str) -> tp.List[str]: 95 | """Get a list of links in a BOE url day filtering by Seccion 1 and Seccion T. 96 | 97 | :param url: day url link. Example: https://www.boe.es/diario_boe/xml.php?id=BOE-S-20230817 98 | :return: list of id documents to explore (links) 99 | """ 100 | logger = lg.getLogger(_list_links_day.__name__) 101 | logger.info("Scrapping day: %s", url) 102 | session = create_retry_session(retries=5) 103 | response = session.get(url, timeout=10) 104 | response.raise_for_status() 105 | soup = BeautifulSoup(response.text, "lxml") 106 | id_links = [ 107 | url.text.split("?id=")[-1] 108 | for section in soup.find_all( 109 | lambda tag: tag.name == "seccion" 110 | and "num" in tag.attrs 111 | and (tag.attrs["num"] == "1" or tag.attrs["num"] == "T") # Note: Sección 1 and Tribunal Supremo 112 | ) 113 | for url in section.find_all("urlxml") 114 | ] 115 | logger.info("Scrapped day successfully %s (%s BOE documents)", url, len(id_links)) 116 | return id_links 117 | 118 | 119 | class BOEScrapper(BaseScrapper): 120 | def download_day(self, day: date) -> tp.List[BOEMetadataDocument]: 121 | """Download all the documents for a specific date.""" 122 | logger = lg.getLogger(self.download_day.__name__) 123 | logger.info("Downloading BOE content for day %s", day) 124 | day_str = day.strftime("%Y%m%d") 125 | day_url = f"https://www.boe.es/diario_boe/xml.php?id=BOE-S-{day_str}" 126 | metadata_documents = [] 127 | try: 128 | id_links = _list_links_day(day_url) 129 | for id_link in id_links: 130 | url_document = f"https://www.boe.es/diario_boe/xml.php?id={id_link}" 131 | try: 132 | metadata_doc = self.download_document(url_document) 133 | metadata_documents.append(metadata_doc) 134 | except HTTPError: 135 | logger.error("Not scrapped document %s on day %s", url_document, day_url) 136 | except AttributeError: 137 | logger.error("Not scrapped document %s on day %s", url_document, day_url) 138 | except HTTPError: 139 | logger.error("Not scrapped document on day %s", day_url) 140 | logger.info("Downloaded BOE content for day %s", day) 141 | return metadata_documents 142 | 143 | def download_document(self, url: str) -> BOEMetadataDocument: 144 | """Get text and metadata from a BOE xml url document. 145 | 146 | :param url: document url link. Examples: 147 | * https://www.boe.es/diario_boe/xml.php?id=BOE-A-2022-14630 148 | * https://www.boe.es/diario_boe/xml.php?id=BOE-A-2023-12203 149 | :return: document with metadata and filepath with text content 150 | """ 151 | logger = lg.getLogger(self.download_document.__name__) 152 | logger.info("Scrapping document: %s", url) 153 | session = create_retry_session(retries=5) 154 | response = session.get(url, timeout=10) 155 | response.raise_for_status() 156 | soup = BeautifulSoup(response.text, "lxml") 157 | with tempfile.NamedTemporaryFile("w", delete=False) as fn: 158 | text = soup.select_one("documento > texto").get_text() 159 | fn.write(text) 160 | metadata_doc = BOEMetadataDocument(filepath=fn.name, **_extract_metadata(soup)) 161 | logger.info("Scrapped document successfully %s", url) 162 | return metadata_doc 163 | -------------------------------------------------------------------------------- /src/etls/bocm/scrapper.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import tempfile 3 | import typing as tp 4 | from datetime import date, datetime 5 | import re 6 | 7 | import requests 8 | from bs4 import BeautifulSoup 9 | from requests.exceptions import HTTPError 10 | 11 | from src.etls.bocm.metadata import BOCMMetadataDocument 12 | from src.etls.bocm.utils import * 13 | from src.etls.common.scrapper import BaseScrapper 14 | from src.initialize import initialize_logging 15 | from src.etls.utils import create_retry_session 16 | 17 | 18 | initialize_logging() 19 | 20 | 21 | # transformation from url retrieve from redirection to one pointing to complete summary 22 | def _adapt_link_to_complete_summary(url: str) -> str: 23 | """Get complete summary url transforming the url received by param 24 | 25 | :url: url to transform. Example : https://www.bocm.es/boletin/bocm-20240126-22 26 | :return: summary of the day url. Example: https://www.bocm.es/boletin-completo/BOCM-20240126/22 27 | """ 28 | tmp_str = url.replace("boletin", "boletin-completo").replace("/bocm", "/BOCM") 29 | res = re.sub(r"(\d)-(\d+)", r"\1/\2", tmp_str) 30 | return res 31 | 32 | 33 | # get url from response redirection 34 | def _get_summary_link_from_date(day: date) -> str: 35 | """Get summary url from response redirection 36 | 37 | :day: day format for request param: '%d/%m/%Y' 38 | :return: summary of the day url 39 | """ 40 | logger = lg.getLogger(_get_summary_link_from_date.__name__) 41 | 42 | search_url = "https://www.bocm.es/search-day-month" 43 | 44 | try: 45 | response = requests.post(search_url, data={"field_date[date]": day}) 46 | response.raise_for_status() 47 | link = response.headers["Link"].split(";")[0].replace("<", "").replace(">", "") 48 | if re.search("search-day-month", link): 49 | raise ValueError("No link published") 50 | else: 51 | final_url = _adapt_link_to_complete_summary(link) 52 | 53 | except HTTPError: 54 | logger.error("No link got on day %s", day) 55 | final_url = None 56 | 57 | except ValueError as err: 58 | logger.error("%s for day %s. Skiping...", err.args[0], day) 59 | final_url = None 60 | 61 | return final_url 62 | 63 | 64 | def _extract_metadata(soup) -> tp.Dict: 65 | metadata_dict = {} 66 | 67 | # Metadata from head tags 68 | fecha_publicacion, cve, html_link = metadata_from_head_tags(soup) 69 | 70 | # Desc doc header 71 | numero_oficial, seccion_normalizada, paginas, pdf_link = metadata_from_doc_header(soup) 72 | 73 | # Metadata from document 74 | seccion = seccion_normalizada.split(".")[0] 75 | subseccion, apartado, tipo, organo, anunciante, rango = metadata_from_doc(soup, seccion, cve) 76 | 77 | metadata_dict["rango"] = rango 78 | metadata_dict["identificador"] = cve 79 | metadata_dict["numero_oficial"] = numero_oficial 80 | metadata_dict["paginas"] = paginas 81 | 82 | # departamento always match with organo 83 | metadata_dict["departamento"] = organo 84 | 85 | metadata_dict["seccion_normalizada"] = seccion_normalizada 86 | metadata_dict["seccion"] = seccion.upper() 87 | metadata_dict["subseccion"] = subseccion 88 | metadata_dict["tipo"] = tipo 89 | metadata_dict["apartado"] = apartado 90 | 91 | metadata_dict["titulo"] = cve 92 | metadata_dict["url_pdf"] = pdf_link 93 | metadata_dict["url_html"] = html_link 94 | 95 | metadata_dict["fecha_publicacion"] = fecha_publicacion 96 | metadata_dict["fecha_disposicion"] = fecha_publicacion 97 | 98 | metadata_dict["anio"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%Y") 99 | 100 | metadata_dict["mes"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%m") 101 | 102 | metadata_dict["dia"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%d") 103 | 104 | return metadata_dict 105 | 106 | 107 | def _list_links_day(url: str) -> tp.List[str]: 108 | """Get a list of links in a BOCM url day filtering by Seccion 1-A, 3 and 4. 109 | 110 | :param url: summary url link. Example: https://www.bocm.es/boletin-completo/BOCM-20240103/2 111 | :return: list of urls filtered by sections to download 112 | """ 113 | logger = lg.getLogger(_list_links_day.__name__) 114 | 115 | logger.info("Scrapping day: %s", url) 116 | response = requests.get(url) 117 | response.raise_for_status() 118 | soup = BeautifulSoup(response.text, features="lxml") 119 | 120 | # filter by sections 121 | sections_to_filter = ["1-A", "3-", "4-"] 122 | filtered_links = filter_links_by_section(soup, sections_to_filter) 123 | logger.info("Scrapped day successfully %s (%s BOCM documents)", url, len(filtered_links)) 124 | 125 | return filtered_links 126 | 127 | 128 | class BOCMScrapper(BaseScrapper): 129 | def download_day(self, day: date) -> tp.List[BOCMMetadataDocument]: 130 | """Download all the documents for a specific date.""" 131 | logger = lg.getLogger(self.download_day.__name__) 132 | logger.info("Downloading BOCM content for day %s", day) 133 | day_str = day.strftime("%d/%m/%Y") 134 | 135 | summary_url = _get_summary_link_from_date(day_str) 136 | 137 | metadata_documents = [] 138 | if summary_url is not None: 139 | logger.info("Got summary url for day %s", day) 140 | logger.info("URL: [%s] for selected day [%s]", summary_url, day) 141 | 142 | try: 143 | list_urls = _list_links_day(summary_url) 144 | for url in list_urls: 145 | try: 146 | # Skip urls that contains in the path 'boletin' 147 | if not re.search("boletin", url): 148 | metadata_doc = self.download_document(url) 149 | metadata_documents.append(metadata_doc) 150 | except HTTPError: 151 | logger.error("Not scrapped document %s on day %s", url, day) 152 | except AttributeError: 153 | logger.error("Not scrapped document %s on day %s", url, day) 154 | except HTTPError: 155 | logger.error("Not scrapped document %s on day %s", url, day) 156 | logger.info("Downloaded all BOCM docs for day %s", day) 157 | return metadata_documents 158 | 159 | def download_document(self, url: str) -> BOCMMetadataDocument: 160 | """Get text and metadata from BOCM summary html url document. 161 | 162 | :param url: document url link. Examples: 163 | * https://www.bocm.es/bocm-20240123-76 164 | * https://www.bocm.es/bocm-20240123-98 165 | :return: document with metadata and filepath with text content 166 | """ 167 | logger = lg.getLogger(self.download_document.__name__) 168 | logger.info("Scrapping document: %s", url) 169 | session = create_retry_session(retries=5) 170 | response = session.get(url, timeout=10) 171 | response.raise_for_status() 172 | soup = BeautifulSoup(response.text, features="lxml") 173 | with tempfile.NamedTemporaryFile("w", delete=False) as fn: 174 | text = soup.select_one("#main").get_text() 175 | text_cleaned = clean_text(text) 176 | fn.write(text_cleaned) 177 | metadata_doc = BOCMMetadataDocument(filepath=fn.name, **_extract_metadata(soup)) 178 | logger.info("Scrapped document successfully %s", url) 179 | return metadata_doc 180 | -------------------------------------------------------------------------------- /src/etls/boa/scrapper.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import tempfile 3 | import typing as tp 4 | from datetime import date, datetime 5 | import random 6 | import json 7 | from lxml import etree 8 | 9 | import requests 10 | 11 | from src.etls.boa.metadata import BOAMetadataDocument 12 | from src.etls.common.metadata import MetadataDocument 13 | from src.etls.common.scrapper import BaseScrapper 14 | from src.etls.common.utils import ScrapperError 15 | from src.initialize import initialize_logging 16 | from src.etls.utils import create_retry_session 17 | 18 | 19 | initialize_logging() 20 | 21 | 22 | def _remove_html_tags(text: str) -> str: 23 | parser = etree.HTMLParser() 24 | tree = etree.fromstring(text, parser) 25 | clean_text = etree.tostring(tree, encoding="unicode", method='text') 26 | return clean_text.strip() 27 | 28 | 29 | def _extract_metadata(doc: dict) -> tp.Dict: 30 | metadata_dict = {} 31 | 32 | try: 33 | metadata_dict["identificador"] = doc["DOCN"] 34 | except KeyError: 35 | pass 36 | 37 | try: 38 | metadata_dict["numero_boletin"] = doc["Numeroboletin"] 39 | except KeyError: 40 | pass 41 | 42 | try: 43 | metadata_dict["departamento"] = doc["Emisor"].capitalize() 44 | except KeyError: 45 | pass 46 | 47 | try: 48 | metadata_dict["url_pdf"] = doc["UrlPdf"].split('´`')[0][1:] 49 | except KeyError: 50 | pass 51 | 52 | try: 53 | metadata_dict["url_boletin"] = doc["UrlBCOM"].split('´`')[0][1:] 54 | except KeyError: 55 | pass 56 | 57 | try: 58 | metadata_dict["seccion"] = doc["Seccion"] 59 | except KeyError: 60 | pass 61 | 62 | try: 63 | metadata_dict["titulo"] = doc["Titulo"] 64 | except KeyError: 65 | pass 66 | 67 | try: 68 | metadata_dict["subseccion"] = doc["Subseccion"] 69 | except KeyError: 70 | pass 71 | 72 | try: 73 | metadata_dict["codigo_materia"] = doc["CodigoMateria"] 74 | except KeyError: 75 | pass 76 | 77 | try: 78 | metadata_dict["rango"] = doc["Rango"].capitalize() 79 | except KeyError: 80 | pass 81 | 82 | try: 83 | fecha_disposicion = datetime.strptime(doc["Fechadisposicion"], "%Y%m%d").strftime("%Y-%m-%d") 84 | metadata_dict["fecha_disposicion"] = fecha_disposicion 85 | except KeyError: 86 | pass 87 | 88 | return metadata_dict 89 | 90 | 91 | class BOAScrapper(BaseScrapper): 92 | def __init__(self): 93 | self.base_url = "https://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI" 94 | self.user_agents = [ 95 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 96 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 97 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 98 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 99 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 100 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", 101 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15" 102 | ] 103 | self.headers = { 104 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 105 | "Accept-Language": "es-ES,es;q=0.9,en;q=0.8", 106 | "Connection": "keep-alive", 107 | "User-Agent": random.choice(self.user_agents), 108 | } 109 | 110 | 111 | def download_day(self, day: date) -> tp.List[BOAMetadataDocument]: 112 | """Download all the documents for a specific date.""" 113 | try: 114 | logger = lg.getLogger(self.download_day.__name__) 115 | logger.info("Downloading BOA content for day %s", day) 116 | params ={'CMD': 'VERLST', 117 | 'BASE': 'BZHT', 118 | 'DOCS': '1-250', 119 | 'SEC': 'OPENDATABOAJSONAPP', 120 | 'OUTPUTMODE': 'JSON', 121 | 'SEPARADOR':'', 122 | 'PUBL-C': day.strftime("%Y%m%d"), 123 | 'SECC-C':'BOA%2Bo%2BDisposiciones%2Bo%2BJusticia' 124 | # versión completa (todas las secciones, incluyendo personal, etc): 125 | # 'SECC-C':'BOA%2Bo%2BDisposiciones%2Bo%2BPersonal%2Bo%2BAcuerdos%2Bo%2BJusticia%2Bo%2BAnuncios' 126 | } 127 | session = create_retry_session(retries=5) 128 | response = session.get(self.base_url, params=params, timeout=10) 129 | raw_result = response.text 130 | if 'No se han recuperado documentos' in raw_result: 131 | logger.info(f"No hay contenido disponible para el día {day}") 132 | return [] 133 | if response.status_code != 200: 134 | response.raise_for_status() 135 | raw_result = raw_result.replace('\\', '\\\\') 136 | result_json = json.loads(raw_result) 137 | disposiciones = [] 138 | for doc in result_json: 139 | metadata_doc = self.download_document(json.dumps(doc)) 140 | fecha_publicacion_atributos = { 141 | "fecha_publicacion": day.strftime("%Y-%m-%d"), 142 | "anio": day.strftime("%Y"), 143 | "mes": day.strftime("%m"), 144 | "dia": day.strftime("%d"), 145 | } 146 | for atributo, valor in fecha_publicacion_atributos.items(): 147 | setattr(metadata_doc, atributo, valor) 148 | disposiciones.append(metadata_doc) 149 | return disposiciones 150 | except requests.exceptions.RequestException as e: 151 | raise Exception(f"Error de red o HTTP al intentar acceder a {self.base_url}: {e}") 152 | except Exception as e: 153 | raise Exception(f"Error inesperado: {e}") 154 | 155 | 156 | def download_document(self, url: str) -> MetadataDocument: 157 | ''' 158 | En BOAScrapper, a partir de la url diaria (en la función download_day), 159 | se recibe directamente el contenido de todos los boletines. Por lo tanto, 160 | no hace falta scrapear cada uno de las publicaciones a partir de su url. 161 | 162 | Para ser consistentes con el resto de scrappers, se mantiene el método 163 | download_document, pero en este caso en vez de la url se le pasará una 164 | string con el contenido y los metadatos, tal y como se recibe de la base_url 165 | ''' 166 | 167 | logger = lg.getLogger(self.download_document.__name__) 168 | doc = json.loads(url) 169 | url_pdf_raw = doc['UrlPdf'] 170 | url_pdf = url_pdf_raw.split('´`')[0][1:] 171 | logger.info("Scrapping document: %s", url_pdf) 172 | content = doc['Texto'] 173 | clean_text = _remove_html_tags(content) 174 | with tempfile.NamedTemporaryFile("w", delete=False, encoding='utf-8') as fn: 175 | fn.write(clean_text) 176 | try: 177 | metadata_doc = BOAMetadataDocument(filepath=fn.name,**_extract_metadata(doc)) 178 | except: 179 | raise ScrapperError("No se pudo encontrar alguno de los elementos requeridos.") 180 | logger.info("Scrapped document successfully %s", url_pdf) 181 | return metadata_doc 182 | -------------------------------------------------------------------------------- /src/service/main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging as lg 3 | import time 4 | import uuid 5 | import os 6 | import typing as tp 7 | import ipaddress 8 | 9 | import httpx 10 | from fastapi import FastAPI 11 | 12 | from src.initialize import initialize_app, initialize_logging 13 | from src.utils import inject_additional_attributes, timeit 14 | from langtrace_python_sdk import SendUserFeedback, langtrace 15 | from langtrace_python_sdk.utils.with_root_span import with_langtrace_root_span 16 | 17 | langtrace.init(api_key=os.environ.get('LANGTRACE_API_KEY')) 18 | initialize_logging() 19 | 20 | APP = FastAPI() 21 | 22 | INIT_OBJECTS = initialize_app() 23 | 24 | DEFAULT_INPUT_QUERY = ( 25 | "¿Es de aplicación la ley de garantía integral de la libertad sexual a niños (varones) menores de edad " 26 | "víctimas de violencias sexuales o solo a niñas y mujeres?" 27 | ) 28 | DEFAULT_COLLECTION_NAME = "justicio" 29 | 30 | 31 | @with_langtrace_root_span() 32 | async def call_llm_api(span_id, trace_id, model_name: str, messages: tp.List[tp.Dict[str, str]]): 33 | response = await INIT_OBJECTS.openai_client.chat.completions.create( 34 | model=model_name, 35 | messages=messages, 36 | temperature=INIT_OBJECTS.config_loader["temperature"], 37 | seed=INIT_OBJECTS.config_loader["seed"], 38 | max_tokens=INIT_OBJECTS.config_loader["max_tokens"], 39 | ) 40 | return response, span_id, trace_id 41 | 42 | 43 | @APP.get("/healthcheck") 44 | @timeit 45 | async def healthcheck(): 46 | """Asynchronous Health Check""" 47 | # TODO: healthcheck with embeddings db api and llm api 48 | return {"status": "OK"} 49 | 50 | 51 | @APP.get("/semantic_search") 52 | @timeit 53 | async def semantic_search(input_query: str = DEFAULT_INPUT_QUERY, collection_name: str = DEFAULT_COLLECTION_NAME): 54 | logger = lg.getLogger(semantic_search.__name__) 55 | logger.info(input_query) 56 | docs = await INIT_OBJECTS.vector_store[collection_name].asimilarity_search_with_score( 57 | query=input_query, k=INIT_OBJECTS.config_loader["top_k_results"] 58 | ) 59 | logger.info(docs) 60 | return docs 61 | 62 | 63 | @APP.get("/semantic_search_tavily") 64 | @timeit 65 | async def semantic_search_tavily(input_query: str = DEFAULT_INPUT_QUERY): 66 | logger = lg.getLogger(semantic_search_tavily.__name__) 67 | logger.info(input_query) 68 | docs = INIT_OBJECTS.tavily_client.search( 69 | query=input_query, 70 | search_depth="advanced", 71 | include_domains=["https://www.boe.es/"], 72 | max_results=10, 73 | topic="general", 74 | include_raw_content=False, 75 | include_answer=False, 76 | ) 77 | logger.info(docs) 78 | return docs 79 | 80 | 81 | async def a_request_get(url): 82 | """Requests for sync/async load tests""" 83 | async with httpx.AsyncClient(timeout=10.0) as client: 84 | response = await client.get(url) 85 | return response.text 86 | 87 | 88 | @APP.get("/qa_feedback") 89 | @with_langtrace_root_span("Feedback") 90 | @timeit 91 | async def qa_feedback(span_id: str, trace_id: str, user_score: int): 92 | data = { 93 | "spanId": span_id, "traceId": trace_id, "userScore": user_score, "userId": None 94 | } 95 | SendUserFeedback().evaluate(data=data) 96 | return {"feedback": "OK"} 97 | 98 | 99 | @APP.get("/qa") 100 | @with_langtrace_root_span("RAG Justicio") 101 | @timeit 102 | async def qa( 103 | input_query: str = DEFAULT_INPUT_QUERY, 104 | collection_name: str = DEFAULT_COLLECTION_NAME, 105 | model_name: str = INIT_OBJECTS.config_loader["llm_model_name"], 106 | input_original_query: str | None = None, 107 | ip_request_client: ipaddress.IPv4Address | None = None, 108 | ): 109 | logger = lg.getLogger(qa.__name__) 110 | logger.info(input_query) 111 | 112 | # Getting context from embedding database (Qdrant) 113 | docs = await INIT_OBJECTS.vector_store[collection_name].asimilarity_search_with_score( 114 | query=input_query, k=INIT_OBJECTS.config_loader["top_k_results"] 115 | ) 116 | 117 | # Generate response using a LLM (OpenAI) 118 | context_preprocessed = [{"context": doc[0].page_content, "score": doc[1]} for doc in docs] 119 | messages = [ 120 | {"role": "system", "content": INIT_OBJECTS.config_loader["prompt_system"]}, 121 | { 122 | "role": "system", 123 | "content": INIT_OBJECTS.config_loader["prompt_system_context"], 124 | }, 125 | {"role": "system", "content": "A continuación se proporciona el contexto:"}, 126 | {"role": "system", "content": str(context_preprocessed)}, 127 | { 128 | "role": "system", 129 | "content": "A continuación se proporciona la pregunta del usuario:", 130 | }, 131 | {"role": "user", "content": input_query}, 132 | ] 133 | # logger.info(messages) 134 | additional_attributes = { 135 | "db.collection.name": collection_name, 136 | "service.ip": ip_request_client, 137 | "llm.original_query": input_original_query 138 | } 139 | response, span_id, trace_id = await inject_additional_attributes( 140 | lambda: call_llm_api(model_name=model_name, messages=messages), additional_attributes 141 | ) 142 | answer = response.choices[0].message.content 143 | logger.info(answer) 144 | logger.info(response.usage) 145 | 146 | response_payload = dict( 147 | scoring_id=str(uuid.uuid4()), 148 | context=docs, 149 | answer=answer, 150 | span_id=str(span_id), 151 | trace_id=str(trace_id), 152 | ) 153 | return response_payload 154 | 155 | 156 | @APP.get("/qa_tavily") 157 | @timeit 158 | async def qa_tavily(input_query: str = DEFAULT_INPUT_QUERY): 159 | logger = lg.getLogger(qa_tavily.__name__) 160 | logger.info(input_query) 161 | 162 | # Getting context from internet browser (Tavily) 163 | docs = INIT_OBJECTS.tavily_client.search( 164 | query=input_query, 165 | search_depth="advanced", 166 | include_domains=["https://www.boe.es/"], 167 | max_results=10, 168 | topic="general", 169 | include_raw_content=False, 170 | include_answer=False, 171 | ) 172 | 173 | # Generate response using a LLM (OpenAI) 174 | context_preprocessed = [{"context": doc["content"], "score": doc["score"]} for doc in docs["results"]] 175 | 176 | response = await INIT_OBJECTS.openai_client.chat.completions.create( 177 | model=INIT_OBJECTS.config_loader["llm_model_name"], 178 | messages=[ 179 | {"role": "system", "content": INIT_OBJECTS.config_loader["prompt_system"]}, 180 | { 181 | "role": "system", 182 | "content": INIT_OBJECTS.config_loader["prompt_system_context"], 183 | }, 184 | {"role": "system", "content": "A continuación se proporciona el contexto:"}, 185 | {"role": "system", "content": str(context_preprocessed)}, 186 | { 187 | "role": "system", 188 | "content": "A continuación se proporciona la pregunta del usuario:", 189 | }, 190 | {"role": "user", "content": input_query}, 191 | ], 192 | temperature=INIT_OBJECTS.config_loader["temperature"], 193 | seed=INIT_OBJECTS.config_loader["seed"], 194 | max_tokens=INIT_OBJECTS.config_loader["max_tokens"], 195 | ) 196 | answer = response.choices[0].message.content 197 | logger.info(answer) 198 | logger.info(response.usage) 199 | 200 | response_payload = dict( 201 | scoring_id=str(uuid.uuid4()), 202 | context=docs, 203 | answer=answer, 204 | ) 205 | return response_payload 206 | 207 | 208 | @APP.get("/sleep") 209 | @timeit 210 | async def sleep(): 211 | time.sleep(5) 212 | return {"status": "OK"} 213 | 214 | 215 | @APP.get("/asleep") 216 | @timeit 217 | async def asleep(): 218 | await asyncio.sleep(5) 219 | return {"status": "OK"} 220 | -------------------------------------------------------------------------------- /src/etls/bopv/scrapper.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import tempfile 3 | import typing as tp 4 | from datetime import date 5 | import re 6 | import random 7 | 8 | import requests 9 | from bs4 import BeautifulSoup 10 | from urllib.parse import urljoin 11 | 12 | from src.etls.bopv.metadata import BOPVMetadataDocument 13 | from src.etls.common.scrapper import BaseScrapper 14 | from src.etls.common.utils import ScrapperError 15 | from src.initialize import initialize_logging 16 | from src.etls.utils import create_retry_session 17 | 18 | 19 | initialize_logging() 20 | 21 | def clean_text(text: str) -> str: 22 | cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE) 23 | return cleaned 24 | 25 | 26 | class BOPVScrapper(BaseScrapper): 27 | def __init__(self): 28 | self.base_url = "https://www.euskadi.eus/bopv2/datos/" 29 | self.boletin_url_base = "https://www.euskadi.eus/web01-bopv/es/bopv2/datos/" 30 | self.user_agents = [ 31 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 32 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 33 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 34 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 35 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", 36 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", 37 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15" 38 | ] 39 | self.headers = { 40 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 41 | "Accept-Language": "es-ES,es;q=0.9,en;q=0.8", 42 | "Connection": "keep-alive", 43 | "User-Agent": random.choice(self.user_agents), 44 | } 45 | 46 | def _get_boletin_url(self, date: date, enlace_dia: str) -> str: 47 | """Generates a bulletin URL for a given date and link day.""" 48 | return f"{self.boletin_url_base}{date.year}/{date.strftime('%m')}/{enlace_dia}" 49 | 50 | def _get_monthly_url(self, date: date) -> str: 51 | """Generates a monthly URL for a given date.""" 52 | month_year = date.strftime("%m%Y") 53 | return f"{self.boletin_url_base}{month_year}.shtml" 54 | 55 | def _get_summary_link_from_date(self, requested_date: date): 56 | url = self._get_monthly_url(requested_date) 57 | try: 58 | response = requests.get(url, headers=self.headers) 59 | response.raise_for_status() 60 | html = response.text 61 | dias_habilitados_pattern = re.compile(r"var diasHabilitados = (\[.*?\]);") 62 | enlaces_pattern = re.compile(r"var enlaces = (\[.*?\]);") 63 | dias_habilitados_match = dias_habilitados_pattern.search(html) 64 | enlaces_match = enlaces_pattern.search(html) 65 | 66 | if dias_habilitados_match and enlaces_match: 67 | dias_habilitados = eval(dias_habilitados_match.group(1)) 68 | enlaces = eval(enlaces_match.group(1)) 69 | requested_date_formatted = requested_date.strftime("%Y%m%d") 70 | if requested_date_formatted in dias_habilitados: 71 | index = dias_habilitados.index(requested_date_formatted) 72 | enlace = enlaces[index] 73 | if isinstance(enlace, list): 74 | enlace = enlace[0] 75 | final_url = self._get_boletin_url(requested_date, enlace) 76 | return final_url 77 | else: 78 | return None 79 | except requests.HTTPError as err: 80 | raise ValueError(f"Error en la solicitud HTTP: {err}") 81 | except ValueError as err: 82 | raise ValueError(f"Error en la solicitud HTTP: {err}") 83 | 84 | def download_day(self, day: date) -> tp.List[BOPVMetadataDocument]: 85 | """Download all the documents for a specific date.""" 86 | try: 87 | logger = lg.getLogger(self.download_day.__name__) 88 | logger.info("Downloading BOPV content for day %s", day) 89 | summary_link = self._get_summary_link_from_date(day) 90 | if summary_link is None: 91 | logger.info(f"No hay contenido disponible para el día {day}") 92 | return [] 93 | response = requests.get(summary_link) 94 | if response.status_code != 200: 95 | response.raise_for_status() 96 | disposiciones = [] 97 | soup = BeautifulSoup(response.content, 'html.parser') 98 | txt_blocks = soup.find_all('div', class_='txtBloque') 99 | for block in txt_blocks: 100 | titulo = block.find('p', class_='BOPVSumarioTitulo') 101 | if not titulo or not titulo.find('a'): 102 | raise ScrapperError("No se pudo encontrar el título o el enlace en uno de los bloques.") 103 | href = titulo.find('a')['href'] 104 | url_disposicion = summary_link.rsplit('/', 1)[0] + '/' + href 105 | document_data = self.download_document(url_disposicion) 106 | if document_data: 107 | disposition_summary = { 108 | "titulo": titulo.text.strip(), 109 | "url_html": url_disposicion, 110 | "url_boletin": summary_link, 111 | "fecha_disposicion": day.strftime("%Y-%m-%d"), 112 | "anio": str(day.year), 113 | "mes": str(day.month), 114 | "dia": str(day.day), 115 | } 116 | for atributo, valor in disposition_summary.items(): 117 | setattr(document_data, atributo, valor) 118 | disposiciones.append(document_data) 119 | return disposiciones 120 | except requests.exceptions.RequestException as e: 121 | raise Exception(f"Error de red o HTTP al intentar acceder a {summary_link}: {e}") 122 | except Exception as e: 123 | raise Exception(f"Error inesperado: {e}") 124 | 125 | def download_document(self, url: str) -> BOPVMetadataDocument: 126 | """ 127 | Extracts the content, the issuing body, and the PDF URL of a specific disposition from BOPV given its URL. 128 | 129 | :param url_disposicion: The full URL of the disposition from which the content and PDF URL are to be extracted. 130 | Example: "https://www.euskadi.eus/web01-bopv/es/bopv2/datos/2024/01/2400001a.shtml" 131 | :return: A BOCMMetadataDocument containing the content of the disposition, the name of the issuing body, and the PDF URL. 132 | If the content, the issuing body, or the PDF URL is not found, it returns empty strings for those values. 133 | """ 134 | logger = lg.getLogger(self.download_document.__name__) 135 | logger.info("Scrapping document: %s", url) 136 | try: 137 | session = create_retry_session(retries=5) 138 | response = session.get(url, headers=self.headers, timeout=10) 139 | if response.status_code != 200: 140 | response.raise_for_status() 141 | soup = BeautifulSoup(response.content, "html.parser") 142 | seccion_tag = soup.find("h4", class_="BOPVSeccion") 143 | if not seccion_tag: 144 | raise ScrapperError("No se pudo encontrar la sección requerida.") 145 | 146 | seccion_text = seccion_tag.get_text(strip=True).upper() 147 | if seccion_text not in ['DISPOSICIONES GENERALES', 'OTRAS DISPOSICIONES']: 148 | return 149 | tipologia = seccion_tag.get_text(strip=True) 150 | organismo_tag = soup.find("h5", class_="BOPVOrganismo") 151 | content_block = soup.find("div", class_="colCentralinterior") 152 | pdf_link_tag = soup.find("li", class_="formatoPdf").find('a') 153 | 154 | if not organismo_tag or not content_block or not pdf_link_tag: 155 | raise ScrapperError("No se pudo encontrar algunos de los elementos requeridos.") 156 | 157 | organismo = organismo_tag.get_text(strip=True) if organismo_tag else "" 158 | base_url = url.rsplit('/', 1)[0] + '/' 159 | pdf_href = pdf_link_tag.get('href') if pdf_link_tag else "" 160 | pdf_url = urljoin(base_url, pdf_href) 161 | paragraphs = content_block.find_all("p", class_=re.compile(r"BOPV(Detalle|Titulo|FirmaLugFec|FirmaPuesto|FirmaNombre)")) 162 | content_paragraphs = [p.get_text(strip=True) for p in paragraphs] 163 | additional_elements = content_block.find_all(["h5", "div"], class_=re.compile(r"BOPV(Titulo|FirmaLugFec|FirmaPuesto|FirmaNombre)")) 164 | content_additional = [elem.get_text(strip=True) for elem in additional_elements] 165 | content = "\n".join(content_paragraphs + content_additional) 166 | 167 | with tempfile.NamedTemporaryFile("w", delete=False) as fn: 168 | text_cleaned = clean_text(content) 169 | fn.write(text_cleaned) 170 | metadata_doc = BOPVMetadataDocument(**{"filepath": fn.name, 171 | "identificador": '/'.join(url.split('.')[-2].split("/")[-3:]), 172 | "departamento": organismo, 173 | "url_pdf": pdf_url, 174 | "tipologia": tipologia, 175 | }) 176 | logger.info("Scrapped document successfully %s", url) 177 | return metadata_doc 178 | 179 | except requests.exceptions.RequestException as e: 180 | raise Exception(f"Error de red o HTTP al intentar acceder a {url}: {e}") 181 | except Exception as e: 182 | raise Exception(f"Error de red o HTTP al intentar acceder a {url}: {e}") -------------------------------------------------------------------------------- /src/etls/boja/scrapper.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | import tempfile 3 | import typing as tp 4 | from datetime import date 5 | import re 6 | 7 | from src.etls.boja.metadata import BOJAMetadataDocument 8 | from src.etls.common.scrapper import BaseScrapper 9 | from src.etls.common.utils import ScrapperError, HTTPRequester 10 | from src.etls.boja.utils import mes_a_numero, clean_text 11 | from src.initialize import initialize_logging 12 | 13 | 14 | initialize_logging() 15 | 16 | def clean_text(text: str) -> str: 17 | cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE) 18 | return cleaned 19 | 20 | 21 | class BOJAScrapper(BaseScrapper): 22 | def __init__(self): 23 | self.base_url = "https://www.juntadeandalucia.es/" 24 | 25 | @staticmethod 26 | def check_extraordinary_boja(url): 27 | return re.match(r".*/\d{8}\.html$", url) is not None 28 | 29 | @staticmethod 30 | def extract_bojas_from_extraordinary(url): 31 | urls_bojas = [] 32 | soup = HTTPRequester.get_soup(url) 33 | try: 34 | uls = soup.find_all('ul', class_="mt-4 pl-3") 35 | for ul in uls: 36 | links = ul.find_all('a', href=True) 37 | for link in links: 38 | href = link.get('href') 39 | es_extraordinario = "extraordinario" in link.text.lower() 40 | urls_bojas.append((href, es_extraordinario)) 41 | 42 | return urls_bojas 43 | except Exception as e: 44 | raise Exception(f"Error inesperado: {e}") 45 | 46 | @staticmethod 47 | def find_disposiciones(url_boletin): 48 | enlaces_html = [] 49 | enlaces_finales = [] 50 | soup = HTTPRequester.get_soup(url_boletin) 51 | try: 52 | listado_principal = soup.find('ol', class_='listado_ordenado_boja raiz') 53 | if not listado_principal: 54 | listado_principal = soup.find(['ol', 'ul'], class_=['listado_ordenado_boja','listado_ordenado']) 55 | if listado_principal: 56 | items_a = listado_principal.find_all('a') 57 | for item in items_a: 58 | if re.search(r'\b(Disposiciones Generales|Otras Disposiciones)\b', item.text, re.IGNORECASE): 59 | enlaces_html.append(item.get('href')) 60 | for enlace in enlaces_html: 61 | soup_intermedio = HTTPRequester.get_soup(enlace) 62 | enlaces_intermedios = soup_intermedio.find_all('a', class_='item_html', title=re.compile("Versión HTML CVE")) 63 | enlaces_intermedios += soup_intermedio.find_all('a', title="Ver disposición") 64 | for enlace_final in enlaces_intermedios: 65 | enlaces_finales.append(enlace_final.get('href')) 66 | else: 67 | raise ScrapperError("No se encontró el listado ordenado con las clases especificadas.") 68 | except Exception as e: 69 | raise Exception(f"Error inesperado: {e}") 70 | return enlaces_finales 71 | 72 | def _get_summary_link_from_date(self, fecha_busqueda): 73 | url = f"{self.base_url}/{'eboja' if fecha_busqueda.year >= 2012 else 'boja'}/{fecha_busqueda.year}" 74 | soup = HTTPRequester.get_soup(url) 75 | try: 76 | tablas_calendario = soup.find_all('table', class_='calendario_tabla') 77 | for tabla in tablas_calendario: 78 | summary_text = tabla.get('summary', '') 79 | mes_año_match = re.search(r"Boletines del mes de (\w+) de (\d{4})", summary_text) 80 | if mes_año_match: 81 | mes = mes_año_match.group(1) 82 | año = mes_año_match.group(2) 83 | enlaces = tabla.find_all('a') 84 | for enlace in enlaces: 85 | href = enlace.get('href') 86 | dia = enlace.text.strip() 87 | fecha_iso = f"{año}-{mes_a_numero(mes):02d}-{int(dia):02d}" 88 | if fecha_iso == fecha_busqueda.strftime('%Y-%m-%d'): 89 | if BOJAScrapper.check_extraordinary_boja(href): 90 | urls_bojas = BOJAScrapper.extract_bojas_from_extraordinary(href) 91 | enlaces_extraordinarios = [] 92 | for url_boja, es_extraordinario in urls_bojas: 93 | enlaces_extraordinarios.append({ 94 | "url": url_boja, 95 | "fecha": fecha_iso, 96 | "extraordinario": es_extraordinario 97 | }) 98 | return enlaces_extraordinarios 99 | else: 100 | return [{ 101 | "url": href, 102 | "fecha": fecha_iso, 103 | "extraordinario": False 104 | }] 105 | except Exception as e: 106 | raise Exception(f"Error inesperado: {e}") 107 | 108 | def download_day(self, day: date) -> tp.List[BOJAMetadataDocument]: 109 | """Download all the documents for a specific date.""" 110 | logger = lg.getLogger(self.download_day.__name__) 111 | logger.info("Downloading BOJA content for day %s", day) 112 | try: 113 | disposiciones = [] 114 | lista_boletines = self._get_summary_link_from_date(day) 115 | if not lista_boletines: 116 | logger.info(f"No hay contenido disponible para el día {day}") 117 | return [] #None = para ese dia no hay boletín 118 | for boletin in lista_boletines: # Boletines. Si hay boletin extraordinario esto será 2 119 | for disposicion in BOJAScrapper.find_disposiciones(boletin['url']): 120 | document_data = self.download_document(disposicion) 121 | if document_data: 122 | disposition_summary = { 123 | "url_boletin": boletin['url'], 124 | "url_html": disposicion, 125 | "fecha_disposicion": day.strftime("%Y-%m-%d"), 126 | "anio": str(day.year), 127 | "mes": str(day.month), 128 | "dia": str(day.day), 129 | } 130 | for atributo, valor in disposition_summary.items(): 131 | setattr(document_data, atributo, valor) 132 | disposiciones.append(document_data) 133 | return disposiciones 134 | except Exception as e: 135 | raise Exception(f"Error inesperado descargando dia {day}: {e}") 136 | 137 | def download_document(self, url: str) -> BOJAMetadataDocument: 138 | """ 139 | Extracts the content, the issuing body, and the PDF URL of a specific disposition from BOJA given its URL. 140 | 141 | :param url_disposicion: The full URL of the disposition from which the content and PDF URL are to be extracted. 142 | Example: "https://www.juntadeandalucia.es/eboja/2024/7/s51.html" 143 | :return: A BOJAMMetadataDocument containing the content of the disposition, the name of the issuing body, and the PDF URL. 144 | If the content, the issuing body, or the PDF URL is not found, it returns empty strings for those values. 145 | """ 146 | logger = lg.getLogger(self.download_document.__name__) 147 | logger.info("Scrapping document: %s", url) 148 | texto_completo = "" 149 | soup = HTTPRequester.get_soup(url) 150 | try: 151 | acceso_restringido = soup.find('h1', class_='title', string='Texto de acceso restringido') 152 | if acceso_restringido: 153 | return None 154 | cuerpo = soup.find(id="cuerpo", class_="grid_11 contenidos_nivel3 boja_disposicion") 155 | cabecera = soup.find(class_="punteado_izquierda cabecera_detalle_disposicion") 156 | if not cabecera or not cuerpo: 157 | raise ScrapperError("No se pudo encontrar la cabecera o el cuerpo del documento") 158 | h2 = cabecera.find('h2') 159 | h5 = cabecera.find('h5') 160 | h3 = cabecera.find('h3') 161 | titulo_div = cabecera.find('div', class_="item") 162 | if titulo_div and titulo_div.p: 163 | titulo = titulo_div.p.text.strip() 164 | else: 165 | h4 = cabecera.find('h4') 166 | titulo = h4.text.strip() if h4 else "" 167 | 168 | tipo_disposicion = h2.text.strip() if h2 else "" 169 | organo_disposicion = h5.text.strip() if h5 is not None else (h3.text.strip() if h3 is not None else "") 170 | enlace_pdf = soup.find('a', class_="item_pdf_disposicion").get('href') 171 | parrafos = cuerpo.find_all('p') 172 | 173 | for parrafo in parrafos: 174 | if parrafo.parent.get('class') == ['alerta']: 175 | continue 176 | texto_completo += parrafo.text + "\n" 177 | text_cleaned = clean_text(texto_completo) 178 | with tempfile.NamedTemporaryFile("w", delete=False) as fn: 179 | fn.write(text_cleaned) 180 | logger.info("Scrapped document successfully %s", url) 181 | metadata_doc = BOJAMetadataDocument(**{ "filepath": fn.name, 182 | "identificador": '/'.join(url.split("/")[-3:]), 183 | "titulo": titulo, 184 | "departamento": clean_text(organo_disposicion), 185 | "url_pdf": enlace_pdf, 186 | "tipologia": re.sub(r"^\d+\.\s*", "", tipo_disposicion), 187 | }) 188 | return metadata_doc 189 | except Exception as e: 190 | raise Exception(f"Error inesperado procesando el documento {url}: {e}") 191 | -------------------------------------------------------------------------------- /research/fine-tuning-embedding-model/1.5-CheckDataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 45, 6 | "id": "a1120c32-78d2-41c8-9f27-da9648c8e6c3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from datasets import load_dataset" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 46, 16 | "id": "9c7ba22e-3bb2-4c36-9a8f-5fcbffa80e4c", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "INPUT_DATASET = \"dariolopez/justicio-rag-embedding-qa-tmp-2\"" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 47, 26 | "id": "34583617-bab8-4405-9478-cc88fef92bf1", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "application/vnd.jupyter.widget-view+json": { 32 | "model_id": "48b3ef4d69a84a10bb19a1a1e144a113", 33 | "version_major": 2, 34 | "version_minor": 0 35 | }, 36 | "text/plain": [ 37 | "Downloading readme: 0%| | 0.00/348 [00:00