├── .dockerignore
├── src
    ├── __init__.py
    ├── email
    │   ├── __init__.py
    │   └── send_email.py
    ├── service
    │   ├── __init__.py
    │   └── main.py
    ├── etls
    │   ├── boa
    │   │   ├── __init__.py
    │   │   ├── defs.py
    │   │   ├── metadata.py
    │   │   ├── load.py
    │   │   └── scrapper.py
    │   ├── bocm
    │   │   ├── __init__.py
    │   │   ├── defs.py
    │   │   ├── README.md
    │   │   ├── metadata.py
    │   │   ├── load.py
    │   │   ├── utils.py
    │   │   └── scrapper.py
    │   ├── boe
    │   │   ├── __init__.py
    │   │   ├── defs.py
    │   │   ├── metadata.py
    │   │   ├── load.py
    │   │   ├── loading
    │   │   │   ├── documents.py
    │   │   │   └── defs_id_largos.py
    │   │   └── scrapper.py
    │   ├── boja
    │   │   ├── __init__.py
    │   │   ├── defs.py
    │   │   ├── README.md
    │   │   ├── utils.py
    │   │   ├── metadata.py
    │   │   ├── load.py
    │   │   └── scrapper.py
    │   ├── bopv
    │   │   ├── __init__.py
    │   │   ├── defs.py
    │   │   ├── README.md
    │   │   ├── metadata.py
    │   │   ├── load.py
    │   │   └── scrapper.py
    │   ├── template
    │   │   ├── __init__.py
    │   │   ├── defs.py
    │   │   ├── metadata.py
    │   │   ├── scrapper.py
    │   │   ├── README.md
    │   │   └── load.py
    │   ├── bopz
    │   │   ├── defs.py
    │   │   ├── README.md
    │   │   ├── utils.py
    │   │   ├── metadata.py
    │   │   ├── load.py
    │   │   └── scrapper.py
    │   ├── common
    │   │   ├── metadata.py
    │   │   ├── scrapper.py
    │   │   ├── etl.py
    │   │   └── utils.py
    │   ├── jobs.py
    │   └── utils.py
    ├── utils.py
    └── initialize.py
├── benchmark
    ├── requirements.txt
    ├── output
    │   ├── load_test_sync_failures.csv
    │   ├── load_test_async_exceptions.csv
    │   ├── load_test_sync_exceptions.csv
    │   ├── response_time_async_failures.csv
    │   ├── response_time_sync_failures.csv
    │   ├── response_time_async_exceptions.csv
    │   ├── response_time_sync_exceptions.csv
    │   ├── load_test_async_failures.csv
    │   ├── response_time_async_stats.csv
    │   ├── response_time_sync_stats.csv
    │   ├── load_test_sync_stats.csv
    │   ├── load_test_async_stats.csv
    │   ├── response_time_async_stats_history.csv
    │   └── response_time_sync_stats_history.csv
    ├── benchmark.py
    └── README.md
├── bin
    ├── build
    └── run
├── Makefile
├── config
    ├── example_qdrant_local.yaml
    ├── qlora.yaml
    └── config.yaml
├── render.yaml
├── Dockerfile
├── doc
    ├── cron_etl_daily_public.sh
    ├── supabase
    │   ├── query.sql
    │   └── starting.md
    ├── crontab_e.sh
    ├── qdrant
    │   └── queries.json
    └── deployment_guide.md
├── evaluation
    └── embeddings
    │   ├── eval.py
    │   ├── README.md
    │   ├── defs.py
    │   └── questions.py
├── requirements.txt
├── research
    └── fine-tuning-embedding-model
    │   ├── README.md
    │   └── 1.5-CheckDataset.ipynb
├── LICENSE
├── .github
    └── workflows
    │   └── bandit.yml
├── .gitignore
└── README.md


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/email/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/service/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/etls/boa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/etls/bocm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/etls/boe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/etls/boja/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/etls/bopv/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/etls/template/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | locust==2.16.1


--------------------------------------------------------------------------------
/src/etls/boa/defs.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "boa"


--------------------------------------------------------------------------------
/src/etls/boja/defs.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "boja"


--------------------------------------------------------------------------------
/src/etls/bopv/defs.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "bopv"


--------------------------------------------------------------------------------
/src/etls/bocm/defs.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "bocm"
2 | 


--------------------------------------------------------------------------------
/src/etls/boe/defs.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "justicio"
2 | 


--------------------------------------------------------------------------------
/src/etls/bopz/defs.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "bopz"
2 | 


--------------------------------------------------------------------------------
/src/etls/template/defs.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "template"
2 | 


--------------------------------------------------------------------------------
/benchmark/output/load_test_sync_failures.csv:
--------------------------------------------------------------------------------
1 | Method,Name,Error,Occurrences
2 | 


--------------------------------------------------------------------------------
/benchmark/output/load_test_async_exceptions.csv:
--------------------------------------------------------------------------------
1 | Count,Message,Traceback,Nodes
2 | 


--------------------------------------------------------------------------------
/benchmark/output/load_test_sync_exceptions.csv:
--------------------------------------------------------------------------------
1 | Count,Message,Traceback,Nodes
2 | 


--------------------------------------------------------------------------------
/benchmark/output/response_time_async_failures.csv:
--------------------------------------------------------------------------------
1 | Method,Name,Error,Occurrences
2 | 


--------------------------------------------------------------------------------
/benchmark/output/response_time_sync_failures.csv:
--------------------------------------------------------------------------------
1 | Method,Name,Error,Occurrences
2 | 


--------------------------------------------------------------------------------
/benchmark/output/response_time_async_exceptions.csv:
--------------------------------------------------------------------------------
1 | Count,Message,Traceback,Nodes
2 | 


--------------------------------------------------------------------------------
/benchmark/output/response_time_sync_exceptions.csv:
--------------------------------------------------------------------------------
1 | Count,Message,Traceback,Nodes
2 | 


--------------------------------------------------------------------------------
/bin/build:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -xeuo pipefail
4 | 
5 | docker build -t ia-boe:latest .


--------------------------------------------------------------------------------
/src/etls/common/metadata.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class MetadataDocument(BaseModel):
5 |     # Source
6 |     source_name: str
7 |     source_type: str
8 | 


--------------------------------------------------------------------------------
/bin/run:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -euo pipefail
4 | 
5 | docker run -v $(pwd):/usr/app \
6 |            -e PINECONE_API_KEY \
7 |            -e PINECONE_ENV \
8 |            ia-boe:latest
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | init:
 2 | 	pip install -r requirements.txt
 3 | 
 4 | isort:
 5 | 	isort --check-only src evaluation
 6 | 
 7 | format: isort
 8 | 	black src evaluation
 9 | 
10 | isort-fix:
11 | 	isort src evaluation
12 | 


--------------------------------------------------------------------------------
/benchmark/output/load_test_async_failures.csv:
--------------------------------------------------------------------------------
1 | Method,Name,Error,Occurrences
2 | GET,/aqa,"ConnectionResetError(54, 'Connection reset by peer')",16
3 | GET,/aqa,HTTPError('500 Server Error: Internal Server Error for url: /aqa'),33
4 | 


--------------------------------------------------------------------------------
/src/etls/template/metadata.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Define the metadata to be stored in the embedding database.
 3 | """
 4 | 
 5 | from src.etls.common.metadata import MetadataDocument
 6 | 
 7 | 
 8 | class TemplateMetadataDocument(MetadataDocument):
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/config/example_qdrant_local.yaml:
--------------------------------------------------------------------------------
1 | # You can create your own config file
2 | # You can generate your keys using your CLI:
3 | # openssl rand -hex 32
4 | service:
5 |   api_key: 823e071f67c198cc05c73f8bd4580865e6a8819a1f3fe57d2cd49b5c892a5233
6 |   read_only_api_key: d1aab4f05ae4fd7f4e4b8d9e5924469494ebb7897aed46cf2b0df0915410e0b0
7 | 


--------------------------------------------------------------------------------
/render.yaml:
--------------------------------------------------------------------------------
 1 | previewsEnabled: true
 2 | services:
 3 |   # A Docker web service
 4 |   - type: web
 5 |     name: ia-boe
 6 |     runtime: python
 7 |     plan: free
 8 |     autoDeploy: false
 9 |     buildCommand: pip install -r requirements.txt
10 |     startCommand: uvicorn src.service.main:app --host 0.0.0.0 --port 10000 --workers 1 --timeout-keep-alive 125 --log-level info
11 | 


--------------------------------------------------------------------------------
/src/etls/boja/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Web principal
 3 | 
 4 | [Web principal del BOJA](https://www.juntadeandalucia.es/eboja.html)
 5 | 
 6 | 
 7 | # Portal de búsqueda avanzada
 8 | 
 9 | [Portal de búsqueda avanzada](https://www.juntadeandalucia.es/eboja/index.html)
10 | 
11 | # Ejemplo de documentos scrapeados
12 | 
13 | [Doc1] (https://www.juntadeandalucia.es/eboja/2024/9/index.html)
14 | [Doc2] (https://www.juntadeandalucia.es/eboja/2024/39/index.html)


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | RUN pip install --upgrade pip
 4 | 
 5 | WORKDIR /usr/app
 6 | 
 7 | COPY requirements.txt requirements.txt
 8 | 
 9 | RUN pip install -r requirements.txt
10 | 
11 | COPY . .
12 | 
13 | ENV APP_PATH="/usr/app"
14 | ENV PYTHONPATH "${PYTHONPATH}:${APP_PATH}"
15 | 
16 | CMD ["uvicorn", "src.service.app:APP", "--host", "0.0.0.0", "--port", "5000", "--workers", "2", "--timeout-keep-alive", "125", "--log-level", "info"]
17 | 


--------------------------------------------------------------------------------
/doc/cron_etl_daily_public.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export APP_PATH="."
 4 | export SENDGRID_API_KEY="<your_sendgrid_api_key>"
 5 | export OPENAI_API_KEY="<your_open_api_key>"
 6 | export TOKENIZERS_PARALLELISM=false
 7 | export TAVILY_API_KEY="<your_tavily_api_key>"
 8 | export QDRANT_API_KEY="<your_supabase_api_key>"
 9 | export QDRANT_API_URL="<your_supabase_api_url>"
10 | 
11 | 
12 | cd ia-boe/
13 | source venv3.9/bin/activate
14 | pip install -r requirements.txt
15 | python -m src.etls.boe.load.daily
16 | 


--------------------------------------------------------------------------------
/src/etls/boja/utils.py:
--------------------------------------------------------------------------------
 1 | import re 
 2 |    
 3 | def mes_a_numero(mes):
 4 |         meses = {
 5 |             "enero": 1, "febrero": 2, "marzo": 3, "abril": 4,
 6 |             "mayo": 5, "junio": 6, "julio": 7, "agosto": 8,
 7 |             "septiembre": 9, "octubre": 10, "noviembre": 11, "diciembre": 12
 8 |         }
 9 |         return meses.get(mes.lower(), 0)
10 |     
11 | def clean_text(text: str) -> str:
12 |     cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE)
13 |     return cleaned        


--------------------------------------------------------------------------------
/src/etls/bopv/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Web principal
 3 | 
 4 | [Web principal del BOPV](https://www.euskadi.eus/web01-bopv/es/bopv2/datos/Ultimo.shtml)
 5 | 
 6 | 
 7 | # Portal de búsqueda avanzada
 8 | 
 9 | [Portal de búsqueda avanzada](https://www.euskadi.eus/web01-bopv/es/p43aBOPVWebWar/buscarAvanzada.do?idioma=es&tipoBusqueda=2)
10 | 
11 | # Ejemplo de documentos scrapeados
12 | 
13 | [Doc1] (https://www.euskadi.eus/web01-bopv/es/bopv2/datos/2024/02/2400757a.shtml)
14 | [Doc2] (https://www.euskadi.eus/web01-bopv/es/bopv2/datos/2024/02/2400759a.shtml)


--------------------------------------------------------------------------------
/src/etls/bopz/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Web principal
 3 | 
 4 | [Web principal del BOPZ](http://bop.dpz.es/BOPZ/)
 5 | 
 6 | # Normativa 
 7 | 
 8 | [Normativa del BOPZ](http://bop.dpz.es/BOPZ/portal/normativa.pdf)
 9 | 
10 | # Portal de búsqueda 
11 | 
12 | [Portal de búsqueda](https://gestiona.comunidad.madrid/wleg_pub/secure/busquedaAvanzada/buscador.jsf?id=1)
13 | 
14 | # Ejemplo de documentos scrapeados
15 | 
16 | [Doc1] (http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729066)
17 | [Doc2] (http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729162)


--------------------------------------------------------------------------------
/benchmark/output/response_time_async_stats.csv:
--------------------------------------------------------------------------------
1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%
2 | GET,/aqa,18,0,5300.0,5349.600280277776,5259.814042,5824.726167,13780.0,0.1849910355610521,0.0,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800
3 | ,Aggregated,18,0,5300.0,5349.600280277776,5259.814042,5824.726167,13780.0,0.1849910355610521,0.0,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800
4 | 


--------------------------------------------------------------------------------
/benchmark/output/response_time_sync_stats.csv:
--------------------------------------------------------------------------------
1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%
2 | GET,/qa,18,0,5400.0,5396.147113222223,5283.173665999996,5591.686541000001,13780.0,0.18341122514186337,0.0,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600
3 | ,Aggregated,18,0,5400.0,5396.147113222223,5283.173665999996,5591.686541000001,13780.0,0.18341122514186337,0.0,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600
4 | 


--------------------------------------------------------------------------------
/benchmark/output/load_test_sync_stats.csv:
--------------------------------------------------------------------------------
1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%
2 | GET,/qa,110,0,80000.0,110124.43718445454,5342.831416,228389.20708300004,13780.0,0.36826503149218837,0.0,84000,155000,176000,193000,202000,214000,220000,224000,228000,228000,228000
3 | ,Aggregated,110,0,80000.0,110124.43718445454,5342.831416,228389.20708300004,13780.0,0.36826503149218837,0.0,84000,155000,176000,193000,202000,214000,220000,224000,228000,228000,228000
4 | 


--------------------------------------------------------------------------------
/evaluation/embeddings/eval.py:
--------------------------------------------------------------------------------
 1 | from evaluation.embeddings.questions import QUERIES
 2 | from src.initialize import initialize_app
 3 | 
 4 | INIT_OBJECTS = initialize_app()
 5 | 
 6 | 
 7 | success = 0
 8 | for boe_id, question in QUERIES:
 9 |     docs = INIT_OBJECTS.vector_store.similarity_search_with_score(
10 |         query=question, k=INIT_OBJECTS.config_loader["top_k_results"]
11 |     )
12 |     for doc in docs:
13 |         if doc[0].metadata["identificador"] == boe_id:
14 |             success += 1
15 |             # break
16 | 
17 | 
18 | print(f"Len queries: {len(QUERIES)}")
19 | print(f"Success answers: {success}")
20 | 


--------------------------------------------------------------------------------
/benchmark/output/load_test_async_stats.csv:
--------------------------------------------------------------------------------
1 | Type,Name,Request Count,Failure Count,Median Response Time,Average Response Time,Min Response Time,Max Response Time,Average Content Size,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%
2 | GET,/aqa,3525,49,11000.0,11458.114285116599,8.411416999990706,32071.805625000023,13588.644822695036,12.065553609404489,0.16771975230094183,11000,13000,14000,17000,21000,23000,26000,26000,32000,32000,32000
3 | ,Aggregated,3525,49,11000.0,11458.114285116599,8.411416999990706,32071.805625000023,13588.644822695036,12.065553609404489,0.16771975230094183,11000,13000,14000,17000,21000,23000,26000,26000,32000,32000,32000
4 | 


--------------------------------------------------------------------------------
/evaluation/embeddings/README.md:
--------------------------------------------------------------------------------
 1 | We want to evaluate some parameters for the system:
 2 | 
 3 | * chunk_size (600, 1200, 1800)
 4 | * chunk_overlap (50, 100)
 5 | * k number of chunks as context (4, 6, 8)
 6 | * Search params
 7 |   * https://qdrant.tech/documentation/concepts/search/
 8 |   * https://qdrant.tech/documentation/tutorials/optimize/
 9 | 
10 | 
11 | More info: https://www.anyscale.com/blog/a-comprehensive-guide-for-building-rag-based-llm-applications-part-1
12 | 
13 | *********************************************
14 | 
15 | We load a subset (`defs.py`) of BOE documents into different Qdrant databases (tier-free) and run `eval.py` against them.
16 | 
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | PyYAML==6.0
 2 | 
 3 | fastapi==0.103.2
 4 | uvicorn==0.23.2
 5 | 
 6 | requests==2.31.0
 7 | beautifulsoup4==4.12.2
 8 | lxml==4.9.2
 9 | pydantic==2.4.2
10 | 
11 | retry==0.9.2
12 | 
13 | typer==0.9.0
14 | schedule==1.2.1
15 | 
16 | langchain==0.2.1
17 | # langchainplus-sdk==0.0.20
18 | # langsmith==0.1.65
19 | langchain-openai==0.1.8
20 | langchain-core==0.2.3
21 | langchain-community==0.2.1
22 | langtrace_python_sdk==2.1.26
23 | 
24 | qdrant-client==1.9.2
25 | 
26 | sentence_transformers==2.2.2
27 | openai==1.30.5
28 | tavily-python==0.3.3
29 | 
30 | sendgrid==6.10.0
31 | 
32 | # Clean code tools
33 | black==23.9.1
34 | isort==5.12.0
35 | 
36 | # Evaluation
37 | # ragas==0.1.0rc1
38 | 


--------------------------------------------------------------------------------
/src/etls/template/scrapper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Define a class with some methods (download_day, download_document) to scrape the information.
 3 | """
 4 | 
 5 | import typing as tp
 6 | from datetime import date
 7 | 
 8 | from src.etls.common.scrapper import BaseScrapper
 9 | from src.etls.template.metadata import TemplateMetadataDocument
10 | from src.initialize import initialize_logging
11 | 
12 | initialize_logging()
13 | 
14 | 
15 | class TemplateScrapper(BaseScrapper):
16 |     def download_day(self, day: date) -> tp.List[TemplateMetadataDocument]:
17 |         """
18 |         Define how to navigate between documents for a single day
19 |         """
20 |         pass
21 | 
22 |     def download_document(self, url: str) -> TemplateMetadataDocument:
23 |         """
24 |         Define how a single document is scrapped
25 |         """
26 |         pass
27 | 


--------------------------------------------------------------------------------
/src/email/send_email.py:
--------------------------------------------------------------------------------
 1 | import logging as lg
 2 | import os
 3 | 
 4 | from sendgrid import SendGridAPIClient
 5 | from sendgrid.helpers.mail import Content, Email, Mail, To
 6 | 
 7 | from src.initialize import initialize_logging
 8 | 
 9 | initialize_logging()
10 | 
11 | 
12 | def send_email(config_loader, subject: str, content: str) -> None:
13 |     logger = lg.getLogger(send_email.__name__)
14 |     logger.info("Sending email")
15 |     sg = SendGridAPIClient(api_key=os.environ.get("SENDGRID_API_KEY"))
16 |     from_email = Email(config_loader["admin_email"])
17 |     to_email = To(config_loader["admin_email"])
18 |     content = Content("text/plain", content)
19 |     mail = Mail(from_email, to_email, subject, content)
20 |     response = sg.client.mail.send.post(request_body=mail.get())
21 |     logger.info("Sent email with status %s", response.status_code)
22 | 


--------------------------------------------------------------------------------
/research/fine-tuning-embedding-model/README.md:
--------------------------------------------------------------------------------
 1 | # Fine-tuning bge-m3-es-legal
 2 | 
 3 | ### Introduction
 4 | 
 5 | Customize the embedding model (BAAI/bge-m3) for a specific domain (Legal) and language (Spanish).
 6 | 
 7 | ### Steps
 8 | 
 9 | 1. Create a dataset to fine-tuning
10 | 2. Fine-tuning the model using `BAAI/bge-m3` as baseline.
11 | 
12 | ### Notes
13 | 
14 | Run in Runpod with 1 x RTX A6000. About 90 seconds per epoch. So, about 6 epochs -> 10 minutes.
15 | 
16 | ### Based on:
17 | 
18 | - https://www.philschmid.de/sagemaker-train-deploy-embedding-models
19 | - https://github.com/virattt/financial-datasets/
20 | - https://github.com/virattt/financial-datasets/blob/main/financial_datasets/prompts.py
21 | 
22 | ### Results
23 | 
24 | - Dataset: https://huggingface.co/datasets/dariolopez/justicio-rag-embedding-qa
25 | - Model: https://huggingface.co/datasets/dariolopez/bge-m3-es-legal
26 | 


--------------------------------------------------------------------------------
/src/etls/bocm/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Estructura del BOCM  
 3 | 
 4 | [Estructura del BOCM](https://www.bocm.es/estructura)
 5 | 
 6 | # Normativa 
 7 | 
 8 | [Normativa del BOCM](https://www.bocm.es/normativa-reguladora)
 9 | 
10 | 
11 | # Portal de Legislación 
12 | 
13 | [wleg_pub](https://gestiona.comunidad.madrid/wleg_pub/secure/busquedaAvanzada/buscador.jsf?id=1)
14 | 
15 | # Clasificación de campos que contiene cada sección del BOCM
16 | 
17 | 
18 | SECCIÓN | SUBSECCIÓN | APARTADO | TIPO | ANUNCIANTE | RANGO | ORGANO
19 | :---:|:---:|:---:|:---:|:---:|:---:|:---:
20 |  1 | x | | x | | x | x
21 |  2 | x | | | | | |
22 |  3 | x| x | | | | |
23 |  4 | x| | | | | |
24 |  5 |  | | | x | | |
25 | 
26 | > NOTA ACLARATORIA:  (x = Tiene ese campo)
27 | 
28 | # IDEAS para la mejora de los metadatos del scrapeo
29 | 
30 | - Añadir una sección de análisis en los metadatos, cruzando info con el portal de legislación
31 | 


--------------------------------------------------------------------------------
/src/etls/bopv/metadata.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Optional
 3 | 
 4 | from src.etls.common.metadata import MetadataDocument
 5 | 
 6 | 
 7 | class BOPVMetadataDocument(MetadataDocument):
 8 |     """Class for keeping metadata of a BOPV Document scrapped."""
 9 | 
10 |     # Text
11 |     filepath: str
12 | 
13 |     # Source
14 |     source_name: str = "BOPV"
15 |     source_type: str = "Boletin"
16 | 
17 |     # Metadatos
18 |     identificador: str
19 |     departamento: Optional[str] = None 
20 |     tipologia: str
21 | 
22 |     # Links
23 |     titulo: Optional[str] = None
24 |     url_pdf: str  # pdf_link
25 |     url_html: Optional[str] = None
26 |     url_boletin: Optional[str] = None
27 | 
28 |     fecha_disposicion: str = ""
29 |     anio: Optional[str] = None
30 |     mes: Optional[str] = None
31 |     dia: Optional[str] = None
32 | 
33 |     datetime_insert: str = datetime.utcnow().isoformat()
34 | 
35 | 


--------------------------------------------------------------------------------
/src/etls/boja/metadata.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Optional
 3 | 
 4 | from src.etls.common.metadata import MetadataDocument
 5 | 
 6 | 
 7 | 
 8 | 
 9 | class BOJAMetadataDocument(MetadataDocument):
10 |     """Class for keeping metadata of a BOJA Document scrapped."""    
11 | 
12 |     # Text
13 |     filepath: str
14 | 
15 |     # Source
16 |     source_name: str = "BOJA"
17 |     source_type: str = "Boletin"
18 | 
19 |     # Metadatos
20 |     identificador: str
21 |     departamento: str
22 |     tipologia: str   
23 | 
24 |     # Links
25 |     titulo: Optional[str] = None
26 |     url_pdf: str  # pdf_link
27 |     url_html: Optional[str] = None
28 |     url_boletin: Optional[str] = None
29 | 
30 |     fecha_disposicion: Optional[str] = None
31 |     anio: Optional[str] = None
32 |     mes: Optional[str] = None
33 |     dia: Optional[str] = None
34 | 
35 |     datetime_insert: str = datetime.utcnow().isoformat()
36 | 
37 | 


--------------------------------------------------------------------------------
/src/etls/bopz/utils.py:
--------------------------------------------------------------------------------
 1 | # POST request data to filter retrieved BOPZ documents
 2 | 
 3 | DATA_POST = {
 4 |     "numPag": "",
 5 |     "idProcedente": "8610",
 6 |     "idPortador": "",
 7 |     "hProcedente": " AYUNTAMIENTO DE ZARAGOZA",
 8 |     "hPortador": "",
 9 |     "idPagadora": "8610",
10 |     "hPagadora": " AYUNTAMIENTO DE ZARAGOZA",
11 |     "primeraVez": "N",
12 |     "ficheroDoc": "N",
13 |     "numRegistroInf": "",
14 |     "numRegistroSup": "",
15 |     "esPortalSN": "S",
16 |     "numBoletinInf": "",
17 |     "numRegistroNumInf": "",
18 |     "numRegistroAnyoInf": "",
19 |     "numRegistroNumSup": "",
20 |     "numRegistroAnyoSup": "",
21 |     "numBoletinAuxInf": "",
22 |     "anyoBoletinInf": "",
23 |     "numBoletinSup": "",
24 |     "anyoBoletinSup": "",
25 |     "fechaPubInf": "",
26 |     "fechaPubSup": "",
27 |     "procedente": " AYUNTAMIENTO DE ZARAGOZA",
28 |     "tematica": "",
29 |     "titulo": "",
30 |     "contenido": "",
31 | }
32 | 


--------------------------------------------------------------------------------
/doc/supabase/query.sql:
--------------------------------------------------------------------------------
 1 | -- Query the number of vectors
 2 | select count(*) from documents;
 3 | 
 4 | -- Query number of vectors by day
 5 | -- TODO: add sundays
 6 | select
 7 |     count(*) as counted_vector,
 8 |     metadata->>'fecha_publicacion' as date
 9 | from documents
10 | group by date
11 | order by date
12 | 
13 | -- Query max(day)
14 | select max(metadata->>'fecha_publicacion') as date from documents;
15 | 
16 | -- Query min(day)
17 | select min(metadata->>'fecha_publicacion') as date from documents;
18 | 
19 | -- Query metadata by identificador
20 | SELECT count(*) FROM documents WHERE metadata @> '{"identificador": "BOE-A-2023-38"}';
21 | 
22 | -- Query metadata by fecha_publicacion
23 | SELECT count(*) FROM documents WHERE metadata @> '{"fecha_publicacion": "2023-01-01"}';
24 | 
25 | -- TODO: Query to detect the duplicated (text/embeddings)
26 | 
27 | 
28 | -- metadata: https://medium.com/hackernoon/how-to-query-jsonb-beginner-sheet-cheat-4da3aa5082a3
29 | 


--------------------------------------------------------------------------------
/doc/crontab_e.sh:
--------------------------------------------------------------------------------
 1 | # To automatize the daily run:
 2 | # 1. Rename cron_etl_daily_public.sh to cron_etl_daily.sh
 3 | # mv cron_etl_daily_public.sh cron_etl_daily.sh
 4 | # 2. Fill the api keys
 5 | # 3. Provide permissions to file:
 6 | # chmod +x cron_etl_daily.sh
 7 | # 4. Copy and paste this file on Crontab
 8 | # crontab -e
 9 | # Note: Using Ubuntu 22.04 as host probably you find errors like: requests.exceptions.SSLError: HTTPSConnectionPool(host='www.boe.es', port=443): Max retries exceeded with url: /boe/dias/2023/08/02 (Caused by SSLError(SSLError(1, '[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED] unsafe legacy renegotiation disabled (_ssl.c:1129)')))
10 | # Solution (Vulnerable to the Man-in-the-Middle): https://stackoverflow.com/questions/71603314/ssl-error-unsafe-legacy-renegotiation-disabled
11 | 
12 | 
13 | SHELL=/usr/bin/bash
14 | CRON_TZ=UTC
15 | PROJECT_DIR=/home/ubuntu/ia-boe
16 | 
17 | 20 07 * * * $PROJECT_DIR/cron_etl_daily.sh >> $PROJECT_DIR/logs/ingest_cron.out 2>> $PROJECT_DIR/logs/ingest_cron.err
18 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.py:
--------------------------------------------------------------------------------
 1 | import locust
 2 | 
 3 | 
 4 | class ApiUser(locust.HttpUser):
 5 |     wait_time = locust.constant_pacing(1)
 6 |     input_data = (
 7 |         "¿Es de aplicación la ley de garantía integral de la libertad sexual a niños (varones) menores de edad "
 8 |         "víctimas de violencias sexuales o solo a niñas y mujeres?"
 9 |     )
10 | 
11 |     def health_check(self):
12 |         self.client.get("/healthcheck")
13 | 
14 |     def endpoint404(self):
15 |         with self.client.get(
16 |             "/url_does_not_exist", catch_response=True
17 |         ) as response:
18 |             if response.status_code == 404:
19 |                 response.success()
20 | 
21 | 
22 | class ApiAsyncUser(ApiUser):
23 | 
24 |     url = "/aqa"
25 | 
26 |     @locust.task
27 |     def aqa(self):
28 |         self.client.get(self.url, json=self.input_data)
29 | 
30 | 
31 | class ApiSyncUser(ApiUser):
32 | 
33 |     url = "/qa"
34 | 
35 |     @locust.task
36 |     def qa(self):
37 |         self.client.get(self.url, json=self.input_data)
38 | 


--------------------------------------------------------------------------------
/src/etls/jobs.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import schedule
 4 | 
 5 | from src.etls.boe.load import today as boe_today
 6 | from src.etls.bopz.load import today as bopz_today
 7 | from src.etls.bocm.load import today as bocm_today
 8 | from src.etls.bopv.load import today as bopv_today
 9 | from src.etls.boja.load import today as boja_today
10 | from src.etls.boa.load import today as boa_today
11 | from src.initialize import initialize_app
12 | 
13 | 
14 | INIT_OBJECTS = initialize_app()
15 | 
16 | 
17 | schedule.every().day.at("11:00").do(boe_today, init_objects=INIT_OBJECTS)
18 | schedule.every().day.at("11:05").do(bopz_today, init_objects=INIT_OBJECTS)
19 | schedule.every().day.at("11:10").do(bocm_today, init_objects=INIT_OBJECTS)
20 | schedule.every().day.at("11:15").do(bopv_today, init_objects=INIT_OBJECTS)
21 | schedule.every().day.at("11:20").do(boja_today, init_objects=INIT_OBJECTS)
22 | schedule.every().day.at("11:25").do(boa_today, init_objects=INIT_OBJECTS)
23 | # TODO: monthly jobs
24 | 
25 | while True:
26 |     schedule.run_pending()
27 |     time.sleep(1)
28 | 


--------------------------------------------------------------------------------
/src/etls/boa/metadata.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Optional
 3 | 
 4 | from src.etls.common.metadata import MetadataDocument
 5 | 
 6 | 
 7 | class BOAMetadataDocument(MetadataDocument):
 8 |     """Class for keeping metadata of a BOA Document scrapped."""
 9 | 
10 |     # Text
11 |     filepath: str
12 | 
13 |     # Source
14 |     source_name: str = "BOA"
15 |     source_type: str = "Boletin"
16 | 
17 |     # Metadatos
18 |     numero_boletin: str 
19 |     identificador: str # DOCN
20 |     departamento: Optional[str] = None 
21 |     seccion: Optional[str] = None 
22 |     subseccion: Optional[str] = None 
23 |     rango: Optional[str] = None 
24 |     codigo_materia: Optional[str] = None 
25 | 
26 |     # Links
27 |     titulo: Optional[str] = None 
28 |     url_pdf: str  
29 |     url_boletin: Optional[str] = None
30 | 
31 |     fecha_disposicion: str = "" 
32 |     fecha_publicacion: str = ""
33 |     anio: Optional[str] = None
34 |     mes: Optional[str] = None
35 |     dia: Optional[str] = None
36 | 
37 |     datetime_insert: str = datetime.utcnow().isoformat()
38 | 
39 | 


--------------------------------------------------------------------------------
/src/etls/bopz/metadata.py:
--------------------------------------------------------------------------------
 1 | import typing as tp
 2 | from datetime import datetime
 3 | 
 4 | from pydantic import field_validator
 5 | 
 6 | from src.etls.common.metadata import MetadataDocument
 7 | 
 8 | 
 9 | class BOPZMetadataDocument(MetadataDocument):
10 |     """Class for keeping metadata of a BOPZ Document scrapped."""
11 | 
12 |     # Text
13 |     filepath: str
14 | 
15 |     # Source
16 |     source_name: str = "BOPZ"
17 |     source_type: str = "Boletin"
18 | 
19 |     # Metadatos
20 |     identificador: str
21 |     numero_oficial: str = ""
22 |     departamento: str
23 |     titulo: str = ""
24 |     url_pdf: str
25 |     url_html: str
26 |     fecha_publicacion: str
27 |     fecha_disposicion: str = ""
28 |     anio: str
29 |     mes: str
30 |     dia: str
31 | 
32 |     # Analisis
33 |     materia: tp.List[str]
34 | 
35 |     datetime_insert: str = datetime.utcnow().isoformat()
36 | 
37 |     @field_validator("fecha_publicacion", "fecha_disposicion")
38 |     @classmethod
39 |     def isoformat(cls, v):
40 |         if v:
41 |             datetime.strptime(v, "%Y-%m-%d")
42 |         return v
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Darío López Padial
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/etls/utils.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | import requests
 4 | from requests.adapters import HTTPAdapter
 5 | from urllib3.util.retry import Retry
 6 | import schedule
 7 | 
 8 | 
 9 | def catch_exceptions(cancel_on_failure=False):
10 |     def catch_exceptions_decorator(job_func):
11 |         @functools.wraps(job_func)
12 |         def wrapper(*args, **kwargs):
13 |             try:
14 |                 return job_func(*args, **kwargs)
15 |             except:
16 |                 import traceback
17 |                 print(traceback.format_exc())
18 |                 if cancel_on_failure:
19 |                     return schedule.CancelJob
20 |         return wrapper
21 |     return catch_exceptions_decorator
22 | 
23 | 
24 | def create_retry_session(retries, backoff_factor=1, status_forcelist=[500, 502, 503, 504]):
25 |     session = requests.Session()
26 |     retry = Retry(
27 |         total=retries,
28 |         read=retries,
29 |         connect=retries,
30 |         backoff_factor=backoff_factor,
31 |         status_forcelist=status_forcelist,
32 |     )
33 |     adapter = HTTPAdapter(max_retries=retry)
34 |     session.mount('http://', adapter)
35 |     session.mount('https://', adapter)
36 |     return session
37 | 


--------------------------------------------------------------------------------
/src/etls/common/scrapper.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import logging as lg
 3 | import typing as tp
 4 | from abc import ABC, abstractmethod
 5 | from datetime import date, timedelta
 6 | 
 7 | from src.etls.common.metadata import MetadataDocument
 8 | from src.initialize import initialize_logging
 9 | 
10 | initialize_logging()
11 | 
12 | 
13 | class BaseScrapper(ABC):
14 |     def download_days(self, date_start: date, date_end: date) -> tp.List[MetadataDocument]:
15 |         """Download all the documents between two dates (from date_start to date_end)"""
16 |         logger = lg.getLogger(self.download_days.__name__)
17 |         logger.info("Downloading content from day %s to %s", date_start, date_end)
18 |         delta = timedelta(days=1)
19 |         docs = []
20 |         date_start_aux = copy.copy(date_start)
21 |         while date_start_aux <= date_end:
22 |             docs += self.download_day(date_start_aux)
23 |             date_start_aux += delta
24 |         logger.info("Downloaded content from day %s to %s", date_start, date_end)
25 |         return docs
26 | 
27 |     @abstractmethod
28 |     def download_day(self, day: date) -> tp.List[MetadataDocument]:
29 |         """Download all the documents for a specific date."""
30 |         pass
31 | 
32 |     @abstractmethod
33 |     def download_document(self, url: str) -> MetadataDocument:
34 |         """Get text and metadata from url document."""
35 |         pass
36 | 


--------------------------------------------------------------------------------
/doc/supabase/starting.md:
--------------------------------------------------------------------------------
 1 | # 1. Create table and function
 2 | 
 3 | -- Enable the pgvector extension to work with embedding vectors
 4 | create extension vector;
 5 | 
 6 | -- Create a table to store your documents
 7 | create table documents (
 8 |     id bigserial primary key,
 9 |     content text, -- corresponds to Document.pageContent
10 |     metadata jsonb, -- corresponds to Document.metadata
11 |     embedding vector(768) -- 768 works for OpenAI embeddings, change if needed
12 | );
13 | 
14 | -- Create a function to do queries
15 | CREATE FUNCTION match_documents(query_embedding vector(768), match_count int)
16 |     RETURNS TABLE(
17 |         id text,
18 |         content text,
19 |         metadata jsonb,
20 |         -- we return matched vectors to enable maximal marginal relevance searches
21 |         embedding vector(768),
22 |         similarity float)
23 |     LANGUAGE plpgsql
24 |     AS $$
25 |     # variable_conflict use_column
26 | BEGIN
27 |     RETURN query
28 |     SELECT
29 |         id,
30 |         content,
31 |         metadata,
32 |         embedding,
33 |         1 -(documents.embedding <=> query_embedding) AS similarity
34 |     FROM
35 |         documents
36 |     ORDER BY
37 |         documents.embedding <=> query_embedding
38 |     LIMIT match_count;
39 | END;
40 | $$;
41 | 
42 | 
43 | # 2. Edit the type of `id` column from `documents` table from int8 to text.
44 | 
45 | Using the Supabase UI
46 | 
47 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import logging as lg
 2 | import time
 3 | import typing as tp
 4 | from functools import wraps
 5 | 
 6 | from langchain.schema import Document
 7 | from langchain.vectorstores import SupabaseVectorStore
 8 | from pydantic import BaseModel
 9 | from fastapi import Request
10 | from opentelemetry import baggage, context
11 | from langtrace_python_sdk.constants.instrumentation.common import (
12 |     LANGTRACE_ADDITIONAL_SPAN_ATTRIBUTES_KEY,
13 | )
14 | 
15 | 
16 | class QAResponsePayloadModel(BaseModel):
17 |     scoring_id: str
18 |     context: tp.List[tp.Tuple[Document, float]]
19 |     answer: str
20 | 
21 | 
22 | def timeit(func):
23 |     @wraps(func)
24 |     async def wrapper(*args, **kwargs):
25 |         logger = lg.getLogger(func.__name__)
26 |         logger.info("<<< Starting  >>>")
27 |         start_time = time.time()
28 |         result = await func(*args, **kwargs)
29 |         end_time = time.time()
30 |         delta = end_time - start_time
31 |         msg = f"{delta:2.2f}s" if delta > 1 else f"{1000 * delta:2.1f}ms"
32 |         logger.info("<<< Completed >>> in %s", msg)
33 |         return result
34 | 
35 |     return wrapper
36 | 
37 | 
38 | async def inject_additional_attributes(fn, attributes=None):
39 |     if attributes:
40 |         new_ctx = baggage.set_baggage(
41 |             LANGTRACE_ADDITIONAL_SPAN_ATTRIBUTES_KEY, attributes
42 |         )
43 |         context.attach(new_ctx)
44 | 
45 |     return await fn()
46 | 


--------------------------------------------------------------------------------
/evaluation/embeddings/defs.py:
--------------------------------------------------------------------------------
 1 | boe_ids = [
 2 |     "BOE-A-2023-11022",
 3 |     "BOE-A-2023-11077",
 4 |     "BOE-A-2023-11560",
 5 |     "BOE-A-2023-11959",
 6 |     "BOE-A-2023-12667",
 7 |     "BOE-A-2023-13811",
 8 |     "BOE-A-2023-14427",
 9 |     "BOE-A-2023-14713",
10 |     "BOE-A-2023-14733",
11 |     "BOE-A-2023-1776",
12 |     "BOE-A-2023-2098",
13 |     "BOE-A-2023-2980",
14 |     "BOE-A-2023-3297",
15 |     "BOE-A-2023-3346",
16 |     "BOE-A-2023-3511",
17 |     "BOE-A-2023-353",
18 |     "BOE-A-2023-3847",
19 |     "BOE-A-2023-4120",
20 |     "BOE-A-2023-4324",
21 |     "BOE-A-2023-4385",
22 |     "BOE-A-2023-4514",
23 |     "BOE-A-2023-4952",
24 |     "BOE-A-2023-4994",
25 |     "BOE-A-2023-5091",
26 |     "BOE-A-2023-5093",
27 |     "BOE-A-2023-5367",
28 |     "BOE-A-2023-545",
29 |     "BOE-A-2023-5452",
30 |     "BOE-A-2023-5482",
31 |     "BOE-A-2023-5582",
32 |     "BOE-A-2023-5704",
33 |     "BOE-A-2023-5961",
34 |     "BOE-A-2023-6382",
35 |     "BOE-A-2023-6721",
36 |     "BOE-A-2023-7053",
37 |     "BOE-A-2023-7343",
38 |     "BOE-A-2023-7355",
39 |     "BOE-A-2023-755",
40 |     "BOE-A-2023-8110",
41 |     "BOE-A-2023-8164",
42 |     "BOE-A-2023-8315",
43 |     "BOE-A-2023-8318",
44 |     "BOE-A-2023-9030",
45 |     "BOE-A-2023-9069",
46 |     "BOE-A-2023-9428",
47 |     "BOE-A-2023-9429",
48 |     "BOE-A-2023-9719",
49 |     "BOE-A-2023-9827",
50 |     "BOE-A-2022-14630",  # garantía integral de la libertad sexual
51 |     "BOE-A-2023-12203",  # derecho a la vivienda
52 |     "BOE-A-2023-16889",  # Sistema de Formación Profesional
53 | ]
54 | 


--------------------------------------------------------------------------------
/src/etls/bocm/metadata.py:
--------------------------------------------------------------------------------
 1 | import typing as tp
 2 | from datetime import datetime
 3 | 
 4 | from pydantic import BaseModel, field_validator, Field
 5 | import re
 6 | 
 7 | from src.etls.common.metadata import MetadataDocument
 8 | 
 9 | 
10 | # REGEX
11 | CVE_REGEX = r"^BOCM-\d{8}-\d{1,3}$"  # TODO: regex demasiado laxa
12 | 
13 | 
14 | class BOCMMetadataDocument(MetadataDocument):
15 |     """Class for keeping metadata of a BOCM Document scrapped."""
16 | 
17 |     # Text
18 |     filepath: str
19 | 
20 |     # Source
21 |     source_name: str = "BOCM"
22 |     source_type: str = "Boletin"
23 | 
24 |     # Metadatos
25 |     identificador: str = Field(pattern=CVE_REGEX, examples=["BOCM-20240129-24"])
26 |     numero_oficial: str = ""  # Número de boletín
27 |     paginas: str
28 |     departamento: str  # órgano (excepto sección 4, que no tiene)
29 | 
30 |     seccion_normalizada: str
31 |     seccion: str
32 |     subseccion: str
33 |     tipo: str = ""
34 |     apartado: str = ""
35 |     rango: str = ""
36 | 
37 |     # Links
38 |     titulo: str  # title
39 |     url_pdf: str  # pdf_link
40 |     url_html: str  # html_link
41 | 
42 |     fecha_publicacion: str
43 |     fecha_disposicion: str = ""
44 |     anio: str
45 |     mes: str
46 |     dia: str
47 | 
48 |     datetime_insert: str = datetime.utcnow().isoformat()
49 | 
50 |     @field_validator("fecha_publicacion", "fecha_disposicion")
51 |     @classmethod
52 |     def isoformat(cls, v):
53 |         if v:
54 |             return datetime.strptime(v, "%Y-%m-%d").strftime("%Y-%m-%d")
55 |         return v
56 | 


--------------------------------------------------------------------------------
/config/qlora.yaml:
--------------------------------------------------------------------------------
 1 | # RTX 3090 on Runpod - approx 7 hours
 2 | 
 3 | base_model: NousResearch/Llama-2-7b-chat-hf
 4 | base_model_config: NousResearch/Llama-2-7b-chat-hf
 5 | model_type: LlamaForCausalLM
 6 | tokenizer_type: LlamaTokenizer
 7 | is_llama_derived_model: true
 8 | hub_model_id: Llama-2-databricks-dolly-oasst1-es-axolotl
 9 | 
10 | load_in_8bit: false
11 | load_in_4bit: true
12 | strict: false
13 | 
14 | datasets:
15 |   - path: dariolopez/Llama-2-databricks-dolly-oasst1-es
16 |     type: completion
17 | dataset_prepared_path: last_run_prepared
18 | val_set_size: 0.01
19 | output_dir: ./qlora-out
20 | 
21 | adapter: qlora
22 | lora_model_dir:
23 | 
24 | sequence_len: 2048  # 4096
25 | sample_packing: true
26 | 
27 | lora_r: 32
28 | lora_alpha: 16
29 | lora_dropout: 0.05
30 | lora_target_modules:
31 | lora_target_linear: true
32 | lora_fan_in_fan_out:
33 | 
34 | wandb_project: axolotl
35 | wandb_entity:
36 | wandb_watch:
37 | wandb_run_id:
38 | wandb_log_model:
39 | 
40 | 
41 | gradient_accumulation_steps: 4
42 | micro_batch_size: 2
43 | num_epochs: 3
44 | optimizer: paged_adamw_32bit
45 | lr_scheduler: cosine
46 | learning_rate: 0.0002
47 | 
48 | train_on_inputs: false
49 | group_by_length: false
50 | bf16: true
51 | fp16: false
52 | tf32: false
53 | 
54 | gradient_checkpointing: true
55 | early_stopping_patience:
56 | resume_from_checkpoint:
57 | local_rank:
58 | logging_steps: 1
59 | xformers_attention:
60 | flash_attention: false
61 | 
62 | warmup_steps: 100
63 | eval_steps: 0.01
64 | save_steps:
65 | debug:
66 | deepspeed:
67 | weight_decay: 0.0
68 | fsdp:
69 | fsdp_config:
70 | special_tokens:
71 |   bos_token: "<s>"
72 |   eos_token: "</s>"
73 |   unk_token: "<unk>"
74 | 


--------------------------------------------------------------------------------
/doc/qdrant/queries.json:
--------------------------------------------------------------------------------
 1 | // List all collections
 2 | GET collections
 3 | 
 4 | // Get collection info
 5 | GET collections/justicio
 6 | 
 7 | // List points in a collection, using filter by metadata
 8 | POST collections/justicio/points/scroll
 9 | {
10 |   "limit": 40000,
11 |   "filter": {
12 |     "must": [
13 |       {
14 |         "key": "metadata.anio",
15 |         "match": {
16 |           "value": "2018"
17 |         }
18 |       }
19 |     ]
20 |   }
21 | }
22 | 
23 | // Count points in a collection, using filter by metadata
24 | POST collections/justicio/points/count
25 | {
26 |   "filter": {
27 |     "must": [
28 |       {
29 |         "key": "metadata.anio",
30 |         "match": {
31 |           "value": "2018"
32 |         }
33 |       }
34 |     ]
35 |   }
36 | }
37 | 
38 | // Count points in a collection, using filter by multiple metadata
39 | POST collections/justicio/points/count
40 | {
41 |   "filter": {
42 |     "must": [
43 |       {
44 |         "key": "metadata.anio",
45 |         "match": {
46 |           "value": "2024"
47 |         }
48 |       },
49 |       {
50 |         "key": "metadata.mes",
51 |         "match": {
52 |           "value": "02"
53 |         }
54 |       },
55 |       {
56 |         "key": "metadata.dia",
57 |         "match": {
58 |           "value": "20"
59 |         }
60 |       }
61 |     ]
62 |   }
63 | }
64 | 
65 | // Delete points in a collection, using filter by metadata
66 | POST collections/justicio/points/delete
67 | {
68 |   "filter": {
69 |     "must": [
70 |       {
71 |         "key": "metadata.anio",
72 |         "match": {
73 |           "value": "2018"
74 |         }
75 |       }
76 |     ]
77 |   }
78 | }
79 | 
80 | // https://qdrant.tech/documentation/concepts/filtering/
81 | // https://qdrant.tech/documentation/concepts/points/
82 | 


--------------------------------------------------------------------------------
/src/etls/boe/metadata.py:
--------------------------------------------------------------------------------
 1 | import typing as tp
 2 | from datetime import datetime
 3 | 
 4 | from pydantic import BaseModel, field_validator
 5 | 
 6 | from src.etls.common.metadata import MetadataDocument
 7 | 
 8 | 
 9 | class BOEMetadataReferencia(BaseModel):
10 |     id: str
11 |     palabra: str
12 |     texto: str
13 | 
14 | 
15 | class BOEMetadataDocument(MetadataDocument):
16 |     """Class for keeping metadata of a BOE Document scrapped."""
17 | 
18 |     # Text
19 |     filepath: str
20 | 
21 |     # Source
22 |     source_name: str = "BOE"
23 |     source_type: str = "Boletin"
24 | 
25 |     # Metadatos
26 |     identificador: str
27 |     diario: str
28 |     numero_oficial: str = ""
29 |     departamento: str
30 |     rango: str = ""
31 |     titulo: str
32 |     url_pdf: str
33 |     origen_legislativo: str = ""
34 |     fecha_publicacion: str
35 |     fecha_disposicion: str = ""
36 |     anio: str
37 |     mes: str
38 |     dia: str
39 | 
40 |     # Analisis
41 |     observaciones: str = ""
42 |     ambito_geografico: str = ""
43 |     modalidad: str = ""
44 |     tipo: str = ""
45 |     materias: tp.List[str]
46 |     alertas: tp.List[str]
47 |     notas: tp.List[str]
48 |     ref_posteriores: tp.List[BOEMetadataReferencia]
49 |     ref_anteriores: tp.List[BOEMetadataReferencia]
50 | 
51 |     datetime_insert: str = datetime.utcnow().isoformat()
52 | 
53 |     @field_validator("ref_posteriores")
54 |     @classmethod
55 |     def ref_posteriores_to_json(cls, validators):
56 |         return [v.json() for v in validators]
57 | 
58 |     @field_validator("ref_anteriores")
59 |     @classmethod
60 |     def ref_anteriores_to_json(cls, validators):
61 |         return [v.json() for v in validators]
62 | 
63 |     @field_validator("fecha_publicacion", "fecha_disposicion")
64 |     @classmethod
65 |     def isoformat(cls, v):
66 |         if v:
67 |             return datetime.strptime(v, "%Y%m%d").strftime("%Y-%m-%d")
68 |         return v
69 | 


--------------------------------------------------------------------------------
/src/etls/template/README.md:
--------------------------------------------------------------------------------
 1 | This is a template module to load gazettes (e.g.: BOE) and/or single documents (e.g.: sentencias judiciales) into the embedding database.
 2 | 
 3 | # Gazettes
 4 | 
 5 | A gazette in this project has some requirements:
 6 | 
 7 | * A gazette is divided into days.
 8 | * Each day has many documents.
 9 | 
10 | To define an ETL for your gazette, you need to fill some files:
11 | 
12 | 1. `metadata.py` Define the metadata to be stored in the embedding database.
13 | 2. `scrapper.py` Define a class with some methods to scrape the information. 
14 | 3. `load.py` You can define the different scripts to load the data.
15 | 
16 | ### Batch/Historical Load
17 | 
18 | If you want to do a batch/historical load:
19 | 
20 | ```sh
21 | python -m src.etls.template.load dates <date_start_%Y/%m/%d> <date_end_%Y/%m/%d>
22 | ```
23 | 
24 | Note: You should update the end/start dates in the `config/config.py' file.
25 | 
26 | ### Daily (today) load
27 | 
28 | Most likely, your Gazette will be updated every day, so you will need to run a daily ETL script. Take a look at src.etls.template.load.py for inspiration.
29 | 
30 | ```sh
31 | python -m src.etls.template.load today
32 | ```
33 | 
34 | You will probably also want to schedule a daily job to update your embedding database. Then take a look at `src/etls/template/schedule.py`.
35 | 
36 | **Note:** For a complete example of a gazette configuration, you can take a look at the BOE `src/etls/boe`.
37 | 
38 | # Documents
39 | 
40 | If you want to load a single document into the embedding database.
41 | 
42 | ...In progress...
43 | 
44 | 
45 | # Want to develop your own module?
46 | 
47 | You are welcome! Please contact us to discuss your requirements:
48 | 
49 | * [Darío López](https://www.linkedin.com/in/dar%C3%ADo-l%C3%B3pez-padial-45269150/) 
50 | * [Alex Dantart](https://www.linkedin.com/in/dantart/)
51 | * [Jorge Iliarte](https://www.linkedin.com/in/jorge-iliarte-llop/)
52 | * [Jorge Barrachina](https://www.linkedin.com/in/jorgebarrachina/)
53 | 


--------------------------------------------------------------------------------
/doc/deployment_guide.md:
--------------------------------------------------------------------------------
 1 | # How to deploy the service in local
 2 | 
 3 | ## 1. Prepare your vector database in local
 4 | 
 5 | At this moment, we are working with Qdrant as vector database.
 6 | 
 7 | Official doc: https://qdrant.tech/documentation/quick-start/
 8 | 
 9 | ### Download the latest Qdrant image from Dockerhub:
10 | 
11 | ```
12 | docker pull qdrant/qdrant
13 | ```
14 | 
15 | ### Run the service:
16 | 
17 | ```
18 | docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/config/example_qdrant_local.yaml:/qdrant/config/production.yaml -v $(pwd)/qdrant_storage:/qdrant/storage:z qdrant/qdrant
19 | ```
20 | 
21 | * REST API: localhost:6333
22 | * Web UI: localhost:6333/dashboard
23 | 
24 | ## 2. Prepare Justicio
25 | 
26 | ### Clone the code:
27 | 
28 | ```
29 | git clone git@github.com:bukosabino/justicio.git
30 | ```
31 | 
32 | ### Install the requirements:
33 | 
34 | ```
35 | sudo apt install python3-virtualenv
36 | virtualenv -p python3 venv3.10
37 | source venv3.10/bin/activate
38 | pip install -r requirements.txt
39 | ```
40 | 
41 | ### Export environment variables:
42 | 
43 | Note: You need to get an API key for OpenAI and another for Sendgrid.
44 | 
45 | ```
46 | export APP_PATH="."
47 | export SENDGRID_API_KEY=<your_sendgrid_api_key>
48 | export OPENAI_API_KEY=<your_open_api_key>
49 | export TOKENIZERS_PARALLELISM=false
50 | export TAVILY_API_KEY=""
51 | export QDRANT_API_KEY="823e071f67c198cc05c73f8bd4580865e6a8819a1f3fe57d2cd49b5c892a5233"
52 | export QDRANT_API_URL="http://localhost:6333"
53 | ```
54 | 
55 | ### Add some vector to the vector database
56 | 
57 | Load BOE documents into your vector database (depending on the selected data, may take a few minutes).
58 | 
59 | ```
60 | python -m src.etls.boe.load dates 2024/01/01 2024/01/07
61 | ```
62 | 
63 | ## 3. Run Justicio in local
64 | 
65 | ```
66 | uvicorn src.service.main:APP --host=0.0.0.0 --port=5001 --workers=1 --timeout-keep-alive=125 --log-level=info
67 | ```
68 | 
69 | In the browser
70 | 
71 | ```
72 | http://<your.ip>:5001/docs
73 | ```
74 | 


--------------------------------------------------------------------------------
/src/etls/template/load.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | 
 3 | import typer
 4 | 
 5 | from src.email.send_email import send_email
 6 | from src.etls.template.scrapper import TemplateScrapper
 7 | from src.etls.common.etl import ETL
 8 | from src.etls.template.defs import COLLECTION_NAME
 9 | from src.initialize import initialize_app
10 | 
11 | 
12 | app = typer.Typer()
13 | 
14 | 
15 | @app.command()
16 | def today(init_objects=None):
17 |     if init_objects is None:
18 |         init_objects = initialize_app()
19 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
20 |     boe_scrapper = TemplateScrapper()
21 |     day = date.today()
22 |     docs = boe_scrapper.download_day(day)
23 |     if docs:
24 |         etl_job.run(docs)
25 | 
26 |     subject = "Today ETL executed"
27 |     content = f"""
28 |         Today ETL executed
29 |         - Date: {day}
30 |         - Documents loaded: {len(docs)} 
31 |         - Database used: {init_objects.config_loader['vector_store']}
32 |         """
33 |     send_email(init_objects.config_loader, subject, content)
34 | 
35 | 
36 | @app.command()
37 | def dates(date_start: str, date_end: str, init_objects=None):
38 |     if init_objects is None:
39 |         init_objects = initialize_app()
40 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
41 |     scrapper = TemplateScrapper()
42 |     docs = scrapper.download_days(
43 |         date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
44 |         date_end=datetime.strptime(date_end, "%Y/%m/%d").date(),
45 |     )
46 |     if docs:
47 |         etl_job.run(docs)
48 | 
49 |     subject = "Load ETL executed"
50 |     content = f"""
51 |     Load ETL executed
52 |     - Date start: {date_start}
53 |     - Date end: {date_end}
54 |     - Documents loaded: {len(docs)} 
55 |     - Database used: {init_objects.config_loader['vector_store']}
56 |     """
57 |     send_email(init_objects.config_loader, subject, content)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     app()
62 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Load test using Locust
 2 | 
 3 | We will try to compare our service working synchronously and asynchronously.
 4 | 
 5 | ## Constant Incremental Load Test
 6 | 
 7 | We will test how the application behaves in high traffic times and we will find the architectures breaking points.
 8 | 
 9 | The test will submit several request (from 1 to 300) every second during 300 seconds.
10 | 
11 | ### Sync
12 | 
13 | ```
14 | $ locust -f benchmark.py --host http://localhost:5001 --headless --users 300 --spawn-rate 1 --run-time 300s --html output/load_test_sync.html --csv=output/load_test_sync ApiSyncUser
15 | ```
16 | 
17 | <img width="1013" alt="Captura de pantalla 2023-09-30 a las 13 00 51" src="https://github.com/bukosabino/ia-boe/assets/4375209/b38520ae-f8ce-4735-a9bd-0d5ac77e0618">
18 | 
19 | As you can see, 0.4 appears to be the maximum number of simultaneous requests per second that our synchronous architecture can properly manage.
20 | 
21 | At this point, the median response time is 9600ms.
22 | 
23 | If you had more than 0.4 requests per second, you would increase the response time, but you would have the same number of requests per second.
24 | 
25 | ### Async
26 | 
27 | ```
28 | $ locust -f benchmark.py --host http://localhost:5001 --headless --users 300 --spawn-rate 1 --run-time 300s --html output/load_test_async.html --csv=output/load_test_async ApiAsyncUser
29 | ```
30 | 
31 | <img width="1013" alt="Captura de pantalla 2023-09-30 a las 13 01 16" src="https://github.com/bukosabino/ia-boe/assets/4375209/0f5ba189-1507-471e-879f-5d6b40aa0ab6">
32 | 
33 | As you can see, 12.1 seems to be the maximum number of simultaneous requests per second that our asynchronous architecture can properly manage.
34 | 
35 | At this point, the median response time is 5400ms.
36 | 
37 | If you have more requests than 12.1 per second, then you would increase the response time, but you will have the same number of requests per second.
38 | 
39 | 
40 | **Notes:**
41 | 
42 | You can check the results on the `output` folder.
43 | Based on [this example](https://github.com/bukosabino/scoring-handler/tree/main/benchmark/experiment3-benchmarking-locust)
44 | 


--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
 1 | # RecursiveCharacterTextSplitter
 2 | chunk_size: 1200
 3 | chunk_overlap: 100
 4 | 
 5 | admin_email: bukosabino@gmail.com
 6 | 
 7 | embeddings_model_name: dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn
 8 | embeddings_model_size: 768
 9 | 
10 | vector_store: 'qdrant'  # {'qdrant', 'pinecone', 'supabase'}
11 | top_k_results: 10
12 | distance_type: 'Cosine'  # {'Cosine', 'Euclid', 'Dot'}
13 | 
14 | # Prompts
15 | prompt_system: |
16 |   Como un experto en derecho y leyes españolas, tu tarea es responder preguntas sobre el Boletín Oficial del Estado (BOE) de España. Para ello, debes tener en cuenta y utilizar el contexto proporcionado para responder de forma precisa a la pregunta del usuario.
17 |   Asegúrate de responder siempre en español. Si no conoces la respuesta o no tienes suficiente información para responderla, simplemente admítelo; no intentes inventar una respuesta.
18 |   Deberás proporcionar detalles claros y precisos en tus respuestas, asegurándote de referenciar adecuadamente cualquier ley o reglamento pertinente. Tu objetivo es proporcionar respuestas útiles y precisas para ayudar a los usuarios a entender mejor el BOE y cómo se aplica a sus preguntas.
19 | 
20 | prompt_system_context: |
21 |   El contexto tiene un formato de lista, donde cada elemento será un diccionario con dos claves:
22 |   [{'context': 'contexto necesario para contestar la pregunta', 'score': 0.8}]
23 |   La clave 'context' contendrá la información necesaria para contestar a la pregunta y la clave 'score' será una puntuación de entre 0.0 y 1.0. Deberás dar más importancia al contexto cuanto mayor sea el score.
24 |   En la respuesta no menciones nada sobre el contexto o los scores.
25 | 
26 | # Qdrant
27 | collections:
28 |   - justicio
29 |   - boe
30 |   - bocm
31 |   - bopz
32 |   - bopv
33 |   - boja
34 |   - boa
35 | 
36 | # Openai
37 | llm_model_name: 'gpt-3.5-turbo-0125'  # 'gpt-3.5-turbo-1106', 'gpt-4-1106-preview'
38 | temperature: 0
39 | seed: 42
40 | max_tokens: 1024
41 | 
42 | # Not used
43 | ## Pinecone
44 | vector_store_index_name: justicio
45 | ## Supabase
46 | table_name: 'documents'
47 | query_name: 'match_documents'
48 | 


--------------------------------------------------------------------------------
/src/etls/boe/load.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | 
 3 | import typer
 4 | 
 5 | from src.email.send_email import send_email
 6 | from src.etls.boe.scrapper import BOEScrapper
 7 | from src.etls.common.etl import ETL
 8 | from src.etls.utils import catch_exceptions
 9 | from src.etls.boe.defs import COLLECTION_NAME
10 | from src.initialize import initialize_app
11 | 
12 | 
13 | app = typer.Typer()
14 | 
15 | 
16 | @app.command()
17 | @catch_exceptions(cancel_on_failure=True)
18 | def today(init_objects=None):
19 |     if init_objects is None:
20 |         init_objects = initialize_app()
21 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
22 |     boe_scrapper = BOEScrapper()
23 |     day = date.today()
24 |     docs = boe_scrapper.download_day(day)
25 |     if docs:
26 |         etl_job.run(docs)
27 | 
28 |     subject = "[BOE] Daily ETL executed"
29 |     content = f"""
30 |     Daily ETL executed
31 |     - Date: {day}
32 |     - Documents loaded: {len(docs)} 
33 |     - Database used: {init_objects.config_loader['vector_store']}
34 |     """
35 |     send_email(init_objects.config_loader, subject, content)
36 | 
37 | 
38 | @app.command()
39 | def dates(date_start: str, date_end: str, init_objects=None):
40 |     if init_objects is None:
41 |         init_objects = initialize_app()
42 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
43 |     boe_scrapper = BOEScrapper()
44 |     docs = boe_scrapper.download_days(
45 |         date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
46 |         date_end=datetime.strptime(date_end, "%Y/%m/%d").date(),
47 |     )
48 |     if docs:
49 |         etl_job.run(docs)
50 | 
51 |     subject = "[BOE] Load ETL executed"
52 |     content = f"""
53 |     Load ETL executed
54 |     - Date start: {date_start}
55 |     - Date end: {date_end}
56 |     - Documents loaded: {len(docs)} 
57 |     - Database used: {init_objects.config_loader['vector_store']}
58 |     """
59 |     send_email(init_objects.config_loader, subject, content)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     app()
64 | 


--------------------------------------------------------------------------------
/src/etls/boja/load.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | 
 3 | import typer
 4 | 
 5 | from src.email.send_email import send_email
 6 | from src.etls.boja.scrapper import BOJAScrapper
 7 | from src.etls.utils import catch_exceptions
 8 | from src.etls.boja.defs import COLLECTION_NAME
 9 | from src.etls.common.etl import ETL
10 | from src.initialize import initialize_app
11 | 
12 | app = typer.Typer()
13 | 
14 | @app.command()
15 | @catch_exceptions(cancel_on_failure=True)
16 | def today(init_objects=None):
17 |     if init_objects is None:
18 |         init_objects = initialize_app()
19 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
20 |     bopv_scrapper = BOJAScrapper()
21 |     day = date.today()
22 |     docs = bopv_scrapper.download_day(day)
23 |     if docs:
24 |         etl_job.run(docs)
25 |     subject = "[BOJA] Daily ETL executed"
26 |     content = f"""
27 |     Daily ETL executed
28 |     - Date: {day}
29 |     - Documents loaded: {len(docs)} 
30 |     - Database used: {init_objects.config_loader['vector_store']}
31 |     """
32 |     send_email(init_objects.config_loader, subject, content)
33 |     
34 | @app.command()
35 | def dates(date_start: str, date_end: str, init_objects=None):
36 |     if init_objects is None:
37 |         init_objects = initialize_app()
38 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
39 |     bopv_scrapper = BOJAScrapper()
40 |     docs = bopv_scrapper.download_days(
41 |         date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
42 |         date_end=datetime.strptime(date_end, "%Y/%m/%d").date()
43 |         )        
44 |     if docs:
45 |         etl_job.run(docs)
46 | 
47 |     subject = "[BOJA] Load ETL executed"
48 |     content = f"""
49 |     Load ETL executed
50 |     - Date start: {date_start}
51 |     - Date end: {date_end}
52 |     - Documents loaded: {len(docs)} 
53 |     - Database used: {init_objects.config_loader['vector_store']}
54 |     """
55 |     send_email(init_objects.config_loader, subject, content)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     app()    


--------------------------------------------------------------------------------
/src/etls/boa/load.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | import json
 3 | 
 4 | import typer
 5 | 
 6 | from src.email.send_email import send_email
 7 | from src.etls.boa.scrapper import BOAScrapper
 8 | from src.etls.utils import catch_exceptions
 9 | from src.etls.boa.defs import COLLECTION_NAME
10 | from src.etls.common.etl import ETL
11 | from src.initialize import initialize_app
12 | 
13 | app = typer.Typer()
14 | 
15 | @app.command()
16 | @catch_exceptions(cancel_on_failure=True)
17 | def today(init_objects=None):
18 |     if init_objects is None:
19 |         init_objects = initialize_app()
20 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
21 |     boa_scrapper = BOAScrapper()
22 |     day = date.today()
23 |     docs = boa_scrapper.download_day(day)
24 |     if docs:
25 |         etl_job.run(docs)
26 |     subject = "[BOA] Daily ETL executed"
27 |     content = f"""
28 |     Daily ETL executed
29 |     - Date: {day}
30 |     - Documents loaded: {len(docs)} 
31 |     - Database used: {init_objects.config_loader['vector_store']}
32 |     """
33 |     send_email(init_objects.config_loader, subject, content)
34 |     
35 | @app.command()
36 | def dates(date_start: str, date_end: str, init_objects=None):
37 |     if init_objects is None:
38 |         init_objects = initialize_app()
39 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
40 |     boa_scrapper = BOAScrapper()
41 |     docs = boa_scrapper.download_days(
42 |         date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
43 |         date_end=datetime.strptime(date_end, "%Y/%m/%d").date()
44 |         )        
45 |     if docs:
46 |         etl_job.run(docs)
47 | 
48 |     subject = "[BOA] Load ETL executed"
49 |     content = f"""
50 |     Load ETL executed
51 |     - Date start: {date_start}
52 |     - Date end: {date_end}
53 |     - Documents loaded: {len(docs)} 
54 |     - Database used: {init_objects.config_loader['vector_store']}
55 |     """
56 |     send_email(init_objects.config_loader, subject, content)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     app()    


--------------------------------------------------------------------------------
/src/etls/bocm/load.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | 
 3 | import typer
 4 | 
 5 | from src.email.send_email import send_email
 6 | from src.etls.bocm.scrapper import BOCMScrapper
 7 | from src.etls.common.etl import ETL
 8 | from src.etls.utils import catch_exceptions
 9 | from src.etls.bocm.defs import COLLECTION_NAME
10 | from src.initialize import initialize_app
11 | 
12 | 
13 | app = typer.Typer()
14 | 
15 | 
16 | @app.command()
17 | @catch_exceptions(cancel_on_failure=True)
18 | def today(init_objects=None):
19 |     if init_objects is None:
20 |         init_objects = initialize_app()
21 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
22 |     bocm_scrapper = BOCMScrapper()
23 |     day = date.today()
24 |     docs = bocm_scrapper.download_day(day)
25 |     if docs:
26 |         etl_job.run(docs)
27 | 
28 |     subject = "[BOCM] Daily ETL executed"
29 |     content = f"""
30 |     Daily ETL executed
31 |     - Date: {day}
32 |     - Documents loaded: {len(docs)} 
33 |     - Database used: {init_objects.config_loader['vector_store']}
34 |     """
35 |     send_email(init_objects.config_loader, subject, content)
36 | 
37 | 
38 | @app.command()
39 | def dates(date_start: str, date_end: str, init_objects=None):
40 |     if init_objects is None:
41 |         init_objects = initialize_app()
42 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
43 |     bocm_scrapper = BOCMScrapper()
44 |     docs = bocm_scrapper.download_days(
45 |         date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
46 |         date_end=datetime.strptime(date_end, "%Y/%m/%d").date(),
47 |     )
48 |     if docs:
49 |         etl_job.run(docs)
50 | 
51 |     subject = "[BOCM] Load ETL executed"
52 |     content = f"""
53 |     Load ETL executed
54 |     - Date start: {date_start}
55 |     - Date end: {date_end}
56 |     - Documents loaded: {len(docs)} 
57 |     - Database used: {init_objects.config_loader['vector_store']}
58 |     """
59 |     send_email(init_objects.config_loader, subject, content)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     app()
64 | 


--------------------------------------------------------------------------------
/src/etls/bopz/load.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | 
 3 | import typer
 4 | 
 5 | from src.email.send_email import send_email
 6 | from src.etls.bopz.scrapper import BOPZScrapper
 7 | from src.etls.bopz.defs import COLLECTION_NAME
 8 | from src.etls.utils import catch_exceptions
 9 | from src.etls.common.etl import ETL
10 | from src.initialize import initialize_app
11 | 
12 | 
13 | app = typer.Typer()
14 | 
15 | 
16 | @app.command()
17 | @catch_exceptions(cancel_on_failure=True)
18 | def today(init_objects=None):
19 |     if init_objects is None:
20 |         init_objects = initialize_app()
21 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
22 |     bopz_scrapper = BOPZScrapper()
23 |     day = date.today()
24 |     docs = bopz_scrapper.download_day(day)
25 |     if docs:
26 |         etl_job.run(docs)
27 | 
28 |     subject = "[BOPZ] Daily ETL executed"
29 |     content = f"""
30 |     Daily ETL executed
31 |     - Date: {day}
32 |     - Documents loaded: {len(docs)} 
33 |     - Database used: {init_objects.config_loader['vector_store']}
34 |     """
35 |     send_email(init_objects.config_loader, subject, content)
36 | 
37 | 
38 | @app.command()
39 | def dates(date_start: str, date_end: str, init_objects=None):
40 |     if init_objects is None:
41 |         init_objects = initialize_app()
42 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
43 |     bopz_scrapper = BOPZScrapper()
44 |     docs = bopz_scrapper.download_days(
45 |         date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
46 |         date_end=datetime.strptime(date_end, "%Y/%m/%d").date(),
47 |     )
48 |     if docs:
49 |         etl_job.run(docs)
50 | 
51 |     subject = "[BOPZ] Load ETL executed"
52 |     content = f"""
53 |     Load ETL executed
54 |     - Date start: {date_start}
55 |     - Date end: {date_end}
56 |     - Documents loaded: {len(docs)} 
57 |     - Database used: {init_objects.config_loader['vector_store']}
58 |     """
59 |     send_email(init_objects.config_loader, subject, content)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     app()
64 | 


--------------------------------------------------------------------------------
/src/etls/bopv/load.py:
--------------------------------------------------------------------------------
 1 | from datetime import date, datetime
 2 | import json
 3 | 
 4 | import typer
 5 | 
 6 | from src.email.send_email import send_email
 7 | from src.etls.bopv.scrapper import BOPVScrapper
 8 | from src.etls.bopv.defs import COLLECTION_NAME
 9 | from src.etls.utils import catch_exceptions
10 | from src.etls.common.etl import ETL
11 | from src.initialize import initialize_app
12 | 
13 | app = typer.Typer()
14 | 
15 | @app.command()
16 | @catch_exceptions(cancel_on_failure=True)
17 | def today(init_objects=None):
18 |     if init_objects is None:
19 |         init_objects = initialize_app()
20 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
21 |     bopv_scrapper = BOPVScrapper()
22 |     day = date.today()
23 |     docs = bopv_scrapper.download_day(day)
24 |     if docs:
25 |         etl_job.run(docs)
26 |     subject = "[BOPV] Daily ETL executed"
27 |     content = f"""
28 |     Daily ETL executed
29 |     - Date: {day}
30 |     - Documents loaded: {len(docs)} 
31 |     - Database used: {init_objects.config_loader['vector_store']}
32 |     """
33 |     send_email(init_objects.config_loader, subject, content)
34 |     
35 | @app.command()
36 | def dates(date_start: str, date_end: str, init_objects=None):
37 |     if init_objects is None:
38 |         init_objects = initialize_app()
39 |     etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
40 |     bopv_scrapper = BOPVScrapper()
41 |     docs = bopv_scrapper.download_days(
42 |         date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
43 |         date_end=datetime.strptime(date_end, "%Y/%m/%d").date()
44 |         )        
45 |     if docs:
46 |         etl_job.run(docs)
47 | 
48 |     subject = "[BOPV] Load ETL executed"
49 |     content = f"""
50 |     Load ETL executed
51 |     - Date start: {date_start}
52 |     - Date end: {date_end}
53 |     - Documents loaded: {len(docs)} 
54 |     - Database used: {init_objects.config_loader['vector_store']}
55 |     """
56 |     send_email(init_objects.config_loader, subject, content)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     app()    


--------------------------------------------------------------------------------
/src/etls/common/etl.py:
--------------------------------------------------------------------------------
 1 | import logging as lg
 2 | import os
 3 | import typing as tp
 4 | 
 5 | import pinecone
 6 | from langchain.schema import Document
 7 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 8 | from retry import retry
 9 | 
10 | from src.etls.common.metadata import MetadataDocument
11 | from src.etls.common.utils import TextLoader
12 | from src.initialize import initialize_logging
13 | 
14 | initialize_logging()
15 | 
16 | 
17 | class ETL:
18 |     def __init__(self, config_loader, vector_store):
19 |         self._config_loader = config_loader
20 |         self._vector_store = vector_store
21 | 
22 |     def run(self, docs: tp.List[MetadataDocument]):
23 |         chunks = self._split_documents(docs)
24 |         self._load_database(chunks)
25 |         # self._log_database_stats()
26 | 
27 |     def _split_documents(self, docs: tp.List[MetadataDocument]) -> tp.List[Document]:
28 |         """Split documents by chunks
29 | 
30 |         :param docs:
31 |         :return:
32 |         """
33 |         logger = lg.getLogger(self._split_documents.__name__)
34 |         logger.info("Splitting in chunks %s documents", len(docs))
35 |         docs_chunks = []
36 |         for doc in docs:
37 |             loader = TextLoader(file_path=doc.filepath, metadata=doc.dict())
38 |             documents = loader.load()
39 |             text_splitter = RecursiveCharacterTextSplitter(
40 |                 chunk_size=self._config_loader["chunk_size"],
41 |                 chunk_overlap=self._config_loader["chunk_overlap"],
42 |             )
43 |             docs_chunks += text_splitter.split_documents(documents)
44 | 
45 |             logger.info("Removing file %s", doc.filepath)
46 |             os.remove(doc.filepath)
47 |         logger.info("Splitted %s documents in %s chunks", len(docs), len(docs_chunks))
48 |         return docs_chunks
49 | 
50 |     @retry(tries=3, delay=2)
51 |     def _load_database(self, docs_chunks: tp.List[Document]) -> None:
52 |         logger = lg.getLogger(self._load_database.__name__)
53 |         logger.info("Loading %s embeddings to database", len(docs_chunks))
54 |         self._vector_store.add_documents(docs_chunks)
55 |         logger.info("Loaded %s embeddings to database", len(docs_chunks))
56 | 
57 |     def _log_database_stats(self) -> None:
58 |         logger = lg.getLogger(self._log_database_stats.__name__)
59 |         index_name = self._config_loader["vector_store_index_name"]
60 |         logger.info(pinecone.describe_index(index_name))
61 |         index = pinecone.Index(index_name)
62 |         logger.info(index.describe_index_stats())
63 | 


--------------------------------------------------------------------------------
/.github/workflows/bandit.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # Bandit is a security linter designed to find common security issues in Python code.
 7 | # This action will run Bandit on your codebase.
 8 | # The results of the scan will be found under the Security tab of your repository.
 9 | 
10 | # https://github.com/marketplace/actions/bandit-scan is ISC licensed, by abirismyname
11 | # https://pypi.org/project/bandit/ is Apache v2.0 licensed, by PyCQA
12 | 
13 | name: Bandit
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "main" ]
20 |   schedule:
21 |     - cron: '25 4 * * 6'
22 | 
23 | jobs:
24 |   bandit:
25 |     permissions:
26 |       contents: read # for actions/checkout to fetch code
27 |       security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
28 |       actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status
29 | 
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - uses: actions/checkout@v2
33 |       - name: Bandit Scan
34 |         uses: shundor/python-bandit-scan@9cc5aa4a006482b8a7f91134412df6772dbda22c
35 |         with: # optional arguments
36 |           # exit with 0, even with results found
37 |           exit_zero: true # optional, default is DEFAULT
38 |           # Github token of the repository (automatically created by Github)
39 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information.
40 |           # File or directory to run bandit on
41 |           # path: # optional, default is .
42 |           # Report only issues of a given severity level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything)
43 |           # level: # optional, default is UNDEFINED
44 |           # Report only issues of a given confidence level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything)
45 |           # confidence: # optional, default is UNDEFINED
46 |           # comma-separated list of paths (glob patterns supported) to exclude from scan (note that these are in addition to the excluded paths provided in the config file) (default: .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg)
47 |           # excluded_paths: # optional, default is DEFAULT
48 |           # comma-separated list of test IDs to skip
49 |           # skips: # optional, default is DEFAULT
50 |           # path to a .bandit file that supplies command line arguments
51 |           # ini_path: # optional, default is DEFAULT
52 | 
53 | 


--------------------------------------------------------------------------------
/evaluation/embeddings/questions.py:
--------------------------------------------------------------------------------
 1 | QUERIES = (
 2 |     # Solo sí es sí
 3 |     (
 4 |         "BOE-A-2022-14630",
 5 |         (
 6 |             "¿Es de aplicación la ley de garantía integral de la libertad sexual a niños (varones) "
 7 |             "menores de edad víctimas de violencias sexuales o solo a niñas y mujeres?"
 8 |         ),
 9 |     ),
10 |     (
11 |         "BOE-A-2022-14630",
12 |         (
13 |             "¿Se aplica la ley a niños (varones) menores de edad víctimas de violencias sexuales o solo a niñas y mujeres?"
14 |         ),
15 |     ),
16 |     (
17 |         "BOE-A-2022-14630",
18 |         (
19 |             "¿A qué ayudas económicas pueden acceder las mujeres víctimas de violencias sexuales? ¿Son compatibles con "
20 |             "otras ayudas económicas?"
21 |         ),
22 |     ),
23 |     (
24 |         "BOE-A-2022-14630",
25 |         (
26 |             "¿Gozarán de derechos las víctimas de violencia sexual que estén en situación administrativa irregular?"
27 |         ),
28 |     ),
29 |     ("BOE-A-2022-14630", ("¿Qué Leyes hablan sobre la igualdad de género?")),
30 |     # Vivienda
31 |     (
32 |         "BOE-A-2023-12203",
33 |         (
34 |             "Según la ley de vivienda de 2023, ¿quién paga los honorarios de la inmobiliaria? Contesta detallando qué "
35 |             "ocurre en caso de compra y de alquiler"
36 |         ),
37 |     ),
38 |     (
39 |         "BOE-A-2023-12203",
40 |         (
41 |             "¿Quién paga los honorarios de la inmobiliaria en caso de alquiler de una vivienda, arrendatario o arrendador?"
42 |         ),
43 |     ),
44 |     (
45 |         "BOE-A-2023-12203",
46 |         (
47 |             "¿Cuantos años necesita estar una vivienda sin uso para ser considerada 'vivienda vacía'?"
48 |         ),
49 |     ),
50 |     (
51 |         "BOE-A-2023-12203",
52 |         ("¿Se considera gran tenedor a aquella persona con 6 inmuebles?"),
53 |     ),
54 |     (
55 |         "BOE-A-2023-12203",
56 |         (
57 |             "¿Se considera gran tenedor a aquella persona con 7 inmuebles en diferentes zonas tensionadas y no tensionadas?"
58 |         ),
59 |     ),
60 |     (
61 |         "BOE-A-2023-12203",
62 |         (
63 |             "¿Se considera gran tenedor a aquella persona con 7 inmuebles en una misma zona tensionada?"
64 |         ),
65 |     ),
66 |     ("BOE-A-2023-12203", ("¿Cuántos años durará el carácter de 'zona tensionada'?")),
67 |     (
68 |         "BOE-A-2023-12203",
69 |         (
70 |             "¿Qué porcentaje sobre los ingresos es necesario para que una zona sea considerada tensionada?"
71 |         ),
72 |     ),
73 |     (
74 |         "BOE-A-2023-12203",
75 |         (
76 |             "¿Qué porcentaje sobre los ingresos es necesario para que una zona sea considerada tensionada?"
77 |         ),
78 |     ),
79 |     # FP
80 |     (
81 |         "BOE-A-2023-16889",
82 |         (
83 |             "¿Los expertos o expertos senior necesitarán acreditar algún título como el máster de profesorado para "
84 |             "impartir clase de Formación Profesional?"
85 |         ),
86 |     ),
87 | )
88 | 


--------------------------------------------------------------------------------
/src/etls/boe/loading/documents.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing as tp
 3 | import logging as lg
 4 | 
 5 | from requests.exceptions import HTTPError
 6 | from qdrant_client.http.models import Filter, FieldCondition, MatchValue
 7 | from qdrant_client import QdrantClient
 8 | import numpy as np
 9 | 
10 | from src.email.send_email import send_email
11 | from src.etls.boe.scrapper import BOEScrapper
12 | from src.etls.boe.loading.defs_id_largos import BOE_IDS
13 | from src.etls.common.etl import ETL
14 | from src.etls.boe.defs import COLLECTION_NAME
15 | from src.initialize import initialize_app, initialize_logging
16 | 
17 | initialize_logging()
18 | 
19 | QDRANT_CLIENT = QdrantClient(url=os.environ["QDRANT_API_URL"], api_key=os.environ["QDRANT_API_KEY"], timeout=1000)
20 | 
21 | 
22 | def load_important_ids(filename):
23 |     with open(filename) as f:
24 |         lines = f.read().splitlines()
25 |     return lines
26 | 
27 | 
28 | def filter_documents_by_year(documents: tp.List[str]) -> tp.List[str]:
29 |     documents_filtered = []
30 |     for document_id in documents:
31 |         id_split = document_id.split("-")
32 |         if id_split[0] != "BOE" or int(id_split[2]) < 2000:
33 |             documents_filtered.append(document_id)
34 |     return documents_filtered
35 | 
36 | 
37 | def filter_documents_loaded(documents: tp.List[str]) -> tp.List[str]:
38 |     """Filters a list of document IDs that are not loaded on Embedding database."""
39 |     logger = lg.getLogger(filter_documents_loaded.__name__)
40 |     query_vector = np.random.rand(768)
41 |     documents_filtered = []
42 |     for document_id in documents:
43 |         logger.info("Checking if document id is already loaded: %s", document_id)
44 |         search_result = QDRANT_CLIENT.search(
45 |             collection_name="justicio",
46 |             query_vector=query_vector,
47 |             query_filter=Filter(
48 |                 must=[FieldCondition(key="metadata.identificador", match=MatchValue(value=document_id))]
49 |             ),
50 |             limit=1,
51 |         )
52 |         if not search_result:
53 |             documents_filtered.append(document_id)
54 |             logger.info("Document id: %s is added", document_id)
55 | 
56 |     return documents_filtered
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     logger = lg.getLogger("__main__")
61 |     INIT_OBJECTS = initialize_app()
62 |     etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[COLLECTION_NAME])
63 |     boe_scrapper = BOEScrapper()
64 | 
65 |     documents = load_important_ids("src/etls/boe/loading/defs_ids_importantes.txt")
66 |     documents += BOE_IDS
67 |     logger.info("Documents size: %s", len(documents))
68 |     documents_filtered = list(set(documents))
69 |     logger.info("Documents filtered by unique: %s", len(documents_filtered))
70 |     documents_filtered = filter_documents_by_year(documents_filtered)
71 |     logger.info("Documents filtered by year: %s", len(documents_filtered))
72 |     logger.info(documents_filtered)
73 |     # documents_filtered = filter_documents_loaded(documents_filtered)
74 |     # logger.info('Documents filtered size: %s', len(documents_filtered))
75 | 
76 |     docs = []
77 |     for boe_id in documents_filtered:
78 |         logger.info("Loading BOE Id: %s", boe_id)
79 |         url = f"https://www.boe.es/diario_boe/xml.php?id={boe_id}"
80 |         try:
81 |             meta_document = boe_scrapper.download_document(url)
82 |             docs.append(meta_document)
83 |         except HTTPError:
84 |             logger.error("Not scrapped document %s", url)
85 |         except AttributeError:
86 |             logger.error("Not scrapped document %s", url)
87 |     if docs:
88 |         etl_job.run(docs)
89 | 
90 |     subject = "[BOE] Documents ETL executed"
91 |     content = f"""
92 |     Documents ETL executed
93 |     - Documents loaded (BOE ids): {len(documents_filtered)}
94 |     - Documents loaded: {len(docs)}
95 |     - Database used: {INIT_OBJECTS.config_loader['vector_store']}
96 |     """
97 |     send_email(INIT_OBJECTS.config_loader, subject, content)
98 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Nix
  2 | shell.nix
  3 | 
  4 | # Justicio 
  5 | qdrant_storage/
  6 | logs/
  7 | .vscode/
  8 | 
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | cover/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | db.sqlite3-journal
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | .pybuilder/
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | #   For a library or package, you might want to ignore these files since the code is
 95 | #   intended to run in multiple environments; otherwise, check them in:
 96 | # .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # poetry
106 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
108 | #   commonly ignored for libraries.
109 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110 | #poetry.lock
111 | 
112 | # pdm
113 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
114 | #pdm.lock
115 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
116 | #   in version control.
117 | #   https://pdm.fming.dev/#use-with-ide
118 | .pdm.toml
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | .idea/
169 | 
170 | .secrets
171 | .continous_deployment.md
172 | .qdrant.yaml
173 | cron_etl_daily.sh
174 | cron_etl_initial.sh
175 | 
176 | *.pem
177 | 
178 | xmls/
179 | 
180 | .DS_Store
181 | 
182 | # Qdrant
183 | 
184 | qdrant_data/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Justicio
 2 | 
 3 | Justicio is a Question/Answering Assistant that generates answers from user questions about the official state gazette of Spain: 
 4 | Boletín Oficial del Estado (BOE).
 5 | 
 6 | [Spanish link](https://www.boe.es)
 7 | 
 8 | [English link](https://www.boe.es/index.php?lang=en)
 9 | 
10 | **TL;DR:** All BOE articles are embedded in vectors and stored in a vector database. When a question is asked, the question 
11 | is embedded in the same latent space and the most relevant text is retrieved from the vector database by performing a 
12 | query using the embedded question. The retrieved pieces of text are then sent to the LLM to construct an answer.
13 | 
14 | # Service
15 | 
16 | At this moment we are running a user-free service: [Justicio](https://justicio.es/)
17 | 
18 | You can test it without charge! Please, give us your feedback if you have!
19 | 
20 | # How it works under the hood
21 | 
22 | ![image (4)](https://github.com/bukosabino/ia-boe/assets/4375209/bb2ad4ce-f90a-40bf-a77f-bc1443b9896e)
23 | 
24 | ## Flow
25 | 
26 | 0. All BOE articles are embedded as embeddings and stored in an embedding database. This process is run at startup and every day.
27 | 1. The user writes (using natural language) any question related to the BOE as input to the system.
28 | 2. The backend service processes the input request (user question), transforms the question into an embedding, and sends the generated embedding as a query to the embedding database.
29 | 3. The embedding database returns documents that most closely match the query.
30 | 4. The most similar documents returned by the embedding database are added to the input query as context. Then a request with all the information is sent to the LLM API model.
31 | 5. The LLM API model returns a natural language answer to the user's question.
32 | 6. The user receives an AI-generated response output.
33 | 
34 | ## Components
35 | 
36 | ### Backend service
37 | 
38 | It is the web service, and it is a central component for the whole system, doing most of the tasks:
39 | 
40 | * Process the input requests from the user.
41 | * Transform the input text into embeddings.
42 | * Send requests to the embeddings database to get the most similar embeddings.
43 | * Send requests to the LLM API model to generate the response.
44 | * Save the traces.
45 | * Handle input/output exceptions.
46 | 
47 | ### Embedding/Vector database
48 | 
49 | #### Loading data
50 | 
51 | We download the BOE documents and break them into small chunks of text (e.g. 1200 characters). Each text chunk is transformed into an embedding (e.g. a numerically dense vector of 768 sizes). Some additional metadata is also stored with the vectors so that we can pre- or post-filter the search results. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load.py)
52 | 
53 | The BOE is updated every day, so we need to run an ETL job every day to retrieve the new documents, transform them into embeddings, link the metadata, and store them in the embedding database. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/etls/boe/load.py)
54 | 
55 | #### Reading data
56 | 
57 | It implements APIs to transform the input question into a vector, and to perform ANN (Approximate Nearest Neighbour) against all the vectors in the database. [Check the code](https://github.com/bukosabino/ia-boe/blob/main/src/service/main.py)
58 | 
59 | There are different types of search (semantic search, keyword search, or hybrid search).
60 | 
61 | There are different types of ANNs (cosine similarity, Euclidean distance, or dot product).
62 | 
63 | #### Embedding Model
64 | 
65 | The text in BOE is written in Spanish, so we need a sentence transformer model that is fine-tuned using Spanish 
66 | datasets. We are experimenting with [these models](https://github.com/bukosabino/sbert-spanish).
67 |   
68 | More info: https://www.newsletter.swirlai.com/p/sai-notes-07-what-is-a-vector-database
69 | 
70 | ### LLM API Model
71 | 
72 | It is a Large Language Model (LLM) which generates answers for the user questions based on the context, which is
73 | the most similar documents returned by the embedding database.
74 | 
75 | ## Tools
76 | 
77 | - Langchain
78 | - FastAPI
79 | - Qdrant
80 | - [Fine tuned Spanish SBert model](https://github.com/bukosabino/sbert-spanish)
81 | - BeautifulSoup
82 | 
83 | # Deploy your own service
84 | 
85 | Check [doc/deployment_guide.md](https://github.com/bukosabino/justicio/blob/main/doc/deployment_guide.md) file
86 | 
87 | # Want to help develop the project?
88 | 
89 | You are welcome! Please, contact us to see how you can help.
90 | 
91 | * [Darío López](https://www.linkedin.com/in/dar%C3%ADo-l%C3%B3pez-padial-45269150/) 
92 | * [Alex Dantart](https://www.linkedin.com/in/dantart/)
93 | * [Jorge Iliarte](https://www.linkedin.com/in/jorge-iliarte-llop/)
94 | * [Jorge Barrachina](https://www.linkedin.com/in/jorgebarrachina/)
95 | 


--------------------------------------------------------------------------------
/src/etls/common/utils.py:
--------------------------------------------------------------------------------
  1 | import typing as tp
  2 | from requests.exceptions import Timeout
  3 | import random
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | from langchain.docstore.document import Document
  7 | from langchain.document_loaders.base import BaseLoader
  8 | 
  9 | from src.etls.utils import create_retry_session
 10 | 
 11 | 
 12 | class TextLoader(BaseLoader):
 13 |     """Load text files."""
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         file_path: str,
 18 |         encoding: tp.Optional[str] = None,
 19 |         metadata: tp.Optional[dict] = None,
 20 |     ):
 21 |         """Initialize with file path."""
 22 |         self.file_path = file_path
 23 |         self.encoding = encoding
 24 |         self.metadata = metadata
 25 | 
 26 |     def load(self) -> tp.List[Document]:
 27 |         """Load from file path."""
 28 |         with open(self.file_path, encoding=self.encoding) as f:
 29 |             text = f.read()
 30 |         return [Document(page_content=text, metadata=self.metadata)]
 31 | 
 32 | 
 33 | class ScrapperError(Exception):
 34 |     """
 35 |     Custom exception for scraping errors.
 36 |     """
 37 | 
 38 |     def __init__(self, message="Error durante el proceso de scraping", *args, **kwargs):
 39 |         """
 40 |         Initializes the exception with a custom error message.
 41 | 
 42 |         :param message: Error message describing the failure.
 43 |         :param args: Additional positional arguments.
 44 |         :param kwargs: Additional keyword arguments.
 45 |         """
 46 |         super().__init__(message, *args, **kwargs)
 47 |         self.message = message
 48 | 
 49 |     def __str__(self):
 50 |         """
 51 |         Returns a string representation of the exception, including the error message.
 52 |         """
 53 |         return f"ScrapperError: {self.message}"
 54 | 
 55 | 
 56 | class HTTPRequestException(Exception):
 57 |     """
 58 |     Exception for errors occurring during HTTP requests made by HTTPRequester.
 59 |     """
 60 |     def __init__(self, message="HTTP request error", *args):
 61 |         super().__init__(message, *args)
 62 | 
 63 | 
 64 | class HTTPRequester:
 65 |     user_agents = [
 66 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
 67 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
 68 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 69 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 70 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 71 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
 72 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
 73 |         ]
 74 | 
 75 |     default_headers = {
 76 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
 77 |         "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
 78 |         "Connection": "keep-alive",
 79 |     }
 80 | 
 81 |     @classmethod
 82 |     def get_random_user_agent(cls):
 83 |         """
 84 |         Selects and returns a random User-Agent from the list of user_agents.
 85 |         """
 86 |         return random.choice(cls.user_agents)
 87 | 
 88 |     @classmethod
 89 |     def get_headers(cls):
 90 |         """
 91 |         Generates and returns headers including a random User-Agent.
 92 |         """
 93 |         headers = cls.default_headers.copy()
 94 |         headers["User-Agent"] = cls.get_random_user_agent()
 95 |         return headers
 96 | 
 97 |     @staticmethod
 98 |     def get_soup(url, timeout=30, markup='html.parser'):
 99 |         """
100 |         Performs an HTTP GET request to the provided URL, using random headers, and returns a BeautifulSoup
101 |         object if the response is successful. If there is an error or timeout, it throws HTTPRequestException.
102 |         """
103 |         headers = HTTPRequester.get_headers()
104 |         try:
105 |             session = create_retry_session(retries=5)
106 |             response = session.get(url, headers=headers, timeout=timeout)
107 |             response.raise_for_status()
108 |             return BeautifulSoup(response.content, markup)
109 |         except Timeout as e:
110 |             raise HTTPRequestException(f"HTTP request timed out: {e}")
111 |         except requests.RequestException as e:
112 |             raise HTTPRequestException(f"HTTP request failed: {e}")
113 | 


--------------------------------------------------------------------------------
/src/etls/bocm/utils.py:
--------------------------------------------------------------------------------
  1 | import typing as tp
  2 | import logging as lg
  3 | import re
  4 | from src.initialize import initialize_logging
  5 | 
  6 | BOCM_PREFIX = "https://www.bocm.es"
  7 | 
  8 | 
  9 | initialize_logging()
 10 | 
 11 | 
 12 | def _get_url_from_cve(cve: str) -> str:
 13 |     return f"{BOCM_PREFIX}/{cve.lower()}"
 14 | 
 15 | 
 16 | # Metadata from head tags
 17 | def metadata_from_head_tags(soup) -> tp.List[str]:
 18 |     # extract cve from meta[name="TituloGSA"]
 19 |     cve = soup.select_one('meta[name="TituloGSA"]')["content"]
 20 |     fecha = cve.split("-")[1:2][0]
 21 |     fecha_publicacion = f'{fecha[:4]}-{fecha[4:6]}-{fecha[6:8]}'
 22 | 
 23 |     html_link = soup.select_one('meta[property="og:url"]')["content"]
 24 | 
 25 |     return [fecha_publicacion, cve, html_link]
 26 | 
 27 | 
 28 | # Metadata from document header
 29 | def metadata_from_doc(soup, seccion: str, cve: str) -> tp.List[str]:
 30 |     logger = lg.getLogger(metadata_from_doc.__name__)
 31 | 
 32 |     # Set defaults
 33 |     apartado, tipo, anunciante, organo, rango = ["", "", "", "", ""]
 34 | 
 35 |     # get headers
 36 |     paras = [f.get_text().strip().upper() for f in soup.select("#cabeceras p")][:3]
 37 | 
 38 |     # Metadata from article description
 39 |     desc_attempt = soup.select_one('meta[name="description"]')
 40 |     if (desc_attempt is not None):
 41 |         desc = desc_attempt["content"]
 42 |     else:
 43 |         desc = ''
 44 |     num_art = re.sub(r"BOCM-\d{8}-(\d{1,3})", r"\1", cve)
 45 | 
 46 |     try:
 47 |         if seccion == "1":
 48 |             subseccion_letter = ["A", "B", "C", "D"][int(seccion) - 1]
 49 |             subseccion_name = paras[0]
 50 |             organo = paras[2]
 51 |             # Some articles don't have filled description needed for rango field extraction
 52 |             if len(desc) > 10:
 53 |                 rango = re.sub(r"^(\b[^\s]+\b)(.*)", r"\1", desc.split(num_art)[1], flags=re.ASCII).upper()
 54 |         if seccion == "2":
 55 |             subseccion_name = "DISPOSICIONES Y ANUNCIOS DEL ESTADO"
 56 |             organo = paras[0]
 57 |         if seccion == "3":
 58 |             paras_num = len(paras)
 59 |             subseccion_name = "ADMINISTRACIÓN LOCAL AYUNTAMIENTOS"
 60 |             if paras_num == 3:
 61 |                 apartado, organo = paras[1:3]
 62 |             elif paras_num == 2:
 63 |                 organo, apartado = paras[0:2]
 64 |             else:
 65 |                 apartado = "MANCOMUNIDADES"
 66 |                 organo = paras[0]
 67 | 
 68 |         if seccion == "4":
 69 |             subseccion_name = "ADMINISTRACIÓN DE JUSTICIA"
 70 |         if seccion == "5":
 71 |             subseccion_name = "OTROS ANUNCIOS"
 72 |             anunciante = paras[0]
 73 | 
 74 |     except:
 75 |         logger.error("Problem on section clasification for [%s]", cve)
 76 |         logger.error("Please review [%s]", _get_url_from_cve(cve))
 77 | 
 78 |     return [subseccion_name, apartado, tipo, organo, anunciante, rango]
 79 | 
 80 | 
 81 | def metadata_from_doc_header(soup) -> tp.List[str]:
 82 |     logger = lg.getLogger(metadata_from_doc_header.__name__)
 83 | 
 84 |     numero_oficial = soup.select_one(".cabecera_popup h1 strong").get_text().split("-")[1].strip().split(" ")[1].strip()
 85 |     s_field_a, cve_a, pags_a, *permalink = [str.get_text().split(":") for str in soup.select("#titulo_cabecera h2")]
 86 |     seccion_normalizada = s_field_a[0].strip().split(" ")[1]
 87 |     paginas = pags_a[1].strip()  # Should I convert to int??
 88 |     pdf_link = soup.select_one("#titulo_cabecera a")["href"]
 89 | 
 90 |     return [numero_oficial, seccion_normalizada, paginas, pdf_link]
 91 | 
 92 | 
 93 | def select_section_from_id(soup, filtered_section: str) -> tp.List[str]:
 94 |     logger = lg.getLogger(select_section_from_id.__name__)
 95 | 
 96 |     section_links = []
 97 |     section, subsection = filtered_section.split("-")
 98 |     section_container = soup.select_one(f'div[id*="secciones-seccion_{section}"]')
 99 |     if section_container is not None:
100 |         if len(subsection) == 1:
101 |             if section == "1":
102 |                 header_selector = ".view-grouping-header h3"
103 |                 content_selector = ".view-grouping-content"
104 |             else:
105 |                 header_selector = ".view-content h3"
106 |                 content_selector = ".view-content"
107 |             subsections = section_container.select(".view-grouping")
108 |             for group in subsections:
109 |                 title = group.select_one(header_selector).text
110 |                 subsection_fix = f"{subsection}\)"
111 |                 if re.search(subsection_fix, title):
112 |                     links = [f'{BOCM_PREFIX}{a["href"]}' for a in group.select(f'{content_selector} a[href*="bocm-"]')]
113 |                     section_links += links
114 |         else:
115 |             links = [f'{BOCM_PREFIX}{a["href"]}' for a in section_container.select('a[href*="bocm-"]')]
116 |             section_links += links
117 |     logger.info(f"Captured {len(section_links)} docs from section [{section}]")
118 |     return section_links
119 | 
120 | 
121 | def filter_links_by_section(soup, sections_filter_list: tp.List[str]) -> tp.List[str]:
122 |     logger = lg.getLogger(filter_links_by_section.__name__)
123 | 
124 |     selected = []
125 |     for section_id in sections_filter_list:
126 |         links = select_section_from_id(soup, section_id)
127 |         selected += links
128 | 
129 |     logger.info("Retrieved [%s] links for current day", len(selected))
130 |     return selected
131 | 
132 | 
133 | def clean_text(text: str) -> str:
134 |     cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE)
135 |     return cleaned
136 | 


--------------------------------------------------------------------------------
/src/initialize.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import logging as lg
  3 | import os
  4 | 
  5 | import yaml
  6 | from langchain.chains import RetrievalQA
  7 | from langchain.chat_models import ChatOpenAI
  8 | from langchain.embeddings import HuggingFaceEmbeddings
  9 | from langchain.prompts import (
 10 |     ChatPromptTemplate,
 11 |     HumanMessagePromptTemplate,
 12 |     SystemMessagePromptTemplate,
 13 | )
 14 | from langchain.vectorstores.qdrant import Qdrant
 15 | from openai import AsyncOpenAI
 16 | from qdrant_client import QdrantClient
 17 | from qdrant_client.models import VectorParams
 18 | from tavily import TavilyClient
 19 | 
 20 | 
 21 | def initialize_logging():
 22 |     logger = lg.getLogger()
 23 |     logger.info("Initializing logging")
 24 |     logger.handlers = []
 25 |     handler = lg.StreamHandler()
 26 |     formatter = lg.Formatter("[%(asctime)s] [%(process)d] [%(levelname)s] [%(name)s] %(message)s")
 27 |     handler.setFormatter(formatter)
 28 |     logger.addHandler(handler)
 29 |     logger.setLevel(lg.INFO)
 30 |     logger.info("Initialized logging")
 31 |     lg.getLogger("uvicorn.error").handlers = logger.handlers
 32 | 
 33 | 
 34 | def initialize_app():
 35 |     """Initializes the application"""
 36 |     logger = lg.getLogger(initialize_app.__name__)
 37 |     logger.info("Initializing application")
 38 |     config_loader = _init_config()
 39 |     vector_store = _init_vector_store(config_loader)
 40 |     openai_client = _init_openai_client()
 41 |     tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
 42 |     # retrieval_qa = _init_retrieval_qa_llm(vector_store, config_loader)
 43 |     logger.info("Initialized application")
 44 |     init_objects = collections.namedtuple(
 45 |         "init_objects", ["config_loader", "vector_store", "openai_client", "tavily_client"]
 46 |     )
 47 |     return init_objects(config_loader, vector_store, openai_client, tavily_client)
 48 | 
 49 | 
 50 | def _init_config():
 51 |     yaml_config_path = os.path.join(os.environ["APP_PATH"], "config", "config.yaml")
 52 |     with open(yaml_config_path, "r") as stream:
 53 |         config_loader = yaml.safe_load(stream)
 54 |     return config_loader
 55 | 
 56 | 
 57 | def _init_vector_store(config_loader):
 58 |     logger = lg.getLogger(_init_vector_store.__name__)
 59 |     logger.info("Initializing vector store")
 60 |     if config_loader["vector_store"] == "qdrant":
 61 |         vector_store = _init_vector_stores_qdrant(config_loader)
 62 |     else:
 63 |         raise ValueError("Vector Database not configured")
 64 |     return vector_store
 65 | 
 66 | 
 67 | def _init_vector_stores_qdrant(config_loader):
 68 |     logger = lg.getLogger(_init_vector_stores_qdrant.__name__)
 69 |     logger.info("Initializing vector stores")
 70 |     qdrant_client = QdrantClient(
 71 |         url=os.environ["QDRANT_API_URL"],
 72 |         api_key=os.environ["QDRANT_API_KEY"],
 73 |         prefer_grpc=True,
 74 |     )
 75 |     embeddings = HuggingFaceEmbeddings(
 76 |         model_name=config_loader["embeddings_model_name"],
 77 |         model_kwargs={"device": "cpu"},
 78 |     )
 79 |     vector_stores = {}
 80 |     for collection_name in config_loader["collections"]:
 81 |         if not _exists_collection(qdrant_client, collection_name):
 82 |             logger.info("Creating collection for vector store")
 83 |             qdrant_client.recreate_collection(
 84 |                 collection_name=collection_name,
 85 |                 vectors_config=VectorParams(
 86 |                     size=config_loader["embeddings_model_size"], distance=config_loader["distance_type"]
 87 |                 ),
 88 |                 on_disk_payload=True,
 89 |             )
 90 |             logger.info("Created collection [%s] for vector store", collection_name)
 91 |         vector_stores[collection_name] = Qdrant(qdrant_client, collection_name, embeddings)
 92 |         logger.info("Initialized vector store for collection [%s]", collection_name)
 93 |     return vector_stores
 94 | 
 95 | 
 96 | def _init_openai_client():
 97 |     logger = lg.getLogger(_init_openai_client.__name__)
 98 |     logger.info("Initializing OpenAI client")
 99 |     client = AsyncOpenAI(
100 |         api_key=os.environ.get("OPENAI_API_KEY"),
101 |     )
102 |     logger.info("Initialized OpenAI client")
103 |     return client
104 | 
105 | 
106 | def _exists_collection(qdrant_client, collection_name):
107 |     logger = lg.getLogger(_exists_collection.__name__)
108 |     try:
109 |         qdrant_client.get_collection(collection_name=collection_name)
110 |         return True
111 |     except:
112 |         logger.warn("Collection [%s] doesn't exist", collection_name)
113 |         return False
114 | 
115 | 
116 | def _init_retrieval_qa_llm(vector_store, config_loader):
117 |     # DEPRECATED
118 |     logger = lg.getLogger(_init_retrieval_qa_llm.__name__)
119 |     logger.info("Initializing RetrievalQA LLM")
120 |     retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": config_loader["top_k_results"]})
121 |     system_template = f"{config_loader['prompt_system']}----------------\n{{context}}"
122 |     messages = [
123 |         SystemMessagePromptTemplate.from_template(system_template),
124 |         HumanMessagePromptTemplate.from_template("{question}"),
125 |     ]
126 |     retrieval_qa = RetrievalQA.from_chain_type(
127 |         llm=ChatOpenAI(
128 |             model_name=config_loader["llm_model_name"],
129 |             temperature=config_loader["temperature"],
130 |             max_tokens=config_loader["max_tokens"],
131 |         ),
132 |         chain_type="stuff",
133 |         return_source_documents=True,
134 |         retriever=retriever,
135 |         chain_type_kwargs={"prompt": ChatPromptTemplate.from_messages(messages)},
136 |     )
137 |     logger.info(retrieval_qa.combine_documents_chain.llm_chain.prompt.format)
138 |     logger.info("Initialized RetrievalQA LLM")
139 |     return retrieval_qa
140 | 


--------------------------------------------------------------------------------
/src/etls/bopz/scrapper.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import logging as lg
  3 | import tempfile
  4 | import typing as tp
  5 | from datetime import date, datetime
  6 | 
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | from requests.exceptions import HTTPError
 10 | 
 11 | from src.etls.bopz.utils import DATA_POST
 12 | from src.etls.bopz.metadata import BOPZMetadataDocument
 13 | from src.etls.common.scrapper import BaseScrapper
 14 | from src.initialize import initialize_logging
 15 | from src.etls.utils import create_retry_session
 16 | 
 17 | initialize_logging()
 18 | 
 19 | 
 20 | def _extract_span_text(row: BeautifulSoup, regex: str) -> str:
 21 |     """
 22 |     Extracts the text of the next sibling for a span element that contains the specified text_label.
 23 | 
 24 |     :param row: The BeautifulSoup row element to search within.
 25 |     :param regex: The regular expresion to search for within the span element.
 26 |     :return: The stripped text of the next sibling if found, otherwise an empty string.
 27 |     """
 28 |     span_element = row.find("span", string=lambda t: re.search(regex, t))
 29 |     return span_element.next_sibling.strip() if span_element and span_element.next_sibling else None
 30 | 
 31 | 
 32 | def _extract_metadata(soup) -> tp.Dict:
 33 |     metadata_dict = {}
 34 | 
 35 |     # Metadatos
 36 |     if numero_registro := _extract_span_text(soup, r"N.\. Reg:"):
 37 |         metadata_dict["numero_oficial"] = numero_registro.split("/")[0]
 38 |         metadata_dict["titulo"] = f"BOPZ-{numero_registro.replace('/', '-')}"
 39 | 
 40 |     if departamento := _extract_span_text(soup, r"Publicador:"):
 41 |         metadata_dict["departamento"] = departamento
 42 | 
 43 |     if materia := _extract_span_text(soup, r"Materia"):
 44 |         metadata_dict["materia"] = [materia]
 45 | 
 46 |     if fecha_publicacion := _extract_span_text(soup, r"Fecha Pub:"):
 47 |         fecha_publicacion = datetime.strptime(fecha_publicacion, "%d/%m/%Y").strftime("%Y-%m-%d")
 48 |         metadata_dict["fecha_publicacion"] = fecha_publicacion
 49 |         metadata_dict["fecha_disposicion"] = fecha_publicacion
 50 |         metadata_dict["anio"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").year)
 51 |         metadata_dict["mes"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").month)
 52 |         metadata_dict["dia"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").day)
 53 | 
 54 |     href = soup.find("a", class_="adjunto")["href"][1:]
 55 |     metadata_dict["url_pdf"] = f"{'http://bop.dpz.es/BOPZ'}{href}"
 56 | 
 57 |     return metadata_dict
 58 | 
 59 | 
 60 | def _list_links_day(url: str, day_str: str) -> tp.List[BeautifulSoup]:
 61 |     """Get a list of documents listed in a BOPZ url day
 62 | 
 63 |     :param url: url base link. Example: 'http://bop.dpz.es/BOPZ/portalBuscarEdictos.do'
 64 |     :param day_str: str date to scrap
 65 |     :return: list of documents to explore (BeatifullSoup objects)
 66 |     """
 67 |     logger = lg.getLogger(_list_links_day.__name__)
 68 |     logger.info("Scrapping day: %s", day_str)
 69 |     DATA_POST["fechaPubInf"] = day_str
 70 |     DATA_POST["fechaPubSup"] = day_str
 71 |     response = requests.post(url, data=DATA_POST)
 72 |     response.raise_for_status()
 73 |     soup = BeautifulSoup(response.content, "html.parser")
 74 |     # Find all the rows in the response which correspond to published documents
 75 |     id_links = [
 76 |         id_link
 77 |         for id_link in soup.find_all("div", class_="row listadoEdictos")
 78 |         if (href := id_link.find("a", class_="adjunto").get("href", ""))
 79 |         and "UploadServlet?ruta=Boletines" in href
 80 |         and href.endswith(".pdf")
 81 |     ]
 82 |     logger.info("Scrapped day successfully %s (%s BOPZ documents)", url, len(id_links))
 83 |     return id_links
 84 | 
 85 | 
 86 | class BOPZScrapper(BaseScrapper):
 87 |     def download_day(self, day: date) -> tp.List[BOPZMetadataDocument]:
 88 |         """Download all the documents for a specific date."""
 89 |         logger = lg.getLogger(self.download_day.__name__)
 90 |         logger.info("Downloading BOPZ content for day %s", day)
 91 |         day_str = day.strftime("%d/%m/%Y")
 92 |         metadata_documents = []
 93 |         try:
 94 |             id_links = _list_links_day("http://bop.dpz.es/BOPZ/portalBuscarEdictos.do", day_str)
 95 |             for id_link in id_links:
 96 |                 try:
 97 |                     onclick_div = id_link.find("div", onclick=True)
 98 |                     if onclick_div:
 99 |                         onclick_content = onclick_div["onclick"]
100 |                         start = onclick_content.find("'") + 1
101 |                         end = onclick_content.find("'", start)
102 |                         idEdicto = onclick_content[start:end]
103 |                     url_document = f"http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto={idEdicto}"
104 |                     metadata_doc = self.download_document(url_document, id_link)
105 |                     metadata_documents.append(metadata_doc)
106 |                 except HTTPError:
107 |                     logger.error("Not scrapped document %s on day %s", url_document, day_str)
108 |                 except AttributeError:
109 |                     logger.error("Not scrapped document %s on day %s", url_document, day_str)
110 |         except HTTPError:
111 |             logger.error("Not scrapped document on day %s", day_str)
112 |         logger.info("Downloaded BOPZ content for day %s", day_str)
113 |         return metadata_documents
114 | 
115 |     def download_document(self, url: str, metadata: BeautifulSoup) -> BOPZMetadataDocument:
116 |         """Get text and metadata from a BOPZ document.
117 | 
118 |         :param url: document url link. Examples:
119 |             * http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729066
120 |             * http://bop.dpz.es/BOPZ/obtenerContenidoEdicto.do?idEdicto=729162
121 |         :metadata BeautifulSoup: document metadata associated with url download link.
122 |         :return: document with metadata and filepath with text content
123 |         """
124 |         logger = lg.getLogger(self.download_document.__name__)
125 |         logger.info("Scrapping document: %s", url)
126 |         session = create_retry_session(retries=5)
127 |         response = session.get(url, timeout=10)
128 |         response.raise_for_status()
129 |         soup = BeautifulSoup(response.text, "lxml")
130 |         with tempfile.NamedTemporaryFile("w", delete=False) as fn:
131 |             text = soup.find("div", class_="medium-12 panel").get_text(strip=True, separator="\n")
132 |             fn.write(text)
133 |         metadata_dict = _extract_metadata(metadata)
134 |         metadata_dict["identificador"] = url.split("=")[1]
135 |         metadata_dict["url_html"] = url
136 |         metadata_doc = BOPZMetadataDocument(filepath=fn.name, **metadata_dict)
137 |         logger.info("Scrapped document successfully %s", url)
138 |         return metadata_doc
139 | 


--------------------------------------------------------------------------------
/src/etls/boe/scrapper.py:
--------------------------------------------------------------------------------
  1 | import logging as lg
  2 | import tempfile
  3 | import typing as tp
  4 | from datetime import date, datetime
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | from requests.exceptions import HTTPError
  8 | 
  9 | from src.etls.boe.metadata import BOEMetadataDocument, BOEMetadataReferencia
 10 | from src.etls.common.scrapper import BaseScrapper
 11 | from src.initialize import initialize_logging
 12 | from src.etls.utils import create_retry_session
 13 | 
 14 | initialize_logging()
 15 | 
 16 | 
 17 | def _extract_metadata(soup) -> tp.Dict:
 18 |     metadata_dict = {}
 19 | 
 20 |     # Metadatos
 21 |     if identificador := soup.documento.metadatos.identificador:
 22 |         metadata_dict["identificador"] = identificador.get_text()
 23 | 
 24 |     if diario := soup.documento.metadatos.diario:
 25 |         metadata_dict["diario"] = diario.get_text()
 26 | 
 27 |     if numero_oficial := soup.documento.metadatos.numero_oficial:
 28 |         metadata_dict["numero_oficial"] = numero_oficial.get_text()
 29 | 
 30 |     if departamento := soup.documento.metadatos.departamento:
 31 |         metadata_dict["departamento"] = departamento.get_text()
 32 | 
 33 |     if rango := soup.documento.metadatos.rango:
 34 |         metadata_dict["rango"] = rango.get_text()
 35 | 
 36 |     if titulo := soup.documento.metadatos.titulo:
 37 |         metadata_dict["titulo"] = titulo.get_text()
 38 | 
 39 |     if url_pdf := soup.documento.metadatos.url_pdf:
 40 |         metadata_dict["url_pdf"] = url_pdf.get_text()
 41 | 
 42 |     if origen_legislativo := soup.documento.metadatos.origen_legislativo:
 43 |         metadata_dict["origen_legislativo"] = origen_legislativo.get_text()
 44 | 
 45 |     if fecha_publicacion := soup.documento.metadatos.fecha_publicacion:
 46 |         metadata_dict["fecha_publicacion"] = fecha_publicacion.get_text()
 47 | 
 48 |     if fecha_disposicion := soup.documento.metadatos.fecha_disposicion:
 49 |         metadata_dict["fecha_disposicion"] = fecha_disposicion.get_text()
 50 | 
 51 |     metadata_dict["anio"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%Y")
 52 | 
 53 |     metadata_dict["mes"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%m")
 54 | 
 55 |     metadata_dict["dia"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%d")
 56 | 
 57 |     # Analisis
 58 |     if observaciones := soup.documento.analisis.observaciones:
 59 |         metadata_dict["observaciones"] = observaciones.get_text()
 60 | 
 61 |     if ambito_geografico := soup.documento.analisis.ambito_geografico:
 62 |         metadata_dict["ambito_geografico"] = ambito_geografico.get_text()
 63 | 
 64 |     if modalidad := soup.documento.analisis.modalidad:
 65 |         metadata_dict["modalidad"] = modalidad.get_text()
 66 | 
 67 |     if tipo := soup.documento.analisis.tipo:
 68 |         metadata_dict["tipo"] = tipo.get_text()
 69 | 
 70 |     metadata_dict["materias"] = [
 71 |         materia.get_text() for materia in soup.select("documento > analisis > materias > materia")
 72 |     ]
 73 |     metadata_dict["alertas"] = [alerta.get_text() for alerta in soup.select("documento > analisis > alertas > alerta")]
 74 |     metadata_dict["notas"] = [nota.get_text() for nota in soup.select("documento > analisis > notas > nota")]
 75 |     metadata_dict["ref_posteriores"] = [
 76 |         BOEMetadataReferencia(
 77 |             id=ref["referencia"],
 78 |             palabra=ref.palabra.get_text(),
 79 |             texto=ref.texto.get_text(),
 80 |         )
 81 |         for ref in soup.select("documento > analisis > referencias > posteriores > posterior")
 82 |     ]
 83 |     metadata_dict["ref_anteriores"] = [
 84 |         BOEMetadataReferencia(
 85 |             id=ref["referencia"],
 86 |             palabra=ref.palabra.get_text(),
 87 |             texto=ref.texto.get_text(),
 88 |         )
 89 |         for ref in soup.select("documento > analisis > referencias > anteriores > anterior")
 90 |     ]
 91 |     return metadata_dict
 92 | 
 93 | 
 94 | def _list_links_day(url: str) -> tp.List[str]:
 95 |     """Get a list of links in a BOE url day filtering by Seccion 1 and Seccion T.
 96 | 
 97 |     :param url: day url link. Example: https://www.boe.es/diario_boe/xml.php?id=BOE-S-20230817
 98 |     :return: list of id documents to explore (links)
 99 |     """
100 |     logger = lg.getLogger(_list_links_day.__name__)
101 |     logger.info("Scrapping day: %s", url)
102 |     session = create_retry_session(retries=5)
103 |     response = session.get(url, timeout=10)
104 |     response.raise_for_status()
105 |     soup = BeautifulSoup(response.text, "lxml")
106 |     id_links = [
107 |         url.text.split("?id=")[-1]
108 |         for section in soup.find_all(
109 |             lambda tag: tag.name == "seccion"
110 |             and "num" in tag.attrs
111 |             and (tag.attrs["num"] == "1" or tag.attrs["num"] == "T")  # Note: Sección 1 and Tribunal Supremo
112 |         )
113 |         for url in section.find_all("urlxml")
114 |     ]
115 |     logger.info("Scrapped day successfully %s (%s BOE documents)", url, len(id_links))
116 |     return id_links
117 | 
118 | 
119 | class BOEScrapper(BaseScrapper):
120 |     def download_day(self, day: date) -> tp.List[BOEMetadataDocument]:
121 |         """Download all the documents for a specific date."""
122 |         logger = lg.getLogger(self.download_day.__name__)
123 |         logger.info("Downloading BOE content for day %s", day)
124 |         day_str = day.strftime("%Y%m%d")
125 |         day_url = f"https://www.boe.es/diario_boe/xml.php?id=BOE-S-{day_str}"
126 |         metadata_documents = []
127 |         try:
128 |             id_links = _list_links_day(day_url)
129 |             for id_link in id_links:
130 |                 url_document = f"https://www.boe.es/diario_boe/xml.php?id={id_link}"
131 |                 try:
132 |                     metadata_doc = self.download_document(url_document)
133 |                     metadata_documents.append(metadata_doc)
134 |                 except HTTPError:
135 |                     logger.error("Not scrapped document %s on day %s", url_document, day_url)
136 |                 except AttributeError:
137 |                     logger.error("Not scrapped document %s on day %s", url_document, day_url)
138 |         except HTTPError:
139 |             logger.error("Not scrapped document on day %s", day_url)
140 |         logger.info("Downloaded BOE content for day %s", day)
141 |         return metadata_documents
142 | 
143 |     def download_document(self, url: str) -> BOEMetadataDocument:
144 |         """Get text and metadata from a BOE xml url document.
145 | 
146 |         :param url: document url link. Examples:
147 |             * https://www.boe.es/diario_boe/xml.php?id=BOE-A-2022-14630
148 |             * https://www.boe.es/diario_boe/xml.php?id=BOE-A-2023-12203
149 |         :return: document with metadata and filepath with text content
150 |         """
151 |         logger = lg.getLogger(self.download_document.__name__)
152 |         logger.info("Scrapping document: %s", url)
153 |         session = create_retry_session(retries=5)
154 |         response = session.get(url, timeout=10)
155 |         response.raise_for_status()
156 |         soup = BeautifulSoup(response.text, "lxml")
157 |         with tempfile.NamedTemporaryFile("w", delete=False) as fn:
158 |             text = soup.select_one("documento > texto").get_text()
159 |             fn.write(text)
160 |         metadata_doc = BOEMetadataDocument(filepath=fn.name, **_extract_metadata(soup))
161 |         logger.info("Scrapped document successfully %s", url)
162 |         return metadata_doc
163 | 


--------------------------------------------------------------------------------
/src/etls/bocm/scrapper.py:
--------------------------------------------------------------------------------
  1 | import logging as lg
  2 | import tempfile
  3 | import typing as tp
  4 | from datetime import date, datetime
  5 | import re
  6 | 
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | from requests.exceptions import HTTPError
 10 | 
 11 | from src.etls.bocm.metadata import BOCMMetadataDocument
 12 | from src.etls.bocm.utils import *
 13 | from src.etls.common.scrapper import BaseScrapper
 14 | from src.initialize import initialize_logging
 15 | from src.etls.utils import create_retry_session
 16 | 
 17 | 
 18 | initialize_logging()
 19 | 
 20 | 
 21 | # transformation from url retrieve from redirection to one pointing to complete summary
 22 | def _adapt_link_to_complete_summary(url: str) -> str:
 23 |     """Get complete summary url transforming the url received by param
 24 | 
 25 |     :url: url to transform. Example : https://www.bocm.es/boletin/bocm-20240126-22
 26 |     :return: summary of the day url. Example: https://www.bocm.es/boletin-completo/BOCM-20240126/22
 27 |     """
 28 |     tmp_str = url.replace("boletin", "boletin-completo").replace("/bocm", "/BOCM")
 29 |     res = re.sub(r"(\d)-(\d+)", r"\1/\2", tmp_str)
 30 |     return res
 31 | 
 32 | 
 33 | # get url from response redirection
 34 | def _get_summary_link_from_date(day: date) -> str:
 35 |     """Get summary url from response redirection
 36 | 
 37 |     :day: day format for request param: '%d/%m/%Y'
 38 |     :return: summary of the day url
 39 |     """
 40 |     logger = lg.getLogger(_get_summary_link_from_date.__name__)
 41 | 
 42 |     search_url = "https://www.bocm.es/search-day-month"
 43 | 
 44 |     try:
 45 |         response = requests.post(search_url, data={"field_date[date]": day})
 46 |         response.raise_for_status()
 47 |         link = response.headers["Link"].split(";")[0].replace("<", "").replace(">", "")
 48 |         if re.search("search-day-month", link):
 49 |             raise ValueError("No link published")
 50 |         else:
 51 |             final_url = _adapt_link_to_complete_summary(link)
 52 | 
 53 |     except HTTPError:
 54 |         logger.error("No link got on day %s", day)
 55 |         final_url = None
 56 | 
 57 |     except ValueError as err:
 58 |         logger.error("%s for day %s. Skiping...", err.args[0], day)
 59 |         final_url = None
 60 | 
 61 |     return final_url
 62 | 
 63 | 
 64 | def _extract_metadata(soup) -> tp.Dict:
 65 |     metadata_dict = {}
 66 | 
 67 |     # Metadata from head tags
 68 |     fecha_publicacion, cve, html_link = metadata_from_head_tags(soup)
 69 | 
 70 |     # Desc doc header
 71 |     numero_oficial, seccion_normalizada, paginas, pdf_link = metadata_from_doc_header(soup)
 72 | 
 73 |     # Metadata from document
 74 |     seccion = seccion_normalizada.split(".")[0]
 75 |     subseccion, apartado, tipo, organo, anunciante, rango = metadata_from_doc(soup, seccion, cve)
 76 | 
 77 |     metadata_dict["rango"] = rango
 78 |     metadata_dict["identificador"] = cve
 79 |     metadata_dict["numero_oficial"] = numero_oficial
 80 |     metadata_dict["paginas"] = paginas
 81 | 
 82 |     # departamento always match with organo
 83 |     metadata_dict["departamento"] = organo
 84 | 
 85 |     metadata_dict["seccion_normalizada"] = seccion_normalizada
 86 |     metadata_dict["seccion"] = seccion.upper()
 87 |     metadata_dict["subseccion"] = subseccion
 88 |     metadata_dict["tipo"] = tipo
 89 |     metadata_dict["apartado"] = apartado
 90 | 
 91 |     metadata_dict["titulo"] = cve
 92 |     metadata_dict["url_pdf"] = pdf_link
 93 |     metadata_dict["url_html"] = html_link
 94 | 
 95 |     metadata_dict["fecha_publicacion"] = fecha_publicacion
 96 |     metadata_dict["fecha_disposicion"] = fecha_publicacion
 97 | 
 98 |     metadata_dict["anio"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%Y")
 99 | 
100 |     metadata_dict["mes"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%m")
101 | 
102 |     metadata_dict["dia"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%d")
103 | 
104 |     return metadata_dict
105 | 
106 | 
107 | def _list_links_day(url: str) -> tp.List[str]:
108 |     """Get a list of links in a BOCM url day filtering by Seccion 1-A, 3 and 4.
109 | 
110 |     :param url: summary url link. Example: https://www.bocm.es/boletin-completo/BOCM-20240103/2
111 |     :return: list of urls filtered by sections to download
112 |     """
113 |     logger = lg.getLogger(_list_links_day.__name__)
114 | 
115 |     logger.info("Scrapping day: %s", url)
116 |     response = requests.get(url)
117 |     response.raise_for_status()
118 |     soup = BeautifulSoup(response.text, features="lxml")
119 | 
120 |     # filter by sections
121 |     sections_to_filter = ["1-A", "3-", "4-"]
122 |     filtered_links = filter_links_by_section(soup, sections_to_filter)
123 |     logger.info("Scrapped day successfully %s (%s BOCM documents)", url, len(filtered_links))
124 | 
125 |     return filtered_links
126 | 
127 | 
128 | class BOCMScrapper(BaseScrapper):
129 |     def download_day(self, day: date) -> tp.List[BOCMMetadataDocument]:
130 |         """Download all the documents for a specific date."""
131 |         logger = lg.getLogger(self.download_day.__name__)
132 |         logger.info("Downloading BOCM content for day %s", day)
133 |         day_str = day.strftime("%d/%m/%Y")
134 | 
135 |         summary_url = _get_summary_link_from_date(day_str)
136 | 
137 |         metadata_documents = []
138 |         if summary_url is not None:
139 |             logger.info("Got summary url for day %s", day)
140 |             logger.info("URL: [%s] for selected day [%s]", summary_url, day)
141 | 
142 |             try:
143 |                 list_urls = _list_links_day(summary_url)
144 |                 for url in list_urls:
145 |                     try:
146 |                         # Skip urls that contains in the path 'boletin'
147 |                         if not re.search("boletin", url):
148 |                             metadata_doc = self.download_document(url)
149 |                             metadata_documents.append(metadata_doc)
150 |                     except HTTPError:
151 |                         logger.error("Not scrapped document %s on day %s", url, day)
152 |                     except AttributeError:
153 |                         logger.error("Not scrapped document %s on day %s", url, day)
154 |             except HTTPError:
155 |                 logger.error("Not scrapped document %s on day %s", url, day)
156 |             logger.info("Downloaded all BOCM docs for day %s", day)
157 |         return metadata_documents
158 | 
159 |     def download_document(self, url: str) -> BOCMMetadataDocument:
160 |         """Get text and metadata from BOCM summary html url document.
161 | 
162 |         :param url: document url link. Examples:
163 |             * https://www.bocm.es/bocm-20240123-76
164 |             * https://www.bocm.es/bocm-20240123-98
165 |         :return: document with metadata and filepath with text content
166 |         """
167 |         logger = lg.getLogger(self.download_document.__name__)
168 |         logger.info("Scrapping document: %s", url)
169 |         session = create_retry_session(retries=5)
170 |         response = session.get(url, timeout=10)
171 |         response.raise_for_status()
172 |         soup = BeautifulSoup(response.text, features="lxml")
173 |         with tempfile.NamedTemporaryFile("w", delete=False) as fn:
174 |             text = soup.select_one("#main").get_text()
175 |             text_cleaned = clean_text(text)
176 |             fn.write(text_cleaned)
177 |         metadata_doc = BOCMMetadataDocument(filepath=fn.name, **_extract_metadata(soup))
178 |         logger.info("Scrapped document successfully %s", url)
179 |         return metadata_doc
180 | 


--------------------------------------------------------------------------------
/src/etls/boa/scrapper.py:
--------------------------------------------------------------------------------
  1 | import logging as lg
  2 | import tempfile
  3 | import typing as tp
  4 | from datetime import date, datetime
  5 | import random
  6 | import json 
  7 | from lxml import etree
  8 | 
  9 | import requests
 10 | 
 11 | from src.etls.boa.metadata import BOAMetadataDocument
 12 | from src.etls.common.metadata import MetadataDocument
 13 | from src.etls.common.scrapper import BaseScrapper
 14 | from src.etls.common.utils import ScrapperError
 15 | from src.initialize import initialize_logging
 16 | from src.etls.utils import create_retry_session
 17 | 
 18 | 
 19 | initialize_logging()
 20 | 
 21 | 
 22 | def _remove_html_tags(text: str) -> str:
 23 |     parser = etree.HTMLParser()
 24 |     tree = etree.fromstring(text, parser)
 25 |     clean_text = etree.tostring(tree, encoding="unicode", method='text') 
 26 |     return clean_text.strip()
 27 | 
 28 | 
 29 | def _extract_metadata(doc: dict) -> tp.Dict:
 30 |     metadata_dict = {}
 31 |     
 32 |     try:
 33 |         metadata_dict["identificador"] = doc["DOCN"]
 34 |     except KeyError:
 35 |         pass
 36 | 
 37 |     try:
 38 |         metadata_dict["numero_boletin"] = doc["Numeroboletin"]
 39 |     except KeyError:
 40 |         pass
 41 |     
 42 |     try:
 43 |         metadata_dict["departamento"] = doc["Emisor"].capitalize()
 44 |     except KeyError:
 45 |         pass
 46 |     
 47 |     try:
 48 |         metadata_dict["url_pdf"] = doc["UrlPdf"].split('´`')[0][1:]
 49 |     except KeyError:
 50 |         pass
 51 |     
 52 |     try:
 53 |         metadata_dict["url_boletin"] = doc["UrlBCOM"].split('´`')[0][1:]
 54 |     except KeyError:
 55 |         pass
 56 |     
 57 |     try:
 58 |         metadata_dict["seccion"] = doc["Seccion"]
 59 |     except KeyError:
 60 |         pass
 61 | 
 62 |     try:
 63 |         metadata_dict["titulo"] = doc["Titulo"]
 64 |     except KeyError:
 65 |         pass
 66 |     
 67 |     try:
 68 |         metadata_dict["subseccion"] = doc["Subseccion"]
 69 |     except KeyError:
 70 |         pass
 71 |     
 72 |     try:
 73 |         metadata_dict["codigo_materia"] = doc["CodigoMateria"]
 74 |     except KeyError:
 75 |         pass
 76 |     
 77 |     try:
 78 |         metadata_dict["rango"] = doc["Rango"].capitalize()
 79 |     except KeyError:
 80 |         pass
 81 | 
 82 |     try:
 83 |         fecha_disposicion = datetime.strptime(doc["Fechadisposicion"], "%Y%m%d").strftime("%Y-%m-%d")
 84 |         metadata_dict["fecha_disposicion"] = fecha_disposicion
 85 |     except KeyError:
 86 |         pass
 87 |     
 88 |     return metadata_dict 
 89 |    
 90 |     
 91 | class BOAScrapper(BaseScrapper):
 92 |     def __init__(self):
 93 |         self.base_url = "https://www.boa.aragon.es/cgi-bin/EBOA/BRSCGI"
 94 |         self.user_agents = [
 95 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
 96 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
 97 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 98 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 99 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
100 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
101 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
102 |         ]
103 |         self.headers = {
104 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
105 |             "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
106 |             "Connection": "keep-alive",
107 |             "User-Agent": random.choice(self.user_agents),
108 |         }    
109 | 
110 |     
111 |     def download_day(self, day: date) -> tp.List[BOAMetadataDocument]:
112 |         """Download all the documents for a specific date."""
113 |         try:
114 |             logger = lg.getLogger(self.download_day.__name__)
115 |             logger.info("Downloading BOA content for day %s", day)
116 |             params ={'CMD': 'VERLST',
117 |                      'BASE': 'BZHT',
118 |                      'DOCS': '1-250',
119 |                      'SEC': 'OPENDATABOAJSONAPP',
120 |                      'OUTPUTMODE': 'JSON',
121 |                      'SEPARADOR':'',
122 |                      'PUBL-C': day.strftime("%Y%m%d"),
123 |                      'SECC-C':'BOA%2Bo%2BDisposiciones%2Bo%2BJusticia'
124 |                      # versión completa (todas las secciones, incluyendo personal, etc):
125 |                      # 'SECC-C':'BOA%2Bo%2BDisposiciones%2Bo%2BPersonal%2Bo%2BAcuerdos%2Bo%2BJusticia%2Bo%2BAnuncios' 
126 |                      }
127 |             session = create_retry_session(retries=5)
128 |             response = session.get(self.base_url, params=params, timeout=10)
129 |             raw_result = response.text
130 |             if '<span class="titulo">No se han recuperado documentos</span>' in raw_result:
131 |                 logger.info(f"No hay contenido disponible para el día {day}")
132 |                 return []
133 |             if response.status_code != 200:
134 |                 response.raise_for_status() 
135 |             raw_result = raw_result.replace('\\', '\\\\')
136 |             result_json = json.loads(raw_result)
137 |             disposiciones = []
138 |             for doc in result_json:
139 |                 metadata_doc = self.download_document(json.dumps(doc))
140 |                 fecha_publicacion_atributos = {
141 |                     "fecha_publicacion": day.strftime("%Y-%m-%d"), 
142 |                     "anio": day.strftime("%Y"),
143 |                     "mes": day.strftime("%m"),
144 |                     "dia": day.strftime("%d"),
145 |                 }
146 |                 for atributo, valor in fecha_publicacion_atributos.items():
147 |                         setattr(metadata_doc, atributo, valor)
148 |                 disposiciones.append(metadata_doc)
149 |             return disposiciones
150 |         except requests.exceptions.RequestException as e:
151 |             raise Exception(f"Error de red o HTTP al intentar acceder a {self.base_url}: {e}")
152 |         except Exception as e:
153 |             raise Exception(f"Error inesperado: {e}")
154 |         
155 | 
156 |     def download_document(self, url: str) -> MetadataDocument:
157 |         '''
158 |         En BOAScrapper, a partir de la url diaria (en la función download_day), 
159 |         se recibe directamente el contenido de todos los boletines. Por lo tanto,
160 |         no hace falta scrapear cada uno de las publicaciones a partir de su url.
161 |         
162 |         Para ser consistentes con el resto de scrappers, se mantiene el método
163 |         download_document, pero en este caso en vez de la url se le pasará una 
164 |         string con el contenido y los metadatos, tal y como se recibe de la base_url
165 |         '''
166 |         
167 |         logger = lg.getLogger(self.download_document.__name__)
168 |         doc = json.loads(url) 
169 |         url_pdf_raw = doc['UrlPdf']
170 |         url_pdf = url_pdf_raw.split('´`')[0][1:]
171 |         logger.info("Scrapping document: %s", url_pdf)
172 |         content = doc['Texto']
173 |         clean_text = _remove_html_tags(content)
174 |         with tempfile.NamedTemporaryFile("w", delete=False, encoding='utf-8') as fn:
175 |             fn.write(clean_text)           
176 |         try:
177 |             metadata_doc = BOAMetadataDocument(filepath=fn.name,**_extract_metadata(doc))
178 |         except:
179 |             raise ScrapperError("No se pudo encontrar alguno de los elementos requeridos.")
180 |         logger.info("Scrapped document successfully %s", url_pdf)
181 |         return metadata_doc
182 | 


--------------------------------------------------------------------------------
/src/service/main.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging as lg
  3 | import time
  4 | import uuid
  5 | import os
  6 | import typing as tp
  7 | import ipaddress
  8 | 
  9 | import httpx
 10 | from fastapi import FastAPI
 11 | 
 12 | from src.initialize import initialize_app, initialize_logging
 13 | from src.utils import inject_additional_attributes, timeit
 14 | from langtrace_python_sdk import SendUserFeedback, langtrace
 15 | from langtrace_python_sdk.utils.with_root_span import with_langtrace_root_span
 16 | 
 17 | langtrace.init(api_key=os.environ.get('LANGTRACE_API_KEY'))
 18 | initialize_logging()
 19 | 
 20 | APP = FastAPI()
 21 | 
 22 | INIT_OBJECTS = initialize_app()
 23 | 
 24 | DEFAULT_INPUT_QUERY = (
 25 |     "¿Es de aplicación la ley de garantía integral de la libertad sexual a niños (varones) menores de edad "
 26 |     "víctimas de violencias sexuales o solo a niñas y mujeres?"
 27 | )
 28 | DEFAULT_COLLECTION_NAME = "justicio"
 29 | 
 30 | 
 31 | @with_langtrace_root_span()
 32 | async def call_llm_api(span_id, trace_id, model_name: str, messages: tp.List[tp.Dict[str, str]]):
 33 |     response = await INIT_OBJECTS.openai_client.chat.completions.create(
 34 |         model=model_name,
 35 |         messages=messages,
 36 |         temperature=INIT_OBJECTS.config_loader["temperature"],
 37 |         seed=INIT_OBJECTS.config_loader["seed"],
 38 |         max_tokens=INIT_OBJECTS.config_loader["max_tokens"],
 39 |     )
 40 |     return response, span_id, trace_id
 41 | 
 42 | 
 43 | @APP.get("/healthcheck")
 44 | @timeit
 45 | async def healthcheck():
 46 |     """Asynchronous Health Check"""
 47 |     # TODO: healthcheck with embeddings db api and llm api
 48 |     return {"status": "OK"}
 49 | 
 50 | 
 51 | @APP.get("/semantic_search")
 52 | @timeit
 53 | async def semantic_search(input_query: str = DEFAULT_INPUT_QUERY, collection_name: str = DEFAULT_COLLECTION_NAME):
 54 |     logger = lg.getLogger(semantic_search.__name__)
 55 |     logger.info(input_query)
 56 |     docs = await INIT_OBJECTS.vector_store[collection_name].asimilarity_search_with_score(
 57 |         query=input_query, k=INIT_OBJECTS.config_loader["top_k_results"]
 58 |     )
 59 |     logger.info(docs)
 60 |     return docs
 61 | 
 62 | 
 63 | @APP.get("/semantic_search_tavily")
 64 | @timeit
 65 | async def semantic_search_tavily(input_query: str = DEFAULT_INPUT_QUERY):
 66 |     logger = lg.getLogger(semantic_search_tavily.__name__)
 67 |     logger.info(input_query)
 68 |     docs = INIT_OBJECTS.tavily_client.search(
 69 |         query=input_query,
 70 |         search_depth="advanced",
 71 |         include_domains=["https://www.boe.es/"],
 72 |         max_results=10,
 73 |         topic="general",
 74 |         include_raw_content=False,
 75 |         include_answer=False,
 76 |     )
 77 |     logger.info(docs)
 78 |     return docs
 79 | 
 80 | 
 81 | async def a_request_get(url):
 82 |     """Requests for sync/async load tests"""
 83 |     async with httpx.AsyncClient(timeout=10.0) as client:
 84 |         response = await client.get(url)
 85 |         return response.text
 86 | 
 87 | 
 88 | @APP.get("/qa_feedback")
 89 | @with_langtrace_root_span("Feedback")
 90 | @timeit
 91 | async def qa_feedback(span_id: str, trace_id: str, user_score: int):
 92 |     data = {
 93 |         "spanId": span_id, "traceId": trace_id, "userScore": user_score, "userId": None
 94 |     }
 95 |     SendUserFeedback().evaluate(data=data)
 96 |     return {"feedback": "OK"}
 97 | 
 98 | 
 99 | @APP.get("/qa")
100 | @with_langtrace_root_span("RAG Justicio")
101 | @timeit
102 | async def qa(
103 |     input_query: str = DEFAULT_INPUT_QUERY,
104 |     collection_name: str = DEFAULT_COLLECTION_NAME,
105 |     model_name: str = INIT_OBJECTS.config_loader["llm_model_name"],
106 |     input_original_query: str | None = None,
107 |     ip_request_client: ipaddress.IPv4Address | None = None,
108 | ):
109 |     logger = lg.getLogger(qa.__name__)
110 |     logger.info(input_query)
111 | 
112 |     # Getting context from embedding database (Qdrant)
113 |     docs = await INIT_OBJECTS.vector_store[collection_name].asimilarity_search_with_score(
114 |         query=input_query, k=INIT_OBJECTS.config_loader["top_k_results"]
115 |     )
116 | 
117 |     # Generate response using a LLM (OpenAI)
118 |     context_preprocessed = [{"context": doc[0].page_content, "score": doc[1]} for doc in docs]
119 |     messages = [
120 |         {"role": "system", "content": INIT_OBJECTS.config_loader["prompt_system"]},
121 |         {
122 |             "role": "system",
123 |             "content": INIT_OBJECTS.config_loader["prompt_system_context"],
124 |         },
125 |         {"role": "system", "content": "A continuación se proporciona el contexto:"},
126 |         {"role": "system", "content": str(context_preprocessed)},
127 |         {
128 |             "role": "system",
129 |             "content": "A continuación se proporciona la pregunta del usuario:",
130 |         },
131 |         {"role": "user", "content": input_query},
132 |     ]
133 |     # logger.info(messages)
134 |     additional_attributes = {
135 |         "db.collection.name": collection_name,
136 |         "service.ip": ip_request_client,
137 |         "llm.original_query": input_original_query
138 |     }
139 |     response, span_id, trace_id = await inject_additional_attributes(
140 |         lambda: call_llm_api(model_name=model_name, messages=messages), additional_attributes
141 |     )
142 |     answer = response.choices[0].message.content
143 |     logger.info(answer)
144 |     logger.info(response.usage)
145 | 
146 |     response_payload = dict(
147 |         scoring_id=str(uuid.uuid4()),
148 |         context=docs,
149 |         answer=answer,
150 |         span_id=str(span_id),
151 |         trace_id=str(trace_id),
152 |     )
153 |     return response_payload
154 | 
155 | 
156 | @APP.get("/qa_tavily")
157 | @timeit
158 | async def qa_tavily(input_query: str = DEFAULT_INPUT_QUERY):
159 |     logger = lg.getLogger(qa_tavily.__name__)
160 |     logger.info(input_query)
161 | 
162 |     # Getting context from internet browser (Tavily)
163 |     docs = INIT_OBJECTS.tavily_client.search(
164 |         query=input_query,
165 |         search_depth="advanced",
166 |         include_domains=["https://www.boe.es/"],
167 |         max_results=10,
168 |         topic="general",
169 |         include_raw_content=False,
170 |         include_answer=False,
171 |     )
172 | 
173 |     # Generate response using a LLM (OpenAI)
174 |     context_preprocessed = [{"context": doc["content"], "score": doc["score"]} for doc in docs["results"]]
175 | 
176 |     response = await INIT_OBJECTS.openai_client.chat.completions.create(
177 |         model=INIT_OBJECTS.config_loader["llm_model_name"],
178 |         messages=[
179 |             {"role": "system", "content": INIT_OBJECTS.config_loader["prompt_system"]},
180 |             {
181 |                 "role": "system",
182 |                 "content": INIT_OBJECTS.config_loader["prompt_system_context"],
183 |             },
184 |             {"role": "system", "content": "A continuación se proporciona el contexto:"},
185 |             {"role": "system", "content": str(context_preprocessed)},
186 |             {
187 |                 "role": "system",
188 |                 "content": "A continuación se proporciona la pregunta del usuario:",
189 |             },
190 |             {"role": "user", "content": input_query},
191 |         ],
192 |         temperature=INIT_OBJECTS.config_loader["temperature"],
193 |         seed=INIT_OBJECTS.config_loader["seed"],
194 |         max_tokens=INIT_OBJECTS.config_loader["max_tokens"],
195 |     )
196 |     answer = response.choices[0].message.content
197 |     logger.info(answer)
198 |     logger.info(response.usage)
199 | 
200 |     response_payload = dict(
201 |         scoring_id=str(uuid.uuid4()),
202 |         context=docs,
203 |         answer=answer,
204 |     )
205 |     return response_payload
206 | 
207 | 
208 | @APP.get("/sleep")
209 | @timeit
210 | async def sleep():
211 |     time.sleep(5)
212 |     return {"status": "OK"}
213 | 
214 | 
215 | @APP.get("/asleep")
216 | @timeit
217 | async def asleep():
218 |     await asyncio.sleep(5)
219 |     return {"status": "OK"}
220 | 


--------------------------------------------------------------------------------
/src/etls/bopv/scrapper.py:
--------------------------------------------------------------------------------
  1 | import logging as lg
  2 | import tempfile
  3 | import typing as tp
  4 | from datetime import date
  5 | import re
  6 | import random
  7 | 
  8 | import requests
  9 | from bs4 import BeautifulSoup
 10 | from urllib.parse import urljoin
 11 | 
 12 | from src.etls.bopv.metadata import BOPVMetadataDocument
 13 | from src.etls.common.scrapper import BaseScrapper
 14 | from src.etls.common.utils import ScrapperError
 15 | from src.initialize import initialize_logging
 16 | from src.etls.utils import create_retry_session
 17 | 
 18 | 
 19 | initialize_logging()
 20 | 
 21 | def clean_text(text: str) -> str:
 22 |     cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE)
 23 |     return cleaned
 24 | 
 25 | 
 26 | class BOPVScrapper(BaseScrapper):
 27 |     def __init__(self):
 28 |         self.base_url = "https://www.euskadi.eus/bopv2/datos/"
 29 |         self.boletin_url_base = "https://www.euskadi.eus/web01-bopv/es/bopv2/datos/"
 30 |         self.user_agents = [
 31 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
 32 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
 33 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 34 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 35 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
 36 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
 37 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15"
 38 |         ]
 39 |         self.headers = {
 40 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
 41 |             "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
 42 |             "Connection": "keep-alive",
 43 |             "User-Agent": random.choice(self.user_agents),
 44 |         }    
 45 | 
 46 |     def _get_boletin_url(self, date: date, enlace_dia: str) -> str:
 47 |         """Generates a bulletin URL for a given date and link day."""
 48 |         return f"{self.boletin_url_base}{date.year}/{date.strftime('%m')}/{enlace_dia}"
 49 | 
 50 |     def _get_monthly_url(self, date: date) -> str:
 51 |         """Generates a monthly URL for a given date."""
 52 |         month_year = date.strftime("%m%Y")
 53 |         return f"{self.boletin_url_base}{month_year}.shtml"  
 54 | 
 55 |     def _get_summary_link_from_date(self, requested_date: date):
 56 |         url = self._get_monthly_url(requested_date)
 57 |         try:
 58 |             response = requests.get(url, headers=self.headers)
 59 |             response.raise_for_status()
 60 |             html = response.text
 61 |             dias_habilitados_pattern = re.compile(r"var diasHabilitados = (\[.*?\]);")
 62 |             enlaces_pattern = re.compile(r"var enlaces = (\[.*?\]);")
 63 |             dias_habilitados_match = dias_habilitados_pattern.search(html)
 64 |             enlaces_match = enlaces_pattern.search(html)
 65 | 
 66 |             if dias_habilitados_match and enlaces_match:
 67 |                 dias_habilitados = eval(dias_habilitados_match.group(1))
 68 |                 enlaces = eval(enlaces_match.group(1))
 69 |                 requested_date_formatted = requested_date.strftime("%Y%m%d")
 70 |                 if requested_date_formatted in dias_habilitados:
 71 |                     index = dias_habilitados.index(requested_date_formatted)
 72 |                     enlace = enlaces[index]
 73 |                     if isinstance(enlace, list):
 74 |                         enlace = enlace[0]
 75 |                     final_url = self._get_boletin_url(requested_date, enlace)
 76 |                     return final_url
 77 |                 else:
 78 |                     return None
 79 |         except requests.HTTPError as err:
 80 |             raise ValueError(f"Error en la solicitud HTTP: {err}")
 81 |         except ValueError as err:
 82 |             raise ValueError(f"Error en la solicitud HTTP: {err}")
 83 |     
 84 |     def download_day(self, day: date) -> tp.List[BOPVMetadataDocument]:
 85 |         """Download all the documents for a specific date."""
 86 |         try:
 87 |             logger = lg.getLogger(self.download_day.__name__)
 88 |             logger.info("Downloading BOPV content for day %s", day)
 89 |             summary_link = self._get_summary_link_from_date(day)
 90 |             if summary_link is None:
 91 |                 logger.info(f"No hay contenido disponible para el día {day}")
 92 |                 return []                
 93 |             response = requests.get(summary_link)
 94 |             if response.status_code != 200:
 95 |                 response.raise_for_status()        
 96 |             disposiciones = []
 97 |             soup = BeautifulSoup(response.content, 'html.parser')
 98 |             txt_blocks = soup.find_all('div', class_='txtBloque')           
 99 |             for block in txt_blocks:
100 |                 titulo = block.find('p', class_='BOPVSumarioTitulo')
101 |                 if not titulo or not titulo.find('a'):
102 |                     raise ScrapperError("No se pudo encontrar el título o el enlace en uno de los bloques.")            
103 |                 href = titulo.find('a')['href']
104 |                 url_disposicion = summary_link.rsplit('/', 1)[0] + '/' + href
105 |                 document_data = self.download_document(url_disposicion)
106 |                 if document_data:
107 |                     disposition_summary = {
108 |                         "titulo": titulo.text.strip(),                        
109 |                         "url_html": url_disposicion,
110 |                         "url_boletin": summary_link,
111 |                         "fecha_disposicion": day.strftime("%Y-%m-%d"),
112 |                         "anio": str(day.year),
113 |                         "mes": str(day.month),
114 |                         "dia": str(day.day),
115 |                     }
116 |                     for atributo, valor in disposition_summary.items():
117 |                             setattr(document_data, atributo, valor)
118 |                     disposiciones.append(document_data)                    
119 |             return disposiciones 
120 |         except requests.exceptions.RequestException as e:
121 |             raise Exception(f"Error de red o HTTP al intentar acceder a {summary_link}: {e}")
122 |         except Exception as e:
123 |             raise Exception(f"Error inesperado: {e}")
124 |         
125 |     def download_document(self, url: str) -> BOPVMetadataDocument:
126 |         """
127 |         Extracts the content, the issuing body, and the PDF URL of a specific disposition from BOPV given its URL.
128 |         
129 |         :param url_disposicion: The full URL of the disposition from which the content and PDF URL are to be extracted.
130 |         Example: "https://www.euskadi.eus/web01-bopv/es/bopv2/datos/2024/01/2400001a.shtml"
131 |         :return: A BOCMMetadataDocument containing the content of the disposition, the name of the issuing body, and the PDF URL.
132 |         If the content, the issuing body, or the PDF URL is not found, it returns empty strings for those values.
133 |         """
134 |         logger = lg.getLogger(self.download_document.__name__)
135 |         logger.info("Scrapping document: %s", url)
136 |         try:
137 |             session = create_retry_session(retries=5)
138 |             response = session.get(url, headers=self.headers, timeout=10)
139 |             if response.status_code != 200:
140 |                 response.raise_for_status() 
141 |             soup = BeautifulSoup(response.content, "html.parser")
142 |             seccion_tag = soup.find("h4", class_="BOPVSeccion")
143 |             if not seccion_tag:
144 |                 raise ScrapperError("No se pudo encontrar la sección requerida.")
145 | 
146 |             seccion_text = seccion_tag.get_text(strip=True).upper()
147 |             if seccion_text not in ['DISPOSICIONES GENERALES', 'OTRAS DISPOSICIONES']:
148 |                 return         
149 |             tipologia = seccion_tag.get_text(strip=True)
150 |             organismo_tag = soup.find("h5", class_="BOPVOrganismo")
151 |             content_block = soup.find("div", class_="colCentralinterior")
152 |             pdf_link_tag = soup.find("li", class_="formatoPdf").find('a')
153 |             
154 |             if not organismo_tag or not content_block or not pdf_link_tag:
155 |                 raise ScrapperError("No se pudo encontrar algunos de los elementos requeridos.")
156 | 
157 |             organismo = organismo_tag.get_text(strip=True) if organismo_tag else ""
158 |             base_url = url.rsplit('/', 1)[0] + '/'
159 |             pdf_href = pdf_link_tag.get('href') if pdf_link_tag else ""
160 |             pdf_url = urljoin(base_url, pdf_href)            
161 |             paragraphs = content_block.find_all("p", class_=re.compile(r"BOPV(Detalle|Titulo|FirmaLugFec|FirmaPuesto|FirmaNombre)"))
162 |             content_paragraphs = [p.get_text(strip=True) for p in paragraphs]
163 |             additional_elements = content_block.find_all(["h5", "div"], class_=re.compile(r"BOPV(Titulo|FirmaLugFec|FirmaPuesto|FirmaNombre)"))
164 |             content_additional = [elem.get_text(strip=True) for elem in additional_elements]
165 |             content = "\n".join(content_paragraphs + content_additional)
166 |             
167 |             with tempfile.NamedTemporaryFile("w", delete=False) as fn:
168 |                 text_cleaned = clean_text(content) 
169 |                 fn.write(text_cleaned)            
170 |             metadata_doc = BOPVMetadataDocument(**{"filepath": fn.name,
171 |                                                    "identificador": '/'.join(url.split('.')[-2].split("/")[-3:]),
172 |                                                    "departamento": organismo,
173 |                                                    "url_pdf": pdf_url,
174 |                                                    "tipologia": tipologia,                                                 
175 |                                                    })
176 |             logger.info("Scrapped document successfully %s", url)
177 |             return metadata_doc
178 |                
179 |         except requests.exceptions.RequestException as e:
180 |             raise Exception(f"Error de red o HTTP al intentar acceder a {url}: {e}")
181 |         except Exception as e:
182 |             raise Exception(f"Error de red o HTTP al intentar acceder a {url}: {e}")


--------------------------------------------------------------------------------
/src/etls/boja/scrapper.py:
--------------------------------------------------------------------------------
  1 | import logging as lg
  2 | import tempfile
  3 | import typing as tp
  4 | from datetime import date
  5 | import re
  6 | 
  7 | from src.etls.boja.metadata import BOJAMetadataDocument
  8 | from src.etls.common.scrapper import BaseScrapper
  9 | from src.etls.common.utils import ScrapperError, HTTPRequester
 10 | from src.etls.boja.utils import mes_a_numero, clean_text
 11 | from src.initialize import initialize_logging
 12 | 
 13 | 
 14 | initialize_logging()
 15 | 
 16 | def clean_text(text: str) -> str:
 17 |     cleaned = re.sub(r"(\xa0|\t+|\n+)", " ", text, flags=re.MULTILINE)
 18 |     return cleaned
 19 | 
 20 | 
 21 | class BOJAScrapper(BaseScrapper):
 22 |     def __init__(self):
 23 |         self.base_url = "https://www.juntadeandalucia.es/"
 24 |         
 25 |     @staticmethod
 26 |     def check_extraordinary_boja(url):       
 27 |             return re.match(r".*/\d{8}\.html$", url) is not None
 28 |         
 29 |     @staticmethod
 30 |     def extract_bojas_from_extraordinary(url):
 31 |         urls_bojas = []
 32 |         soup = HTTPRequester.get_soup(url)
 33 |         try:           
 34 |             uls = soup.find_all('ul', class_="mt-4 pl-3")
 35 |             for ul in uls:
 36 |                 links = ul.find_all('a', href=True)
 37 |                 for link in links:
 38 |                     href = link.get('href')
 39 |                     es_extraordinario = "extraordinario" in link.text.lower()
 40 |                     urls_bojas.append((href, es_extraordinario))
 41 | 
 42 |             return urls_bojas        
 43 |         except Exception as e:
 44 |             raise Exception(f"Error inesperado: {e}")
 45 |         
 46 |     @staticmethod    
 47 |     def find_disposiciones(url_boletin):        
 48 |         enlaces_html = []
 49 |         enlaces_finales = []
 50 |         soup = HTTPRequester.get_soup(url_boletin)
 51 |         try:            
 52 |             listado_principal = soup.find('ol', class_='listado_ordenado_boja raiz')
 53 |             if not listado_principal:                
 54 |                 listado_principal = soup.find(['ol', 'ul'], class_=['listado_ordenado_boja','listado_ordenado'])
 55 |             if listado_principal:                
 56 |                 items_a = listado_principal.find_all('a')
 57 |                 for item in items_a:                    
 58 |                     if re.search(r'\b(Disposiciones Generales|Otras Disposiciones)\b', item.text, re.IGNORECASE):                        
 59 |                         enlaces_html.append(item.get('href'))
 60 |                 for enlace in enlaces_html:                    
 61 |                     soup_intermedio = HTTPRequester.get_soup(enlace)
 62 |                     enlaces_intermedios = soup_intermedio.find_all('a', class_='item_html', title=re.compile("Versión HTML CVE"))
 63 |                     enlaces_intermedios += soup_intermedio.find_all('a', title="Ver disposición") 
 64 |                     for enlace_final in enlaces_intermedios:
 65 |                         enlaces_finales.append(enlace_final.get('href'))                                                         
 66 |             else:
 67 |                 raise ScrapperError("No se encontró el listado ordenado con las clases especificadas.")          
 68 |         except Exception as e:
 69 |             raise Exception(f"Error inesperado: {e}")
 70 |         return enlaces_finales
 71 | 
 72 |     def _get_summary_link_from_date(self, fecha_busqueda):
 73 |         url = f"{self.base_url}/{'eboja' if fecha_busqueda.year >= 2012 else 'boja'}/{fecha_busqueda.year}"
 74 |         soup = HTTPRequester.get_soup(url)
 75 |         try:
 76 |             tablas_calendario = soup.find_all('table', class_='calendario_tabla')
 77 |             for tabla in tablas_calendario:
 78 |                 summary_text = tabla.get('summary', '')
 79 |                 mes_año_match = re.search(r"Boletines del mes de (\w+) de (\d{4})", summary_text)
 80 |                 if mes_año_match:
 81 |                     mes = mes_año_match.group(1)
 82 |                     año = mes_año_match.group(2)
 83 |                     enlaces = tabla.find_all('a')
 84 |                     for enlace in enlaces:
 85 |                         href = enlace.get('href')
 86 |                         dia = enlace.text.strip()
 87 |                         fecha_iso = f"{año}-{mes_a_numero(mes):02d}-{int(dia):02d}"
 88 |                         if fecha_iso == fecha_busqueda.strftime('%Y-%m-%d'):
 89 |                             if BOJAScrapper.check_extraordinary_boja(href):
 90 |                                 urls_bojas = BOJAScrapper.extract_bojas_from_extraordinary(href)
 91 |                                 enlaces_extraordinarios = []
 92 |                                 for url_boja, es_extraordinario in urls_bojas:
 93 |                                     enlaces_extraordinarios.append({
 94 |                                         "url": url_boja,
 95 |                                         "fecha": fecha_iso,
 96 |                                         "extraordinario": es_extraordinario
 97 |                                     })
 98 |                                 return enlaces_extraordinarios
 99 |                             else:                             
100 |                                 return [{
101 |                                     "url": href,
102 |                                     "fecha": fecha_iso,
103 |                                     "extraordinario": False
104 |                                 }]
105 |         except Exception as e:
106 |             raise Exception(f"Error inesperado: {e}")
107 | 
108 |     def download_day(self, day: date) -> tp.List[BOJAMetadataDocument]:
109 |         """Download all the documents for a specific date."""
110 |         logger = lg.getLogger(self.download_day.__name__)
111 |         logger.info("Downloading BOJA content for day %s", day)
112 |         try:
113 |             disposiciones = []
114 |             lista_boletines = self._get_summary_link_from_date(day) 
115 |             if not lista_boletines:
116 |                 logger.info(f"No hay contenido disponible para el día {day}")
117 |                 return [] #None =  para ese dia no hay boletín            
118 |             for boletin in lista_boletines: # Boletines. Si hay boletin extraordinario esto será 2                            
119 |                 for disposicion in BOJAScrapper.find_disposiciones(boletin['url']):
120 |                     document_data = self.download_document(disposicion)
121 |                     if document_data:
122 |                         disposition_summary = {
123 |                             "url_boletin": boletin['url'],
124 |                             "url_html": disposicion,
125 |                             "fecha_disposicion": day.strftime("%Y-%m-%d"),
126 |                             "anio": str(day.year),
127 |                             "mes": str(day.month),
128 |                             "dia": str(day.day),
129 |                         }
130 |                         for atributo, valor in disposition_summary.items():
131 |                             setattr(document_data, atributo, valor)
132 |                         disposiciones.append(document_data) 
133 |             return disposiciones 
134 |         except Exception as e:
135 |             raise Exception(f"Error inesperado descargando dia {day}: {e}")
136 |         
137 |     def download_document(self, url: str) -> BOJAMetadataDocument:
138 |         """
139 |         Extracts the content, the issuing body, and the PDF URL of a specific disposition from BOJA given its URL.
140 |         
141 |         :param url_disposicion: The full URL of the disposition from which the content and PDF URL are to be extracted.
142 |         Example: "https://www.juntadeandalucia.es/eboja/2024/7/s51.html"
143 |         :return: A BOJAMMetadataDocument containing the content of the disposition, the name of the issuing body, and the PDF URL.
144 |         If the content, the issuing body, or the PDF URL is not found, it returns empty strings for those values.
145 |         """
146 |         logger = lg.getLogger(self.download_document.__name__)
147 |         logger.info("Scrapping document: %s", url)
148 |         texto_completo = ""
149 |         soup = HTTPRequester.get_soup(url)
150 |         try:
151 |             acceso_restringido = soup.find('h1', class_='title', string='Texto de acceso restringido')
152 |             if acceso_restringido:
153 |                 return None            
154 |             cuerpo = soup.find(id="cuerpo", class_="grid_11 contenidos_nivel3 boja_disposicion")        
155 |             cabecera = soup.find(class_="punteado_izquierda cabecera_detalle_disposicion")
156 |             if not cabecera or not cuerpo:
157 |                 raise ScrapperError("No se pudo encontrar la cabecera o el cuerpo del documento")
158 |             h2 = cabecera.find('h2')
159 |             h5 = cabecera.find('h5')
160 |             h3 = cabecera.find('h3')
161 |             titulo_div = cabecera.find('div', class_="item")
162 |             if titulo_div and titulo_div.p:
163 |                 titulo = titulo_div.p.text.strip()
164 |             else:
165 |                 h4 = cabecera.find('h4') 
166 |                 titulo = h4.text.strip() if h4 else ""  
167 |                    
168 |             tipo_disposicion = h2.text.strip() if h2 else ""
169 |             organo_disposicion = h5.text.strip() if h5 is not None else (h3.text.strip() if h3 is not None else "")
170 |             enlace_pdf = soup.find('a', class_="item_pdf_disposicion").get('href')
171 |             parrafos = cuerpo.find_all('p')            
172 | 
173 |             for parrafo in parrafos:
174 |                 if parrafo.parent.get('class') == ['alerta']:
175 |                     continue
176 |                 texto_completo += parrafo.text + "\n"
177 |             text_cleaned = clean_text(texto_completo)
178 |             with tempfile.NamedTemporaryFile("w", delete=False) as fn:
179 |                 fn.write(text_cleaned)
180 |             logger.info("Scrapped document successfully %s", url)      
181 |             metadata_doc = BOJAMetadataDocument(**{ "filepath": fn.name,
182 |                                                     "identificador": '/'.join(url.split("/")[-3:]),
183 |                                                     "titulo": titulo,
184 |                                                     "departamento": clean_text(organo_disposicion),
185 |                                                     "url_pdf": enlace_pdf,
186 |                                                     "tipologia": re.sub(r"^\d+\.\s*", "", tipo_disposicion),                                                 
187 |                                                     })
188 |             return metadata_doc              
189 |         except Exception as e:
190 |             raise Exception(f"Error inesperado procesando el documento {url}: {e}")
191 | 


--------------------------------------------------------------------------------
/research/fine-tuning-embedding-model/1.5-CheckDataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 45,
  6 |    "id": "a1120c32-78d2-41c8-9f27-da9648c8e6c3",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from datasets import load_dataset"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 46,
 16 |    "id": "9c7ba22e-3bb2-4c36-9a8f-5fcbffa80e4c",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "INPUT_DATASET = \"dariolopez/justicio-rag-embedding-qa-tmp-2\""
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 47,
 26 |    "id": "34583617-bab8-4405-9478-cc88fef92bf1",
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "application/vnd.jupyter.widget-view+json": {
 32 |        "model_id": "48b3ef4d69a84a10bb19a1a1e144a113",
 33 |        "version_major": 2,
 34 |        "version_minor": 0
 35 |       },
 36 |       "text/plain": [
 37 |        "Downloading readme:   0%|          | 0.00/348 [00:00<?, ?B/s]"
 38 |       ]
 39 |      },
 40 |      "metadata": {},
 41 |      "output_type": "display_data"
 42 |     },
 43 |     {
 44 |      "data": {
 45 |       "application/vnd.jupyter.widget-view+json": {
 46 |        "model_id": "b2aaa2e1607549f0b13259a668d75732",
 47 |        "version_major": 2,
 48 |        "version_minor": 0
 49 |       },
 50 |       "text/plain": [
 51 |        "Downloading data:   0%|          | 0.00/813k [00:00<?, ?B/s]"
 52 |       ]
 53 |      },
 54 |      "metadata": {},
 55 |      "output_type": "display_data"
 56 |     },
 57 |     {
 58 |      "data": {
 59 |       "application/vnd.jupyter.widget-view+json": {
 60 |        "model_id": "8cf305f1a7a24d648e1508dae1f89cd7",
 61 |        "version_major": 2,
 62 |        "version_minor": 0
 63 |       },
 64 |       "text/plain": [
 65 |        "Generating train split:   0%|          | 0/3275 [00:00<?, ? examples/s]"
 66 |       ]
 67 |      },
 68 |      "metadata": {},
 69 |      "output_type": "display_data"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "dataset = load_dataset(INPUT_DATASET)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 48,
 79 |    "id": "56bef3ca-fc75-459d-9d86-7f2a0ca872ab",
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "DatasetDict({\n",
 86 |        "    train: Dataset({\n",
 87 |        "        features: ['question', 'context', 'answer'],\n",
 88 |        "        num_rows: 3275\n",
 89 |        "    })\n",
 90 |        "})"
 91 |       ]
 92 |      },
 93 |      "execution_count": 48,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "dataset"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 49,
105 |    "id": "ad9efc20-1046-4e01-93e7-16dbefdd04be",
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "Dataset({\n",
112 |        "    features: ['question', 'context', 'answer'],\n",
113 |        "    num_rows: 3275\n",
114 |        "})"
115 |       ]
116 |      },
117 |      "execution_count": 49,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "dataset['train']"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 50,
129 |    "id": "2eea0ed2-56dc-40b6-b80f-1c74047747fb",
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "data": {
134 |       "text/plain": [
135 |        "{'question': '¿Cuál es el derecho constitucional que se reconoce en el artículo 47 de la Constitución Española?',\n",
136 |        " 'context': 'La Constitución española (CE) reconoce, en su artículo 47, el derecho al disfrute de una vivienda digna y adecuada e impone seguidamente a los poderes públicos el deber de promover las condiciones necesarias que garanticen la igualdad en el ejercicio de los derechos y el cumplimiento de los deberes constitucionales.',\n",
137 |        " 'answer': 'El derecho al disfrute de una vivienda digna y adecuada.'}"
138 |       ]
139 |      },
140 |      "execution_count": 50,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "dataset['train'][0]"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "id": "925b5dd8-4f8f-470f-83fb-43718bebe3bd",
152 |    "metadata": {},
153 |    "source": [
154 |     "# Comprobar questions/contexts/answers vacíos o pequeños"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 51,
160 |    "id": "4b07a9c6-b9bc-4159-bdc3-31b96d657558",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "def check_len_column(data, name_column):\n",
165 |     "    # Calculating the mean of the contexts length\n",
166 |     "    contexts = []\n",
167 |     "    for d in data:\n",
168 |     "        contexts.append(len(d[name_column]))\n",
169 |     "    print(f\"Mean: {sum(contexts) / len(contexts)}\")\n",
170 |     "\n",
171 |     "    print(\"\\nElementos mayores:\")\n",
172 |     "    # Obtener las posiciones de los 10 mayores elementos\n",
173 |     "    posiciones = sorted(range(len(contexts)), key=lambda i: contexts[i], reverse=True)[:10]\n",
174 |     "    \n",
175 |     "    # Imprimir las posiciones y los valores correspondientes\n",
176 |     "    for pos in posiciones:\n",
177 |     "        print(f\"Posición: {pos}, Valor: {contexts[pos]}\")\n",
178 |     "\n",
179 |     "    print(\"\\nElementos menores:\")\n",
180 |     "    # Obtener las posiciones de los 10 menores elementos\n",
181 |     "    posiciones = sorted(range(len(contexts)), key=lambda i: contexts[i], reverse=False)[:10]\n",
182 |     "    \n",
183 |     "    # Imprimir las posiciones y los valores correspondientes\n",
184 |     "    for pos in posiciones:\n",
185 |     "        print(f\"Posición: {pos}, Valor: {contexts[pos]}\")"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 52,
191 |    "id": "0668fa6f-30fd-43ec-8417-569db7f32ff2",
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "Mean: 302.0741984732824\n",
199 |       "\n",
200 |       "Elementos mayores:\n",
201 |       "Posición: 553, Valor: 1338\n",
202 |       "Posición: 644, Valor: 1195\n",
203 |       "Posición: 1772, Valor: 1138\n",
204 |       "Posición: 308, Valor: 1135\n",
205 |       "Posición: 2487, Valor: 1050\n",
206 |       "Posición: 2736, Valor: 1041\n",
207 |       "Posición: 434, Valor: 979\n",
208 |       "Posición: 1958, Valor: 973\n",
209 |       "Posición: 1026, Valor: 937\n",
210 |       "Posición: 2485, Valor: 910\n",
211 |       "\n",
212 |       "Elementos menores:\n",
213 |       "Posición: 306, Valor: 121\n",
214 |       "Posición: 1792, Valor: 121\n",
215 |       "Posición: 1996, Valor: 121\n",
216 |       "Posición: 2091, Valor: 121\n",
217 |       "Posición: 2115, Valor: 121\n",
218 |       "Posición: 2412, Valor: 121\n",
219 |       "Posición: 415, Valor: 122\n",
220 |       "Posición: 607, Valor: 122\n",
221 |       "Posición: 651, Valor: 122\n",
222 |       "Posición: 706, Valor: 122\n"
223 |      ]
224 |     }
225 |    ],
226 |    "source": [
227 |     "check_len_column(data=dataset['train'], name_column='context')"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 53,
233 |    "id": "a455f678-5920-4d2d-9fce-0bd2768eaf89",
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "name": "stdout",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "Mean: 84.49404580152672\n",
241 |       "\n",
242 |       "Elementos mayores:\n",
243 |       "Posición: 997, Valor: 265\n",
244 |       "Posición: 468, Valor: 246\n",
245 |       "Posición: 2631, Valor: 245\n",
246 |       "Posición: 1571, Valor: 234\n",
247 |       "Posición: 2518, Valor: 228\n",
248 |       "Posición: 1328, Valor: 227\n",
249 |       "Posición: 465, Valor: 225\n",
250 |       "Posición: 1958, Valor: 225\n",
251 |       "Posición: 2230, Valor: 223\n",
252 |       "Posición: 1355, Valor: 220\n",
253 |       "\n",
254 |       "Elementos menores:\n",
255 |       "Posición: 277, Valor: 19\n",
256 |       "Posición: 652, Valor: 19\n",
257 |       "Posición: 1618, Valor: 19\n",
258 |       "Posición: 1786, Valor: 19\n",
259 |       "Posición: 2786, Valor: 19\n",
260 |       "Posición: 312, Valor: 20\n",
261 |       "Posición: 1577, Valor: 23\n",
262 |       "Posición: 147, Valor: 24\n",
263 |       "Posición: 1121, Valor: 24\n",
264 |       "Posición: 2846, Valor: 24\n"
265 |      ]
266 |     }
267 |    ],
268 |    "source": [
269 |     "check_len_column(data=dataset['train'], name_column='question')"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 54,
275 |    "id": "c1deb95c-4956-468d-b8a0-d6e6483545e4",
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "Mean: 115.42076335877863\n",
283 |       "\n",
284 |       "Elementos mayores:\n",
285 |       "Posición: 2897, Valor: 1015\n",
286 |       "Posición: 644, Valor: 689\n",
287 |       "Posición: 1648, Valor: 681\n",
288 |       "Posición: 1390, Valor: 617\n",
289 |       "Posición: 1406, Valor: 600\n",
290 |       "Posición: 282, Valor: 538\n",
291 |       "Posición: 2345, Valor: 498\n",
292 |       "Posición: 243, Valor: 490\n",
293 |       "Posición: 2923, Valor: 483\n",
294 |       "Posición: 2865, Valor: 476\n",
295 |       "\n",
296 |       "Elementos menores:\n",
297 |       "Posición: 1507, Valor: 1\n",
298 |       "Posición: 228, Valor: 3\n",
299 |       "Posición: 1036, Valor: 3\n",
300 |       "Posición: 1957, Valor: 3\n",
301 |       "Posición: 2839, Valor: 5\n",
302 |       "Posición: 1275, Valor: 6\n",
303 |       "Posición: 2930, Valor: 6\n",
304 |       "Posición: 3148, Valor: 6\n",
305 |       "Posición: 226, Valor: 7\n",
306 |       "Posición: 388, Valor: 7\n"
307 |      ]
308 |     }
309 |    ],
310 |    "source": [
311 |     "check_len_column(data=dataset['train'], name_column='answer')"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 55,
317 |    "id": "20c12093-36a6-4e47-ac26-b623a9fd0ca6",
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/plain": [
323 |        "{'question': '¿Cuántos representantes tendrá una Junta de Personal si hay entre 50 y 100 funcionarios?',\n",
324 |        " 'context': 'Cada Junta de Personal se compone de un número de representantes, en función del número de funcionarios de la Unidad electoral correspondiente, de acuerdo con la siguiente escala, en coherencia con lo establecido en el Estatuto de los Trabajadores: De 50 a 100 funcionarios: 5.',\n",
325 |        " 'answer': '5'}"
326 |       ]
327 |      },
328 |      "execution_count": 55,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "dataset['train'][1507]"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "id": "3e45c728-972d-44e1-a9ec-183777032c7f",
341 |    "metadata": {},
342 |    "outputs": [],
343 |    "source": []
344 |   }
345 |  ],
346 |  "metadata": {
347 |   "kernelspec": {
348 |    "display_name": "Python 3 (ipykernel)",
349 |    "language": "python",
350 |    "name": "python3"
351 |   },
352 |   "language_info": {
353 |    "codemirror_mode": {
354 |     "name": "ipython",
355 |     "version": 3
356 |    },
357 |    "file_extension": ".py",
358 |    "mimetype": "text/x-python",
359 |    "name": "python",
360 |    "nbconvert_exporter": "python",
361 |    "pygments_lexer": "ipython3",
362 |    "version": "3.10.14"
363 |   }
364 |  },
365 |  "nbformat": 4,
366 |  "nbformat_minor": 5
367 | }
368 | 


--------------------------------------------------------------------------------
/benchmark/output/response_time_async_stats_history.csv:
--------------------------------------------------------------------------------
 1 | Timestamp,User Count,Type,Name,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%,Total Request Count,Total Failure Count,Total Median Response Time,Total Average Response Time,Total Min Response Time,Total Max Response Time,Total Average Content Size
 2 | 1696068372,0,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 3 | 1696068373,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 4 | 1696068374,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 5 | 1696068375,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 6 | 1696068376,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 7 | 1696068377,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 8 | 1696068378,1,,Aggregated,0.000000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,1,0,5824.726167,5824.726167,5824.726167,5824.726167,13780.0
 9 | 1696068379,1,,Aggregated,0.000000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,1,0,5824.726167,5824.726167,5824.726167,5824.726167,13780.0
10 | 1696068380,1,,Aggregated,0.000000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,1,0,5824.726167,5824.726167,5824.726167,5824.726167,13780.0
11 | 1696068381,1,,Aggregated,0.000000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,1,0,5824.726167,5824.726167,5824.726167,5824.726167,13780.0
12 | 1696068382,1,,Aggregated,0.000000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,1,0,5824.726167,5824.726167,5824.726167,5824.726167,13780.0
13 | 1696068383,1,,Aggregated,0.000000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,1,0,5824.726167,5824.726167,5824.726167,5824.726167,13780.0
14 | 1696068384,1,,Aggregated,0.000000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,1,0,5824.726167,5824.726167,5824.726167,5824.726167,13780.0
15 | 1696068385,1,,Aggregated,0.100000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,2,0,5300.0,5542.270104499999,5259.814042,5824.726167,13780.0
16 | 1696068386,1,,Aggregated,0.100000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,2,0,5300.0,5542.270104499999,5259.814042,5824.726167,13780.0
17 | 1696068387,1,,Aggregated,0.100000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,2,0,5300.0,5542.270104499999,5259.814042,5824.726167,13780.0
18 | 1696068388,1,,Aggregated,0.100000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,2,0,5300.0,5542.270104499999,5259.814042,5824.726167,13780.0
19 | 1696068389,1,,Aggregated,0.100000,0.000000,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,5800,2,0,5300.0,5542.270104499999,5259.814042,5824.726167,13780.0
20 | 1696068390,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,3,0,5300.0,5462.189681,5259.814042,5824.726167,13780.0
21 | 1696068391,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,3,0,5300.0,5462.189681,5259.814042,5824.726167,13780.0
22 | 1696068392,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,3,0,5300.0,5462.189681,5259.814042,5824.726167,13780.0
23 | 1696068393,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,3,0,5300.0,5462.189681,5259.814042,5824.726167,13780.0
24 | 1696068394,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,3,0,5300.0,5462.189681,5259.814042,5824.726167,13780.0
25 | 1696068395,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,4,0,5300.0,5422.127323249999,5259.814042,5824.726167,13780.0
26 | 1696068396,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,4,0,5300.0,5422.127323249999,5259.814042,5824.726167,13780.0
27 | 1696068397,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,4,0,5300.0,5422.127323249999,5259.814042,5824.726167,13780.0
28 | 1696068398,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,4,0,5300.0,5422.127323249999,5259.814042,5824.726167,13780.0
29 | 1696068399,1,,Aggregated,0.200000,0.000000,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5800,4,0,5300.0,5422.127323249999,5259.814042,5824.726167,13780.0
30 | 1696068400,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5,0,5300.0,5394.4984835999985,5259.814042,5824.726167,13780.0
31 | 1696068401,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5,0,5300.0,5394.4984835999985,5259.814042,5824.726167,13780.0
32 | 1696068403,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5,0,5300.0,5394.4984835999985,5259.814042,5824.726167,13780.0
33 | 1696068404,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5,0,5300.0,5394.4984835999985,5259.814042,5824.726167,13780.0
34 | 1696068405,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5,0,5300.0,5394.4984835999985,5259.814042,5824.726167,13780.0
35 | 1696068406,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,5800,5,0,5300.0,5394.4984835999985,5259.814042,5824.726167,13780.0
36 | 1696068407,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,6,0,5300.0,5375.477555833332,5259.814042,5824.726167,13780.0
37 | 1696068408,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,6,0,5300.0,5375.477555833332,5259.814042,5824.726167,13780.0
38 | 1696068409,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,6,0,5300.0,5375.477555833332,5259.814042,5824.726167,13780.0
39 | 1696068410,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,6,0,5300.0,5375.477555833332,5259.814042,5824.726167,13780.0
40 | 1696068411,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,6,0,5300.0,5375.477555833332,5259.814042,5824.726167,13780.0
41 | 1696068412,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,7,0,5300.0,5365.848041857142,5259.814042,5824.726167,13780.0
42 | 1696068413,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,7,0,5300.0,5365.848041857142,5259.814042,5824.726167,13780.0
43 | 1696068414,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,7,0,5300.0,5365.848041857142,5259.814042,5824.726167,13780.0
44 | 1696068415,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,7,0,5300.0,5365.848041857142,5259.814042,5824.726167,13780.0
45 | 1696068416,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,7,0,5300.0,5365.848041857142,5259.814042,5824.726167,13780.0
46 | 1696068417,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,8,0,5300.0,5358.167843999999,5259.814042,5824.726167,13780.0
47 | 1696068418,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,8,0,5300.0,5358.167843999999,5259.814042,5824.726167,13780.0
48 | 1696068419,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,8,0,5300.0,5358.167843999999,5259.814042,5824.726167,13780.0
49 | 1696068420,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,8,0,5300.0,5358.167843999999,5259.814042,5824.726167,13780.0
50 | 1696068421,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,8,0,5300.0,5358.167843999999,5259.814042,5824.726167,13780.0
51 | 1696068422,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5300,5800,5800,5800,5800,5800,5800,5800,8,0,5300.0,5358.167843999999,5259.814042,5824.726167,13780.0
52 | 1696068423,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5400,5800,5800,5800,5800,5800,5800,5800,9,0,5300.0,5360.510176111111,5259.814042,5824.726167,13780.0
53 | 1696068424,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5400,5800,5800,5800,5800,5800,5800,5800,9,0,5300.0,5360.510176111111,5259.814042,5824.726167,13780.0
54 | 1696068425,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5400,5800,5800,5800,5800,5800,5800,5800,9,0,5300.0,5360.510176111111,5259.814042,5824.726167,13780.0
55 | 1696068426,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5400,5800,5800,5800,5800,5800,5800,5800,9,0,5300.0,5360.510176111111,5259.814042,5824.726167,13780.0
56 | 1696068427,1,,Aggregated,0.200000,0.000000,5300,5300,5300,5400,5800,5800,5800,5800,5800,5800,5800,9,0,5300.0,5360.510176111111,5259.814042,5824.726167,13780.0
57 | 1696068428,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5800,5800,5800,5800,5800,5800,5800,10,0,5300.0,5365.480808499999,5259.814042,5824.726167,13780.0
58 | 1696068429,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5800,5800,5800,5800,5800,5800,5800,10,0,5300.0,5365.480808499999,5259.814042,5824.726167,13780.0
59 | 1696068430,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5800,5800,5800,5800,5800,5800,5800,10,0,5300.0,5365.480808499999,5259.814042,5824.726167,13780.0
60 | 1696068431,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5800,5800,5800,5800,5800,5800,5800,10,0,5300.0,5365.480808499999,5259.814042,5824.726167,13780.0
61 | 1696068432,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5800,5800,5800,5800,5800,5800,5800,10,0,5300.0,5365.480808499999,5259.814042,5824.726167,13780.0
62 | 1696068433,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,11,0,5300.0,5361.114704727272,5259.814042,5824.726167,13780.0
63 | 1696068434,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,11,0,5300.0,5361.114704727272,5259.814042,5824.726167,13780.0
64 | 1696068435,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,11,0,5300.0,5361.114704727272,5259.814042,5824.726167,13780.0
65 | 1696068436,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,11,0,5300.0,5361.114704727272,5259.814042,5824.726167,13780.0
66 | 1696068437,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,11,0,5300.0,5361.114704727272,5259.814042,5824.726167,13780.0
67 | 1696068438,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,11,0,5300.0,5361.114704727272,5259.814042,5824.726167,13780.0
68 | 1696068439,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,12,0,5300.0,5358.921482833332,5259.814042,5824.726167,13780.0
69 | 1696068440,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,12,0,5300.0,5358.921482833332,5259.814042,5824.726167,13780.0
70 | 1696068441,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,12,0,5300.0,5358.921482833332,5259.814042,5824.726167,13780.0
71 | 1696068442,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,12,0,5300.0,5358.921482833332,5259.814042,5824.726167,13780.0
72 | 1696068443,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5800,5800,5800,5800,5800,5800,12,0,5300.0,5358.921482833332,5259.814042,5824.726167,13780.0
73 | 1696068444,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,13,0,5300.0,5366.4233367692295,5259.814042,5824.726167,13780.0
74 | 1696068445,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,13,0,5300.0,5366.4233367692295,5259.814042,5824.726167,13780.0
75 | 1696068446,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,13,0,5300.0,5366.4233367692295,5259.814042,5824.726167,13780.0
76 | 1696068447,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,13,0,5300.0,5366.4233367692295,5259.814042,5824.726167,13780.0
77 | 1696068448,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,13,0,5300.0,5366.4233367692295,5259.814042,5824.726167,13780.0
78 | 1696068449,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,14,0,5300.0,5362.19016092857,5259.814042,5824.726167,13780.0
79 | 1696068450,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,14,0,5300.0,5362.19016092857,5259.814042,5824.726167,13780.0
80 | 1696068451,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,14,0,5300.0,5362.19016092857,5259.814042,5824.726167,13780.0
81 | 1696068452,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,14,0,5300.0,5362.19016092857,5259.814042,5824.726167,13780.0
82 | 1696068453,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,14,0,5300.0,5362.19016092857,5259.814042,5824.726167,13780.0
83 | 1696068454,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,14,0,5300.0,5362.19016092857,5259.814042,5824.726167,13780.0
84 | 1696068455,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,15,0,5300.0,5358.644505799997,5259.814042,5824.726167,13780.0
85 | 1696068456,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,15,0,5300.0,5358.644505799997,5259.814042,5824.726167,13780.0
86 | 1696068457,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,15,0,5300.0,5358.644505799997,5259.814042,5824.726167,13780.0
87 | 1696068458,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,15,0,5300.0,5358.644505799997,5259.814042,5824.726167,13780.0
88 | 1696068459,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,15,0,5300.0,5358.644505799997,5259.814042,5824.726167,13780.0
89 | 1696068460,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,16,0,5300.0,5353.084674687497,5259.814042,5824.726167,13780.0
90 | 1696068461,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,16,0,5300.0,5353.084674687497,5259.814042,5824.726167,13780.0
91 | 1696068462,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,16,0,5300.0,5353.084674687497,5259.814042,5824.726167,13780.0
92 | 1696068463,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5500,5800,5800,5800,5800,5800,5800,16,0,5300.0,5353.084674687497,5259.814042,5824.726167,13780.0
93 | 


--------------------------------------------------------------------------------
/benchmark/output/response_time_sync_stats_history.csv:
--------------------------------------------------------------------------------
 1 | Timestamp,User Count,Type,Name,Requests/s,Failures/s,50%,66%,75%,80%,90%,95%,98%,99%,99.9%,99.99%,100%,Total Request Count,Total Failure Count,Total Median Response Time,Total Average Response Time,Total Min Response Time,Total Max Response Time,Total Average Content Size
 2 | 1696068619,0,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 3 | 1696068620,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 4 | 1696068621,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 5 | 1696068622,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 6 | 1696068623,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 7 | 1696068624,1,,Aggregated,0.000000,0.000000,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,N/A,0,0,0,0.0,0,0,0
 8 | 1696068625,1,,Aggregated,0.000000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,1,0,5342.645125,5342.645125,5342.645125,5342.645125,13780.0
 9 | 1696068626,1,,Aggregated,0.000000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,1,0,5342.645125,5342.645125,5342.645125,5342.645125,13780.0
10 | 1696068627,1,,Aggregated,0.000000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,1,0,5342.645125,5342.645125,5342.645125,5342.645125,13780.0
11 | 1696068628,1,,Aggregated,0.000000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,1,0,5342.645125,5342.645125,5342.645125,5342.645125,13780.0
12 | 1696068629,1,,Aggregated,0.000000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,1,0,5342.645125,5342.645125,5342.645125,5342.645125,13780.0
13 | 1696068630,1,,Aggregated,0.000000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,1,0,5342.645125,5342.645125,5342.645125,5342.645125,13780.0
14 | 1696068631,1,,Aggregated,0.100000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,2,0,5342.645125,5343.057500000001,5342.645125,5343.469875000001,13780.0
15 | 1696068632,1,,Aggregated,0.100000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,2,0,5342.645125,5343.057500000001,5342.645125,5343.469875000001,13780.0
16 | 1696068633,1,,Aggregated,0.100000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,2,0,5342.645125,5343.057500000001,5342.645125,5343.469875000001,13780.0
17 | 1696068634,1,,Aggregated,0.100000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,2,0,5342.645125,5343.057500000001,5342.645125,5343.469875000001,13780.0
18 | 1696068635,1,,Aggregated,0.100000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,2,0,5342.645125,5343.057500000001,5342.645125,5343.469875000001,13780.0
19 | 1696068636,1,,Aggregated,0.100000,0.000000,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,5300,2,0,5342.645125,5343.057500000001,5342.645125,5343.469875000001,13780.0
20 | 1696068637,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5400,5400,5400,5400,5400,5400,3,0,5342.645125,5374.626597000001,5342.645125,5437.764791000001,13780.0
21 | 1696068638,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5400,5400,5400,5400,5400,5400,3,0,5342.645125,5374.626597000001,5342.645125,5437.764791000001,13780.0
22 | 1696068639,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5400,5400,5400,5400,5400,5400,3,0,5342.645125,5374.626597000001,5342.645125,5437.764791000001,13780.0
23 | 1696068640,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5400,5400,5400,5400,5400,5400,3,0,5342.645125,5374.626597000001,5342.645125,5437.764791000001,13780.0
24 | 1696068641,1,,Aggregated,0.200000,0.000000,5300,5300,5400,5400,5400,5400,5400,5400,5400,5400,5400,3,0,5342.645125,5374.626597000001,5342.645125,5437.764791000001,13780.0
25 | 1696068642,1,,Aggregated,0.200000,0.000000,5400,5400,5600,5600,5600,5600,5600,5600,5600,5600,5600,4,0,5342.645125,5428.8915830000005,5342.645125,5591.686541000001,13780.0
26 | 1696068643,1,,Aggregated,0.200000,0.000000,5400,5400,5600,5600,5600,5600,5600,5600,5600,5600,5600,4,0,5342.645125,5428.8915830000005,5342.645125,5591.686541000001,13780.0
27 | 1696068644,1,,Aggregated,0.200000,0.000000,5400,5400,5600,5600,5600,5600,5600,5600,5600,5600,5600,4,0,5342.645125,5428.8915830000005,5342.645125,5591.686541000001,13780.0
28 | 1696068645,1,,Aggregated,0.200000,0.000000,5400,5400,5600,5600,5600,5600,5600,5600,5600,5600,5600,4,0,5342.645125,5428.8915830000005,5342.645125,5591.686541000001,13780.0
29 | 1696068646,1,,Aggregated,0.200000,0.000000,5400,5400,5600,5600,5600,5600,5600,5600,5600,5600,5600,4,0,5342.645125,5428.8915830000005,5342.645125,5591.686541000001,13780.0
30 | 1696068647,1,,Aggregated,0.200000,0.000000,5400,5400,5600,5600,5600,5600,5600,5600,5600,5600,5600,4,0,5342.645125,5428.8915830000005,5342.645125,5591.686541000001,13780.0
31 | 1696068648,1,,Aggregated,0.200000,0.000000,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,5600,5,0,5400.0,5448.149708,5342.645125,5591.686541000001,13780.0
32 | 1696068649,1,,Aggregated,0.200000,0.000000,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,5600,5,0,5400.0,5448.149708,5342.645125,5591.686541000001,13780.0
33 | 1696068650,1,,Aggregated,0.200000,0.000000,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,5600,5,0,5400.0,5448.149708,5342.645125,5591.686541000001,13780.0
34 | 1696068651,1,,Aggregated,0.200000,0.000000,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,5600,5,0,5400.0,5448.149708,5342.645125,5591.686541000001,13780.0
35 | 1696068652,1,,Aggregated,0.200000,0.000000,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,5600,5,0,5400.0,5448.149708,5342.645125,5591.686541000001,13780.0
36 | 1696068653,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,6,0,5400.0,5433.060694166667,5342.645125,5591.686541000001,13780.0
37 | 1696068654,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,6,0,5400.0,5433.060694166667,5342.645125,5591.686541000001,13780.0
38 | 1696068655,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,6,0,5400.0,5433.060694166667,5342.645125,5591.686541000001,13780.0
39 | 1696068656,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,6,0,5400.0,5433.060694166667,5342.645125,5591.686541000001,13780.0
40 | 1696068657,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,6,0,5400.0,5433.060694166667,5342.645125,5591.686541000001,13780.0
41 | 1696068658,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,7,0,5400.0,5435.25572,5342.645125,5591.686541000001,13780.0
42 | 1696068659,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,7,0,5400.0,5435.25572,5342.645125,5591.686541000001,13780.0
43 | 1696068660,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,7,0,5400.0,5435.25572,5342.645125,5591.686541000001,13780.0
44 | 1696068661,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,7,0,5400.0,5435.25572,5342.645125,5591.686541000001,13780.0
45 | 1696068662,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,7,0,5400.0,5435.25572,5342.645125,5591.686541000001,13780.0
46 | 1696068663,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,7,0,5400.0,5435.25572,5342.645125,5591.686541000001,13780.0
47 | 1696068664,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,8,0,5400.0,5425.031827875,5342.645125,5591.686541000001,13780.0
48 | 1696068665,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,8,0,5400.0,5425.031827875,5342.645125,5591.686541000001,13780.0
49 | 1696068666,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,8,0,5400.0,5425.031827875,5342.645125,5591.686541000001,13780.0
50 | 1696068667,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,8,0,5400.0,5425.031827875,5342.645125,5591.686541000001,13780.0
51 | 1696068668,1,,Aggregated,0.200000,0.000000,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,5600,8,0,5400.0,5425.031827875,5342.645125,5591.686541000001,13780.0
52 | 1696068669,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,9,0,5400.0,5416.436870111112,5342.645125,5591.686541000001,13780.0
53 | 1696068670,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,9,0,5400.0,5416.436870111112,5342.645125,5591.686541000001,13780.0
54 | 1696068671,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,9,0,5400.0,5416.436870111112,5342.645125,5591.686541000001,13780.0
55 | 1696068672,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,9,0,5400.0,5416.436870111112,5342.645125,5591.686541000001,13780.0
56 | 1696068673,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,9,0,5400.0,5416.436870111112,5342.645125,5591.686541000001,13780.0
57 | 1696068674,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,10,0,5400.0,5414.836766400001,5342.645125,5591.686541000001,13780.0
58 | 1696068675,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,10,0,5400.0,5414.836766400001,5342.645125,5591.686541000001,13780.0
59 | 1696068676,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,10,0,5400.0,5414.836766400001,5342.645125,5591.686541000001,13780.0
60 | 1696068677,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,10,0,5400.0,5414.836766400001,5342.645125,5591.686541000001,13780.0
61 | 1696068678,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,10,0,5400.0,5414.836766400001,5342.645125,5591.686541000001,13780.0
62 | 1696068679,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,5600,10,0,5400.0,5414.836766400001,5342.645125,5591.686541000001,13780.0
63 | 1696068680,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,11,0,5400.0,5411.139162636365,5342.645125,5591.686541000001,13780.0
64 | 1696068681,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,11,0,5400.0,5411.139162636365,5342.645125,5591.686541000001,13780.0
65 | 1696068682,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,11,0,5400.0,5411.139162636365,5342.645125,5591.686541000001,13780.0
66 | 1696068683,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,11,0,5400.0,5411.139162636365,5342.645125,5591.686541000001,13780.0
67 | 1696068684,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,11,0,5400.0,5411.139162636365,5342.645125,5591.686541000001,13780.0
68 | 1696068685,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,12,0,5400.0,5403.316906000001,5317.272083000006,5591.686541000001,13780.0
69 | 1696068686,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,12,0,5400.0,5403.316906000001,5317.272083000006,5591.686541000001,13780.0
70 | 1696068687,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,12,0,5400.0,5403.316906000001,5317.272083000006,5591.686541000001,13780.0
71 | 1696068688,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,12,0,5400.0,5403.316906000001,5317.272083000006,5591.686541000001,13780.0
72 | 1696068689,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,12,0,5400.0,5403.316906000001,5317.272083000006,5591.686541000001,13780.0
73 | 1696068690,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,13,0,5400.0,5395.0800894615395,5296.238291000009,5591.686541000001,13780.0
74 | 1696068691,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,13,0,5400.0,5395.0800894615395,5296.238291000009,5591.686541000001,13780.0
75 | 1696068692,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,13,0,5400.0,5395.0800894615395,5296.238291000009,5591.686541000001,13780.0
76 | 1696068693,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,13,0,5400.0,5395.0800894615395,5296.238291000009,5591.686541000001,13780.0
77 | 1696068694,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,13,0,5400.0,5395.0800894615395,5296.238291000009,5591.686541000001,13780.0
78 | 1696068695,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,13,0,5400.0,5395.0800894615395,5296.238291000009,5591.686541000001,13780.0
79 | 1696068696,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,14,0,5400.0,5395.874758642858,5296.238291000009,5591.686541000001,13780.0
80 | 1696068697,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,14,0,5400.0,5395.874758642858,5296.238291000009,5591.686541000001,13780.0
81 | 1696068698,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,14,0,5400.0,5395.874758642858,5296.238291000009,5591.686541000001,13780.0
82 | 1696068699,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,14,0,5400.0,5395.874758642858,5296.238291000009,5591.686541000001,13780.0
83 | 1696068700,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,14,0,5400.0,5395.874758642858,5296.238291000009,5591.686541000001,13780.0
84 | 1696068701,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,15,0,5400.0,5403.023560866668,5296.238291000009,5591.686541000001,13780.0
85 | 1696068702,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,15,0,5400.0,5403.023560866668,5296.238291000009,5591.686541000001,13780.0
86 | 1696068703,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,15,0,5400.0,5403.023560866668,5296.238291000009,5591.686541000001,13780.0
87 | 1696068704,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,15,0,5400.0,5403.023560866668,5296.238291000009,5591.686541000001,13780.0
88 | 1696068705,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,15,0,5400.0,5403.023560866668,5296.238291000009,5591.686541000001,13780.0
89 | 1696068706,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5500,5500,5600,5600,5600,5600,5600,5600,15,0,5400.0,5403.023560866668,5296.238291000009,5591.686541000001,13780.0
90 | 1696068707,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,16,0,5400.0,5401.405840937501,5296.238291000009,5591.686541000001,13780.0
91 | 1696068708,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,16,0,5400.0,5401.405840937501,5296.238291000009,5591.686541000001,13780.0
92 | 1696068709,1,,Aggregated,0.200000,0.000000,5400,5400,5400,5400,5500,5600,5600,5600,5600,5600,5600,16,0,5400.0,5401.405840937501,5296.238291000009,5591.686541000001,13780.0
93 | 


--------------------------------------------------------------------------------
/src/etls/boe/loading/defs_id_largos.py:
--------------------------------------------------------------------------------
  1 | BOE_IDS = [
  2 |     "BOE-A-1964-9380",
  3 |     "BOE-A-1964-9814",
  4 |     "BOE-A-1967-16485",
  5 |     "BOE-A-1977-16116",
  6 |     "BOE-A-1979-21118",
  7 |     "BOE-A-1980-12179",
  8 |     "BOE-A-1982-10606",
  9 |     "BOE-A-1982-27204",
 10 |     "BOE-A-1983-26065",
 11 |     "BOE-A-1983-3834",
 12 |     "BOE-A-1984-15028",
 13 |     "BOE-A-1985-12666",
 14 |     "BOE-A-1985-1455",
 15 |     "BOE-A-1985-505",
 16 |     "BOE-A-1986-15212",
 17 |     "BOE-A-1986-25798",
 18 |     "BOE-A-1986-29360",
 19 |     "BOE-A-1986-9865",
 20 |     "BOE-A-1987-28404",
 21 |     "BOE-A-1987-665",
 22 |     "BOE-A-1988-29563",
 23 |     "BOE-A-1988-29622",
 24 |     "BOE-A-1989-4070",
 25 |     "BOE-A-1990-15347",
 26 |     "BOE-A-1990-23930",
 27 |     "BOE-A-1990-24442",
 28 |     "BOE-A-1990-31126",
 29 |     "BOE-A-1990-31180",
 30 |     "BOE-A-1991-19975",
 31 |     "BOE-A-1991-30903",
 32 |     "BOE-A-1992-23405",
 33 |     "BOE-A-1992-23406",
 34 |     "BOE-A-1992-28740",
 35 |     "BOE-A-1992-28826",
 36 |     "BOE-A-1992-6355",
 37 |     "BOE-A-1992-8426",
 38 |     "BOE-A-1993-31087",
 39 |     "BOE-A-1994-11421",
 40 |     "BOE-A-1994-14960",
 41 |     "BOE-A-1994-19367",
 42 |     "BOE-A-1994-28293",
 43 |     "BOE-A-1994-28967",
 44 |     "BOE-A-1994-28972",
 45 |     "BOE-A-1994-2986",
 46 |     "BOE-A-1994-6072",
 47 |     "BOE-A-1994-7063",
 48 |     "BOE-A-1995-15905",
 49 |     "BOE-A-1995-17001",
 50 |     "BOE-A-1995-1850",
 51 |     "BOE-A-1995-24262",
 52 |     "BOE-A-1995-25444",
 53 |     "BOE-A-1995-25735",
 54 |     "BOE-A-1995-5541",
 55 |     "BOE-A-1996-10996",
 56 |     "BOE-A-1996-19186",
 57 |     "BOE-A-1996-19480",
 58 |     "BOE-A-1996-1950",
 59 |     "BOE-A-1996-22197",
 60 |     "BOE-A-1996-2526",
 61 |     "BOE-A-1996-29116",
 62 |     "BOE-A-1996-29117",
 63 |     "BOE-A-1997-22309",
 64 |     "BOE-A-1997-23405",
 65 |     "BOE-A-1997-28052",
 66 |     "BOE-A-1997-28053",
 67 |     "BOE-A-1997-3296",
 68 |     "BOE-A-1997-3345",
 69 |     "BOE-A-1997-668",
 70 |     "BOE-A-1997-703",
 71 |     "BOE-A-1998-10720",
 72 |     "BOE-A-1998-11237",
 73 |     "BOE-A-1998-11315",
 74 |     "BOE-A-1998-15839",
 75 |     "BOE-A-1998-20245",
 76 |     "BOE-A-1998-20863",
 77 |     "BOE-A-1998-22121",
 78 |     "BOE-A-1998-23233",
 79 |     "BOE-A-1998-24279",
 80 |     "BOE-A-1998-26785",
 81 |     "BOE-A-1998-27654",
 82 |     "BOE-A-1998-30154",
 83 |     "BOE-A-1998-30155",
 84 |     "BOE-A-1998-5934",
 85 |     "BOE-A-1998-6648",
 86 |     "BOE-A-1998-8203",
 87 |     "BOE-A-1999-13467",
 88 |     "BOE-A-1999-14322",
 89 |     "BOE-A-1999-16941",
 90 |     "BOE-A-1999-1826",
 91 |     "BOE-A-1999-18866",
 92 |     "BOE-A-1999-20580",
 93 |     "BOE-A-1999-21158",
 94 |     "BOE-A-1999-21561",
 95 |     "BOE-A-1999-24785",
 96 |     "BOE-A-1999-2944",
 97 |     "BOE-A-1999-5756",
 98 |     "BOE-A-1999-637",
 99 |     "BOE-A-1999-6568",
100 |     "BOE-A-1999-8910",
101 |     "BOE-A-1999-8994",
102 | ]
103 | 
104 | 
105 | boe_ids_ = [
106 |     "BOE-A-2000-10827",
107 |     "BOE-A-2000-12180",
108 |     "BOE-A-2000-12302",
109 |     "BOE-A-2000-1546",
110 |     "BOE-A-2000-15839",
111 |     "BOE-A-2000-17268",
112 |     "BOE-A-2000-19913",
113 |     "BOE-A-2000-2387",
114 |     "BOE-A-2000-24265",
115 |     "BOE-A-2000-24357",
116 |     "BOE-A-2000-2670",
117 |     "BOE-A-2000-3083",
118 |     "BOE-A-2000-323",
119 |     "BOE-A-2000-3745",
120 |     "BOE-A-2000-5182",
121 |     "BOE-A-2000-5941",
122 |     "BOE-A-2000-6160",
123 |     "BOE-A-2001-12306",
124 |     "BOE-A-2001-16965",
125 |     "BOE-A-2001-17025",
126 |     "BOE-A-2001-18984",
127 |     "BOE-A-2001-19291",
128 |     "BOE-A-2001-19995",
129 |     "BOE-A-2001-24964",
130 |     "BOE-A-2001-24965",
131 |     "BOE-A-2001-3181",
132 |     "BOE-A-2001-8143",
133 |     "BOE-A-2001-8593",
134 |     "BOE-A-2001-8971",
135 |     "BOE-A-2002-10790",
136 |     "BOE-A-2002-1097",
137 |     "BOE-A-2002-11394",
138 |     "BOE-A-2002-11550",
139 |     "BOE-A-2002-15165",
140 |     "BOE-A-2002-17253",
141 |     "BOE-A-2002-18099",
142 |     "BOE-A-2002-1821",
143 |     "BOE-A-2002-18254",
144 |     "BOE-A-2002-22650",
145 |     "BOE-A-2002-2286",
146 |     "BOE-A-2002-2416",
147 |     "BOE-A-2002-25288",
148 |     "BOE-A-2002-25411",
149 |     "BOE-A-2002-25412",
150 |     "BOE-A-2002-3296",
151 |     "BOE-A-2002-3800",
152 |     "BOE-A-2002-5185",
153 |     "BOE-A-2002-551",
154 |     "BOE-A-2002-5844",
155 |     "BOE-A-2003-13621",
156 |     "BOE-A-2003-13813",
157 |     "BOE-A-2003-16239",
158 |     "BOE-A-2003-16487",
159 |     "BOE-A-2003-18084",
160 |     "BOE-A-2003-20211",
161 |     "BOE-A-2003-21033",
162 |     "BOE-A-2003-21430",
163 |     "BOE-A-2003-21615",
164 |     "BOE-A-2003-23186",
165 |     "BOE-A-2003-23514",
166 |     "BOE-A-2003-23644",
167 |     "BOE-A-2003-23935",
168 |     "BOE-A-2003-23936",
169 |     "BOE-A-2003-5552",
170 |     "BOE-A-2003-811",
171 |     "BOE-A-2003-9635",
172 |     "BOE-A-2004-10070",
173 |     "BOE-A-2004-11128",
174 |     "BOE-A-2004-12675",
175 |     "BOE-A-2004-14444",
176 |     "BOE-A-2004-14600",
177 |     "BOE-A-2004-17107",
178 |     "BOE-A-2004-21216",
179 |     "BOE-A-2004-21688",
180 |     "BOE-A-2004-2177",
181 |     "BOE-A-2004-21845",
182 |     "BOE-A-2004-2429",
183 |     "BOE-A-2004-2577",
184 |     "BOE-A-2004-2972",
185 |     "BOE-A-2004-4214",
186 |     "BOE-A-2004-4456",
187 |     "BOE-A-2004-5771",
188 |     "BOE-A-2004-6192",
189 |     "BOE-A-2004-741",
190 |     "BOE-A-2004-8358",
191 |     "BOE-A-2005-10210",
192 |     "BOE-A-2005-1154",
193 |     "BOE-A-2005-12099",
194 |     "BOE-A-2005-175",
195 |     "BOE-A-2005-20770",
196 |     "BOE-A-2005-21525",
197 |     "BOE-A-2005-2178",
198 |     "BOE-A-2005-3387",
199 |     "BOE-A-2006-10056",
200 |     "BOE-A-2006-10303",
201 |     "BOE-A-2006-11050",
202 |     "BOE-A-2006-11956",
203 |     "BOE-A-2006-14961",
204 |     "BOE-A-2006-15487",
205 |     "BOE-A-2006-15907",
206 |     "BOE-A-2006-16285",
207 |     "BOE-A-2006-1757",
208 |     "BOE-A-2006-20764",
209 |     "BOE-A-2006-20935",
210 |     "BOE-A-2006-22169",
211 |     "BOE-A-2006-22221",
212 |     "BOE-A-2006-22865",
213 |     "BOE-A-2006-2973",
214 |     "BOE-A-2006-2975",
215 |     "BOE-A-2007-10677",
216 |     "BOE-A-2007-13972",
217 |     "BOE-A-2007-14050",
218 |     "BOE-A-2007-15574",
219 |     "BOE-A-2007-15984",
220 |     "BOE-A-2007-16208",
221 |     "BOE-A-2007-16897",
222 |     "BOE-A-2007-18874",
223 |     "BOE-A-2007-19184",
224 |     "BOE-A-2007-19884",
225 |     "BOE-A-2007-19966",
226 |     "BOE-A-2007-2208",
227 |     "BOE-A-2007-22295",
228 |     "BOE-A-2007-238",
229 |     "BOE-A-2007-2498",
230 |     "BOE-A-2007-4735",
231 |     "BOE-A-2007-8581",
232 |     "BOE-A-2008-10389",
233 |     "BOE-A-2008-1338",
234 |     "BOE-A-2008-13530",
235 |     "BOE-A-2008-14805",
236 |     "BOE-A-2008-14900",
237 |     "BOE-A-2008-15264",
238 |     "BOE-A-2008-15340",
239 |     "BOE-A-2008-19157",
240 |     "BOE-A-2008-19916",
241 |     "BOE-A-2008-20744",
242 |     "BOE-A-2008-20895",
243 |     "BOE-A-2008-2108",
244 |     "BOE-A-2008-2481",
245 |     "BOE-A-2008-3100",
246 |     "BOE-A-2008-3101",
247 |     "BOE-A-2008-3175",
248 |     "BOE-A-2008-3306",
249 |     "BOE-A-2008-3309",
250 |     "BOE-A-2008-3347",
251 |     "BOE-A-2008-4729",
252 |     "BOE-A-2008-4789",
253 |     "BOE-A-2008-4810",
254 |     "BOE-A-2008-4914",
255 |     "BOE-A-2008-4983",
256 |     "BOE-A-2008-5064",
257 |     "BOE-A-2008-5269",
258 |     "BOE-A-2008-7616",
259 |     "BOE-A-2008-792",
260 |     "BOE-A-2008-9288",
261 |     "BOE-A-2008-9768",
262 |     "BOE-A-2008-9915",
263 |     "BOE-A-2009-10493",
264 |     "BOE-A-2009-10623",
265 |     "BOE-A-2009-11247",
266 |     "BOE-A-2009-11423",
267 |     "BOE-A-2009-11439",
268 |     "BOE-A-2009-11680",
269 |     "BOE-A-2009-13440",
270 |     "BOE-A-2009-13539",
271 |     "BOE-A-2009-13716",
272 |     "BOE-A-2009-14001",
273 |     "BOE-A-2009-14027",
274 |     "BOE-A-2009-14083",
275 |     "BOE-A-2009-14088",
276 |     "BOE-A-2009-14755",
277 |     "BOE-A-2009-15054",
278 |     "BOE-A-2009-15107",
279 |     "BOE-A-2009-15417",
280 |     "BOE-A-2009-1542",
281 |     "BOE-A-2009-15793",
282 |     "BOE-A-2009-16022",
283 |     "BOE-A-2009-1603",
284 |     "BOE-A-2009-1604",
285 |     "BOE-A-2009-1613",
286 |     "BOE-A-2009-17394",
287 |     "BOE-A-2009-17493",
288 |     "BOE-A-2009-17708",
289 |     "BOE-A-2009-1781",
290 |     "BOE-A-2009-1826",
291 |     "BOE-A-2009-1852",
292 |     "BOE-A-2009-1874",
293 |     "BOE-A-2009-18748",
294 |     "BOE-A-2009-18898",
295 |     "BOE-A-2009-19889",
296 |     "BOE-A-2009-2044",
297 |     "BOE-A-2009-20748",
298 |     "BOE-A-2009-20765",
299 |     "BOE-A-2009-20768",
300 |     "BOE-A-2009-2174",
301 |     "BOE-A-2009-2605",
302 |     "BOE-A-2009-2800",
303 |     "BOE-A-2009-2865",
304 |     "BOE-A-2009-3499",
305 |     "BOE-A-2009-4089",
306 |     "BOE-A-2009-4142",
307 |     "BOE-A-2009-4369",
308 |     "BOE-A-2009-4489",
309 |     "BOE-A-2009-4586",
310 |     "BOE-A-2009-7260",
311 |     "BOE-A-2009-8813",
312 |     "BOE-A-2009-8930",
313 |     "BOE-A-2009-9043",
314 |     "BOE-A-2009-9481",
315 |     "BOE-A-2009-9597",
316 |     "BOE-A-2009-9995",
317 |     "BOE-A-2010-101",
318 |     "BOE-A-2010-1010",
319 |     "BOE-A-2010-10273",
320 |     "BOE-A-2010-10544",
321 |     "BOE-A-2010-10971",
322 |     "BOE-A-2010-11409",
323 |     "BOE-A-2010-11492",
324 |     "BOE-A-2010-11531",
325 |     "BOE-A-2010-11689",
326 |     "BOE-A-2010-12629",
327 |     "BOE-A-2010-12703",
328 |     "BOE-A-2010-12970",
329 |     "BOE-A-2010-13562",
330 |     "BOE-A-2010-13564",
331 |     "BOE-A-2010-13883",
332 |     "BOE-A-2010-14078",
333 |     "BOE-A-2010-14191",
334 |     "BOE-A-2010-14621",
335 |     "BOE-A-2010-14826",
336 |     "BOE-A-2010-15319",
337 |     "BOE-A-2010-15420",
338 |     "BOE-A-2010-16087",
339 |     "BOE-A-2010-16277",
340 |     "BOE-A-2010-16441",
341 |     "BOE-A-2010-17676",
342 |     "BOE-A-2010-18067",
343 |     "BOE-A-2010-19189",
344 |     "BOE-A-2010-19197",
345 |     "BOE-A-2010-19703",
346 |     "BOE-A-2010-19760",
347 |     "BOE-A-2010-20034",
348 |     "BOE-A-2010-2164",
349 |     "BOE-A-2010-2463",
350 |     "BOE-A-2010-2654",
351 |     "BOE-A-2010-2694",
352 |     "BOE-A-2010-2858",
353 |     "BOE-A-2010-2882",
354 |     "BOE-A-2010-2888",
355 |     "BOE-A-2010-3297",
356 |     "BOE-A-2010-4006",
357 |     "BOE-A-2010-4624",
358 |     "BOE-A-2010-6170",
359 |     "BOE-A-2010-6210",
360 |     "BOE-A-2010-6710",
361 |     "BOE-A-2010-6923",
362 |     "BOE-A-2010-7333",
363 |     "BOE-A-2010-8168",
364 |     "BOE-A-2010-883",
365 |     "BOE-A-2010-891",
366 |     "BOE-A-2010-928",
367 |     "BOE-A-2010-9515",
368 |     "BOE-A-2010-9642",
369 |     "BOE-A-2010-972",
370 |     "BOE-A-2010-9764",
371 |     "BOE-A-2010-9774",
372 |     "BOE-A-2010-9994",
373 |     "BOE-A-2011-1000",
374 |     "BOE-A-2011-10055",
375 |     "BOE-A-2011-10056",
376 |     "BOE-A-2011-10057",
377 |     "BOE-A-2011-10534",
378 |     "BOE-A-2011-10540",
379 |     "BOE-A-2011-10676",
380 |     "BOE-A-2011-10677",
381 |     "BOE-A-2011-10678",
382 |     "BOE-A-2011-10679",
383 |     "BOE-A-2011-10724",
384 |     "BOE-A-2011-10873",
385 |     "BOE-A-2011-10875",
386 |     "BOE-A-2011-10876",
387 |     "BOE-A-2011-10877",
388 |     "BOE-A-2011-10878",
389 |     "BOE-A-2011-11162",
390 |     "BOE-A-2011-11825",
391 |     "BOE-A-2011-11945",
392 |     "BOE-A-2011-11948",
393 |     "BOE-A-2011-12104",
394 |     "BOE-A-2011-12540",
395 |     "BOE-A-2011-12867",
396 |     "BOE-A-2011-13276",
397 |     "BOE-A-2011-13390",
398 |     "BOE-A-2011-13391",
399 |     "BOE-A-2011-13392",
400 |     "BOE-A-2011-13393",
401 |     "BOE-A-2011-13394",
402 |     "BOE-A-2011-13542",
403 |     "BOE-A-2011-14248",
404 |     "BOE-A-2011-14249",
405 |     "BOE-A-2011-14250",
406 |     "BOE-A-2011-14325",
407 |     "BOE-A-2011-1438",
408 |     "BOE-A-2011-1497",
409 |     "BOE-A-2011-14971",
410 |     "BOE-A-2011-15363",
411 |     "BOE-A-2011-1564",
412 |     "BOE-A-2011-15936",
413 |     "BOE-A-2011-16466",
414 |     "BOE-A-2011-16467",
415 |     "BOE-A-2011-16473",
416 |     "BOE-A-2011-16745",
417 |     "BOE-A-2011-17400",
418 |     "BOE-A-2011-17465",
419 |     "BOE-A-2011-17562",
420 |     "BOE-A-2011-17887",
421 |     "BOE-A-2011-17893",
422 |     "BOE-A-2011-17894",
423 |     "BOE-A-2011-17895",
424 |     "BOE-A-2011-17960",
425 |     "BOE-A-2011-18097",
426 |     "BOE-A-2011-18098",
427 |     "BOE-A-2011-18099",
428 |     "BOE-A-2011-18458",
429 |     "BOE-A-2011-18461",
430 |     "BOE-A-2011-18712",
431 |     "BOE-A-2011-18913",
432 |     "BOE-A-2011-19239",
433 |     "BOE-A-2011-19294",
434 |     "BOE-A-2011-19354",
435 |     "BOE-A-2011-19355",
436 |     "BOE-A-2011-19360",
437 |     "BOE-A-2011-19391",
438 |     "BOE-A-2011-19441",
439 |     "BOE-A-2011-19499",
440 |     "BOE-A-2011-19500",
441 |     "BOE-A-2011-19501",
442 |     "BOE-A-2011-19502",
443 |     "BOE-A-2011-19503",
444 |     "BOE-A-2011-19504",
445 |     "BOE-A-2011-19846",
446 |     "BOE-A-2011-19902",
447 |     "BOE-A-2011-19918",
448 |     "BOE-A-2011-20100",
449 |     "BOE-A-2011-20101",
450 |     "BOE-A-2011-20102",
451 |     "BOE-A-2011-20103",
452 |     "BOE-A-2011-20367",
453 |     "BOE-A-2011-20368",
454 |     "BOE-A-2011-20471",
455 |     "BOE-A-2011-20648",
456 |     "BOE-A-2011-2236",
457 |     "BOE-A-2011-2627",
458 |     "BOE-A-2011-3096",
459 |     "BOE-A-2011-3098",
460 |     "BOE-A-2011-3099",
461 |     "BOE-A-2011-3173",
462 |     "BOE-A-2011-3174",
463 |     "BOE-A-2011-3221",
464 |     "BOE-A-2011-3582",
465 |     "BOE-A-2011-3614",
466 |     "BOE-A-2011-3634",
467 |     "BOE-A-2011-3639",
468 |     "BOE-A-2011-4082",
469 |     "BOE-A-2011-4117",
470 |     "BOE-A-2011-4292",
471 |     "BOE-A-2011-4341",
472 |     "BOE-A-2011-4352",
473 |     "BOE-A-2011-4372",
474 |     "BOE-A-2011-6668",
475 |     "BOE-A-2011-6837",
476 |     "BOE-A-2011-7703",
477 |     "BOE-A-2011-7732",
478 |     "BOE-A-2011-7752",
479 |     "BOE-A-2011-8018",
480 |     "BOE-A-2011-8019",
481 |     "BOE-A-2011-8024",
482 |     "BOE-A-2011-8050",
483 |     "BOE-A-2011-8052",
484 |     "BOE-A-2011-8342",
485 |     "BOE-A-2011-8382",
486 |     "BOE-A-2011-8923",
487 |     "BOE-A-2011-9086",
488 |     "BOE-A-2011-9088",
489 |     "BOE-A-2011-9213",
490 |     "BOE-A-2011-9657",
491 |     "BOE-A-2011-9928",
492 |     "BOE-A-2011-9929",
493 |     "BOE-A-2011-9930",
494 |     "BOE-A-2011-9931",
495 |     "BOE-A-2011-9992",
496 |     "BOE-A-2011-9993",
497 |     "BOE-A-2011-9994",
498 |     "BOE-A-2012-10386",
499 |     "BOE-A-2012-10514",
500 |     "BOE-A-2012-10690",
501 |     "BOE-A-2012-10751",
502 |     "BOE-A-2012-11009",
503 |     "BOE-A-2012-11169",
504 |     "BOE-A-2012-11325",
505 |     "BOE-A-2012-11326",
506 |     "BOE-A-2012-11374",
507 |     "BOE-A-2012-11397",
508 |     "BOE-A-2012-11514",
509 |     "BOE-A-2012-11548",
510 |     "BOE-A-2012-12029",
511 |     "BOE-A-2012-12403",
512 |     "BOE-A-2012-1254",
513 |     "BOE-A-2012-12618",
514 |     "BOE-A-2012-14078",
515 |     "BOE-A-2012-14679",
516 |     "BOE-A-2012-15651",
517 |     "BOE-A-2012-15774",
518 |     "BOE-A-2012-2034",
519 |     "BOE-A-2012-2269",
520 |     "BOE-A-2012-2983",
521 |     "BOE-A-2012-3227",
522 |     "BOE-A-2012-3264",
523 |     "BOE-A-2012-3378",
524 |     "BOE-A-2012-3725",
525 |     "BOE-A-2012-4294",
526 |     "BOE-A-2012-472",
527 |     "BOE-A-2012-5203",
528 |     "BOE-A-2012-7117",
529 |     "BOE-A-2012-7663",
530 |     "BOE-A-2012-807",
531 |     "BOE-A-2012-808",
532 |     "BOE-A-2012-8595",
533 |     "BOE-A-2012-8721",
534 |     "BOE-A-2012-8745",
535 |     "BOE-A-2012-9282",
536 |     "BOE-A-2012-9364",
537 |     "BOE-A-2012-9716",
538 |     "BOE-A-2013-10268",
539 |     "BOE-A-2013-10269",
540 |     "BOE-A-2013-11956",
541 |     "BOE-A-2013-12026",
542 |     "BOE-A-2013-12917",
543 |     "BOE-A-2013-13322",
544 |     "BOE-A-2013-13392",
545 |     "BOE-A-2013-13501",
546 |     "BOE-A-2013-13616",
547 |     "BOE-A-2013-1362",
548 |     "BOE-A-2013-13660",
549 |     "BOE-A-2013-1424",
550 |     "BOE-A-2013-1917",
551 |     "BOE-A-2013-1995",
552 |     "BOE-A-2013-2465",
553 |     "BOE-A-2013-3070",
554 |     "BOE-A-2013-3242",
555 |     "BOE-A-2013-3243",
556 |     "BOE-A-2013-3244",
557 |     "BOE-A-2013-3245",
558 |     "BOE-A-2013-3246",
559 |     "BOE-A-2013-3247",
560 |     "BOE-A-2013-3350",
561 |     "BOE-A-2013-3736",
562 |     "BOE-A-2013-3750",
563 |     "BOE-A-2013-3781",
564 |     "BOE-A-2013-4088",
565 |     "BOE-A-2013-4209",
566 |     "BOE-A-2013-451",
567 |     "BOE-A-2013-4557",
568 |     "BOE-A-2013-4559",
569 |     "BOE-A-2013-4845",
570 |     "BOE-A-2013-5720",
571 |     "BOE-A-2013-627",
572 |     "BOE-A-2013-631",
573 |     "BOE-A-2013-663",
574 |     "BOE-A-2013-6761",
575 |     "BOE-A-2013-709",
576 |     "BOE-A-2013-710",
577 |     "BOE-A-2013-711",
578 |     "BOE-A-2013-713",
579 |     "BOE-A-2013-7883",
580 |     "BOE-A-2013-8826",
581 |     "BOE-A-2013-9074",
582 |     "BOE-A-2013-9414",
583 |     "BOE-A-2013-9415",
584 |     "BOE-A-2013-9432",
585 |     "BOE-A-2013-9462",
586 |     "BOE-A-2013-9463",
587 |     "BOE-A-2013-9512",
588 |     "BOE-A-2013-9513",
589 |     "BOE-A-2013-9535",
590 |     "BOE-A-2013-9536",
591 |     "BOE-A-2013-9565",
592 |     "BOE-A-2013-9637",
593 |     "BOE-A-2013-9678",
594 |     "BOE-A-2013-9679",
595 |     "BOE-A-2013-9706",
596 |     "BOE-A-2013-9707",
597 |     "BOE-A-2013-9720",
598 |     "BOE-A-2013-9722",
599 |     "BOE-A-2013-9723",
600 |     "BOE-A-2013-9804",
601 |     "BOE-A-2013-9805",
602 |     "BOE-A-2013-9807",
603 |     "BOE-A-2014-10344",
604 |     "BOE-A-2014-10345",
605 |     "BOE-A-2014-10517",
606 |     "BOE-A-2014-1096",
607 |     "BOE-A-2014-11702",
608 |     "BOE-A-2014-11864",
609 |     "BOE-A-2014-11987",
610 |     "BOE-A-2014-1218",
611 |     "BOE-A-2014-12328",
612 |     "BOE-A-2014-13291",
613 |     "BOE-A-2014-13612",
614 |     "BOE-A-2014-1510",
615 |     "BOE-A-2014-1561",
616 |     "BOE-A-2014-2223",
617 |     "BOE-A-2014-2360",
618 |     "BOE-A-2014-287",
619 |     "BOE-A-2014-2999",
620 |     "BOE-A-2014-4626",
621 |     "BOE-A-2014-4856",
622 |     "BOE-A-2014-4950",
623 |     "BOE-A-2014-4954",
624 |     "BOE-A-2014-510",
625 |     "BOE-A-2014-5280",
626 |     "BOE-A-2014-5365",
627 |     "BOE-A-2014-5427",
628 |     "BOE-A-2014-5591",
629 |     "BOE-A-2014-6084",
630 |     "BOE-A-2014-6123",
631 |     "BOE-A-2014-6143",
632 |     "BOE-A-2014-643",
633 |     "BOE-A-2014-6431",
634 |     "BOE-A-2014-6726",
635 |     "BOE-A-2014-6856",
636 |     "BOE-A-2014-7064",
637 |     "BOE-A-2014-7683",
638 |     "BOE-A-2014-8166",
639 |     "BOE-A-2014-8843",
640 |     "BOE-A-2014-8844",
641 |     "BOE-A-2014-885",
642 |     "BOE-A-2014-916",
643 |     "BOE-A-2014-9335",
644 |     "BOE-A-2014-9427",
645 |     "BOE-A-2014-945",
646 |     "BOE-A-2014-9469",
647 |     "BOE-A-2014-9625",
648 |     "BOE-A-2014-9667",
649 |     "BOE-A-2014-970",
650 |     "BOE-A-2014-9865",
651 |     "BOE-A-2015-10016",
652 |     "BOE-A-2015-10197",
653 |     "BOE-A-2015-10328",
654 |     "BOE-A-2015-10358",
655 |     "BOE-A-2015-10566",
656 |     "BOE-A-2015-1059",
657 |     "BOE-A-2015-10768",
658 |     "BOE-A-2015-10769",
659 |     "BOE-A-2015-10770",
660 |     "BOE-A-2015-10771",
661 |     "BOE-A-2015-10773",
662 |     "BOE-A-2015-10789",
663 |     "BOE-A-2015-10956",
664 |     "BOE-A-2015-11430",
665 |     "BOE-A-2015-11435",
666 |     "BOE-A-2015-11644",
667 |     "BOE-A-2015-11724",
668 |     "BOE-A-2015-12054",
669 |     "BOE-A-2015-12192",
670 |     "BOE-A-2015-1236",
671 |     "BOE-A-2015-12446",
672 |     "BOE-A-2015-12471",
673 |     "BOE-A-2015-12734",
674 |     "BOE-A-2015-12739",
675 |     "BOE-A-2015-12863",
676 |     "BOE-A-2015-13057",
677 |     "BOE-A-2015-13073",
678 |     "BOE-A-2015-13488",
679 |     "BOE-A-2015-13643",
680 |     "BOE-A-2015-13692",
681 |     "BOE-A-2015-13875",
682 |     "BOE-A-2015-13907",
683 |     "BOE-A-2015-14047",
684 |     "BOE-A-2015-14278",
685 |     "BOE-A-2015-14333",
686 |     "BOE-A-2015-2036",
687 |     "BOE-A-2015-2107",
688 |     "BOE-A-2015-2605",
689 |     "BOE-A-2015-2995",
690 |     "BOE-A-2015-3484",
691 |     "BOE-A-2015-3737",
692 |     "BOE-A-2015-3799",
693 |     "BOE-A-2015-4102",
694 |     "BOE-A-2015-4790",
695 |     "BOE-A-2015-5490",
696 |     "BOE-A-2015-5677",
697 |     "BOE-A-2015-6203",
698 |     "BOE-A-2015-685",
699 |     "BOE-A-2015-7391",
700 |     "BOE-A-2015-7594",
701 |     "BOE-A-2015-7897",
702 |     "BOE-A-2015-8042",
703 |     "BOE-A-2015-8149",
704 |     "BOE-A-2015-82",
705 |     "BOE-A-2015-8234",
706 |     "BOE-A-2015-8646",
707 |     "BOE-A-2015-8714",
708 |     "BOE-A-2015-8765",
709 |     "BOE-A-2015-8771",
710 |     "BOE-A-2015-8772",
711 |     "BOE-A-2015-8774",
712 |     "BOE-A-2015-9270",
713 |     "BOE-A-2015-9304",
714 |     "BOE-A-2015-9452",
715 |     "BOE-A-2015-9462",
716 |     "BOE-A-2015-9472",
717 |     "BOE-A-2015-950",
718 |     "BOE-A-2015-9546",
719 |     "BOE-A-2015-9806",
720 |     "BOE-A-2016-10435",
721 |     "BOE-A-2016-1052",
722 |     "BOE-A-2016-11115",
723 |     "BOE-A-2016-1205",
724 |     "BOE-A-2016-12607",
725 |     "BOE-A-2016-1271",
726 |     "BOE-A-2016-1607",
727 |     "BOE-A-2016-2016",
728 |     "BOE-A-2016-2168",
729 |     "BOE-A-2016-2856",
730 |     "BOE-A-2016-2943",
731 |     "BOE-A-2016-2993",
732 |     "BOE-A-2016-3069",
733 |     "BOE-A-2016-333",
734 |     "BOE-A-2016-3530",
735 |     "BOE-A-2016-3540",
736 |     "BOE-A-2016-4246",
737 |     "BOE-A-2016-43",
738 |     "BOE-A-2016-439",
739 |     "BOE-A-2016-478",
740 |     "BOE-A-2016-4970",
741 |     "BOE-A-2016-5186",
742 |     "BOE-A-2016-541",
743 |     "BOE-A-2016-5530",
744 |     "BOE-A-2016-5551",
745 |     "BOE-A-2016-5932",
746 |     "BOE-A-2016-6140",
747 |     "BOE-A-2016-6561",
748 |     "BOE-A-2016-6738",
749 |     "BOE-A-2016-6748",
750 |     "BOE-A-2016-7516",
751 |     "BOE-A-2016-7517",
752 |     "BOE-A-2016-7841",
753 |     "BOE-A-2016-824",
754 |     "BOE-A-2016-8424",
755 |     "BOE-A-2016-8852",
756 |     "BOE-A-2016-8854",
757 |     "BOE-A-2016-8952",
758 |     "BOE-A-2016-8960",
759 |     "BOE-A-2016-9323",
760 |     "BOE-A-2016-9964",
761 | ]
762 | 


--------------------------------------------------------------------------------