├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── assets └── Flowchart.png ├── docker-compose-s.yml ├── docker-compose.yml ├── main.py ├── requirements.txt ├── src └── wikipedia.py └── start.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | 3 | # Set working directory 4 | WORKDIR /app 5 | 6 | # Copy application code 7 | COPY . /app 8 | 9 | # Set environment variable 10 | ENV PYTHONPATH="/app" 11 | 12 | # Install dependencies 13 | RUN pip install --no-cache-dir -r requirements.txt 14 | 15 | # Expose port 16 | # EXPOSE 8000 17 | 18 | # Start both verba and main.py 19 | CMD ["sh", "-c", "verba start --port 8000 --host 0.0.0.0 & python main.py"] 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Piazza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Piazza AI: Wikipedia Updater Framework 2 | 3 | Welcome to the **Piazza Updater**, a framework developed by **Piazza AI** that demonstrates the power of **Weaviate vector databases** combined with **real-time data updates**. This repository is an open-source demo showcasing how our framework processes Wikipedia data, fetches new information in real-time from the web, and updates a vector database. The goal is to simplify **Large Language Model (LLM) deployments** by leveraging advanced techniques like **Retrieval-Augmented Generation (RAG)**. 4 | 5 | --- 6 | 7 | ## Piazza Workflow 8 | 9 |
10 | Piazza Workflow 11 |
12 | 13 | ## How to Run the Demo 14 | 15 | ### Prerequisites 16 | 1. **Docker**: Ensure Docker is installed on your machine. 17 | 2. **API Keys**: Create a `.env` file to include API keys required for other modules (e.g., OpenAI, Anthropic). Refer to the [Verba repository](https://github.com/weaviate/Verba) for details. 18 | 19 | ### Steps to Run 20 | 1. Clone this repository and install dependencies 21 | ```bash 22 | git clone https://github.com/piazza-tech/Piazza-Updater.git 23 | cd Piazza-Updater 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | 2. Provide execution permissions for the `start.sh` script: 28 | ```bash 29 | chmod +x start.sh 30 | ``` 31 | 32 | 3. Run the framework: 33 | ```bash 34 | ./start.sh 35 | ``` 36 | 37 | 4. Open your browser and navigate to `http://localhost:8000`. 38 | 39 | 5. In the Verba web interface: 40 | - Choose **Docker Deployment**. 41 | - Select **Documents** to observe Wikipedia data being processed and updated in real-time. 42 | 43 | 6. Once the initial Wikipedia dumps are processed: 44 | - The script begins searching the internet for new data. 45 | - Chat with the LLM using up-to-date information! 46 | 47 | --- 48 | 49 | ### Configuration Options 50 | 1. **Development Mode**: 51 | - Skip the `PRODUCTION` variable in `.env` for a lightweight demo (processes a small subset of Wikipedia). 52 | - Use `docker-compose-s.yml` for minimal resource usage. 53 | 54 | 2. **Production Mode**: 55 | - Add the `PRODUCTION` variable in `.env` to process the entire Wikipedia dataset. 56 | - Use `docker-compose.yml` for full-scale deployment (requires more time and resources). 57 | 58 | --- 59 | 60 | ## Technologies Used 61 | - **Weaviate**: Vector database for efficient semantic search and data retrieval. 62 | - **LLMs**: Powered by **Ollama Llama 3.2** for natural language understanding. 63 | - **RAG Framework**: Combines vectorized data with real-time search to enhance LLM performance. 64 | - **Verba**: Web app for seamless user interaction and deployment ([learn more](https://github.com/weaviate/Verba)). 65 | 66 | --- 67 | 68 | ## Use Cases Beyond Wikipedia 69 | While this demo focuses on Wikipedia, the Piazza Updater framework is highly adaptable: 70 | - Integrate with any database or website. 71 | - Fetch and process real-time internet data for various domains, such as: 72 | - News websites 73 | - E-commerce platforms 74 | - Scientific research databases 75 | 76 | --- 77 | 78 | ## License 79 | This project is open-source under the [MIT License](LICENSE). 80 | 81 | --- 82 | 83 | ## Contributing 84 | We welcome contributions to enhance this demo! Feel free to fork the repository, make changes, and submit pull requests. 85 | 86 | --- 87 | 88 | For questions or support, reach out to **Piazza AI** or visit the [Verba repository](https://github.com/weaviate/Verba) for additional deployment details. 89 | 90 | Start exploring the future of real-time LLMs today! 🚀 91 | -------------------------------------------------------------------------------- /assets/Flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Piazza-tech/Piazza-Updater/5064db489257f5f330bae9ce3254028feb77ec13/assets/Flowchart.png -------------------------------------------------------------------------------- /docker-compose-s.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | services: 4 | 5 | weaviate: 6 | command: 7 | - --host 8 | - 0.0.0.0 9 | - --port 10 | - '8080' 11 | - --scheme 12 | - http 13 | image: cr.weaviate.io/semitechnologies/weaviate:1.27.0 14 | ports: 15 | - 8080:8080 16 | - 3000:8080 17 | - 50051:50051 18 | volumes: 19 | - weaviate_data:/var/lib/weaviate 20 | restart: on-failure:0 21 | healthcheck: 22 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8080/v1/.well-known/ready || exit 1 23 | interval: 5s 24 | timeout: 10s 25 | retries: 5 26 | start_period: 10s 27 | environment: 28 | QUERY_DEFAULTS_LIMIT: 50 29 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 30 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 31 | DEFAULT_VECTORIZER_MODULE: 'text2vec-ollama' 32 | ENABLE_API_BASED_MODULES: 'true' 33 | ENABLE_MODULES: 'generative-ollama,text2vec-ollama' 34 | CLUSTER_HOSTNAME: 'node1' 35 | 36 | updater: 37 | build: 38 | context: . 39 | dockerfile: Dockerfile 40 | environment: 41 | - WEAVIATE_URL=http://weaviate:8080 42 | - OLLAMA_URL=http://ollama:11434 43 | - OLLAMA_MODEL=llama3.2 44 | ports: 45 | - 8000:8000 46 | depends_on: 47 | weaviate: 48 | condition: service_healthy 49 | volumes: 50 | - updater:/data/ 51 | healthcheck: 52 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8000 || exit 1 53 | interval: 5s 54 | timeout: 10s 55 | retries: 5 56 | start_period: 10s 57 | 58 | ollama: 59 | image: ollama/ollama 60 | ports: 61 | - "11434:11434" # Expose the port for Ollama 62 | volumes: 63 | - ollama:/root/.ollama # Persistent storage for Ollama data 64 | deploy: 65 | resources: 66 | reservations: 67 | devices: 68 | - driver: nvidia 69 | capabilities: [gpu] 70 | count: all # Use 'all' for all available GPUs or specify a number 71 | restart: always 72 | 73 | volumes: 74 | weaviate_data: 75 | external: true 76 | ollama: 77 | external: true 78 | updater: 79 | external: true 80 | ... -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | services: 4 | 5 | weaviate: 6 | command: 7 | - --host 8 | - 0.0.0.0 9 | - --port 10 | - '8080' 11 | - --scheme 12 | - http 13 | image: cr.weaviate.io/semitechnologies/weaviate:1.27.0 14 | ports: 15 | - 8080:8080 16 | - 3000:8080 17 | - 50051:50051 18 | volumes: 19 | - weaviate_data:/var/lib/weaviate 20 | restart: on-failure:0 21 | healthcheck: 22 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8080/v1/.well-known/ready || exit 1 23 | interval: 5s 24 | timeout: 10s 25 | retries: 5 26 | start_period: 10s 27 | environment: 28 | OPENAI_APIKEY: $OPENAI_API_KEY 29 | COHERE_APIKEY: $COHERE_API_KEY 30 | TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080' 31 | QNA_INFERENCE_API: 'http://qna-transformers:8080' 32 | IMAGE_INFERENCE_API: 'http://i2v-neural:8080' 33 | NER_INFERENCE_API: 'http://ner-transformers:8080' 34 | RERANKER_INFERENCE_API: 'http://reranker-transformers:8080' 35 | QUERY_DEFAULTS_LIMIT: 25 36 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 37 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 38 | DEFAULT_VECTORIZER_MODULE: 'text2vec-ollama' 39 | ENABLE_API_BASED_MODULES: 'true' 40 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama,text2vec-transformers,qna-transformers,ner-transformers,img2vec-neural,reranker-transformers' 41 | CLUSTER_HOSTNAME: 'node1' 42 | 43 | t2v-transformers: 44 | image: cr.weaviate.io/semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 45 | environment: 46 | ENABLE_CUDA: '1' 47 | NVIDIA_VISIBLE_DEVICES: 'all' 48 | deploy: 49 | resources: 50 | reservations: 51 | devices: 52 | - capabilities: 53 | - 'gpu' 54 | 55 | qna-transformers: 56 | image: cr.weaviate.io/semitechnologies/qna-transformers:distilbert-base-uncased-distilled-squad 57 | environment: 58 | ENABLE_CUDA: '1' 59 | NVIDIA_VISIBLE_DEVICES: 'all' 60 | deploy: 61 | resources: 62 | reservations: 63 | devices: 64 | - capabilities: 65 | - 'gpu' 66 | 67 | ner-transformers: 68 | image: cr.weaviate.io/semitechnologies/ner-transformers:dbmdz-bert-large-cased-finetuned-conll03-english 69 | environment: 70 | ENABLE_CUDA: '1' 71 | NVIDIA_VISIBLE_DEVICES: 'all' 72 | deploy: 73 | resources: 74 | reservations: 75 | devices: 76 | - capabilities: 77 | - 'gpu' 78 | 79 | i2v-neural: 80 | image: cr.weaviate.io/semitechnologies/img2vec-pytorch:resnet50 81 | environment: 82 | ENABLE_CUDA: '1' 83 | NVIDIA_VISIBLE_DEVICES: 'all' 84 | deploy: 85 | resources: 86 | reservations: 87 | devices: 88 | - capabilities: 89 | - 'gpu' 90 | 91 | reranker-transformers: 92 | image: cr.weaviate.io/semitechnologies/reranker-transformers:cross-encoder-ms-marco-MiniLM-L-6-v2 93 | environment: 94 | ENABLE_CUDA: '1' 95 | NVIDIA_VISIBLE_DEVICES: 'all' 96 | deploy: 97 | resources: 98 | reservations: 99 | devices: 100 | - capabilities: 101 | - 'gpu' 102 | 103 | updater: 104 | build: 105 | context: . 106 | dockerfile: Dockerfile 107 | ports: 108 | - 8000:8000 109 | environment: 110 | - WEAVIATE_URL=http://weaviate:8080 111 | - OPENAI_API_KEY=$OPENAI_API_KEY 112 | - COHERE_API_KEY=$COHERE_API_KEY 113 | - UNSTRUCTURED_API_KEY=$UNSTRUCTURED_API_KEY 114 | - UNSTRUCTURED_API_URL=$UNSTRUCTURED_API_URL 115 | - OLLAMA_URL=http://ollama:11434 116 | - OLLAMA_MODEL=llama3.2 117 | - OLLAMA_EMBED_MODEL=nomic-embed-text 118 | - GITHUB_TOKEN=$GITHUB_TOKEN 119 | depends_on: 120 | weaviate: 121 | condition: service_healthy 122 | volumes: 123 | - updater:/data/ 124 | healthcheck: 125 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8000 || exit 1 126 | interval: 5s 127 | timeout: 10s 128 | retries: 5 129 | start_period: 10s 130 | 131 | ollama: 132 | image: ollama/ollama 133 | ports: 134 | - "11434:11434" # Expose the port for Ollama 135 | volumes: 136 | - ollama:/root/.ollama # Persistent storage for Ollama data 137 | deploy: 138 | resources: 139 | reservations: 140 | devices: 141 | - driver: nvidia 142 | capabilities: [gpu] 143 | count: all # Use 'all' for all available GPUs or specify a number 144 | restart: always 145 | 146 | volumes: 147 | weaviate_data: 148 | external: true 149 | ollama: 150 | external: true 151 | updater: 152 | external: true 153 | ... -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import dotenv 4 | from datetime import datetime, timezone 5 | import base64 6 | from typing import Dict, Any 7 | import argparse 8 | from goldenverba.server.api import manager, client_manager 9 | from goldenverba.server.helpers import LoggerManager 10 | from goldenverba.server.types import Credentials, FileConfig, FileStatus 11 | from src.wikipedia import WikipediaDataProcessor 12 | 13 | def create_fileconfig(article_data: dict, rag_config: dict, overwrite: bool=False) -> FileConfig: 14 | """Create a FileConfig object from article data.""" 15 | original_content = article_data['text'].encode('utf-8') 16 | encoded_content = base64.b64encode(original_content).decode('utf-8') 17 | 18 | return FileConfig( 19 | fileID=f"wiki_{article_data['page_id']}", 20 | filename=article_data['title'], 21 | isURL=False, 22 | overwrite=overwrite, 23 | extension="txt", 24 | source=article_data['source_link'], 25 | content=encoded_content, 26 | labels=article_data['labels'], 27 | rag_config=rag_config, 28 | file_size=len(original_content), 29 | status=FileStatus.READY, 30 | metadata=article_data['metadata'], 31 | status_report={} 32 | ) 33 | 34 | async def process_batch(client, batch: list[Dict[str, Any]], rag_config: dict, logger: LoggerManager): 35 | """Process a batch of articles concurrently.""" 36 | tasks = [] 37 | for article_data in batch: 38 | file_config = create_fileconfig(article_data, rag_config) 39 | tasks.append(manager.import_document(client, file_config, logger)) 40 | return await asyncio.gather(*tasks) 41 | 42 | async def process_static_data(data_processor: WikipediaDataProcessor, client, rag_config, logger, batch_size: int = 10): 43 | """ 44 | Process the initial Wikipedia dump data in batches and upload them. 45 | 46 | Args: 47 | data_processor: Instance of WikipediaDataProcessor. 48 | client: Client connection to the server. 49 | rag_config: RAG configuration. 50 | logger: Logger instance. 51 | batch_size: Number of articles to process per batch. 52 | limit: Maximum number of articles to process (for testing). 53 | """ 54 | current_batch = [] 55 | 56 | for article_data in data_processor.process_data(): 57 | current_batch.append(article_data) 58 | # Process batch when it reaches the desired size 59 | if len(current_batch) >= batch_size: 60 | await process_batch(client, current_batch, rag_config, logger) 61 | current_batch = [] 62 | 63 | # Process any remaining articles in the final batch 64 | if current_batch: 65 | await process_batch(client, current_batch, rag_config, logger) 66 | 67 | print("Finished processing static data.") 68 | 69 | async def fetch_recent_changes_continuously(data_processor: WikipediaDataProcessor, client, rag_config, logger, batch_size: int=10, interval_minutes: int=5): 70 | """ 71 | Fetch recent Wikipedia changes every specified interval and upload them. 72 | 73 | Args: 74 | data_processor: Instance of WikipediaDataProcessor. 75 | client: Client connection to the server. 76 | rag_config: RAG configuration. 77 | logger: Logger instance. 78 | interval_minutes: Minutes to wait between each fetch of recent changes. 79 | """ 80 | while True: 81 | print(f"Fetching recent changes at {datetime.now(timezone.utc).isoformat()}") 82 | 83 | current_batch = [] 84 | 85 | # Fetch recent changes from the last interval 86 | for change in data_processor.get_recent_changes(minutes=interval_minutes): 87 | current_batch.append(change) 88 | # Process batch when it reaches the desired size 89 | if len(current_batch) >= batch_size: 90 | await process_batch(client, current_batch, rag_config, logger) 91 | current_batch = [] 92 | 93 | # Process any remaining articles in the final batch 94 | if current_batch: 95 | await process_batch(client, current_batch, rag_config, logger) 96 | 97 | # Wait for the next interval 98 | await asyncio.sleep(interval_minutes * 60) 99 | 100 | def get_production_mode(args): 101 | # Check if args.production is set and is a boolean 102 | production_mode = getattr(args, 'production', None) 103 | 104 | # If args.production is not set, check the environment variable 105 | if production_mode is None: 106 | env_value = os.getenv('PRODUCTION', 'False') # Default to 'False' as a string 107 | production_mode = env_value.lower() == 'true' # Convert to boolean 108 | 109 | return production_mode 110 | 111 | # Function to parse command-line arguments 112 | def parse_arguments(): 113 | parser = argparse.ArgumentParser(description="Run the data processing script.") 114 | parser.add_argument('--production', action='store_true', help="Specify if running in production mode.") 115 | return parser.parse_args() 116 | 117 | async def main(): 118 | # Parse command-line arguments 119 | args = parse_arguments() 120 | dotenv.load_dotenv() 121 | # Initialize logger 122 | logger = LoggerManager() 123 | 124 | # Set up credentials and connect to the client 125 | credentials = Credentials(deployment="Docker", url="weaviate", key="") 126 | client = await client_manager.connect(credentials) 127 | production = get_production_mode(args) 128 | 129 | try: 130 | # Fetch the rag_config from the server 131 | rag_config = await manager.load_rag_config(client) 132 | if not rag_config: 133 | raise Exception("rag_config is empty. Cannot proceed.") 134 | 135 | # Initialize the data processor 136 | data_processor = WikipediaDataProcessor(production=production) 137 | 138 | # Process the static data first 139 | await process_static_data(data_processor, client, rag_config, logger, 1) 140 | 141 | # Start fetching recent changes continuously 142 | await fetch_recent_changes_continuously(data_processor, client, rag_config, logger, 1, 5) 143 | 144 | finally: 145 | await client.close() 146 | 147 | if __name__ == "__main__": 148 | asyncio.run(main()) 149 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | mwxml 3 | mwparserfromhell 4 | tqdm 5 | goldenverba 6 | python-dotenv -------------------------------------------------------------------------------- /src/wikipedia.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import bz2 4 | import mwxml 5 | import mwparserfromhell 6 | import json 7 | from typing import Generator, Dict, Any 8 | from datetime import datetime, timedelta, timezone 9 | from time import sleep 10 | from tqdm import tqdm 11 | 12 | class WikipediaDataProcessor: 13 | def __init__(self, extract_folder: str = "/data/datasets/wikipedia/extracted_wikipedia", production=False): 14 | self.dump_url_test = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p1p41242.bz2" 15 | self.dump_url_prod = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2" 16 | self.dump_url = self.dump_url_prod if production else self.dump_url_test 17 | self.dump_file = os.path.join(os.path.dirname(extract_folder), "enwiki-sample.xml.bz2") 18 | self.extract_folder = extract_folder 19 | self.production = production 20 | self.api_base_url = "https://en.wikipedia.org/w/api.php" 21 | 22 | # Check and download the dump file if necessary 23 | self.check_and_download_dump() 24 | # Check and extract the dump file if necessary 25 | self.check_and_extract_dump() 26 | 27 | def check_and_download_dump(self): 28 | # Check if the dump file exists 29 | if not os.path.exists(self.dump_file): 30 | # Get the directory for the dump file 31 | os.makedirs(os.path.abspath(os.path.dirname(self.dump_file)), exist_ok=True) 32 | 33 | print(f"Dump file {self.dump_file} not found.") 34 | self.download_wikipedia_dump() 35 | else: 36 | print(f"Dump file {self.dump_file} already exists.") 37 | 38 | def download_wikipedia_dump(self): 39 | response = requests.get(self.dump_url, stream=True) 40 | total_size = int(response.headers.get('content-length', 0)) 41 | print(f"Downloading {total_size / 1048576:.2f} MB from {self.dump_url} ...") 42 | block_size = 1024 * 1024 # 1 MB 43 | 44 | with open(self.dump_file, 'wb') as file: 45 | # Initialize tqdm for progress tracking 46 | with tqdm(total=total_size, unit='MB', unit_scale=True, desc=self.dump_file) as pbar: 47 | for data in response.iter_content(block_size): 48 | file.write(data) 49 | pbar.update(len(data) / (1024 * 1024)) # Update progress bar 50 | 51 | print(f"Downloaded {self.dump_file}") 52 | 53 | def check_and_extract_dump(self): 54 | if not os.path.isdir(self.extract_folder) or not bool(os.listdir(self.extract_folder)): 55 | print(f"Extract folder {self.extract_folder} not found. Extracting...") 56 | os.makedirs(self.extract_folder, exist_ok=True) 57 | self.extract_dump() 58 | else: 59 | print(f"Extract folder {self.extract_folder} already exists.") 60 | 61 | def extract_dump(self): 62 | # Get the total size of the dump file for progress tracking 63 | total_size = os.path.getsize(self.dump_file) 64 | 65 | with bz2.BZ2File(self.dump_file, 'rb') as file: 66 | # Open the output file for writing 67 | with open(os.path.join(self.extract_folder, 'wikipedia.xml'), 'wb') as out_file: 68 | # Initialize tqdm for progress tracking 69 | with tqdm(total=total_size, unit='B', unit_scale=True, desc='Extracting dump') as pbar: 70 | for data in iter(lambda: file.read(100 * 1024), b''): # Read in chunks of 100 KB 71 | out_file.write(data) 72 | pbar.update(len(data)) # Update progress bar with the size of data written 73 | 74 | print(f"Extracted dump to {self.extract_folder}") 75 | if not self.production: 76 | try: 77 | os.remove(self.dump_file) 78 | print(f"Deleted dump file: {self.dump_file}") 79 | except OSError as e: 80 | print(f"Error deleting file {self.dump_file}: {e}") 81 | 82 | def process_data(self) -> Generator[Dict[str, Any], None, None]: 83 | xml_file_path = os.path.join(self.extract_folder, 'wikipedia.xml') 84 | 85 | # Count total pages in the dump file for tqdm 86 | with open(xml_file_path, 'rb') as file: 87 | dump = mwxml.Dump.from_file(file) 88 | total_pages = sum(1 for _ in dump) 89 | 90 | # Process pages with tqdm progress bar 91 | with open(xml_file_path, 'rb') as file: 92 | dump = mwxml.Dump.from_file(file) 93 | pages_count = 0 94 | for page in tqdm(dump, total=total_pages, desc="Processing Wikipedia Pages"): 95 | if not self.production and pages_count >= 500: 96 | break 97 | title = page.title 98 | # Skip non-article pages (e.g., User:, Talk:, etc.) 99 | if ':' in title: 100 | continue 101 | # Construct source link 102 | source_link = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}" 103 | # Get the latest revision text 104 | try: 105 | revision = next(page) # Assuming the first revision is the latest 106 | except StopIteration: 107 | continue # No revisions, skip page 108 | text = revision.text 109 | 110 | if text is None or text.startswith("#REDIRECT"): 111 | continue # Skip void and REDIRECT pages 112 | 113 | # Parse wikitext to extract plain text 114 | wikicode = mwparserfromhell.parse(text) 115 | plain_text = wikicode.strip_code() 116 | 117 | # Yield the data in the standardized format 118 | yield { 119 | 'title': title, 120 | 'source_link': source_link, 121 | 'text': plain_text, 122 | 'page_id': page.id, 123 | 'labels': ['Wikipedia'], 124 | 'metadata': json.dumps({ 125 | 'id': page.id, 126 | 'title': title 127 | }) 128 | } 129 | pages_count +=1 130 | 131 | def get_recent_changes(self, minutes: int = 5) -> Generator[Dict[str, Any], None, None]: 132 | """ 133 | Fetch and process recent changes from Wikipedia. 134 | 135 | Args: 136 | minutes: Number of minutes to look back for changes (default: 5) 137 | 138 | Yields: 139 | Dict containing processed page information in the same format as process_data() 140 | """ 141 | # Calculate the start time 142 | start_time = (datetime.now(timezone.utc) - timedelta(minutes=minutes)).strftime("%Y%m%d%H%M%S") 143 | 144 | # Parameters for the API request 145 | params = { 146 | "action": "query", 147 | "format": "json", 148 | "list": "recentchanges", 149 | "rcstart": start_time, 150 | "rcdir": "newer", 151 | "rcnamespace": "0", # Main namespace only 152 | "rclimit": "500", # Maximum allowed limit 153 | "rcprop": "title|ids|timestamp|comment|user|flags|sizes", 154 | } 155 | 156 | processed_pages = set() # Track processed pages to avoid duplicates 157 | 158 | while True: 159 | try: 160 | response = requests.get(self.api_base_url, params=params) 161 | response.raise_for_status() 162 | data = response.json() 163 | 164 | changes = data.get("query", {}).get("recentchanges", []) 165 | 166 | if not changes: 167 | break 168 | 169 | pages_count = 0 170 | for change in tqdm(changes, desc="Processing changes", unit="change"): 171 | if not self.production and pages_count >= 50: 172 | break 173 | page_id = change["pageid"] 174 | # Skip if we've already processed this page 175 | if page_id in processed_pages: 176 | continue 177 | processed_pages.add(page_id) 178 | 179 | # Get the current content of the page 180 | content_params = { 181 | "action": "query", 182 | "format": "json", 183 | "prop": "revisions", 184 | "pageids": page_id, 185 | "rvprop": "content", 186 | "rvslots": "main" 187 | } 188 | 189 | try: 190 | content_response = requests.get(self.api_base_url, params=content_params) 191 | content_response.raise_for_status() 192 | content_data = content_response.json() 193 | 194 | # Extract the page content 195 | page = next(iter(content_data["query"]["pages"].values())) 196 | revision = page["revisions"][0] 197 | text = revision["slots"]["main"]["*"] 198 | 199 | # Skip redirects 200 | if text.startswith("#REDIRECT"): 201 | continue 202 | 203 | title = page["title"] 204 | source_link = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}" 205 | 206 | # Parse wikitext to extract plain text 207 | wikicode = mwparserfromhell.parse(text) 208 | plain_text = wikicode.strip_code() 209 | 210 | yield { 211 | 'title': title, 212 | 'source_link': source_link, 213 | 'text': plain_text, 214 | 'page_id': page_id, 215 | 'labels': ['Wikipedia'], 216 | 'metadata': json.dumps({ 217 | 'id': page_id, 218 | 'title': title, 219 | 'last_modified': change['timestamp'], 220 | 'editor': change['user'], 221 | 'comment': change.get('comment', '') 222 | }) 223 | } 224 | 225 | # Be nice to the API 226 | sleep(0.1) 227 | pages_count += 1 228 | 229 | except Exception as e: 230 | print(f"Error processing page {page_id}: {str(e)}") 231 | continue 232 | 233 | # Check if there are more results 234 | if "continue" in data: 235 | params.update(data["continue"]) 236 | else: 237 | break 238 | 239 | except Exception as e: 240 | print(f"Error fetching recent changes: {str(e)}") 241 | break -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Load environment variables from .env file 4 | set -a 5 | source .env 6 | set +a 7 | 8 | # List of external volumes to check and create if needed 9 | VOLUMES=("weaviate_data" "updater" "ollama") 10 | # Name of the tunnel network to check 11 | NETWORKS="" 12 | 13 | # Function to create a volume if it does not exist 14 | create_volume_if_not_exists() { 15 | local volume_name=$1 16 | if ! docker volume ls | grep -q "$volume_name"; then 17 | echo "Creating external volume '$volume_name'..." 18 | docker volume create "$volume_name" 19 | fi 20 | } 21 | 22 | # Function to create a network if it does not exist 23 | create_network_if_not_exists() { 24 | local network_name=$1 25 | if ! docker network ls | grep -q "$network_name"; then 26 | echo "Creating network '$network_name'..." 27 | docker network create "$network_name" 28 | fi 29 | } 30 | 31 | # Loop through each volume in the list and create if it doesn't exist 32 | for volume in "${VOLUMES[@]}"; do 33 | create_volume_if_not_exists "$volume" 34 | done 35 | 36 | # Check for tunnel network and create if it doesn't exist 37 | create_network_if_not_exists "$NETWORKS" 38 | 39 | # Check if PRODUCTION is set to False 40 | if [ "$PRODUCTION" == "False" ]; then 41 | echo "Testing with small docker-compose-s.yml ..." 42 | docker compose --env-file .env -f docker-compose-s.yml up --build 43 | else 44 | echo "Deploying with docker-compose.yml ..." 45 | docker compose --env-file .env -f docker-compose.yml up -d --build 46 | fi --------------------------------------------------------------------------------