├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── assets
    └── Flowchart.png
├── docker-compose-s.yml
├── docker-compose.yml
├── main.py
├── requirements.txt
├── src
    └── wikipedia.py
└── start.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11
 2 | 
 3 | # Set working directory
 4 | WORKDIR /app
 5 | 
 6 | # Copy application code
 7 | COPY . /app
 8 | 
 9 | # Set environment variable
10 | ENV PYTHONPATH="/app"
11 | 
12 | # Install dependencies
13 | RUN pip install --no-cache-dir -r requirements.txt
14 | 
15 | # Expose port
16 | # EXPOSE 8000
17 | 
18 | # Start both verba and main.py
19 | CMD ["sh", "-c", "verba start --port 8000 --host 0.0.0.0 & python main.py"]
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Piazza
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Piazza AI: Wikipedia Updater Framework
 2 | 
 3 | Welcome to the **Piazza Updater**, a framework developed by **Piazza AI** that demonstrates the power of **Weaviate vector databases** combined with **real-time data updates**. This repository is an open-source demo showcasing how our framework processes Wikipedia data, fetches new information in real-time from the web, and updates a vector database. The goal is to simplify **Large Language Model (LLM) deployments** by leveraging advanced techniques like **Retrieval-Augmented Generation (RAG)**.
 4 | 
 5 | ---
 6 | 
 7 | ## Piazza Workflow
 8 | 
 9 | <div style="text-align: center;">
10 |     <img src="assets/Flowchart.png" alt="Piazza Workflow" width="600"/>
11 | </div>
12 | 
13 | ## How to Run the Demo
14 | 
15 | ### Prerequisites
16 | 1. **Docker**: Ensure Docker is installed on your machine.
17 | 2. **API Keys**: Create a `.env` file to include API keys required for other modules (e.g., OpenAI, Anthropic). Refer to the [Verba repository](https://github.com/weaviate/Verba) for details.
18 | 
19 | ### Steps to Run
20 | 1. Clone this repository and install dependencies
21 |    ```bash
22 |    git clone https://github.com/piazza-tech/Piazza-Updater.git
23 |    cd Piazza-Updater
24 |    pip install -r requirements.txt
25 |    ```
26 | 
27 | 2. Provide execution permissions for the `start.sh` script:
28 |    ```bash
29 |    chmod +x start.sh
30 |    ```
31 | 
32 | 3. Run the framework:
33 |    ```bash
34 |    ./start.sh
35 |    ```
36 | 
37 | 4. Open your browser and navigate to `http://localhost:8000`.
38 | 
39 | 5. In the Verba web interface:
40 |    - Choose **Docker Deployment**.
41 |    - Select **Documents** to observe Wikipedia data being processed and updated in real-time.
42 | 
43 | 6. Once the initial Wikipedia dumps are processed:
44 |    - The script begins searching the internet for new data.
45 |    - Chat with the LLM using up-to-date information!
46 | 
47 | ---
48 | 
49 | ### Configuration Options
50 | 1. **Development Mode**:
51 |    - Skip the `PRODUCTION` variable in `.env` for a lightweight demo (processes a small subset of Wikipedia).
52 |    - Use `docker-compose-s.yml` for minimal resource usage.
53 | 
54 | 2. **Production Mode**:
55 |    - Add the `PRODUCTION` variable in `.env` to process the entire Wikipedia dataset.
56 |    - Use `docker-compose.yml` for full-scale deployment (requires more time and resources).
57 | 
58 | ---
59 | 
60 | ## Technologies Used
61 | - **Weaviate**: Vector database for efficient semantic search and data retrieval.
62 | - **LLMs**: Powered by **Ollama Llama 3.2** for natural language understanding.
63 | - **RAG Framework**: Combines vectorized data with real-time search to enhance LLM performance.
64 | - **Verba**: Web app for seamless user interaction and deployment ([learn more](https://github.com/weaviate/Verba)).
65 | 
66 | ---
67 | 
68 | ## Use Cases Beyond Wikipedia
69 | While this demo focuses on Wikipedia, the Piazza Updater framework is highly adaptable:
70 | - Integrate with any database or website.
71 | - Fetch and process real-time internet data for various domains, such as:
72 |   - News websites
73 |   - E-commerce platforms
74 |   - Scientific research databases
75 | 
76 | ---
77 | 
78 | ## License
79 | This project is open-source under the [MIT License](LICENSE).
80 | 
81 | ---
82 | 
83 | ## Contributing
84 | We welcome contributions to enhance this demo! Feel free to fork the repository, make changes, and submit pull requests.
85 | 
86 | ---
87 | 
88 | For questions or support, reach out to **Piazza AI** or visit the [Verba repository](https://github.com/weaviate/Verba) for additional deployment details. 
89 | 
90 | Start exploring the future of real-time LLMs today! 🚀
91 | 


--------------------------------------------------------------------------------
/assets/Flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Piazza-tech/Piazza-Updater/5064db489257f5f330bae9ce3254028feb77ec13/assets/Flowchart.png


--------------------------------------------------------------------------------
/docker-compose-s.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | services:
 4 | 
 5 |   weaviate:
 6 |     command:
 7 |       - --host
 8 |       - 0.0.0.0
 9 |       - --port
10 |       - '8080'
11 |       - --scheme
12 |       - http
13 |     image: cr.weaviate.io/semitechnologies/weaviate:1.27.0
14 |     ports:
15 |       - 8080:8080
16 |       - 3000:8080
17 |       - 50051:50051
18 |     volumes:
19 |       - weaviate_data:/var/lib/weaviate
20 |     restart: on-failure:0
21 |     healthcheck:
22 |       test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8080/v1/.well-known/ready || exit 1
23 |       interval: 5s
24 |       timeout: 10s
25 |       retries: 5
26 |       start_period: 10s
27 |     environment:
28 |       QUERY_DEFAULTS_LIMIT: 50
29 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
30 |       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
31 |       DEFAULT_VECTORIZER_MODULE: 'text2vec-ollama'
32 |       ENABLE_API_BASED_MODULES: 'true'
33 |       ENABLE_MODULES: 'generative-ollama,text2vec-ollama'
34 |       CLUSTER_HOSTNAME: 'node1'
35 | 
36 |   updater:
37 |     build:
38 |       context: .
39 |       dockerfile: Dockerfile
40 |     environment:
41 |       - WEAVIATE_URL=http://weaviate:8080
42 |       - OLLAMA_URL=http://ollama:11434
43 |       - OLLAMA_MODEL=llama3.2
44 |     ports:
45 |       - 8000:8000
46 |     depends_on:
47 |       weaviate:
48 |         condition: service_healthy
49 |     volumes:
50 |       - updater:/data/
51 |     healthcheck:
52 |       test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8000 || exit 1
53 |       interval: 5s
54 |       timeout: 10s
55 |       retries: 5
56 |       start_period: 10s
57 | 
58 |   ollama:
59 |     image: ollama/ollama
60 |     ports:
61 |       - "11434:11434"  # Expose the port for Ollama
62 |     volumes:
63 |       - ollama:/root/.ollama  # Persistent storage for Ollama data
64 |     deploy:
65 |       resources:
66 |         reservations:
67 |           devices:
68 |             - driver: nvidia
69 |               capabilities: [gpu]
70 |               count: all  # Use 'all' for all available GPUs or specify a number
71 |     restart: always
72 | 
73 | volumes:
74 |   weaviate_data:
75 |     external: true
76 |   ollama:
77 |     external: true
78 |   updater:
79 |     external: true
80 | ...


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | services:
  4 | 
  5 |   weaviate:
  6 |     command:
  7 |       - --host
  8 |       - 0.0.0.0
  9 |       - --port
 10 |       - '8080'
 11 |       - --scheme
 12 |       - http
 13 |     image: cr.weaviate.io/semitechnologies/weaviate:1.27.0
 14 |     ports:
 15 |       - 8080:8080
 16 |       - 3000:8080
 17 |       - 50051:50051
 18 |     volumes:
 19 |       - weaviate_data:/var/lib/weaviate
 20 |     restart: on-failure:0
 21 |     healthcheck:
 22 |       test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8080/v1/.well-known/ready || exit 1
 23 |       interval: 5s
 24 |       timeout: 10s
 25 |       retries: 5
 26 |       start_period: 10s
 27 |     environment:
 28 |       OPENAI_APIKEY: $OPENAI_API_KEY
 29 |       COHERE_APIKEY: $COHERE_API_KEY
 30 |       TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080'
 31 |       QNA_INFERENCE_API: 'http://qna-transformers:8080'
 32 |       IMAGE_INFERENCE_API: 'http://i2v-neural:8080'
 33 |       NER_INFERENCE_API: 'http://ner-transformers:8080'
 34 |       RERANKER_INFERENCE_API: 'http://reranker-transformers:8080'
 35 |       QUERY_DEFAULTS_LIMIT: 25
 36 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
 37 |       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
 38 |       DEFAULT_VECTORIZER_MODULE: 'text2vec-ollama'
 39 |       ENABLE_API_BASED_MODULES: 'true'
 40 |       ENABLE_MODULES: 'text2vec-ollama,generative-ollama,text2vec-transformers,qna-transformers,ner-transformers,img2vec-neural,reranker-transformers'
 41 |       CLUSTER_HOSTNAME: 'node1'
 42 | 
 43 |   t2v-transformers:
 44 |     image: cr.weaviate.io/semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1
 45 |     environment:
 46 |       ENABLE_CUDA: '1'
 47 |       NVIDIA_VISIBLE_DEVICES: 'all'
 48 |     deploy:
 49 |       resources:
 50 |         reservations:
 51 |           devices:
 52 |           - capabilities: 
 53 |             - 'gpu'
 54 | 
 55 |   qna-transformers:
 56 |     image: cr.weaviate.io/semitechnologies/qna-transformers:distilbert-base-uncased-distilled-squad
 57 |     environment:
 58 |       ENABLE_CUDA: '1'
 59 |       NVIDIA_VISIBLE_DEVICES: 'all'
 60 |     deploy:
 61 |       resources:
 62 |         reservations:
 63 |           devices:
 64 |           - capabilities: 
 65 |             - 'gpu'
 66 | 
 67 |   ner-transformers:
 68 |     image: cr.weaviate.io/semitechnologies/ner-transformers:dbmdz-bert-large-cased-finetuned-conll03-english
 69 |     environment:
 70 |       ENABLE_CUDA: '1'
 71 |       NVIDIA_VISIBLE_DEVICES: 'all'
 72 |     deploy:
 73 |       resources:
 74 |         reservations:
 75 |           devices:
 76 |           - capabilities: 
 77 |             - 'gpu'
 78 | 
 79 |   i2v-neural:
 80 |     image: cr.weaviate.io/semitechnologies/img2vec-pytorch:resnet50
 81 |     environment:
 82 |       ENABLE_CUDA: '1'
 83 |       NVIDIA_VISIBLE_DEVICES: 'all'
 84 |     deploy:
 85 |       resources:
 86 |         reservations:
 87 |           devices:
 88 |           - capabilities: 
 89 |             - 'gpu'
 90 | 
 91 |   reranker-transformers:
 92 |     image: cr.weaviate.io/semitechnologies/reranker-transformers:cross-encoder-ms-marco-MiniLM-L-6-v2
 93 |     environment:
 94 |       ENABLE_CUDA: '1'
 95 |       NVIDIA_VISIBLE_DEVICES: 'all'
 96 |     deploy:
 97 |       resources:
 98 |         reservations:
 99 |           devices:
100 |           - capabilities:
101 |             - 'gpu'
102 | 
103 |   updater:
104 |     build:
105 |       context: .
106 |       dockerfile: Dockerfile
107 |     ports:
108 |       - 8000:8000
109 |     environment:
110 |       - WEAVIATE_URL=http://weaviate:8080
111 |       - OPENAI_API_KEY=$OPENAI_API_KEY
112 |       - COHERE_API_KEY=$COHERE_API_KEY
113 |       - UNSTRUCTURED_API_KEY=$UNSTRUCTURED_API_KEY
114 |       - UNSTRUCTURED_API_URL=$UNSTRUCTURED_API_URL
115 |       - OLLAMA_URL=http://ollama:11434
116 |       - OLLAMA_MODEL=llama3.2
117 |       - OLLAMA_EMBED_MODEL=nomic-embed-text
118 |       - GITHUB_TOKEN=$GITHUB_TOKEN
119 |     depends_on:
120 |       weaviate:
121 |         condition: service_healthy
122 |     volumes:
123 |       - updater:/data/
124 |     healthcheck:
125 |       test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8000 || exit 1
126 |       interval: 5s
127 |       timeout: 10s
128 |       retries: 5
129 |       start_period: 10s
130 | 
131 |   ollama:
132 |     image: ollama/ollama
133 |     ports:
134 |       - "11434:11434"  # Expose the port for Ollama
135 |     volumes:
136 |       - ollama:/root/.ollama  # Persistent storage for Ollama data
137 |     deploy:
138 |       resources:
139 |         reservations:
140 |           devices:
141 |             - driver: nvidia
142 |               capabilities: [gpu]
143 |               count: all  # Use 'all' for all available GPUs or specify a number
144 |     restart: always
145 | 
146 | volumes:
147 |   weaviate_data:
148 |     external: true
149 |   ollama:
150 |     external: true
151 |   updater:
152 |     external: true
153 | ...


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import asyncio
  3 | import dotenv
  4 | from datetime import datetime, timezone
  5 | import base64
  6 | from typing import Dict, Any
  7 | import argparse
  8 | from goldenverba.server.api import manager, client_manager
  9 | from goldenverba.server.helpers import LoggerManager
 10 | from goldenverba.server.types import Credentials, FileConfig, FileStatus
 11 | from src.wikipedia import WikipediaDataProcessor
 12 | 
 13 | def create_fileconfig(article_data: dict, rag_config: dict, overwrite: bool=False) -> FileConfig:
 14 |     """Create a FileConfig object from article data."""
 15 |     original_content = article_data['text'].encode('utf-8')
 16 |     encoded_content = base64.b64encode(original_content).decode('utf-8')
 17 |     
 18 |     return FileConfig(
 19 |         fileID=f"wiki_{article_data['page_id']}",
 20 |         filename=article_data['title'],
 21 |         isURL=False,
 22 |         overwrite=overwrite,
 23 |         extension="txt",
 24 |         source=article_data['source_link'],
 25 |         content=encoded_content,
 26 |         labels=article_data['labels'],
 27 |         rag_config=rag_config,
 28 |         file_size=len(original_content),
 29 |         status=FileStatus.READY,
 30 |         metadata=article_data['metadata'],
 31 |         status_report={}
 32 |     )
 33 | 
 34 | async def process_batch(client, batch: list[Dict[str, Any]], rag_config: dict, logger: LoggerManager):
 35 |     """Process a batch of articles concurrently."""
 36 |     tasks = []
 37 |     for article_data in batch:
 38 |         file_config = create_fileconfig(article_data, rag_config)
 39 |         tasks.append(manager.import_document(client, file_config, logger))
 40 |     return await asyncio.gather(*tasks)
 41 | 
 42 | async def process_static_data(data_processor: WikipediaDataProcessor, client, rag_config, logger, batch_size: int = 10):
 43 |     """
 44 |     Process the initial Wikipedia dump data in batches and upload them.
 45 |     
 46 |     Args:
 47 |         data_processor: Instance of WikipediaDataProcessor.
 48 |         client: Client connection to the server.
 49 |         rag_config: RAG configuration.
 50 |         logger: Logger instance.
 51 |         batch_size: Number of articles to process per batch.
 52 |         limit: Maximum number of articles to process (for testing).
 53 |     """
 54 |     current_batch = []
 55 | 
 56 |     for article_data in data_processor.process_data():
 57 |         current_batch.append(article_data)
 58 |         # Process batch when it reaches the desired size
 59 |         if len(current_batch) >= batch_size:
 60 |             await process_batch(client, current_batch, rag_config, logger)
 61 |             current_batch = []
 62 | 
 63 |     # Process any remaining articles in the final batch
 64 |     if current_batch:
 65 |         await process_batch(client, current_batch, rag_config, logger)
 66 | 
 67 |     print("Finished processing static data.")
 68 | 
 69 | async def fetch_recent_changes_continuously(data_processor: WikipediaDataProcessor, client, rag_config, logger, batch_size: int=10, interval_minutes: int=5):
 70 |     """
 71 |     Fetch recent Wikipedia changes every specified interval and upload them.
 72 |     
 73 |     Args:
 74 |         data_processor: Instance of WikipediaDataProcessor.
 75 |         client: Client connection to the server.
 76 |         rag_config: RAG configuration.
 77 |         logger: Logger instance.
 78 |         interval_minutes: Minutes to wait between each fetch of recent changes.
 79 |     """
 80 |     while True:
 81 |         print(f"Fetching recent changes at {datetime.now(timezone.utc).isoformat()}")
 82 |         
 83 |         current_batch = []
 84 |         
 85 |         # Fetch recent changes from the last interval
 86 |         for change in data_processor.get_recent_changes(minutes=interval_minutes):
 87 |             current_batch.append(change)
 88 |             # Process batch when it reaches the desired size
 89 |             if len(current_batch) >= batch_size:
 90 |                 await process_batch(client, current_batch, rag_config, logger)
 91 |                 current_batch = []
 92 | 
 93 |         # Process any remaining articles in the final batch
 94 |         if current_batch:
 95 |             await process_batch(client, current_batch, rag_config, logger)
 96 | 
 97 |         # Wait for the next interval
 98 |         await asyncio.sleep(interval_minutes * 60)
 99 | 
100 | def get_production_mode(args):
101 |     # Check if args.production is set and is a boolean
102 |     production_mode = getattr(args, 'production', None)
103 |     
104 |     # If args.production is not set, check the environment variable
105 |     if production_mode is None:
106 |         env_value = os.getenv('PRODUCTION', 'False')  # Default to 'False' as a string
107 |         production_mode = env_value.lower() == 'true'  # Convert to boolean
108 | 
109 |     return production_mode
110 | 
111 | # Function to parse command-line arguments
112 | def parse_arguments():
113 |     parser = argparse.ArgumentParser(description="Run the data processing script.")
114 |     parser.add_argument('--production', action='store_true', help="Specify if running in production mode.")
115 |     return parser.parse_args()
116 | 
117 | async def main():
118 |     # Parse command-line arguments
119 |     args = parse_arguments()
120 |     dotenv.load_dotenv()
121 |     # Initialize logger
122 |     logger = LoggerManager()
123 |     
124 |     # Set up credentials and connect to the client
125 |     credentials = Credentials(deployment="Docker", url="weaviate", key="")
126 |     client = await client_manager.connect(credentials)
127 |     production = get_production_mode(args)
128 |     
129 |     try:
130 |         # Fetch the rag_config from the server
131 |         rag_config = await manager.load_rag_config(client)
132 |         if not rag_config:
133 |             raise Exception("rag_config is empty. Cannot proceed.")
134 | 
135 |         # Initialize the data processor
136 |         data_processor = WikipediaDataProcessor(production=production)
137 | 
138 |         # Process the static data first
139 |         await process_static_data(data_processor, client, rag_config, logger, 1)
140 | 
141 |         # Start fetching recent changes continuously
142 |         await fetch_recent_changes_continuously(data_processor, client, rag_config, logger, 1, 5)
143 | 
144 |     finally:
145 |         await client.close()
146 | 
147 | if __name__ == "__main__":
148 |     asyncio.run(main())
149 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests 
2 | mwxml 
3 | mwparserfromhell
4 | tqdm
5 | goldenverba
6 | python-dotenv


--------------------------------------------------------------------------------
/src/wikipedia.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import requests
  3 | import bz2
  4 | import mwxml
  5 | import mwparserfromhell
  6 | import json
  7 | from typing import Generator, Dict, Any
  8 | from datetime import datetime, timedelta, timezone
  9 | from time import sleep
 10 | from tqdm import tqdm
 11 | 
 12 | class WikipediaDataProcessor:
 13 |     def __init__(self, extract_folder: str = "/data/datasets/wikipedia/extracted_wikipedia", production=False):
 14 |         self.dump_url_test = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p1p41242.bz2"
 15 |         self.dump_url_prod = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2"
 16 |         self.dump_url = self.dump_url_prod if production else self.dump_url_test
 17 |         self.dump_file = os.path.join(os.path.dirname(extract_folder), "enwiki-sample.xml.bz2")
 18 |         self.extract_folder = extract_folder
 19 |         self.production = production
 20 |         self.api_base_url = "https://en.wikipedia.org/w/api.php"
 21 | 
 22 |         # Check and download the dump file if necessary
 23 |         self.check_and_download_dump()
 24 |         # Check and extract the dump file if necessary
 25 |         self.check_and_extract_dump()
 26 | 
 27 |     def check_and_download_dump(self):
 28 |         # Check if the dump file exists
 29 |         if not os.path.exists(self.dump_file):
 30 |             # Get the directory for the dump file
 31 |             os.makedirs(os.path.abspath(os.path.dirname(self.dump_file)), exist_ok=True)
 32 | 
 33 |             print(f"Dump file {self.dump_file} not found.")
 34 |             self.download_wikipedia_dump()
 35 |         else:
 36 |             print(f"Dump file {self.dump_file} already exists.")
 37 | 
 38 |     def download_wikipedia_dump(self):
 39 |         response = requests.get(self.dump_url, stream=True)
 40 |         total_size = int(response.headers.get('content-length', 0))
 41 |         print(f"Downloading {total_size / 1048576:.2f} MB from {self.dump_url} ...")
 42 |         block_size = 1024 * 1024  # 1 MB
 43 | 
 44 |         with open(self.dump_file, 'wb') as file:
 45 |             # Initialize tqdm for progress tracking
 46 |             with tqdm(total=total_size, unit='MB', unit_scale=True, desc=self.dump_file) as pbar:
 47 |                 for data in response.iter_content(block_size):
 48 |                     file.write(data)
 49 |                     pbar.update(len(data) / (1024 * 1024))  # Update progress bar
 50 | 
 51 |         print(f"Downloaded {self.dump_file}")
 52 | 
 53 |     def check_and_extract_dump(self):
 54 |         if not os.path.isdir(self.extract_folder) or not bool(os.listdir(self.extract_folder)):
 55 |             print(f"Extract folder {self.extract_folder} not found. Extracting...")
 56 |             os.makedirs(self.extract_folder, exist_ok=True)
 57 |             self.extract_dump()
 58 |         else:
 59 |             print(f"Extract folder {self.extract_folder} already exists.")
 60 | 
 61 |     def extract_dump(self):
 62 |         # Get the total size of the dump file for progress tracking
 63 |         total_size = os.path.getsize(self.dump_file)
 64 |         
 65 |         with bz2.BZ2File(self.dump_file, 'rb') as file:
 66 |             # Open the output file for writing
 67 |             with open(os.path.join(self.extract_folder, 'wikipedia.xml'), 'wb') as out_file:
 68 |                 # Initialize tqdm for progress tracking
 69 |                 with tqdm(total=total_size, unit='B', unit_scale=True, desc='Extracting dump') as pbar:
 70 |                     for data in iter(lambda: file.read(100 * 1024), b''):  # Read in chunks of 100 KB
 71 |                         out_file.write(data)
 72 |                         pbar.update(len(data))  # Update progress bar with the size of data written
 73 | 
 74 |         print(f"Extracted dump to {self.extract_folder}")
 75 |         if not self.production:
 76 |             try:
 77 |                 os.remove(self.dump_file)
 78 |                 print(f"Deleted dump file: {self.dump_file}")
 79 |             except OSError as e:
 80 |                 print(f"Error deleting file {self.dump_file}: {e}")
 81 | 
 82 |     def process_data(self) -> Generator[Dict[str, Any], None, None]:
 83 |         xml_file_path = os.path.join(self.extract_folder, 'wikipedia.xml')
 84 |         
 85 |         # Count total pages in the dump file for tqdm
 86 |         with open(xml_file_path, 'rb') as file:
 87 |             dump = mwxml.Dump.from_file(file)
 88 |             total_pages = sum(1 for _ in dump)
 89 | 
 90 |         # Process pages with tqdm progress bar
 91 |         with open(xml_file_path, 'rb') as file:
 92 |             dump = mwxml.Dump.from_file(file)
 93 |             pages_count = 0
 94 |             for page in tqdm(dump, total=total_pages, desc="Processing Wikipedia Pages"):
 95 |                 if not self.production and pages_count >= 500:
 96 |                     break
 97 |                 title = page.title
 98 |                 # Skip non-article pages (e.g., User:, Talk:, etc.)
 99 |                 if ':' in title:
100 |                     continue
101 |                 # Construct source link
102 |                 source_link = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
103 |                 # Get the latest revision text
104 |                 try:
105 |                     revision = next(page)  # Assuming the first revision is the latest
106 |                 except StopIteration:
107 |                     continue  # No revisions, skip page
108 |                 text = revision.text
109 | 
110 |                 if text is None or text.startswith("#REDIRECT"):
111 |                     continue  # Skip void and REDIRECT pages
112 | 
113 |                 # Parse wikitext to extract plain text
114 |                 wikicode = mwparserfromhell.parse(text)
115 |                 plain_text = wikicode.strip_code()
116 | 
117 |                 # Yield the data in the standardized format
118 |                 yield {
119 |                     'title': title,
120 |                     'source_link': source_link,
121 |                     'text': plain_text,
122 |                     'page_id': page.id,
123 |                     'labels': ['Wikipedia'],
124 |                     'metadata': json.dumps({
125 |                         'id': page.id,
126 |                         'title': title
127 |                     })
128 |                 }
129 |                 pages_count +=1
130 |                 
131 |     def get_recent_changes(self, minutes: int = 5) -> Generator[Dict[str, Any], None, None]:
132 |         """
133 |         Fetch and process recent changes from Wikipedia.
134 |         
135 |         Args:
136 |             minutes: Number of minutes to look back for changes (default: 5)
137 |             
138 |         Yields:
139 |             Dict containing processed page information in the same format as process_data()
140 |         """
141 |         # Calculate the start time
142 |         start_time = (datetime.now(timezone.utc) - timedelta(minutes=minutes)).strftime("%Y%m%d%H%M%S")
143 |         
144 |         # Parameters for the API request
145 |         params = {
146 |             "action": "query",
147 |             "format": "json",
148 |             "list": "recentchanges",
149 |             "rcstart": start_time,
150 |             "rcdir": "newer",
151 |             "rcnamespace": "0",  # Main namespace only
152 |             "rclimit": "500",    # Maximum allowed limit
153 |             "rcprop": "title|ids|timestamp|comment|user|flags|sizes",
154 |         }
155 |         
156 |         processed_pages = set()  # Track processed pages to avoid duplicates
157 |         
158 |         while True:
159 |             try:
160 |                 response = requests.get(self.api_base_url, params=params)
161 |                 response.raise_for_status()
162 |                 data = response.json()
163 |                 
164 |                 changes = data.get("query", {}).get("recentchanges", [])
165 |                 
166 |                 if not changes:
167 |                     break
168 | 
169 |                 pages_count = 0
170 |                 for change in tqdm(changes, desc="Processing changes", unit="change"):
171 |                     if not self.production and pages_count >= 50:
172 |                         break
173 |                     page_id = change["pageid"]
174 |                     # Skip if we've already processed this page
175 |                     if page_id in processed_pages:
176 |                         continue
177 |                     processed_pages.add(page_id)
178 |                     
179 |                     # Get the current content of the page
180 |                     content_params = {
181 |                         "action": "query",
182 |                         "format": "json",
183 |                         "prop": "revisions",
184 |                         "pageids": page_id,
185 |                         "rvprop": "content",
186 |                         "rvslots": "main"
187 |                     }
188 |                     
189 |                     try:
190 |                         content_response = requests.get(self.api_base_url, params=content_params)
191 |                         content_response.raise_for_status()
192 |                         content_data = content_response.json()
193 |                         
194 |                         # Extract the page content
195 |                         page = next(iter(content_data["query"]["pages"].values()))
196 |                         revision = page["revisions"][0]
197 |                         text = revision["slots"]["main"]["*"]
198 |                         
199 |                         # Skip redirects
200 |                         if text.startswith("#REDIRECT"):
201 |                             continue
202 |                         
203 |                         title = page["title"]
204 |                         source_link = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
205 |                         
206 |                         # Parse wikitext to extract plain text
207 |                         wikicode = mwparserfromhell.parse(text)
208 |                         plain_text = wikicode.strip_code()
209 |                         
210 |                         yield {
211 |                             'title': title,
212 |                             'source_link': source_link,
213 |                             'text': plain_text,
214 |                             'page_id': page_id,
215 |                             'labels': ['Wikipedia'],
216 |                             'metadata': json.dumps({
217 |                                 'id': page_id,
218 |                                 'title': title,
219 |                                 'last_modified': change['timestamp'],
220 |                                 'editor': change['user'],
221 |                                 'comment': change.get('comment', '')
222 |                             })
223 |                         }
224 |                         
225 |                         # Be nice to the API
226 |                         sleep(0.1)
227 |                         pages_count += 1
228 |                         
229 |                     except Exception as e:
230 |                         print(f"Error processing page {page_id}: {str(e)}")
231 |                         continue
232 |                 
233 |                 # Check if there are more results
234 |                 if "continue" in data:
235 |                     params.update(data["continue"])
236 |                 else:
237 |                     break
238 |                     
239 |             except Exception as e:
240 |                 print(f"Error fetching recent changes: {str(e)}")
241 |                 break


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Load environment variables from .env file
 4 | set -a
 5 | source .env
 6 | set +a
 7 | 
 8 | # List of external volumes to check and create if needed
 9 | VOLUMES=("weaviate_data" "updater" "ollama")
10 | # Name of the tunnel network to check
11 | NETWORKS=""
12 | 
13 | # Function to create a volume if it does not exist
14 | create_volume_if_not_exists() {
15 |   local volume_name=$1
16 |   if ! docker volume ls | grep -q "$volume_name"; then
17 |     echo "Creating external volume '$volume_name'..."
18 |     docker volume create "$volume_name"
19 |   fi
20 | }
21 | 
22 | # Function to create a network if it does not exist
23 | create_network_if_not_exists() {
24 |   local network_name=$1
25 |   if ! docker network ls | grep -q "$network_name"; then
26 |     echo "Creating network '$network_name'..."
27 |     docker network create "$network_name"
28 |   fi
29 | }
30 | 
31 | # Loop through each volume in the list and create if it doesn't exist
32 | for volume in "${VOLUMES[@]}"; do
33 |   create_volume_if_not_exists "$volume"
34 | done
35 | 
36 | # Check for tunnel network and create if it doesn't exist
37 | create_network_if_not_exists "$NETWORKS"
38 | 
39 | # Check if PRODUCTION is set to False
40 | if [ "$PRODUCTION" == "False" ]; then
41 |     echo "Testing with small docker-compose-s.yml ..."
42 |     docker compose --env-file .env -f docker-compose-s.yml up --build
43 | else
44 |     echo "Deploying with docker-compose.yml ..."
45 |     docker compose --env-file .env -f docker-compose.yml up -d --build
46 | fi


--------------------------------------------------------------------------------