├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── assets
└── Flowchart.png
├── docker-compose-s.yml
├── docker-compose.yml
├── main.py
├── requirements.txt
├── src
└── wikipedia.py
└── start.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11
2 |
3 | # Set working directory
4 | WORKDIR /app
5 |
6 | # Copy application code
7 | COPY . /app
8 |
9 | # Set environment variable
10 | ENV PYTHONPATH="/app"
11 |
12 | # Install dependencies
13 | RUN pip install --no-cache-dir -r requirements.txt
14 |
15 | # Expose port
16 | # EXPOSE 8000
17 |
18 | # Start both verba and main.py
19 | CMD ["sh", "-c", "verba start --port 8000 --host 0.0.0.0 & python main.py"]
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Piazza
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Piazza AI: Wikipedia Updater Framework
2 |
3 | Welcome to the **Piazza Updater**, a framework developed by **Piazza AI** that demonstrates the power of **Weaviate vector databases** combined with **real-time data updates**. This repository is an open-source demo showcasing how our framework processes Wikipedia data, fetches new information in real-time from the web, and updates a vector database. The goal is to simplify **Large Language Model (LLM) deployments** by leveraging advanced techniques like **Retrieval-Augmented Generation (RAG)**.
4 |
5 | ---
6 |
7 | ## Piazza Workflow
8 |
9 |
10 |

11 |
12 |
13 | ## How to Run the Demo
14 |
15 | ### Prerequisites
16 | 1. **Docker**: Ensure Docker is installed on your machine.
17 | 2. **API Keys**: Create a `.env` file to include API keys required for other modules (e.g., OpenAI, Anthropic). Refer to the [Verba repository](https://github.com/weaviate/Verba) for details.
18 |
19 | ### Steps to Run
20 | 1. Clone this repository and install dependencies
21 | ```bash
22 | git clone https://github.com/piazza-tech/Piazza-Updater.git
23 | cd Piazza-Updater
24 | pip install -r requirements.txt
25 | ```
26 |
27 | 2. Provide execution permissions for the `start.sh` script:
28 | ```bash
29 | chmod +x start.sh
30 | ```
31 |
32 | 3. Run the framework:
33 | ```bash
34 | ./start.sh
35 | ```
36 |
37 | 4. Open your browser and navigate to `http://localhost:8000`.
38 |
39 | 5. In the Verba web interface:
40 | - Choose **Docker Deployment**.
41 | - Select **Documents** to observe Wikipedia data being processed and updated in real-time.
42 |
43 | 6. Once the initial Wikipedia dumps are processed:
44 | - The script begins searching the internet for new data.
45 | - Chat with the LLM using up-to-date information!
46 |
47 | ---
48 |
49 | ### Configuration Options
50 | 1. **Development Mode**:
51 | - Skip the `PRODUCTION` variable in `.env` for a lightweight demo (processes a small subset of Wikipedia).
52 | - Use `docker-compose-s.yml` for minimal resource usage.
53 |
54 | 2. **Production Mode**:
55 | - Add the `PRODUCTION` variable in `.env` to process the entire Wikipedia dataset.
56 | - Use `docker-compose.yml` for full-scale deployment (requires more time and resources).
57 |
58 | ---
59 |
60 | ## Technologies Used
61 | - **Weaviate**: Vector database for efficient semantic search and data retrieval.
62 | - **LLMs**: Powered by **Ollama Llama 3.2** for natural language understanding.
63 | - **RAG Framework**: Combines vectorized data with real-time search to enhance LLM performance.
64 | - **Verba**: Web app for seamless user interaction and deployment ([learn more](https://github.com/weaviate/Verba)).
65 |
66 | ---
67 |
68 | ## Use Cases Beyond Wikipedia
69 | While this demo focuses on Wikipedia, the Piazza Updater framework is highly adaptable:
70 | - Integrate with any database or website.
71 | - Fetch and process real-time internet data for various domains, such as:
72 | - News websites
73 | - E-commerce platforms
74 | - Scientific research databases
75 |
76 | ---
77 |
78 | ## License
79 | This project is open-source under the [MIT License](LICENSE).
80 |
81 | ---
82 |
83 | ## Contributing
84 | We welcome contributions to enhance this demo! Feel free to fork the repository, make changes, and submit pull requests.
85 |
86 | ---
87 |
88 | For questions or support, reach out to **Piazza AI** or visit the [Verba repository](https://github.com/weaviate/Verba) for additional deployment details.
89 |
90 | Start exploring the future of real-time LLMs today! 🚀
91 |
--------------------------------------------------------------------------------
/assets/Flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Piazza-tech/Piazza-Updater/5064db489257f5f330bae9ce3254028feb77ec13/assets/Flowchart.png
--------------------------------------------------------------------------------
/docker-compose-s.yml:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | services:
4 |
5 | weaviate:
6 | command:
7 | - --host
8 | - 0.0.0.0
9 | - --port
10 | - '8080'
11 | - --scheme
12 | - http
13 | image: cr.weaviate.io/semitechnologies/weaviate:1.27.0
14 | ports:
15 | - 8080:8080
16 | - 3000:8080
17 | - 50051:50051
18 | volumes:
19 | - weaviate_data:/var/lib/weaviate
20 | restart: on-failure:0
21 | healthcheck:
22 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8080/v1/.well-known/ready || exit 1
23 | interval: 5s
24 | timeout: 10s
25 | retries: 5
26 | start_period: 10s
27 | environment:
28 | QUERY_DEFAULTS_LIMIT: 50
29 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
30 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
31 | DEFAULT_VECTORIZER_MODULE: 'text2vec-ollama'
32 | ENABLE_API_BASED_MODULES: 'true'
33 | ENABLE_MODULES: 'generative-ollama,text2vec-ollama'
34 | CLUSTER_HOSTNAME: 'node1'
35 |
36 | updater:
37 | build:
38 | context: .
39 | dockerfile: Dockerfile
40 | environment:
41 | - WEAVIATE_URL=http://weaviate:8080
42 | - OLLAMA_URL=http://ollama:11434
43 | - OLLAMA_MODEL=llama3.2
44 | ports:
45 | - 8000:8000
46 | depends_on:
47 | weaviate:
48 | condition: service_healthy
49 | volumes:
50 | - updater:/data/
51 | healthcheck:
52 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8000 || exit 1
53 | interval: 5s
54 | timeout: 10s
55 | retries: 5
56 | start_period: 10s
57 |
58 | ollama:
59 | image: ollama/ollama
60 | ports:
61 | - "11434:11434" # Expose the port for Ollama
62 | volumes:
63 | - ollama:/root/.ollama # Persistent storage for Ollama data
64 | deploy:
65 | resources:
66 | reservations:
67 | devices:
68 | - driver: nvidia
69 | capabilities: [gpu]
70 | count: all # Use 'all' for all available GPUs or specify a number
71 | restart: always
72 |
73 | volumes:
74 | weaviate_data:
75 | external: true
76 | ollama:
77 | external: true
78 | updater:
79 | external: true
80 | ...
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | services:
4 |
5 | weaviate:
6 | command:
7 | - --host
8 | - 0.0.0.0
9 | - --port
10 | - '8080'
11 | - --scheme
12 | - http
13 | image: cr.weaviate.io/semitechnologies/weaviate:1.27.0
14 | ports:
15 | - 8080:8080
16 | - 3000:8080
17 | - 50051:50051
18 | volumes:
19 | - weaviate_data:/var/lib/weaviate
20 | restart: on-failure:0
21 | healthcheck:
22 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8080/v1/.well-known/ready || exit 1
23 | interval: 5s
24 | timeout: 10s
25 | retries: 5
26 | start_period: 10s
27 | environment:
28 | OPENAI_APIKEY: $OPENAI_API_KEY
29 | COHERE_APIKEY: $COHERE_API_KEY
30 | TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080'
31 | QNA_INFERENCE_API: 'http://qna-transformers:8080'
32 | IMAGE_INFERENCE_API: 'http://i2v-neural:8080'
33 | NER_INFERENCE_API: 'http://ner-transformers:8080'
34 | RERANKER_INFERENCE_API: 'http://reranker-transformers:8080'
35 | QUERY_DEFAULTS_LIMIT: 25
36 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
37 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
38 | DEFAULT_VECTORIZER_MODULE: 'text2vec-ollama'
39 | ENABLE_API_BASED_MODULES: 'true'
40 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama,text2vec-transformers,qna-transformers,ner-transformers,img2vec-neural,reranker-transformers'
41 | CLUSTER_HOSTNAME: 'node1'
42 |
43 | t2v-transformers:
44 | image: cr.weaviate.io/semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1
45 | environment:
46 | ENABLE_CUDA: '1'
47 | NVIDIA_VISIBLE_DEVICES: 'all'
48 | deploy:
49 | resources:
50 | reservations:
51 | devices:
52 | - capabilities:
53 | - 'gpu'
54 |
55 | qna-transformers:
56 | image: cr.weaviate.io/semitechnologies/qna-transformers:distilbert-base-uncased-distilled-squad
57 | environment:
58 | ENABLE_CUDA: '1'
59 | NVIDIA_VISIBLE_DEVICES: 'all'
60 | deploy:
61 | resources:
62 | reservations:
63 | devices:
64 | - capabilities:
65 | - 'gpu'
66 |
67 | ner-transformers:
68 | image: cr.weaviate.io/semitechnologies/ner-transformers:dbmdz-bert-large-cased-finetuned-conll03-english
69 | environment:
70 | ENABLE_CUDA: '1'
71 | NVIDIA_VISIBLE_DEVICES: 'all'
72 | deploy:
73 | resources:
74 | reservations:
75 | devices:
76 | - capabilities:
77 | - 'gpu'
78 |
79 | i2v-neural:
80 | image: cr.weaviate.io/semitechnologies/img2vec-pytorch:resnet50
81 | environment:
82 | ENABLE_CUDA: '1'
83 | NVIDIA_VISIBLE_DEVICES: 'all'
84 | deploy:
85 | resources:
86 | reservations:
87 | devices:
88 | - capabilities:
89 | - 'gpu'
90 |
91 | reranker-transformers:
92 | image: cr.weaviate.io/semitechnologies/reranker-transformers:cross-encoder-ms-marco-MiniLM-L-6-v2
93 | environment:
94 | ENABLE_CUDA: '1'
95 | NVIDIA_VISIBLE_DEVICES: 'all'
96 | deploy:
97 | resources:
98 | reservations:
99 | devices:
100 | - capabilities:
101 | - 'gpu'
102 |
103 | updater:
104 | build:
105 | context: .
106 | dockerfile: Dockerfile
107 | ports:
108 | - 8000:8000
109 | environment:
110 | - WEAVIATE_URL=http://weaviate:8080
111 | - OPENAI_API_KEY=$OPENAI_API_KEY
112 | - COHERE_API_KEY=$COHERE_API_KEY
113 | - UNSTRUCTURED_API_KEY=$UNSTRUCTURED_API_KEY
114 | - UNSTRUCTURED_API_URL=$UNSTRUCTURED_API_URL
115 | - OLLAMA_URL=http://ollama:11434
116 | - OLLAMA_MODEL=llama3.2
117 | - OLLAMA_EMBED_MODEL=nomic-embed-text
118 | - GITHUB_TOKEN=$GITHUB_TOKEN
119 | depends_on:
120 | weaviate:
121 | condition: service_healthy
122 | volumes:
123 | - updater:/data/
124 | healthcheck:
125 | test: wget --no-verbose --tries=3 --spider http://127.0.0.1:8000 || exit 1
126 | interval: 5s
127 | timeout: 10s
128 | retries: 5
129 | start_period: 10s
130 |
131 | ollama:
132 | image: ollama/ollama
133 | ports:
134 | - "11434:11434" # Expose the port for Ollama
135 | volumes:
136 | - ollama:/root/.ollama # Persistent storage for Ollama data
137 | deploy:
138 | resources:
139 | reservations:
140 | devices:
141 | - driver: nvidia
142 | capabilities: [gpu]
143 | count: all # Use 'all' for all available GPUs or specify a number
144 | restart: always
145 |
146 | volumes:
147 | weaviate_data:
148 | external: true
149 | ollama:
150 | external: true
151 | updater:
152 | external: true
153 | ...
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import asyncio
3 | import dotenv
4 | from datetime import datetime, timezone
5 | import base64
6 | from typing import Dict, Any
7 | import argparse
8 | from goldenverba.server.api import manager, client_manager
9 | from goldenverba.server.helpers import LoggerManager
10 | from goldenverba.server.types import Credentials, FileConfig, FileStatus
11 | from src.wikipedia import WikipediaDataProcessor
12 |
13 | def create_fileconfig(article_data: dict, rag_config: dict, overwrite: bool=False) -> FileConfig:
14 | """Create a FileConfig object from article data."""
15 | original_content = article_data['text'].encode('utf-8')
16 | encoded_content = base64.b64encode(original_content).decode('utf-8')
17 |
18 | return FileConfig(
19 | fileID=f"wiki_{article_data['page_id']}",
20 | filename=article_data['title'],
21 | isURL=False,
22 | overwrite=overwrite,
23 | extension="txt",
24 | source=article_data['source_link'],
25 | content=encoded_content,
26 | labels=article_data['labels'],
27 | rag_config=rag_config,
28 | file_size=len(original_content),
29 | status=FileStatus.READY,
30 | metadata=article_data['metadata'],
31 | status_report={}
32 | )
33 |
34 | async def process_batch(client, batch: list[Dict[str, Any]], rag_config: dict, logger: LoggerManager):
35 | """Process a batch of articles concurrently."""
36 | tasks = []
37 | for article_data in batch:
38 | file_config = create_fileconfig(article_data, rag_config)
39 | tasks.append(manager.import_document(client, file_config, logger))
40 | return await asyncio.gather(*tasks)
41 |
42 | async def process_static_data(data_processor: WikipediaDataProcessor, client, rag_config, logger, batch_size: int = 10):
43 | """
44 | Process the initial Wikipedia dump data in batches and upload them.
45 |
46 | Args:
47 | data_processor: Instance of WikipediaDataProcessor.
48 | client: Client connection to the server.
49 | rag_config: RAG configuration.
50 | logger: Logger instance.
51 | batch_size: Number of articles to process per batch.
52 | limit: Maximum number of articles to process (for testing).
53 | """
54 | current_batch = []
55 |
56 | for article_data in data_processor.process_data():
57 | current_batch.append(article_data)
58 | # Process batch when it reaches the desired size
59 | if len(current_batch) >= batch_size:
60 | await process_batch(client, current_batch, rag_config, logger)
61 | current_batch = []
62 |
63 | # Process any remaining articles in the final batch
64 | if current_batch:
65 | await process_batch(client, current_batch, rag_config, logger)
66 |
67 | print("Finished processing static data.")
68 |
69 | async def fetch_recent_changes_continuously(data_processor: WikipediaDataProcessor, client, rag_config, logger, batch_size: int=10, interval_minutes: int=5):
70 | """
71 | Fetch recent Wikipedia changes every specified interval and upload them.
72 |
73 | Args:
74 | data_processor: Instance of WikipediaDataProcessor.
75 | client: Client connection to the server.
76 | rag_config: RAG configuration.
77 | logger: Logger instance.
78 | interval_minutes: Minutes to wait between each fetch of recent changes.
79 | """
80 | while True:
81 | print(f"Fetching recent changes at {datetime.now(timezone.utc).isoformat()}")
82 |
83 | current_batch = []
84 |
85 | # Fetch recent changes from the last interval
86 | for change in data_processor.get_recent_changes(minutes=interval_minutes):
87 | current_batch.append(change)
88 | # Process batch when it reaches the desired size
89 | if len(current_batch) >= batch_size:
90 | await process_batch(client, current_batch, rag_config, logger)
91 | current_batch = []
92 |
93 | # Process any remaining articles in the final batch
94 | if current_batch:
95 | await process_batch(client, current_batch, rag_config, logger)
96 |
97 | # Wait for the next interval
98 | await asyncio.sleep(interval_minutes * 60)
99 |
100 | def get_production_mode(args):
101 | # Check if args.production is set and is a boolean
102 | production_mode = getattr(args, 'production', None)
103 |
104 | # If args.production is not set, check the environment variable
105 | if production_mode is None:
106 | env_value = os.getenv('PRODUCTION', 'False') # Default to 'False' as a string
107 | production_mode = env_value.lower() == 'true' # Convert to boolean
108 |
109 | return production_mode
110 |
111 | # Function to parse command-line arguments
112 | def parse_arguments():
113 | parser = argparse.ArgumentParser(description="Run the data processing script.")
114 | parser.add_argument('--production', action='store_true', help="Specify if running in production mode.")
115 | return parser.parse_args()
116 |
117 | async def main():
118 | # Parse command-line arguments
119 | args = parse_arguments()
120 | dotenv.load_dotenv()
121 | # Initialize logger
122 | logger = LoggerManager()
123 |
124 | # Set up credentials and connect to the client
125 | credentials = Credentials(deployment="Docker", url="weaviate", key="")
126 | client = await client_manager.connect(credentials)
127 | production = get_production_mode(args)
128 |
129 | try:
130 | # Fetch the rag_config from the server
131 | rag_config = await manager.load_rag_config(client)
132 | if not rag_config:
133 | raise Exception("rag_config is empty. Cannot proceed.")
134 |
135 | # Initialize the data processor
136 | data_processor = WikipediaDataProcessor(production=production)
137 |
138 | # Process the static data first
139 | await process_static_data(data_processor, client, rag_config, logger, 1)
140 |
141 | # Start fetching recent changes continuously
142 | await fetch_recent_changes_continuously(data_processor, client, rag_config, logger, 1, 5)
143 |
144 | finally:
145 | await client.close()
146 |
147 | if __name__ == "__main__":
148 | asyncio.run(main())
149 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | mwxml
3 | mwparserfromhell
4 | tqdm
5 | goldenverba
6 | python-dotenv
--------------------------------------------------------------------------------
/src/wikipedia.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | import bz2
4 | import mwxml
5 | import mwparserfromhell
6 | import json
7 | from typing import Generator, Dict, Any
8 | from datetime import datetime, timedelta, timezone
9 | from time import sleep
10 | from tqdm import tqdm
11 |
12 | class WikipediaDataProcessor:
13 | def __init__(self, extract_folder: str = "/data/datasets/wikipedia/extracted_wikipedia", production=False):
14 | self.dump_url_test = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p1p41242.bz2"
15 | self.dump_url_prod = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2"
16 | self.dump_url = self.dump_url_prod if production else self.dump_url_test
17 | self.dump_file = os.path.join(os.path.dirname(extract_folder), "enwiki-sample.xml.bz2")
18 | self.extract_folder = extract_folder
19 | self.production = production
20 | self.api_base_url = "https://en.wikipedia.org/w/api.php"
21 |
22 | # Check and download the dump file if necessary
23 | self.check_and_download_dump()
24 | # Check and extract the dump file if necessary
25 | self.check_and_extract_dump()
26 |
27 | def check_and_download_dump(self):
28 | # Check if the dump file exists
29 | if not os.path.exists(self.dump_file):
30 | # Get the directory for the dump file
31 | os.makedirs(os.path.abspath(os.path.dirname(self.dump_file)), exist_ok=True)
32 |
33 | print(f"Dump file {self.dump_file} not found.")
34 | self.download_wikipedia_dump()
35 | else:
36 | print(f"Dump file {self.dump_file} already exists.")
37 |
38 | def download_wikipedia_dump(self):
39 | response = requests.get(self.dump_url, stream=True)
40 | total_size = int(response.headers.get('content-length', 0))
41 | print(f"Downloading {total_size / 1048576:.2f} MB from {self.dump_url} ...")
42 | block_size = 1024 * 1024 # 1 MB
43 |
44 | with open(self.dump_file, 'wb') as file:
45 | # Initialize tqdm for progress tracking
46 | with tqdm(total=total_size, unit='MB', unit_scale=True, desc=self.dump_file) as pbar:
47 | for data in response.iter_content(block_size):
48 | file.write(data)
49 | pbar.update(len(data) / (1024 * 1024)) # Update progress bar
50 |
51 | print(f"Downloaded {self.dump_file}")
52 |
53 | def check_and_extract_dump(self):
54 | if not os.path.isdir(self.extract_folder) or not bool(os.listdir(self.extract_folder)):
55 | print(f"Extract folder {self.extract_folder} not found. Extracting...")
56 | os.makedirs(self.extract_folder, exist_ok=True)
57 | self.extract_dump()
58 | else:
59 | print(f"Extract folder {self.extract_folder} already exists.")
60 |
61 | def extract_dump(self):
62 | # Get the total size of the dump file for progress tracking
63 | total_size = os.path.getsize(self.dump_file)
64 |
65 | with bz2.BZ2File(self.dump_file, 'rb') as file:
66 | # Open the output file for writing
67 | with open(os.path.join(self.extract_folder, 'wikipedia.xml'), 'wb') as out_file:
68 | # Initialize tqdm for progress tracking
69 | with tqdm(total=total_size, unit='B', unit_scale=True, desc='Extracting dump') as pbar:
70 | for data in iter(lambda: file.read(100 * 1024), b''): # Read in chunks of 100 KB
71 | out_file.write(data)
72 | pbar.update(len(data)) # Update progress bar with the size of data written
73 |
74 | print(f"Extracted dump to {self.extract_folder}")
75 | if not self.production:
76 | try:
77 | os.remove(self.dump_file)
78 | print(f"Deleted dump file: {self.dump_file}")
79 | except OSError as e:
80 | print(f"Error deleting file {self.dump_file}: {e}")
81 |
82 | def process_data(self) -> Generator[Dict[str, Any], None, None]:
83 | xml_file_path = os.path.join(self.extract_folder, 'wikipedia.xml')
84 |
85 | # Count total pages in the dump file for tqdm
86 | with open(xml_file_path, 'rb') as file:
87 | dump = mwxml.Dump.from_file(file)
88 | total_pages = sum(1 for _ in dump)
89 |
90 | # Process pages with tqdm progress bar
91 | with open(xml_file_path, 'rb') as file:
92 | dump = mwxml.Dump.from_file(file)
93 | pages_count = 0
94 | for page in tqdm(dump, total=total_pages, desc="Processing Wikipedia Pages"):
95 | if not self.production and pages_count >= 500:
96 | break
97 | title = page.title
98 | # Skip non-article pages (e.g., User:, Talk:, etc.)
99 | if ':' in title:
100 | continue
101 | # Construct source link
102 | source_link = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
103 | # Get the latest revision text
104 | try:
105 | revision = next(page) # Assuming the first revision is the latest
106 | except StopIteration:
107 | continue # No revisions, skip page
108 | text = revision.text
109 |
110 | if text is None or text.startswith("#REDIRECT"):
111 | continue # Skip void and REDIRECT pages
112 |
113 | # Parse wikitext to extract plain text
114 | wikicode = mwparserfromhell.parse(text)
115 | plain_text = wikicode.strip_code()
116 |
117 | # Yield the data in the standardized format
118 | yield {
119 | 'title': title,
120 | 'source_link': source_link,
121 | 'text': plain_text,
122 | 'page_id': page.id,
123 | 'labels': ['Wikipedia'],
124 | 'metadata': json.dumps({
125 | 'id': page.id,
126 | 'title': title
127 | })
128 | }
129 | pages_count +=1
130 |
131 | def get_recent_changes(self, minutes: int = 5) -> Generator[Dict[str, Any], None, None]:
132 | """
133 | Fetch and process recent changes from Wikipedia.
134 |
135 | Args:
136 | minutes: Number of minutes to look back for changes (default: 5)
137 |
138 | Yields:
139 | Dict containing processed page information in the same format as process_data()
140 | """
141 | # Calculate the start time
142 | start_time = (datetime.now(timezone.utc) - timedelta(minutes=minutes)).strftime("%Y%m%d%H%M%S")
143 |
144 | # Parameters for the API request
145 | params = {
146 | "action": "query",
147 | "format": "json",
148 | "list": "recentchanges",
149 | "rcstart": start_time,
150 | "rcdir": "newer",
151 | "rcnamespace": "0", # Main namespace only
152 | "rclimit": "500", # Maximum allowed limit
153 | "rcprop": "title|ids|timestamp|comment|user|flags|sizes",
154 | }
155 |
156 | processed_pages = set() # Track processed pages to avoid duplicates
157 |
158 | while True:
159 | try:
160 | response = requests.get(self.api_base_url, params=params)
161 | response.raise_for_status()
162 | data = response.json()
163 |
164 | changes = data.get("query", {}).get("recentchanges", [])
165 |
166 | if not changes:
167 | break
168 |
169 | pages_count = 0
170 | for change in tqdm(changes, desc="Processing changes", unit="change"):
171 | if not self.production and pages_count >= 50:
172 | break
173 | page_id = change["pageid"]
174 | # Skip if we've already processed this page
175 | if page_id in processed_pages:
176 | continue
177 | processed_pages.add(page_id)
178 |
179 | # Get the current content of the page
180 | content_params = {
181 | "action": "query",
182 | "format": "json",
183 | "prop": "revisions",
184 | "pageids": page_id,
185 | "rvprop": "content",
186 | "rvslots": "main"
187 | }
188 |
189 | try:
190 | content_response = requests.get(self.api_base_url, params=content_params)
191 | content_response.raise_for_status()
192 | content_data = content_response.json()
193 |
194 | # Extract the page content
195 | page = next(iter(content_data["query"]["pages"].values()))
196 | revision = page["revisions"][0]
197 | text = revision["slots"]["main"]["*"]
198 |
199 | # Skip redirects
200 | if text.startswith("#REDIRECT"):
201 | continue
202 |
203 | title = page["title"]
204 | source_link = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
205 |
206 | # Parse wikitext to extract plain text
207 | wikicode = mwparserfromhell.parse(text)
208 | plain_text = wikicode.strip_code()
209 |
210 | yield {
211 | 'title': title,
212 | 'source_link': source_link,
213 | 'text': plain_text,
214 | 'page_id': page_id,
215 | 'labels': ['Wikipedia'],
216 | 'metadata': json.dumps({
217 | 'id': page_id,
218 | 'title': title,
219 | 'last_modified': change['timestamp'],
220 | 'editor': change['user'],
221 | 'comment': change.get('comment', '')
222 | })
223 | }
224 |
225 | # Be nice to the API
226 | sleep(0.1)
227 | pages_count += 1
228 |
229 | except Exception as e:
230 | print(f"Error processing page {page_id}: {str(e)}")
231 | continue
232 |
233 | # Check if there are more results
234 | if "continue" in data:
235 | params.update(data["continue"])
236 | else:
237 | break
238 |
239 | except Exception as e:
240 | print(f"Error fetching recent changes: {str(e)}")
241 | break
--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Load environment variables from .env file
4 | set -a
5 | source .env
6 | set +a
7 |
8 | # List of external volumes to check and create if needed
9 | VOLUMES=("weaviate_data" "updater" "ollama")
10 | # Name of the tunnel network to check
11 | NETWORKS=""
12 |
13 | # Function to create a volume if it does not exist
14 | create_volume_if_not_exists() {
15 | local volume_name=$1
16 | if ! docker volume ls | grep -q "$volume_name"; then
17 | echo "Creating external volume '$volume_name'..."
18 | docker volume create "$volume_name"
19 | fi
20 | }
21 |
22 | # Function to create a network if it does not exist
23 | create_network_if_not_exists() {
24 | local network_name=$1
25 | if ! docker network ls | grep -q "$network_name"; then
26 | echo "Creating network '$network_name'..."
27 | docker network create "$network_name"
28 | fi
29 | }
30 |
31 | # Loop through each volume in the list and create if it doesn't exist
32 | for volume in "${VOLUMES[@]}"; do
33 | create_volume_if_not_exists "$volume"
34 | done
35 |
36 | # Check for tunnel network and create if it doesn't exist
37 | create_network_if_not_exists "$NETWORKS"
38 |
39 | # Check if PRODUCTION is set to False
40 | if [ "$PRODUCTION" == "False" ]; then
41 | echo "Testing with small docker-compose-s.yml ..."
42 | docker compose --env-file .env -f docker-compose-s.yml up --build
43 | else
44 | echo "Deploying with docker-compose.yml ..."
45 | docker compose --env-file .env -f docker-compose.yml up -d --build
46 | fi
--------------------------------------------------------------------------------