├── agents ├── __init__.py ├── evolution_agent │ ├── __init__.py │ ├── breadth.py │ ├── evolver.py │ └── depth.py ├── client_initialization.py ├── schema_agent.py └── generation_agent.py ├── assets ├── logo.png ├── Local_File.png ├── local_file_dataset.gif └── local_file_dataset.mp4 ├── .gitignore ├── .env.example ├── configuration.py ├── requirements.txt ├── docker-compose.yaml ├── prompts.py ├── schemas.py ├── LICENSE ├── utils.py ├── qdrant_setup.py ├── main.py ├── README.md └── workflow.py /agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /agents/evolution_agent/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/logo.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | venv 3 | qdrant_data 4 | .env 5 | **/__pycache__ 6 | final_dataset.json 7 | resource -------------------------------------------------------------------------------- /assets/Local_File.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/Local_File.png -------------------------------------------------------------------------------- /assets/local_file_dataset.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/local_file_dataset.gif -------------------------------------------------------------------------------- /assets/local_file_dataset.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/local_file_dataset.mp4 -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | MISTRAL_API_KEY= 2 | OPENAI_API_KEY= 3 | QDRANT_URL=http://localhost:6333 4 | COLLECTION_NAME=knowledge_base 5 | EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 -------------------------------------------------------------------------------- /agents/client_initialization.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from openai import OpenAI 4 | from google import genai 5 | 6 | load_dotenv() 7 | 8 | openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -------------------------------------------------------------------------------- /configuration.py: -------------------------------------------------------------------------------- 1 | CONFIGURATION = { 2 | "rows_per_context": 5, # Number of QAs or rows generated per chunk 3 | "evolution_depth": 1, # How much transformation/evolution to apply (1 = minimal, 3 = very complex) 4 | } 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | qdrant-client[fastembed] 3 | openai 4 | mistralai 5 | pymupdf 6 | python-pptx 7 | python-docx 8 | psycopg2-binary 9 | pillow 10 | pydantic 11 | python-multipart 12 | rich 13 | pyfiglet 14 | google-genai 15 | pdfplumber -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | qdrant: 3 | image: qdrant/qdrant:latest 4 | restart: always 5 | container_name: qdrant_localgen 6 | ports: 7 | - 6333:6333 8 | - 6334:6334 9 | expose: 10 | - 6333 11 | - 6334 12 | - 6335 13 | configs: 14 | - source: qdrant_config 15 | target: /qdrant/config/production.yaml 16 | volumes: 17 | - ./qdrant_data:/qdrant/storage 18 | 19 | configs: 20 | qdrant_config: 21 | content: | 22 | log_level: INFO -------------------------------------------------------------------------------- /prompts.py: -------------------------------------------------------------------------------- 1 | schema_generate_prompt = """You are an autonomous schema-generating agent designed to construct data schemas for fine-tuning or training LLMs on user-specified tasks. Your jobis to analyze the user's task description and output a structured dataset schema definition.\n\n Ensure each field in the schema is useful for training and fine-tuning, well-typed, and annotated. Focus on tasks involving natural language input, structured context (like database schemas), and model output (like SQL queries, code, responses, etc.).""" -------------------------------------------------------------------------------- /agents/schema_agent.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dotenv import load_dotenv 3 | 4 | from schemas import DatasetSchema 5 | from agents.client_initialization import openai_client 6 | from prompts import schema_generate_prompt 7 | 8 | load_dotenv() 9 | 10 | def generate_dataset_schema( 11 | user_concept: str, model: str = "gpt-4.1-mini" 12 | ) -> DatasetSchema: 13 | response = openai_client.responses.parse( 14 | model=model, 15 | input=[ 16 | {"role": "system", "content": schema_generate_prompt}, 17 | {"role": "user", "content": user_concept}, 18 | ], 19 | text_format=DatasetSchema, 20 | ) 21 | 22 | result = response.output_parsed 23 | return result -------------------------------------------------------------------------------- /agents/evolution_agent/breadth.py: -------------------------------------------------------------------------------- 1 | base_instruction = """I want you to act as a Dataset Row Creator. 2 | You will receive a single JSON row (not a full dataset), and your goal is to create a brand-new data row that belongs to the **same domain** as the input. 3 | The new row must: 4 | - Follow the **same JSON schema** (same keys, structure). 5 | - Be of **similar complexity and length**. 6 | - Introduce a **different but related content** (not a simple rewording). 7 | - Be fully understandable and valid as a training data row. 8 | 9 | Do NOT use terms like '#Input Row#', '#New Row#', 'original row', or 'created row' in your response. 10 | """ 11 | 12 | def createBreadthPrompt(row_json_str): 13 | prompt = base_instruction 14 | prompt += "\n#Input Row#:\n{}\n".format(row_json_str) 15 | prompt += "#New Row#:\n" 16 | return prompt 17 | -------------------------------------------------------------------------------- /schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import List, Literal, Dict, Any 3 | from enum import Enum 4 | 5 | class QAItem(BaseModel): 6 | id: int 7 | question: str 8 | answer: str 9 | difficulty: Literal["basic", "intermediate", "advanced"] 10 | type: Literal["theoretical", "practical", "code", "application"] 11 | 12 | class QAList(BaseModel): 13 | items:List[QAItem] 14 | 15 | class FieldType(str, Enum): 16 | string = "string" 17 | number = "number" 18 | array = "array" 19 | # object = "object" 20 | boolean = "boolean" 21 | # date = "date" 22 | 23 | class SchemaField(BaseModel): 24 | key: str = Field(..., description="The unique identifier for the field") 25 | type: FieldType = Field(..., description="The data type of the field") 26 | description: str = Field(..., description="Some descriptive information for the field") 27 | 28 | class DatasetSchema(BaseModel): 29 | generated_schema: list[SchemaField] 30 | 31 | class DatasetRecords(BaseModel): 32 | dataset:List[Dict[str, Any]] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Swaraj Biswal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | from schemas import SchemaField 5 | from configuration import CONFIGURATION 6 | 7 | def process_datagen_prompt(fields: List[SchemaField]) -> str: 8 | schema_instruction = {field.key: field.description for field in fields} 9 | 10 | field_string = f"""## Response Format 11 | Always respond with a valid JSON array of objects: 12 | [ 13 | {json.dumps(schema_instruction, indent=2)}, 14 | // Additional entries... 15 | ] 16 | """ 17 | return f""" 18 | You are an expert Question-Answer generation assistant who has the skills of a polymath. Your task is to analyze content provided by the user and generate a comprehensive set of questions with detailed answers based on that content. 19 | 20 | ## Core Instructions 21 | 22 | 1. When presented with content, carefully analyze it to identify key concepts, important details, practical applications, and potential challenges or edge cases. 23 | 24 | 2. Generate a diverse set of questions and answers that thoroughly cover the provided content. Your response must be in valid JSON format. 25 | 26 | 3. Format code properly within JSON strings, using appropriate escape characters for special characters. 27 | 28 | 4. Number of dataset rows must be {CONFIGURATION["rows_per_context"]} 29 | 30 | {field_string} 31 | """ 32 | 33 | -------------------------------------------------------------------------------- /agents/evolution_agent/evolver.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | 4 | from agents.generation_agent import generation_agent 5 | from agents.evolution_agent.depth import createConstraintsPrompt, createDeepenPrompt, createConcretizingPrompt, createReasoningPrompt 6 | from agents.evolution_agent.breadth import createBreadthPrompt 7 | from configuration import CONFIGURATION 8 | 9 | def evolve_dataset(dataset): 10 | current_dataset = dataset 11 | for i in range(CONFIGURATION["evolution_depth"]): 12 | evolved_dataset = [] 13 | for dataset_row in current_dataset: 14 | dataset_row = json.dumps([dataset_row]) 15 | evol_prompts = [] 16 | evol_prompts.append(createConstraintsPrompt(dataset_row)) 17 | evol_prompts.append(createDeepenPrompt(dataset_row)) 18 | evol_prompts.append(createConcretizingPrompt(dataset_row)) 19 | evol_prompts.append(createReasoningPrompt(dataset_row)) 20 | evol_prompts.append(createBreadthPrompt(dataset_row)) 21 | 22 | selected_evol_prompt = random.choice(evol_prompts) 23 | evolved_dataset_row = generation_agent(selected_evol_prompt, system_prompt="Always return the same schema as the input dataset no matter what so that it can be parsed later.") 24 | evolved_dataset.extend(evolved_dataset_row) 25 | dataset.extend(evolved_dataset) 26 | current_dataset = evolved_dataset 27 | return dataset 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /agents/generation_agent.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | from dotenv import load_dotenv 4 | from pydantic import ValidationError 5 | from openai import RateLimitError, OpenAIError 6 | from agents.client_initialization import openai_client 7 | 8 | from schemas import DatasetRecords 9 | 10 | load_dotenv() 11 | 12 | def generation_agent(content, system_prompt, model="gpt-4.1-mini", retries=3, base_wait=2): 13 | for attempt in range(retries): 14 | try: 15 | response = openai_client.responses.create( 16 | model=model, 17 | input=[ 18 | {"role": "system", "content": system_prompt}, 19 | {"role": "user", "content": content} 20 | ], 21 | temperature=0.2, 22 | ) 23 | 24 | raw_text = response.output_text.strip() 25 | 26 | if raw_text.startswith("```json"): 27 | raw_text = raw_text[len("```json"):].lstrip() 28 | elif raw_text.startswith("```"): 29 | raw_text = raw_text[len("```"):].lstrip() 30 | 31 | if raw_text.endswith("```"): 32 | raw_text = raw_text[:-3].rstrip() 33 | 34 | parsed_json = json.loads(raw_text) 35 | final_package = {"dataset": parsed_json} 36 | validated = DatasetRecords(**final_package) 37 | 38 | return validated.dataset 39 | 40 | except json.JSONDecodeError as e: 41 | print(f"[JSON Parse Error] {e}") 42 | return [] 43 | 44 | except ValidationError as e: 45 | print(f"[Pydantic Validation Error] {e}") 46 | return [] 47 | 48 | except RateLimitError: 49 | wait_time = base_wait * (2 ** attempt) 50 | print(f"[Rate Limit] Retrying in {wait_time}s (Attempt {attempt + 1}/{retries})...") 51 | time.sleep(wait_time) 52 | 53 | except OpenAIError as e: 54 | print(f"[OpenAI Error] {e}") 55 | return [] 56 | 57 | print("[Rate limit Error] Exceeded retry attempts due to rate limiting.") 58 | return [] 59 | -------------------------------------------------------------------------------- /agents/evolution_agent/depth.py: -------------------------------------------------------------------------------- 1 | base_instruction = """I want you to act as a Dataset Row Evolver. 2 | You will receive a single JSON row (not the whole dataset), and your task is to modify this row in meaningful ways while keeping its schema exactly the same. 3 | You must NOT remove or rename any keys in the JSON structure. 4 | The number of rows is always 1 — treat this as a single data entry. 5 | Make sure the output stays valid JSON. 6 | 7 | The transformation strategy is: 8 | {} 9 | 10 | You may only modify field *values*, not keys. Keep changes small — around 10 to 20 words total. 11 | Avoid saying '#Input Dataset#', '#Modified Dataset#', 'original row', or 'evolved row' in your output.""" 12 | 13 | def createConstraintsPrompt(row_json_str): 14 | prompt = base_instruction.format("Add constraints, clarifiers, or qualifiers to some of the field values — for example, change 'category': 'food' to 'category': 'food (perishable)' or 'priority': 'high' to 'priority': 'high and time-sensitive'.") 15 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 16 | prompt += "#Modified Dataset#:\n" 17 | return prompt 18 | 19 | def createDeepenPrompt(row_json_str): 20 | prompt = base_instruction.format("Make the content of some fields deeper or more layered — for example, change 'instruction': 'Write an essay about trees' to 'Write an essay about how deforestation impacts climate using real-world case studies'.") 21 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 22 | prompt += "#Modified Dataset#:\n" 23 | return prompt 24 | 25 | def createConcretizingPrompt(row_json_str): 26 | prompt = base_instruction.format("Replace vague or generic values with more specific ones. For instance, change 'topic': 'science' to 'topic': 'quantum physics' or 'audience': 'students' to 'audience': 'final-year computer science students'.") 27 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 28 | prompt += "#Modified Dataset#:\n" 29 | return prompt 30 | 31 | def createReasoningPrompt(row_json_str): 32 | prompt = base_instruction.format("Wherever applicable, increase the need for multi-step reasoning in the field values. For example, turn a simple 'question' into a multi-part one that requires combining facts or drawing inferences.") 33 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 34 | prompt += "#Modified Dataset#:\n" 35 | return prompt 36 | -------------------------------------------------------------------------------- /qdrant_setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from dotenv import load_dotenv 4 | from qdrant_client import QdrantClient, models 5 | 6 | load_dotenv() 7 | 8 | client = QdrantClient(url=os.getenv("QDRANT_URL")) 9 | collection_name = os.getenv("COLLECTION_NAME") 10 | model_name = os.getenv("EMBEDDING_MODEL") 11 | 12 | if not client.collection_exists(collection_name=collection_name): 13 | client.create_collection( 14 | collection_name=collection_name, 15 | vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)) 16 | 17 | def retrieve_from_store(question: str, user_id:str, n_points: int = 3) -> str: 18 | results = client.query_points( 19 | collection_name=collection_name, 20 | query=models.Document(text=question, model=model_name), 21 | query_filter=models.Filter( 22 | must=[ 23 | models.FieldCondition( 24 | key="group_id", 25 | match=models.MatchValue( 26 | value=user_id, 27 | ), 28 | ) 29 | ] 30 | ), 31 | limit=n_points, 32 | ) 33 | return results.points 34 | 35 | def remove_data_from_store(user_id:str) -> str: 36 | client.delete( 37 | collection_name=collection_name, 38 | points_selector=models.FilterSelector( 39 | filter=models.Filter( 40 | must=[ 41 | models.FieldCondition( 42 | key="group_id", 43 | match=models.MatchValue( 44 | value=user_id, 45 | ), 46 | ) 47 | ] 48 | ) 49 | ) 50 | ) 51 | 52 | def rag_pipeline_setup(user_id, documents): 53 | client.upsert( 54 | collection_name=collection_name, 55 | points=[ 56 | models.PointStruct( 57 | id=idx, 58 | vector=models.Document(text=document["page_content"], model=model_name), 59 | payload={"group_id": user_id, "document": document}, 60 | ) 61 | for idx, document in enumerate(documents) 62 | ],) 63 | 64 | def select_random_chunk(documents): 65 | if not documents: 66 | return None, None 67 | 68 | idx = random.randint(0, len(documents) - 1) 69 | selected_doc = documents[idx] 70 | 71 | content = f"filename:{selected_doc['filename']}\nPage_number:{selected_doc['page_number']}\nPage_Content: {selected_doc["page_content"]}\n\n\n" 72 | 73 | return idx, content -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | 4 | from rich.console import Console 5 | from rich.panel import Panel 6 | from rich.prompt import Prompt 7 | from rich.table import Table 8 | from rich import box 9 | from pyfiglet import Figlet 10 | from datetime import datetime 11 | 12 | from workflow import generate_dataset_schema, generate_full_dataset, process_datagen_prompt 13 | 14 | console = Console() 15 | 16 | def render_banner(title: str = "Thesius.ai", subtitle: str = "Document to Dataset Generator"): 17 | figlet = Figlet(font="banner3-D", width=200) 18 | ascii_art = figlet.renderText(title) 19 | panel = Panel.fit( 20 | f"[bold cyan]{ascii_art}[/bold cyan]\n[green]{subtitle}[/green]", 21 | border_style="magenta", 22 | padding=(1, 4), 23 | title="[bold yellow]WELCOME[/bold yellow]", 24 | box=box.DOUBLE 25 | ) 26 | console.print(panel) 27 | 28 | def render_schema(schema_obj): 29 | if not hasattr(schema_obj, 'generated_schema'): 30 | print_section("SCHEMA GENERATION", str(schema_obj)) 31 | return 32 | 33 | table = Table(title=None, box=box.ASCII, header_style="bold magenta") 34 | table.add_column("Field", style="cyan", no_wrap=True) 35 | table.add_column("Type", style="green") 36 | table.add_column("Description", style="white") 37 | 38 | for field in schema_obj.generated_schema: 39 | field_type = str(field.type.value if hasattr(field.type, "value") else field.type) 40 | table.add_row(field.key, field_type, field.description) 41 | 42 | console.print("\n") 43 | print_section("SCHEMA GENERATION") 44 | console.print(table) 45 | 46 | def print_section(title: str, content: str = "", width: int = 100): 47 | title_bar = f" {title} ".center(width, "=") 48 | console.print(f"\n[bold cyan]{title_bar}[/bold cyan]") 49 | if content: 50 | console.print(content) 51 | 52 | def status(message: str, style="bold white"): 53 | console.print(f"[{style}]• {message}[/{style}]") 54 | 55 | def success(message: str): 56 | console.print(f"[bold green]{message}[/bold green]") 57 | 58 | def warning(message: str): 59 | console.print(f"[bold yellow]{message}[/bold yellow]") 60 | 61 | def error(message: str): 62 | console.print(f"[bold red]{message}[/bold red]") 63 | 64 | def get_user_feedback(fields, base_query): 65 | while True: 66 | render_schema(fields) 67 | status("Do you want to proceed with this schema?") 68 | feedback = input("Type your feedback or 'continue' to proceed: ").strip() 69 | 70 | if not feedback: 71 | warning("Input was empty. Please provide feedback or type 'continue'.") 72 | continue 73 | 74 | if feedback.lower() == "continue": 75 | return "continue", base_query 76 | else: 77 | updated_query = f"""Parent query: {base_query} 78 | 79 | Previous generated schema: 80 | {fields.generated_schema} 81 | 82 | Suggestion from the user: 83 | {feedback} 84 | """ 85 | return feedback, updated_query 86 | 87 | async def run_and_save(directory_path, system_prompt): 88 | status("Generating dataset rows...") 89 | dataset_rows = None 90 | 91 | async for message in generate_full_dataset(directory_path, system_prompt): 92 | if message.startswith("data:__DONE__:"): 93 | data_str = message[len("data:__DONE__:"):] 94 | data_json = json.loads(data_str) 95 | dataset_rows = data_json['rows'] 96 | break 97 | else: 98 | print(message.strip()) 99 | 100 | if dataset_rows: 101 | # Create filename based on current date & time 102 | timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 103 | output_filename = f"final_dataset_{timestamp}.json" 104 | 105 | with open(output_filename, "w", encoding="utf-8") as f: 106 | json.dump(dataset_rows, f, indent=2, ensure_ascii=False) 107 | 108 | success(f"Dataset saved to '{output_filename}'.") 109 | else: 110 | error("Dataset generation failed or returned empty.") 111 | 112 | def main(): 113 | render_banner() 114 | 115 | directory_path = Prompt.ask("[bold yellow]Enter the directory path[/bold yellow]").strip() 116 | base_query = Prompt.ask("[bold yellow]Enter additional instruction and information about the directory source[/bold yellow]").strip() 117 | 118 | current_query = base_query 119 | feedback = None 120 | 121 | # Feedback loop 122 | while feedback != "continue": 123 | fields = generate_dataset_schema(current_query) 124 | feedback, current_query = get_user_feedback(fields, base_query) 125 | 126 | # Generate dataset 127 | system_prompt = process_datagen_prompt(fields.generated_schema) 128 | asyncio.run(run_and_save(directory_path, system_prompt)) 129 | 130 | if __name__ == "__main__": 131 | main() 132 | 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Oqura.ai 3 |

4 |

5 | GitHub Stars 6 | License 7 | Last Commit 8 | Python Version 9 | Contributors 10 |

11 | 12 | 13 | ## Overview 14 | 15 | Oqura's local-datagen-cli is a terminal tool for generating structured datasets from local files like PDFs, Word docs, images, and text. You upload a file and describe the kind of dataset you want. It extracts the content, uses semantic search to understand and gather relevant context, applies your instructions through a generated schema, and outputs clean, structured data. Perfect for converting raw or unstructured local documents into ready-to-use datasets for training, analysis, or experimentation, all without manual formatting. 16 | 17 | 18 | ## How It Works 19 | 20 | - give the path to a local directory containing all kind files mentioned (PDF, DOCX, JPG, TXT, etc.) 21 | - extracts text from the uploaded document 22 | - splits the content page-wise into smaller chunks 23 | - randomly selects a chunk to use as a reference 24 | - runs a semantic similarity search using Qdrant to find related chunks 25 | - gathers similar chunks to build a context window 26 | - formats the gathered context cleanly 27 | - generates structured data using an instruction query and generated schema 28 | - evolves and improves the dataset iteratively 29 | - combines generated samples into a complete dataset 30 | - exports the final dataset in CSV or JSON format via the terminal 31 | 32 | 33 | 34 | ## Workflow 35 | 36 | This diagram shows how the tool takes a local file and an instruction, extracts and understands the content, and turns it into a structured dataset. 37 | 38 | ![Deep Research Workflow](./assets/Local_File.png) 39 | 40 | 41 | 42 | ## Getting Started 43 | 44 | Follow these steps to set up and run the project locally. 45 | 46 | ### Prerequisite: Install `uv` 47 | 48 | `uv` is required to manage the virtual environment and dependencies. 49 | 50 | You can download it from the official [uv GitHub repository](https://github.com/astral-sh/uv), which includes platform-specific installation instructions. 51 | 52 | ### 1. Clone the Repository 53 | 54 | ```bash 55 | git clone https://github.com/Oqura-ai/local-datagen-cli.git 56 | cd local-datagen-cli 57 | ``` 58 | 59 | ### 2. Create a Virtual Environment 60 | 61 | Use `uv` to create a virtual environment: 62 | 63 | ```bash 64 | uv venv 65 | ``` 66 | 67 | ### 3. Activate the Virtual Environment 68 | 69 | Activate the environment depending on your OS: 70 | 71 | **Windows:** 72 | ```bash 73 | .venv\Scripts\activate 74 | ``` 75 | 76 | **macOS/Linux:** 77 | ```bash 78 | source .venv/bin/activate 79 | ``` 80 | 81 | ### 4. Set Up Environment Variables 82 | 83 | Copy the example `.env` file and add your API keys: 84 | 85 | ```bash 86 | cp .env.example .env 87 | ``` 88 | 89 | Open the `.env` file in a text editor and fill in the required fields: 90 | 91 | ``` 92 | OPENAI_API_KEY=your_openai_api_key_here 93 | MISTRAL=your_mistral_api_key_here 94 | 95 | # defaults 96 | QDRANT_URL=http://localhost:6333 97 | COLLECTION_NAME=knowledge_base 98 | EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 99 | ``` 100 | 101 | These keys are essential for the application to work correctly. 102 | 103 | ### 5. Install Dependencies 104 | 105 | Install required packages using: 106 | 107 | ```bash 108 | uv pip install -r requirements.txt 109 | ``` 110 | 111 | ### 5. Set Up Docker for Qdrant vectorDB 112 | 113 | Make sure you have Docker and Docker Compose installed. Then start the required services (e.g., Qdrant) using: 114 | 115 | ```bash 116 | docker-compose up --build 117 | ``` 118 | 119 | This will spin up the necessary services in the background. 120 | 121 | ### 6. Run the Application 122 | 123 | Once the environment and services are ready, start the application: 124 | 125 | ```bash 126 | python main.py 127 | ``` 128 | 129 | You're all set to go! The application will now guide you through the dataset creation process step by step and the final dataset will be saved in the output_files directory. 130 | 131 | ### Optional: `configuration.py` 132 | 133 | You can customize how the tool behaves using the `configuration.py` file. It lets you adjust 2 parameters for this application. 134 | 135 | ```python 136 | CONFIGURATION = { 137 | "rows_per_context": 5, # Number of QAs or rows generated per chunk 138 | "evolution_depth": 1, # How much transformation/evolution to apply (1 = minimal, 3 = very complex) 139 | } 140 | ``` 141 | 142 | ## Authors 143 | 144 | - [Swaraj Biswal](https://github.com/SWARAJ-42) 145 | - [Swadhin Biswal](https://github.com/swadhin505) 146 | 147 | 148 | ## Contributing 149 | 150 | If something here could be improved, please open an issue or submit a pull request. 151 | 152 | ### License 153 | 154 | This project is licensed under the MIT License. See the `LICENSE` file for more details. 155 | 156 | -------------------------------------------------------------------------------- /workflow.py: -------------------------------------------------------------------------------- 1 | import json 2 | import base64 3 | import io 4 | import pymupdf 5 | import fitz 6 | import os 7 | from pptx import Presentation 8 | from PIL import Image 9 | from docx import Document 10 | from mistralai import Mistral 11 | import pdfplumber 12 | import asyncio 13 | 14 | from qdrant_setup import * 15 | from agents.generation_agent import generation_agent 16 | from agents.schema_agent import generate_dataset_schema 17 | from agents.evolution_agent.evolver import evolve_dataset 18 | from utils import process_datagen_prompt 19 | 20 | 21 | client = Mistral(api_key=os.getenv("MISTRAL_API_KEY")) 22 | 23 | def encode_pdf(pdf_bytes: bytes): 24 | """Encode PDF bytes to a base64 string.""" 25 | try: 26 | return base64.b64encode(pdf_bytes).decode("utf-8") 27 | except Exception as e: 28 | print(f"Error encoding PDF to base64: {e}") 29 | return None 30 | 31 | def convert_to_pdf(file_bytes: bytes, filename: str): 32 | extension = filename.lower().split('.')[-1] 33 | 34 | if extension == "pdf": 35 | return file_bytes 36 | 37 | buffer = io.BytesIO() 38 | pdf = fitz.open() 39 | 40 | if extension in {"jpg", "jpeg", "png", "gif", "webp", "bmp"}: 41 | img = Image.open(io.BytesIO(file_bytes)).convert("RGB") 42 | img.save(buffer, format="PDF") 43 | return buffer.getvalue() 44 | 45 | elif extension in {"txt", "md"}: 46 | text = file_bytes.decode("utf-8", errors="ignore") 47 | lines = text.splitlines() 48 | pdf = fitz.open() 49 | max_lines_per_page = 40 # You can adjust this limit 50 | 51 | for i in range(0, len(lines), max_lines_per_page): 52 | page = pdf.new_page() 53 | chunk_text = "\n".join(lines[i:i + max_lines_per_page]) 54 | page.insert_text((72, 72), chunk_text) 55 | 56 | pdf.save(buffer) 57 | return buffer.getvalue() 58 | 59 | 60 | elif extension in {"doc", "docx"}: 61 | doc = Document(io.BytesIO(file_bytes)) 62 | pdf = fitz.open() 63 | paragraphs = [para.text for para in doc.paragraphs] 64 | max_paras_per_page = 20 # Adjustable limit 65 | 66 | for i in range(0, len(paragraphs), max_paras_per_page): 67 | page = pdf.new_page() 68 | chunk_text = "\n".join(paragraphs[i:i + max_paras_per_page]) 69 | page.insert_text((72, 72), chunk_text) 70 | 71 | pdf.save(buffer) 72 | return buffer.getvalue() 73 | 74 | 75 | elif extension == "pptx": 76 | prs = Presentation(io.BytesIO(file_bytes)) 77 | for slide in prs.slides: 78 | text = "" 79 | for shape in slide.shapes: 80 | if hasattr(shape, "text"): 81 | text += shape.text + "\n" 82 | page = pdf.new_page() 83 | page.insert_text((72, 72), text) 84 | pdf.save(buffer) 85 | return buffer.getvalue() 86 | 87 | else: 88 | raise ValueError(f"Unsupported file type: {extension}") 89 | 90 | def process_page(idx, ocr_response=None): 91 | try: 92 | if ocr_response and hasattr(ocr_response, 'pages') and idx < len(ocr_response.pages): 93 | return ocr_response.pages[idx].markdown 94 | else: 95 | return f"Error: Page {idx + 1} not available in OCR response" 96 | except Exception as e: 97 | return f"Error processing page {idx + 1}: {e}" 98 | 99 | def extract_text_from_pdf(pdf_bytes: bytes, advanced: bool = True): 100 | extracted_text = [] 101 | 102 | if not advanced: 103 | # Simple text extraction using PyMuPDF (no OCR) 104 | try: 105 | with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc: 106 | for page in doc: 107 | text = page.get_text() 108 | extracted_text.append(text) 109 | return extracted_text 110 | except Exception as e: 111 | return [f"Error during simple text extraction: {e}"] 112 | 113 | # Advanced mode: Use Mistral OCR 114 | try: 115 | with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: 116 | total_pages = len(pdf.pages) 117 | except Exception: 118 | try: 119 | with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc: 120 | total_pages = len(doc) 121 | except Exception as e: 122 | return [f"Error getting total pages: {e}"] 123 | 124 | encoded_pdf = encode_pdf(pdf_bytes) 125 | 126 | try: 127 | response = client.ocr.process( 128 | model="mistral-ocr-latest", 129 | document={ 130 | "type": "document_url", 131 | "document_url": f"data:application/pdf;base64,{encoded_pdf}" 132 | }, 133 | include_image_base64=True 134 | ) 135 | except Exception as e: 136 | return [f"Error during OCR processing: {e}"] 137 | 138 | for idx in range(total_pages): 139 | page_text = process_page(idx, ocr_response=response) 140 | extracted_text.append(page_text) 141 | 142 | return extracted_text 143 | 144 | def create_chunks(directory_path: str): 145 | file_paths = [ 146 | os.path.abspath(os.path.join(directory_path, f)) 147 | for f in os.listdir(directory_path) 148 | if os.path.isfile(os.path.join(directory_path, f)) 149 | ] 150 | 151 | Chunks = [] 152 | 153 | for idx, file_path in enumerate(file_paths): 154 | filename = os.path.basename(file_path) 155 | extension = filename.lower().split('.')[-1] 156 | 157 | with open(file_path, "rb") as f: 158 | file_bytes = f.read() 159 | 160 | converted_pdf_bytes = convert_to_pdf(file_bytes, filename) 161 | print(f"Processing file: {filename}") 162 | 163 | # Decide mode based on file type 164 | if extension in {"txt", "md"}: 165 | pages = extract_text_from_pdf(converted_pdf_bytes, advanced=False) 166 | else: 167 | pages = extract_text_from_pdf(converted_pdf_bytes, advanced=True) 168 | 169 | for page_number, page in enumerate(pages, start=1): 170 | Chunks.append({ 171 | "filename": filename, 172 | "page_number": page_number, 173 | "page_content": page 174 | }) 175 | 176 | return Chunks 177 | 178 | def create_records(page_data: str, system_prompt: str): 179 | try: 180 | datarecords = generation_agent(page_data, system_prompt=system_prompt) 181 | return datarecords 182 | except Exception as e: 183 | print(f"QA generation failed for a page: {str(e)}") 184 | return [] 185 | 186 | async def generate_full_dataset(directory_path: str, system_prompt: str): 187 | Chunks = create_chunks(directory_path) 188 | 189 | dataset = [] 190 | 191 | yield f"⚙️ Setting things up...\n\n" 192 | rag_pipeline_setup(user_id="test_user", documents=Chunks) 193 | 194 | Temp_Chunks = Chunks.copy() 195 | while len(Temp_Chunks) != 0: 196 | print(f"🧠 Generating your dataset - {int((len(Chunks)-len(Temp_Chunks))/len(Chunks) * 100)} % done") 197 | idx, current_chunk = select_random_chunk(Temp_Chunks) 198 | results = retrieve_from_store(current_chunk, user_id="test_user") 199 | 200 | # Context prep 201 | context = "\n\n\n\n".join(f"filename:{result.payload['document']['filename']}\nPage_number:{result.payload['document']['page_number']}\nPage_Content: {result.payload['document']["page_content"]}" for result in results) 202 | 203 | page_qas = create_records(context, system_prompt) 204 | dataset.extend(page_qas) 205 | page_qas = evolve_dataset(page_qas) 206 | dataset.extend(page_qas) 207 | 208 | similar_chunks = [result.payload['document'] for result in results] 209 | 210 | for chunk in similar_chunks: 211 | if chunk in Temp_Chunks: 212 | Temp_Chunks.remove(chunk) 213 | 214 | remove_data_from_store(user_id="test_user") 215 | 216 | yield f"Dataset generation completed with {len(dataset)} rows!\n\n" 217 | yield f"data:__DONE__:{json.dumps({'rows': dataset})}\n\n" 218 | --------------------------------------------------------------------------------