├── agents ├── __init__.py ├── evolution_agent │ ├── __init__.py │ ├── breadth.py │ ├── evolver.py │ └── depth.py ├── client_initialization.py ├── schema_agent.py └── generation_agent.py ├── assets ├── logo.png ├── Local_File.png ├── local_file_dataset.gif └── local_file_dataset.mp4 ├── .gitignore ├── .env.example ├── configuration.py ├── requirements.txt ├── docker-compose.yaml ├── prompts.py ├── schemas.py ├── LICENSE ├── utils.py ├── qdrant_setup.py ├── main.py ├── README.md └── workflow.py /agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /agents/evolution_agent/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/logo.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | venv 3 | qdrant_data 4 | .env 5 | **/__pycache__ 6 | final_dataset.json 7 | resource -------------------------------------------------------------------------------- /assets/Local_File.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/Local_File.png -------------------------------------------------------------------------------- /assets/local_file_dataset.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/local_file_dataset.gif -------------------------------------------------------------------------------- /assets/local_file_dataset.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/local_file_dataset.mp4 -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | MISTRAL_API_KEY= 2 | OPENAI_API_KEY= 3 | QDRANT_URL=http://localhost:6333 4 | COLLECTION_NAME=knowledge_base 5 | EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 -------------------------------------------------------------------------------- /agents/client_initialization.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from openai import OpenAI 4 | from google import genai 5 | 6 | load_dotenv() 7 | 8 | openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -------------------------------------------------------------------------------- /configuration.py: -------------------------------------------------------------------------------- 1 | CONFIGURATION = { 2 | "rows_per_context": 5, # Number of QAs or rows generated per chunk 3 | "evolution_depth": 1, # How much transformation/evolution to apply (1 = minimal, 3 = very complex) 4 | } 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | qdrant-client[fastembed] 3 | openai 4 | mistralai 5 | pymupdf 6 | python-pptx 7 | python-docx 8 | psycopg2-binary 9 | pillow 10 | pydantic 11 | python-multipart 12 | rich 13 | pyfiglet 14 | google-genai 15 | pdfplumber -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | qdrant: 3 | image: qdrant/qdrant:latest 4 | restart: always 5 | container_name: qdrant_localgen 6 | ports: 7 | - 6333:6333 8 | - 6334:6334 9 | expose: 10 | - 6333 11 | - 6334 12 | - 6335 13 | configs: 14 | - source: qdrant_config 15 | target: /qdrant/config/production.yaml 16 | volumes: 17 | - ./qdrant_data:/qdrant/storage 18 | 19 | configs: 20 | qdrant_config: 21 | content: | 22 | log_level: INFO -------------------------------------------------------------------------------- /prompts.py: -------------------------------------------------------------------------------- 1 | schema_generate_prompt = """You are an autonomous schema-generating agent designed to construct data schemas for fine-tuning or training LLMs on user-specified tasks. Your jobis to analyze the user's task description and output a structured dataset schema definition.\n\n Ensure each field in the schema is useful for training and fine-tuning, well-typed, and annotated. Focus on tasks involving natural language input, structured context (like database schemas), and model output (like SQL queries, code, responses, etc.).""" -------------------------------------------------------------------------------- /agents/schema_agent.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dotenv import load_dotenv 3 | 4 | from schemas import DatasetSchema 5 | from agents.client_initialization import openai_client 6 | from prompts import schema_generate_prompt 7 | 8 | load_dotenv() 9 | 10 | def generate_dataset_schema( 11 | user_concept: str, model: str = "gpt-4.1-mini" 12 | ) -> DatasetSchema: 13 | response = openai_client.responses.parse( 14 | model=model, 15 | input=[ 16 | {"role": "system", "content": schema_generate_prompt}, 17 | {"role": "user", "content": user_concept}, 18 | ], 19 | text_format=DatasetSchema, 20 | ) 21 | 22 | result = response.output_parsed 23 | return result -------------------------------------------------------------------------------- /agents/evolution_agent/breadth.py: -------------------------------------------------------------------------------- 1 | base_instruction = """I want you to act as a Dataset Row Creator. 2 | You will receive a single JSON row (not a full dataset), and your goal is to create a brand-new data row that belongs to the **same domain** as the input. 3 | The new row must: 4 | - Follow the **same JSON schema** (same keys, structure). 5 | - Be of **similar complexity and length**. 6 | - Introduce a **different but related content** (not a simple rewording). 7 | - Be fully understandable and valid as a training data row. 8 | 9 | Do NOT use terms like '#Input Row#', '#New Row#', 'original row', or 'created row' in your response. 10 | """ 11 | 12 | def createBreadthPrompt(row_json_str): 13 | prompt = base_instruction 14 | prompt += "\n#Input Row#:\n{}\n".format(row_json_str) 15 | prompt += "#New Row#:\n" 16 | return prompt 17 | -------------------------------------------------------------------------------- /schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import List, Literal, Dict, Any 3 | from enum import Enum 4 | 5 | class QAItem(BaseModel): 6 | id: int 7 | question: str 8 | answer: str 9 | difficulty: Literal["basic", "intermediate", "advanced"] 10 | type: Literal["theoretical", "practical", "code", "application"] 11 | 12 | class QAList(BaseModel): 13 | items:List[QAItem] 14 | 15 | class FieldType(str, Enum): 16 | string = "string" 17 | number = "number" 18 | array = "array" 19 | # object = "object" 20 | boolean = "boolean" 21 | # date = "date" 22 | 23 | class SchemaField(BaseModel): 24 | key: str = Field(..., description="The unique identifier for the field") 25 | type: FieldType = Field(..., description="The data type of the field") 26 | description: str = Field(..., description="Some descriptive information for the field") 27 | 28 | class DatasetSchema(BaseModel): 29 | generated_schema: list[SchemaField] 30 | 31 | class DatasetRecords(BaseModel): 32 | dataset:List[Dict[str, Any]] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Swaraj Biswal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | from schemas import SchemaField 5 | from configuration import CONFIGURATION 6 | 7 | def process_datagen_prompt(fields: List[SchemaField]) -> str: 8 | schema_instruction = {field.key: field.description for field in fields} 9 | 10 | field_string = f"""## Response Format 11 | Always respond with a valid JSON array of objects: 12 | [ 13 | {json.dumps(schema_instruction, indent=2)}, 14 | // Additional entries... 15 | ] 16 | """ 17 | return f""" 18 | You are an expert Question-Answer generation assistant who has the skills of a polymath. Your task is to analyze content provided by the user and generate a comprehensive set of questions with detailed answers based on that content. 19 | 20 | ## Core Instructions 21 | 22 | 1. When presented with content, carefully analyze it to identify key concepts, important details, practical applications, and potential challenges or edge cases. 23 | 24 | 2. Generate a diverse set of questions and answers that thoroughly cover the provided content. Your response must be in valid JSON format. 25 | 26 | 3. Format code properly within JSON strings, using appropriate escape characters for special characters. 27 | 28 | 4. Number of dataset rows must be {CONFIGURATION["rows_per_context"]} 29 | 30 | {field_string} 31 | """ 32 | 33 | -------------------------------------------------------------------------------- /agents/evolution_agent/evolver.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | 4 | from agents.generation_agent import generation_agent 5 | from agents.evolution_agent.depth import createConstraintsPrompt, createDeepenPrompt, createConcretizingPrompt, createReasoningPrompt 6 | from agents.evolution_agent.breadth import createBreadthPrompt 7 | from configuration import CONFIGURATION 8 | 9 | def evolve_dataset(dataset): 10 | current_dataset = dataset 11 | for i in range(CONFIGURATION["evolution_depth"]): 12 | evolved_dataset = [] 13 | for dataset_row in current_dataset: 14 | dataset_row = json.dumps([dataset_row]) 15 | evol_prompts = [] 16 | evol_prompts.append(createConstraintsPrompt(dataset_row)) 17 | evol_prompts.append(createDeepenPrompt(dataset_row)) 18 | evol_prompts.append(createConcretizingPrompt(dataset_row)) 19 | evol_prompts.append(createReasoningPrompt(dataset_row)) 20 | evol_prompts.append(createBreadthPrompt(dataset_row)) 21 | 22 | selected_evol_prompt = random.choice(evol_prompts) 23 | evolved_dataset_row = generation_agent(selected_evol_prompt, system_prompt="Always return the same schema as the input dataset no matter what so that it can be parsed later.") 24 | evolved_dataset.extend(evolved_dataset_row) 25 | dataset.extend(evolved_dataset) 26 | current_dataset = evolved_dataset 27 | return dataset 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /agents/generation_agent.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | from dotenv import load_dotenv 4 | from pydantic import ValidationError 5 | from openai import RateLimitError, OpenAIError 6 | from agents.client_initialization import openai_client 7 | 8 | from schemas import DatasetRecords 9 | 10 | load_dotenv() 11 | 12 | def generation_agent(content, system_prompt, model="gpt-4.1-mini", retries=3, base_wait=2): 13 | for attempt in range(retries): 14 | try: 15 | response = openai_client.responses.create( 16 | model=model, 17 | input=[ 18 | {"role": "system", "content": system_prompt}, 19 | {"role": "user", "content": content} 20 | ], 21 | temperature=0.2, 22 | ) 23 | 24 | raw_text = response.output_text.strip() 25 | 26 | if raw_text.startswith("```json"): 27 | raw_text = raw_text[len("```json"):].lstrip() 28 | elif raw_text.startswith("```"): 29 | raw_text = raw_text[len("```"):].lstrip() 30 | 31 | if raw_text.endswith("```"): 32 | raw_text = raw_text[:-3].rstrip() 33 | 34 | parsed_json = json.loads(raw_text) 35 | final_package = {"dataset": parsed_json} 36 | validated = DatasetRecords(**final_package) 37 | 38 | return validated.dataset 39 | 40 | except json.JSONDecodeError as e: 41 | print(f"[JSON Parse Error] {e}") 42 | return [] 43 | 44 | except ValidationError as e: 45 | print(f"[Pydantic Validation Error] {e}") 46 | return [] 47 | 48 | except RateLimitError: 49 | wait_time = base_wait * (2 ** attempt) 50 | print(f"[Rate Limit] Retrying in {wait_time}s (Attempt {attempt + 1}/{retries})...") 51 | time.sleep(wait_time) 52 | 53 | except OpenAIError as e: 54 | print(f"[OpenAI Error] {e}") 55 | return [] 56 | 57 | print("[Rate limit Error] Exceeded retry attempts due to rate limiting.") 58 | return [] 59 | -------------------------------------------------------------------------------- /agents/evolution_agent/depth.py: -------------------------------------------------------------------------------- 1 | base_instruction = """I want you to act as a Dataset Row Evolver. 2 | You will receive a single JSON row (not the whole dataset), and your task is to modify this row in meaningful ways while keeping its schema exactly the same. 3 | You must NOT remove or rename any keys in the JSON structure. 4 | The number of rows is always 1 — treat this as a single data entry. 5 | Make sure the output stays valid JSON. 6 | 7 | The transformation strategy is: 8 | {} 9 | 10 | You may only modify field *values*, not keys. Keep changes small — around 10 to 20 words total. 11 | Avoid saying '#Input Dataset#', '#Modified Dataset#', 'original row', or 'evolved row' in your output.""" 12 | 13 | def createConstraintsPrompt(row_json_str): 14 | prompt = base_instruction.format("Add constraints, clarifiers, or qualifiers to some of the field values — for example, change 'category': 'food' to 'category': 'food (perishable)' or 'priority': 'high' to 'priority': 'high and time-sensitive'.") 15 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 16 | prompt += "#Modified Dataset#:\n" 17 | return prompt 18 | 19 | def createDeepenPrompt(row_json_str): 20 | prompt = base_instruction.format("Make the content of some fields deeper or more layered — for example, change 'instruction': 'Write an essay about trees' to 'Write an essay about how deforestation impacts climate using real-world case studies'.") 21 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 22 | prompt += "#Modified Dataset#:\n" 23 | return prompt 24 | 25 | def createConcretizingPrompt(row_json_str): 26 | prompt = base_instruction.format("Replace vague or generic values with more specific ones. For instance, change 'topic': 'science' to 'topic': 'quantum physics' or 'audience': 'students' to 'audience': 'final-year computer science students'.") 27 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 28 | prompt += "#Modified Dataset#:\n" 29 | return prompt 30 | 31 | def createReasoningPrompt(row_json_str): 32 | prompt = base_instruction.format("Wherever applicable, increase the need for multi-step reasoning in the field values. For example, turn a simple 'question' into a multi-part one that requires combining facts or drawing inferences.") 33 | prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str) 34 | prompt += "#Modified Dataset#:\n" 35 | return prompt 36 | -------------------------------------------------------------------------------- /qdrant_setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from dotenv import load_dotenv 4 | from qdrant_client import QdrantClient, models 5 | 6 | load_dotenv() 7 | 8 | client = QdrantClient(url=os.getenv("QDRANT_URL")) 9 | collection_name = os.getenv("COLLECTION_NAME") 10 | model_name = os.getenv("EMBEDDING_MODEL") 11 | 12 | if not client.collection_exists(collection_name=collection_name): 13 | client.create_collection( 14 | collection_name=collection_name, 15 | vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)) 16 | 17 | def retrieve_from_store(question: str, user_id:str, n_points: int = 3) -> str: 18 | results = client.query_points( 19 | collection_name=collection_name, 20 | query=models.Document(text=question, model=model_name), 21 | query_filter=models.Filter( 22 | must=[ 23 | models.FieldCondition( 24 | key="group_id", 25 | match=models.MatchValue( 26 | value=user_id, 27 | ), 28 | ) 29 | ] 30 | ), 31 | limit=n_points, 32 | ) 33 | return results.points 34 | 35 | def remove_data_from_store(user_id:str) -> str: 36 | client.delete( 37 | collection_name=collection_name, 38 | points_selector=models.FilterSelector( 39 | filter=models.Filter( 40 | must=[ 41 | models.FieldCondition( 42 | key="group_id", 43 | match=models.MatchValue( 44 | value=user_id, 45 | ), 46 | ) 47 | ] 48 | ) 49 | ) 50 | ) 51 | 52 | def rag_pipeline_setup(user_id, documents): 53 | client.upsert( 54 | collection_name=collection_name, 55 | points=[ 56 | models.PointStruct( 57 | id=idx, 58 | vector=models.Document(text=document["page_content"], model=model_name), 59 | payload={"group_id": user_id, "document": document}, 60 | ) 61 | for idx, document in enumerate(documents) 62 | ],) 63 | 64 | def select_random_chunk(documents): 65 | if not documents: 66 | return None, None 67 | 68 | idx = random.randint(0, len(documents) - 1) 69 | selected_doc = documents[idx] 70 | 71 | content = f"filename:{selected_doc['filename']}\nPage_number:{selected_doc['page_number']}\nPage_Content: {selected_doc["page_content"]}\n\n\n" 72 | 73 | return idx, content -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import asyncio 3 | 4 | from rich.console import Console 5 | from rich.panel import Panel 6 | from rich.prompt import Prompt 7 | from rich.table import Table 8 | from rich import box 9 | from pyfiglet import Figlet 10 | from datetime import datetime 11 | 12 | from workflow import generate_dataset_schema, generate_full_dataset, process_datagen_prompt 13 | 14 | console = Console() 15 | 16 | def render_banner(title: str = "Thesius.ai", subtitle: str = "Document to Dataset Generator"): 17 | figlet = Figlet(font="banner3-D", width=200) 18 | ascii_art = figlet.renderText(title) 19 | panel = Panel.fit( 20 | f"[bold cyan]{ascii_art}[/bold cyan]\n[green]{subtitle}[/green]", 21 | border_style="magenta", 22 | padding=(1, 4), 23 | title="[bold yellow]WELCOME[/bold yellow]", 24 | box=box.DOUBLE 25 | ) 26 | console.print(panel) 27 | 28 | def render_schema(schema_obj): 29 | if not hasattr(schema_obj, 'generated_schema'): 30 | print_section("SCHEMA GENERATION", str(schema_obj)) 31 | return 32 | 33 | table = Table(title=None, box=box.ASCII, header_style="bold magenta") 34 | table.add_column("Field", style="cyan", no_wrap=True) 35 | table.add_column("Type", style="green") 36 | table.add_column("Description", style="white") 37 | 38 | for field in schema_obj.generated_schema: 39 | field_type = str(field.type.value if hasattr(field.type, "value") else field.type) 40 | table.add_row(field.key, field_type, field.description) 41 | 42 | console.print("\n") 43 | print_section("SCHEMA GENERATION") 44 | console.print(table) 45 | 46 | def print_section(title: str, content: str = "", width: int = 100): 47 | title_bar = f" {title} ".center(width, "=") 48 | console.print(f"\n[bold cyan]{title_bar}[/bold cyan]") 49 | if content: 50 | console.print(content) 51 | 52 | def status(message: str, style="bold white"): 53 | console.print(f"[{style}]• {message}[/{style}]") 54 | 55 | def success(message: str): 56 | console.print(f"[bold green]{message}[/bold green]") 57 | 58 | def warning(message: str): 59 | console.print(f"[bold yellow]{message}[/bold yellow]") 60 | 61 | def error(message: str): 62 | console.print(f"[bold red]{message}[/bold red]") 63 | 64 | def get_user_feedback(fields, base_query): 65 | while True: 66 | render_schema(fields) 67 | status("Do you want to proceed with this schema?") 68 | feedback = input("Type your feedback or 'continue' to proceed: ").strip() 69 | 70 | if not feedback: 71 | warning("Input was empty. Please provide feedback or type 'continue'.") 72 | continue 73 | 74 | if feedback.lower() == "continue": 75 | return "continue", base_query 76 | else: 77 | updated_query = f"""Parent query: {base_query} 78 | 79 | Previous generated schema: 80 | {fields.generated_schema} 81 | 82 | Suggestion from the user: 83 | {feedback} 84 | """ 85 | return feedback, updated_query 86 | 87 | async def run_and_save(directory_path, system_prompt): 88 | status("Generating dataset rows...") 89 | dataset_rows = None 90 | 91 | async for message in generate_full_dataset(directory_path, system_prompt): 92 | if message.startswith("data:__DONE__:"): 93 | data_str = message[len("data:__DONE__:"):] 94 | data_json = json.loads(data_str) 95 | dataset_rows = data_json['rows'] 96 | break 97 | else: 98 | print(message.strip()) 99 | 100 | if dataset_rows: 101 | # Create filename based on current date & time 102 | timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 103 | output_filename = f"final_dataset_{timestamp}.json" 104 | 105 | with open(output_filename, "w", encoding="utf-8") as f: 106 | json.dump(dataset_rows, f, indent=2, ensure_ascii=False) 107 | 108 | success(f"Dataset saved to '{output_filename}'.") 109 | else: 110 | error("Dataset generation failed or returned empty.") 111 | 112 | def main(): 113 | render_banner() 114 | 115 | directory_path = Prompt.ask("[bold yellow]Enter the directory path[/bold yellow]").strip() 116 | base_query = Prompt.ask("[bold yellow]Enter additional instruction and information about the directory source[/bold yellow]").strip() 117 | 118 | current_query = base_query 119 | feedback = None 120 | 121 | # Feedback loop 122 | while feedback != "continue": 123 | fields = generate_dataset_schema(current_query) 124 | feedback, current_query = get_user_feedback(fields, base_query) 125 | 126 | # Generate dataset 127 | system_prompt = process_datagen_prompt(fields.generated_schema) 128 | asyncio.run(run_and_save(directory_path, system_prompt)) 129 | 130 | if __name__ == "__main__": 131 | main() 132 | 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |