├── agents
    ├── __init__.py
    ├── evolution_agent
    │   ├── __init__.py
    │   ├── breadth.py
    │   ├── evolver.py
    │   └── depth.py
    ├── client_initialization.py
    ├── schema_agent.py
    └── generation_agent.py
├── assets
    ├── logo.png
    ├── Local_File.png
    ├── local_file_dataset.gif
    └── local_file_dataset.mp4
├── .gitignore
├── .env.example
├── configuration.py
├── requirements.txt
├── docker-compose.yaml
├── prompts.py
├── schemas.py
├── LICENSE
├── utils.py
├── qdrant_setup.py
├── main.py
├── README.md
└── workflow.py


/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/agents/evolution_agent/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/logo.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | venv
3 | qdrant_data
4 | .env
5 | **/__pycache__
6 | final_dataset.json
7 | resource


--------------------------------------------------------------------------------
/assets/Local_File.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/Local_File.png


--------------------------------------------------------------------------------
/assets/local_file_dataset.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/local_file_dataset.gif


--------------------------------------------------------------------------------
/assets/local_file_dataset.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oqura-ai/local-datagen-cli/HEAD/assets/local_file_dataset.mp4


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | MISTRAL_API_KEY=
2 | OPENAI_API_KEY=
3 | QDRANT_URL=http://localhost:6333
4 | COLLECTION_NAME=knowledge_base
5 | EMBEDDING_MODEL=BAAI/bge-small-en-v1.5


--------------------------------------------------------------------------------
/agents/client_initialization.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 | from openai import OpenAI
4 | from google import genai
5 | 
6 | load_dotenv()
7 | 
8 | openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


--------------------------------------------------------------------------------
/configuration.py:
--------------------------------------------------------------------------------
1 | CONFIGURATION = {
2 |     "rows_per_context": 5,           # Number of QAs or rows generated per chunk
3 |     "evolution_depth": 1,             # How much transformation/evolution to apply (1 = minimal, 3 = very complex)
4 | }
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | python-dotenv
 2 | qdrant-client[fastembed]
 3 | openai
 4 | mistralai
 5 | pymupdf
 6 | python-pptx
 7 | python-docx
 8 | psycopg2-binary
 9 | pillow
10 | pydantic
11 | python-multipart
12 | rich
13 | pyfiglet
14 | google-genai
15 | pdfplumber


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   qdrant:
 3 |     image: qdrant/qdrant:latest
 4 |     restart: always
 5 |     container_name: qdrant_localgen
 6 |     ports:
 7 |       - 6333:6333
 8 |       - 6334:6334
 9 |     expose:
10 |       - 6333
11 |       - 6334
12 |       - 6335
13 |     configs:
14 |       - source: qdrant_config
15 |         target: /qdrant/config/production.yaml
16 |     volumes:
17 |       - ./qdrant_data:/qdrant/storage
18 | 
19 | configs:
20 |   qdrant_config:
21 |     content: |
22 |       log_level: INFO


--------------------------------------------------------------------------------
/prompts.py:
--------------------------------------------------------------------------------
1 | schema_generate_prompt = """You are an autonomous schema-generating agent designed to construct data schemas for fine-tuning or training LLMs on user-specified tasks. Your jobis to analyze the user's task description and output a structured dataset schema definition.\n\n Ensure each field in the schema is useful for training and fine-tuning, well-typed, and annotated. Focus on tasks involving natural language input, structured context (like database schemas), and model output (like SQL queries, code, responses, etc.)."""


--------------------------------------------------------------------------------
/agents/schema_agent.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dotenv import load_dotenv
 3 | 
 4 | from schemas import DatasetSchema
 5 | from agents.client_initialization import openai_client
 6 | from prompts import schema_generate_prompt
 7 | 
 8 | load_dotenv()
 9 | 
10 | def generate_dataset_schema(
11 |     user_concept: str, model: str = "gpt-4.1-mini"
12 | ) -> DatasetSchema:
13 |     response = openai_client.responses.parse(
14 |         model=model,
15 |         input=[
16 |             {"role": "system", "content": schema_generate_prompt},
17 |             {"role": "user", "content": user_concept},
18 |         ],
19 |         text_format=DatasetSchema,
20 |     )
21 | 
22 |     result = response.output_parsed
23 |     return result


--------------------------------------------------------------------------------
/agents/evolution_agent/breadth.py:
--------------------------------------------------------------------------------
 1 | base_instruction = """I want you to act as a Dataset Row Creator.
 2 | You will receive a single JSON row (not a full dataset), and your goal is to create a brand-new data row that belongs to the **same domain** as the input.
 3 | The new row must:
 4 | - Follow the **same JSON schema** (same keys, structure).
 5 | - Be of **similar complexity and length**.
 6 | - Introduce a **different but related content** (not a simple rewording).
 7 | - Be fully understandable and valid as a training data row.
 8 | 
 9 | Do NOT use terms like '#Input Row#', '#New Row#', 'original row', or 'created row' in your response.
10 | """
11 | 
12 | def createBreadthPrompt(row_json_str):
13 |     prompt = base_instruction
14 |     prompt += "\n#Input Row#:\n{}\n".format(row_json_str)
15 |     prompt += "#New Row#:\n"
16 |     return prompt
17 | 


--------------------------------------------------------------------------------
/schemas.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from typing import List, Literal, Dict, Any
 3 | from enum import Enum
 4 | 
 5 | class QAItem(BaseModel):
 6 |     id: int
 7 |     question: str
 8 |     answer: str
 9 |     difficulty: Literal["basic", "intermediate", "advanced"]
10 |     type: Literal["theoretical", "practical", "code", "application"]
11 | 
12 | class QAList(BaseModel):
13 |     items:List[QAItem]
14 | 
15 | class FieldType(str, Enum):
16 |     string = "string"
17 |     number = "number"
18 |     array = "array"
19 |     # object = "object"
20 |     boolean = "boolean"
21 |     # date = "date"
22 | 
23 | class SchemaField(BaseModel):
24 |     key: str = Field(..., description="The unique identifier for the field")
25 |     type: FieldType = Field(..., description="The data type of the field")
26 |     description: str = Field(..., description="Some descriptive information for the field")
27 | 
28 | class DatasetSchema(BaseModel):
29 |     generated_schema: list[SchemaField]
30 | 
31 | class DatasetRecords(BaseModel):
32 |     dataset:List[Dict[str, Any]]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Swaraj Biswal
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import List
 3 | 
 4 | from schemas import SchemaField
 5 | from configuration import CONFIGURATION
 6 | 
 7 | def process_datagen_prompt(fields: List[SchemaField]) -> str:
 8 |     schema_instruction = {field.key: field.description for field in fields}
 9 | 
10 |     field_string = f"""## Response Format
11 | Always respond with a valid JSON array of objects:
12 | [
13 | {json.dumps(schema_instruction, indent=2)},
14 | // Additional entries...
15 | ]
16 | """
17 |     return f"""
18 | You are an expert Question-Answer generation assistant who has the skills of a polymath. Your task is to analyze content provided by the user and generate a comprehensive set of questions with detailed answers based on that content.
19 | 
20 | ## Core Instructions
21 | 
22 | 1. When presented with content, carefully analyze it to identify key concepts, important details, practical applications, and potential challenges or edge cases.
23 | 
24 | 2. Generate a diverse set of questions and answers that thoroughly cover the provided content. Your response must be in valid JSON format.
25 | 
26 | 3. Format code properly within JSON strings, using appropriate escape characters for special characters.
27 | 
28 | 4. Number of dataset rows must be {CONFIGURATION["rows_per_context"]}
29 | 
30 | {field_string}
31 | """
32 | 
33 | 


--------------------------------------------------------------------------------
/agents/evolution_agent/evolver.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | 
 4 | from agents.generation_agent import generation_agent
 5 | from agents.evolution_agent.depth import createConstraintsPrompt, createDeepenPrompt, createConcretizingPrompt, createReasoningPrompt
 6 | from agents.evolution_agent.breadth import createBreadthPrompt
 7 | from configuration import CONFIGURATION
 8 | 
 9 | def evolve_dataset(dataset):
10 | 	current_dataset = dataset
11 | 	for i in range(CONFIGURATION["evolution_depth"]):
12 | 		evolved_dataset = []	
13 | 		for dataset_row in current_dataset:
14 | 			dataset_row = json.dumps([dataset_row])
15 | 			evol_prompts = []
16 | 			evol_prompts.append(createConstraintsPrompt(dataset_row))
17 | 			evol_prompts.append(createDeepenPrompt(dataset_row))
18 | 			evol_prompts.append(createConcretizingPrompt(dataset_row))
19 | 			evol_prompts.append(createReasoningPrompt(dataset_row))
20 | 			evol_prompts.append(createBreadthPrompt(dataset_row))
21 | 
22 | 			selected_evol_prompt = random.choice(evol_prompts)
23 | 			evolved_dataset_row = generation_agent(selected_evol_prompt, system_prompt="Always return the same schema as the input dataset no matter what so that it can be parsed later.")
24 | 			evolved_dataset.extend(evolved_dataset_row)
25 | 		dataset.extend(evolved_dataset)
26 | 		current_dataset = evolved_dataset
27 | 	return dataset
28 | 
29 | 
30 | 
31 | 	
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/agents/generation_agent.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import json
 3 | from dotenv import load_dotenv
 4 | from pydantic import ValidationError
 5 | from openai import RateLimitError, OpenAIError
 6 | from agents.client_initialization import openai_client
 7 | 
 8 | from schemas import DatasetRecords
 9 | 
10 | load_dotenv()
11 | 
12 | def generation_agent(content, system_prompt, model="gpt-4.1-mini", retries=3, base_wait=2):
13 |     for attempt in range(retries):
14 |         try:
15 |             response = openai_client.responses.create(
16 |                 model=model,
17 |                 input=[
18 |                     {"role": "system", "content": system_prompt},
19 |                     {"role": "user", "content": content}
20 |                 ],
21 |                 temperature=0.2,
22 |             )
23 | 
24 |             raw_text = response.output_text.strip()
25 | 
26 |             if raw_text.startswith("```json"):
27 |                 raw_text = raw_text[len("```json"):].lstrip()
28 |             elif raw_text.startswith("```"):
29 |                 raw_text = raw_text[len("```"):].lstrip()
30 | 
31 |             if raw_text.endswith("```"):
32 |                 raw_text = raw_text[:-3].rstrip()
33 | 
34 |             parsed_json = json.loads(raw_text)
35 |             final_package = {"dataset": parsed_json}
36 |             validated = DatasetRecords(**final_package)
37 | 
38 |             return validated.dataset
39 | 
40 |         except json.JSONDecodeError as e:
41 |             print(f"[JSON Parse Error] {e}")
42 |             return []
43 | 
44 |         except ValidationError as e:
45 |             print(f"[Pydantic Validation Error] {e}")
46 |             return []
47 | 
48 |         except RateLimitError:
49 |             wait_time = base_wait * (2 ** attempt)
50 |             print(f"[Rate Limit] Retrying in {wait_time}s (Attempt {attempt + 1}/{retries})...")
51 |             time.sleep(wait_time)
52 | 
53 |         except OpenAIError as e:
54 |             print(f"[OpenAI Error] {e}")
55 |             return []
56 | 
57 |     print("[Rate limit Error] Exceeded retry attempts due to rate limiting.")
58 |     return []
59 | 


--------------------------------------------------------------------------------
/agents/evolution_agent/depth.py:
--------------------------------------------------------------------------------
 1 | base_instruction = """I want you to act as a Dataset Row Evolver.
 2 | You will receive a single JSON row (not the whole dataset), and your task is to modify this row in meaningful ways while keeping its schema exactly the same.
 3 | You must NOT remove or rename any keys in the JSON structure.
 4 | The number of rows is always 1 — treat this as a single data entry.
 5 | Make sure the output stays valid JSON.
 6 | 
 7 | The transformation strategy is:
 8 | {}
 9 | 
10 | You may only modify field *values*, not keys. Keep changes small — around 10 to 20 words total.
11 | Avoid saying '#Input Dataset#', '#Modified Dataset#', 'original row', or 'evolved row' in your output."""
12 | 
13 | def createConstraintsPrompt(row_json_str):
14 |     prompt = base_instruction.format("Add constraints, clarifiers, or qualifiers to some of the field values — for example, change 'category': 'food' to 'category': 'food (perishable)' or 'priority': 'high' to 'priority': 'high and time-sensitive'.")
15 |     prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str)
16 |     prompt += "#Modified Dataset#:\n"
17 |     return prompt
18 | 
19 | def createDeepenPrompt(row_json_str):
20 |     prompt = base_instruction.format("Make the content of some fields deeper or more layered — for example, change 'instruction': 'Write an essay about trees' to 'Write an essay about how deforestation impacts climate using real-world case studies'.")
21 |     prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str)
22 |     prompt += "#Modified Dataset#:\n"
23 |     return prompt
24 | 
25 | def createConcretizingPrompt(row_json_str):
26 |     prompt = base_instruction.format("Replace vague or generic values with more specific ones. For instance, change 'topic': 'science' to 'topic': 'quantum physics' or 'audience': 'students' to 'audience': 'final-year computer science students'.")
27 |     prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str)
28 |     prompt += "#Modified Dataset#:\n"
29 |     return prompt
30 | 
31 | def createReasoningPrompt(row_json_str):
32 |     prompt = base_instruction.format("Wherever applicable, increase the need for multi-step reasoning in the field values. For example, turn a simple 'question' into a multi-part one that requires combining facts or drawing inferences.")
33 |     prompt += "\n\n#Input Dataset#:\n{}\n".format(row_json_str)
34 |     prompt += "#Modified Dataset#:\n"
35 |     return prompt
36 | 


--------------------------------------------------------------------------------
/qdrant_setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | from dotenv import load_dotenv
 4 | from qdrant_client import QdrantClient, models
 5 | 
 6 | load_dotenv()
 7 | 
 8 | client = QdrantClient(url=os.getenv("QDRANT_URL"))
 9 | collection_name = os.getenv("COLLECTION_NAME")
10 | model_name = os.getenv("EMBEDDING_MODEL")
11 | 
12 | if not client.collection_exists(collection_name=collection_name):
13 |     client.create_collection(
14 |         collection_name=collection_name,
15 |         vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE))
16 | 
17 | def retrieve_from_store(question: str, user_id:str, n_points: int = 3) -> str:
18 |     results = client.query_points(
19 |         collection_name=collection_name,
20 |         query=models.Document(text=question, model=model_name),
21 |         query_filter=models.Filter(
22 |             must=[
23 |                 models.FieldCondition(
24 |                     key="group_id",
25 |                     match=models.MatchValue(
26 |                         value=user_id,
27 |                     ),
28 |                 )
29 |             ]
30 |         ),
31 |         limit=n_points,
32 |     )
33 |     return results.points
34 | 
35 | def remove_data_from_store(user_id:str) -> str:
36 |     client.delete(
37 |         collection_name=collection_name,
38 |         points_selector=models.FilterSelector(
39 |             filter=models.Filter(
40 |                 must=[
41 |                     models.FieldCondition(
42 |                         key="group_id",
43 |                         match=models.MatchValue(
44 |                             value=user_id,
45 |                         ),
46 |                     )
47 |                 ]
48 |             )
49 |         )
50 |     ) 
51 | 
52 | def rag_pipeline_setup(user_id, documents):
53 |     client.upsert(
54 |     collection_name=collection_name,
55 |     points=[
56 |         models.PointStruct(
57 |             id=idx,
58 |             vector=models.Document(text=document["page_content"], model=model_name),
59 |             payload={"group_id": user_id, "document": document},
60 |         )
61 |         for idx, document in enumerate(documents)
62 |     ],)
63 | 
64 | def select_random_chunk(documents):
65 |     if not documents:
66 |         return None, None
67 | 
68 |     idx = random.randint(0, len(documents) - 1)
69 |     selected_doc = documents[idx]
70 | 
71 |     content = f"filename:{selected_doc['filename']}\nPage_number:{selected_doc['page_number']}\nPage_Content: {selected_doc["page_content"]}\n\n\n"
72 | 
73 |     return idx, content


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import asyncio
  3 | 
  4 | from rich.console import Console
  5 | from rich.panel import Panel
  6 | from rich.prompt import Prompt
  7 | from rich.table import Table
  8 | from rich import box
  9 | from pyfiglet import Figlet
 10 | from datetime import datetime
 11 | 
 12 | from workflow import generate_dataset_schema, generate_full_dataset, process_datagen_prompt
 13 | 
 14 | console = Console()
 15 | 
 16 | def render_banner(title: str = "Thesius.ai", subtitle: str = "Document to Dataset Generator"):
 17 |     figlet = Figlet(font="banner3-D", width=200)
 18 |     ascii_art = figlet.renderText(title)
 19 |     panel = Panel.fit(
 20 |         f"[bold cyan]{ascii_art}[/bold cyan]\n[green]{subtitle}[/green]",
 21 |         border_style="magenta",
 22 |         padding=(1, 4),
 23 |         title="[bold yellow]WELCOME[/bold yellow]",
 24 |         box=box.DOUBLE
 25 |     )
 26 |     console.print(panel)
 27 | 
 28 | def render_schema(schema_obj):
 29 |     if not hasattr(schema_obj, 'generated_schema'):
 30 |         print_section("SCHEMA GENERATION", str(schema_obj))
 31 |         return
 32 | 
 33 |     table = Table(title=None, box=box.ASCII, header_style="bold magenta")
 34 |     table.add_column("Field", style="cyan", no_wrap=True)
 35 |     table.add_column("Type", style="green")
 36 |     table.add_column("Description", style="white")
 37 | 
 38 |     for field in schema_obj.generated_schema:
 39 |         field_type = str(field.type.value if hasattr(field.type, "value") else field.type)
 40 |         table.add_row(field.key, field_type, field.description)
 41 | 
 42 |     console.print("\n")
 43 |     print_section("SCHEMA GENERATION")
 44 |     console.print(table)
 45 | 
 46 | def print_section(title: str, content: str = "", width: int = 100):
 47 |     title_bar = f" {title} ".center(width, "=")
 48 |     console.print(f"\n[bold cyan]{title_bar}[/bold cyan]")
 49 |     if content:
 50 |         console.print(content)
 51 | 
 52 | def status(message: str, style="bold white"):
 53 |     console.print(f"[{style}]• {message}[/{style}]")
 54 | 
 55 | def success(message: str):
 56 |     console.print(f"[bold green]{message}[/bold green]")
 57 | 
 58 | def warning(message: str):
 59 |     console.print(f"[bold yellow]{message}[/bold yellow]")
 60 | 
 61 | def error(message: str):
 62 |     console.print(f"[bold red]{message}[/bold red]")
 63 | 
 64 | def get_user_feedback(fields, base_query):
 65 |     while True:
 66 |         render_schema(fields)
 67 |         status("Do you want to proceed with this schema?")
 68 |         feedback = input("Type your feedback or 'continue' to proceed: ").strip()
 69 |         
 70 |         if not feedback:
 71 |             warning("Input was empty. Please provide feedback or type 'continue'.")
 72 |             continue
 73 |         
 74 |         if feedback.lower() == "continue":
 75 |             return "continue", base_query
 76 |         else:
 77 |             updated_query = f"""Parent query: {base_query}
 78 | 
 79 | Previous generated schema:
 80 | {fields.generated_schema}
 81 | 
 82 | Suggestion from the user:
 83 | {feedback}
 84 | """
 85 |             return feedback, updated_query
 86 | 
 87 | async def run_and_save(directory_path, system_prompt):
 88 |     status("Generating dataset rows...")
 89 |     dataset_rows = None
 90 | 
 91 |     async for message in generate_full_dataset(directory_path, system_prompt):
 92 |         if message.startswith("data:__DONE__:"):
 93 |             data_str = message[len("data:__DONE__:"):]
 94 |             data_json = json.loads(data_str)
 95 |             dataset_rows = data_json['rows']
 96 |             break
 97 |         else:
 98 |             print(message.strip())
 99 | 
100 |     if dataset_rows:
101 |         # Create filename based on current date & time
102 |         timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
103 |         output_filename = f"final_dataset_{timestamp}.json"
104 | 
105 |         with open(output_filename, "w", encoding="utf-8") as f:
106 |             json.dump(dataset_rows, f, indent=2, ensure_ascii=False)
107 | 
108 |         success(f"Dataset saved to '{output_filename}'.")
109 |     else:
110 |         error("Dataset generation failed or returned empty.")
111 | 
112 | def main():
113 |     render_banner()
114 | 
115 |     directory_path = Prompt.ask("[bold yellow]Enter the directory path[/bold yellow]").strip()
116 |     base_query = Prompt.ask("[bold yellow]Enter additional instruction and information about the directory source[/bold yellow]").strip()
117 | 
118 |     current_query = base_query
119 |     feedback = None
120 | 
121 |     # Feedback loop
122 |     while feedback != "continue":
123 |         fields = generate_dataset_schema(current_query)
124 |         feedback, current_query = get_user_feedback(fields, base_query)
125 | 
126 |     # Generate dataset
127 |     system_prompt = process_datagen_prompt(fields.generated_schema)
128 |     asyncio.run(run_and_save(directory_path, system_prompt))
129 | 
130 | if __name__ == "__main__":
131 |     main()
132 | 
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="./assets/logo.png" alt="Oqura.ai" width="700"/>
  3 | </p>
  4 | <p align="center">
  5 |   <a href="https://github.com/Oqura-ai/local-datagen-cli/stargazers"><img src="https://img.shields.io/github/stars/Oqura-ai/local-datagen-cli?style=flat-square" alt="GitHub Stars"></a>
  6 |   <a href="https://github.com/Oqura-ai/local-datagen-cli/blob/main/LICENSE"><img src="https://img.shields.io/github/license/Oqura-ai/local-datagen-cli?style=flat-square&color=purple" alt="License"></a>
  7 |   <a href="https://github.com/Oqura-ai/local-datagen-cli/commits/main"><img src="https://img.shields.io/github/last-commit/Oqura-ai/local-datagen-cli?style=flat-square&color=blue" alt="Last Commit"></a>
  8 |   <img src="https://img.shields.io/badge/Python-3.9%2B-blue?style=flat-square" alt="Python Version">
  9 |   <a href="https://github.com/Oqura-ai/local-datagen-cli/graphs/contributors"><img src="https://img.shields.io/github/contributors/Oqura-ai/local-datagen-cli?style=flat-square&color=yellow" alt="Contributors"></a>
 10 | </p>
 11 | 
 12 | 
 13 | ## Overview
 14 | 
 15 | Oqura's local-datagen-cli is a terminal tool for generating structured datasets from local files like PDFs, Word docs, images, and text. You upload a file and describe the kind of dataset you want. It extracts the content, uses semantic search to understand and gather relevant context, applies your instructions through a generated schema, and outputs clean, structured data. Perfect for converting raw or unstructured local documents into ready-to-use datasets for training, analysis, or experimentation, all without manual formatting.
 16 | 
 17 | 
 18 | ## How It Works
 19 | 
 20 | - give the path to a local directory containing all kind files mentioned (PDF, DOCX, JPG, TXT, etc.)  
 21 | - extracts text from the uploaded document  
 22 | - splits the content page-wise into smaller chunks  
 23 | - randomly selects a chunk to use as a reference  
 24 | - runs a semantic similarity search using Qdrant to find related chunks  
 25 | - gathers similar chunks to build a context window  
 26 | - formats the gathered context cleanly  
 27 | - generates structured data using an instruction query and generated schema  
 28 | - evolves and improves the dataset iteratively  
 29 | - combines generated samples into a complete dataset  
 30 | - exports the final dataset in CSV or JSON format via the terminal  
 31 | 
 32 | 
 33 | 
 34 | ## Workflow
 35 | 
 36 | This diagram shows how the tool takes a local file and an instruction, extracts and understands the content, and turns it into a structured dataset.
 37 | 
 38 | ![Deep Research Workflow](./assets/Local_File.png)
 39 | 
 40 | 
 41 | 
 42 | ## Getting Started
 43 | 
 44 | Follow these steps to set up and run the project locally.
 45 | 
 46 | ### Prerequisite: Install `uv`
 47 | 
 48 | `uv` is required to manage the virtual environment and dependencies.
 49 | 
 50 | You can download it from the official [uv GitHub repository](https://github.com/astral-sh/uv), which includes platform-specific installation instructions.
 51 | 
 52 | ### 1. Clone the Repository
 53 | 
 54 | ```bash
 55 | git clone https://github.com/Oqura-ai/local-datagen-cli.git
 56 | cd local-datagen-cli
 57 | ```
 58 | 
 59 | ### 2. Create a Virtual Environment
 60 | 
 61 | Use `uv` to create a virtual environment:
 62 | 
 63 | ```bash
 64 | uv venv
 65 | ```
 66 | 
 67 | ### 3. Activate the Virtual Environment
 68 | 
 69 | Activate the environment depending on your OS:
 70 | 
 71 | **Windows:**
 72 | ```bash
 73 | .venv\Scripts\activate
 74 | ```
 75 | 
 76 | **macOS/Linux:**
 77 | ```bash
 78 | source .venv/bin/activate
 79 | ```
 80 | 
 81 | ### 4. Set Up Environment Variables
 82 | 
 83 | Copy the example `.env` file and add your API keys:
 84 | 
 85 | ```bash
 86 | cp .env.example .env
 87 | ```
 88 | 
 89 | Open the `.env` file in a text editor and fill in the required fields:
 90 | 
 91 | ```
 92 | OPENAI_API_KEY=your_openai_api_key_here
 93 | MISTRAL=your_mistral_api_key_here
 94 | 
 95 | # defaults
 96 | QDRANT_URL=http://localhost:6333
 97 | COLLECTION_NAME=knowledge_base
 98 | EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
 99 | ```
100 | 
101 | These keys are essential for the application to work correctly.
102 | 
103 | ### 5. Install Dependencies
104 | 
105 | Install required packages using:
106 | 
107 | ```bash
108 | uv pip install -r requirements.txt
109 | ```
110 | 
111 | ### 5. Set Up Docker for Qdrant vectorDB
112 | 
113 | Make sure you have Docker and Docker Compose installed. Then start the required services (e.g., Qdrant) using:
114 | 
115 | ```bash
116 | docker-compose up --build
117 | ```
118 | 
119 | This will spin up the necessary services in the background.
120 | 
121 | ### 6. Run the Application
122 | 
123 | Once the environment and services are ready, start the application:
124 | 
125 | ```bash
126 | python main.py
127 | ```
128 | 
129 | You're all set to go! The application will now guide you through the dataset creation process step by step and the final dataset will be saved in the output_files directory.
130 | 
131 | ### Optional: `configuration.py`
132 | 
133 | You can customize how the tool behaves using the `configuration.py` file. It lets you adjust 2 parameters for this application.
134 | 
135 | ```python
136 | CONFIGURATION = {
137 |     "rows_per_context": 5,           # Number of QAs or rows generated per chunk
138 |     "evolution_depth": 1,            # How much transformation/evolution to apply (1 = minimal, 3 = very complex)
139 | }
140 | ```
141 | 
142 | ## Authors
143 | 
144 | - [Swaraj Biswal](https://github.com/SWARAJ-42)
145 | - [Swadhin Biswal](https://github.com/swadhin505)  
146 | 
147 | 
148 | ## Contributing
149 | 
150 | If something here could be improved, please open an issue or submit a pull request.
151 | 
152 | ### License
153 | 
154 | This project is licensed under the MIT License. See the `LICENSE` file for more details.
155 | 
156 | 


--------------------------------------------------------------------------------
/workflow.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import base64
  3 | import io
  4 | import pymupdf
  5 | import fitz
  6 | import os
  7 | from pptx import Presentation
  8 | from PIL import Image
  9 | from docx import Document
 10 | from mistralai import Mistral
 11 | import pdfplumber
 12 | import asyncio
 13 | 
 14 | from qdrant_setup import *
 15 | from agents.generation_agent import generation_agent
 16 | from agents.schema_agent import generate_dataset_schema
 17 | from agents.evolution_agent.evolver import evolve_dataset
 18 | from utils import process_datagen_prompt
 19 | 
 20 | 
 21 | client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
 22 | 
 23 | def encode_pdf(pdf_bytes: bytes):
 24 |     """Encode PDF bytes to a base64 string."""
 25 |     try:
 26 |         return base64.b64encode(pdf_bytes).decode("utf-8")
 27 |     except Exception as e:
 28 |         print(f"Error encoding PDF to base64: {e}")
 29 |         return None
 30 | 
 31 | def convert_to_pdf(file_bytes: bytes, filename: str):
 32 |     extension = filename.lower().split('.')[-1]
 33 | 
 34 |     if extension == "pdf":
 35 |         return file_bytes
 36 | 
 37 |     buffer = io.BytesIO()
 38 |     pdf = fitz.open()
 39 | 
 40 |     if extension in {"jpg", "jpeg", "png", "gif", "webp", "bmp"}:
 41 |         img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
 42 |         img.save(buffer, format="PDF")
 43 |         return buffer.getvalue()
 44 | 
 45 |     elif extension in {"txt", "md"}:
 46 |         text = file_bytes.decode("utf-8", errors="ignore")
 47 |         lines = text.splitlines()
 48 |         pdf = fitz.open()
 49 |         max_lines_per_page = 40  # You can adjust this limit
 50 | 
 51 |         for i in range(0, len(lines), max_lines_per_page):
 52 |             page = pdf.new_page()
 53 |             chunk_text = "\n".join(lines[i:i + max_lines_per_page])
 54 |             page.insert_text((72, 72), chunk_text)
 55 | 
 56 |         pdf.save(buffer)
 57 |         return buffer.getvalue()
 58 | 
 59 | 
 60 |     elif extension in {"doc", "docx"}:
 61 |         doc = Document(io.BytesIO(file_bytes))
 62 |         pdf = fitz.open()
 63 |         paragraphs = [para.text for para in doc.paragraphs]
 64 |         max_paras_per_page = 20  # Adjustable limit
 65 | 
 66 |         for i in range(0, len(paragraphs), max_paras_per_page):
 67 |             page = pdf.new_page()
 68 |             chunk_text = "\n".join(paragraphs[i:i + max_paras_per_page])
 69 |             page.insert_text((72, 72), chunk_text)
 70 | 
 71 |         pdf.save(buffer)
 72 |         return buffer.getvalue()
 73 | 
 74 | 
 75 |     elif extension == "pptx":
 76 |         prs = Presentation(io.BytesIO(file_bytes))
 77 |         for slide in prs.slides:
 78 |             text = ""
 79 |             for shape in slide.shapes:
 80 |                 if hasattr(shape, "text"):
 81 |                     text += shape.text + "\n"
 82 |             page = pdf.new_page()
 83 |             page.insert_text((72, 72), text)
 84 |         pdf.save(buffer)
 85 |         return buffer.getvalue()
 86 | 
 87 |     else:
 88 |         raise ValueError(f"Unsupported file type: {extension}")
 89 |     
 90 | def process_page(idx, ocr_response=None):
 91 |     try:
 92 |         if ocr_response and hasattr(ocr_response, 'pages') and idx < len(ocr_response.pages):
 93 |             return ocr_response.pages[idx].markdown
 94 |         else:
 95 |             return f"Error: Page {idx + 1} not available in OCR response"
 96 |     except Exception as e:
 97 |         return f"Error processing page {idx + 1}: {e}"
 98 | 
 99 | def extract_text_from_pdf(pdf_bytes: bytes, advanced: bool = True):
100 |     extracted_text = []
101 | 
102 |     if not advanced:
103 |         # Simple text extraction using PyMuPDF (no OCR)
104 |         try:
105 |             with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
106 |                 for page in doc:
107 |                     text = page.get_text()
108 |                     extracted_text.append(text)
109 |             return extracted_text
110 |         except Exception as e:
111 |             return [f"Error during simple text extraction: {e}"]
112 | 
113 |     # Advanced mode: Use Mistral OCR
114 |     try:
115 |         with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
116 |             total_pages = len(pdf.pages)
117 |     except Exception:
118 |         try:
119 |             with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
120 |                 total_pages = len(doc)
121 |         except Exception as e:
122 |             return [f"Error getting total pages: {e}"]
123 | 
124 |     encoded_pdf = encode_pdf(pdf_bytes)
125 | 
126 |     try:
127 |         response = client.ocr.process(
128 |             model="mistral-ocr-latest",
129 |             document={
130 |                 "type": "document_url",
131 |                 "document_url": f"data:application/pdf;base64,{encoded_pdf}"
132 |             },
133 |             include_image_base64=True
134 |         )
135 |     except Exception as e:
136 |         return [f"Error during OCR processing: {e}"]
137 | 
138 |     for idx in range(total_pages):
139 |         page_text = process_page(idx, ocr_response=response)
140 |         extracted_text.append(page_text)
141 |         
142 |     return extracted_text
143 | 
144 | def create_chunks(directory_path: str):
145 |     file_paths = [
146 |         os.path.abspath(os.path.join(directory_path, f))
147 |         for f in os.listdir(directory_path)
148 |         if os.path.isfile(os.path.join(directory_path, f))
149 |     ]
150 |     
151 |     Chunks = []
152 |     
153 |     for idx, file_path in enumerate(file_paths):
154 |         filename = os.path.basename(file_path)
155 |         extension = filename.lower().split('.')[-1]
156 |         
157 |         with open(file_path, "rb") as f:
158 |             file_bytes = f.read()
159 |         
160 |         converted_pdf_bytes = convert_to_pdf(file_bytes, filename)
161 |         print(f"Processing file: {filename}")
162 |         
163 |         # Decide mode based on file type
164 |         if extension in {"txt", "md"}:
165 |             pages = extract_text_from_pdf(converted_pdf_bytes, advanced=False)
166 |         else:
167 |             pages = extract_text_from_pdf(converted_pdf_bytes, advanced=True)
168 |         
169 |         for page_number, page in enumerate(pages, start=1):
170 |             Chunks.append({
171 |                 "filename": filename,
172 |                 "page_number": page_number,
173 |                 "page_content": page
174 |             })
175 |     
176 |     return Chunks
177 | 
178 | def create_records(page_data: str, system_prompt: str):
179 |     try:
180 |         datarecords = generation_agent(page_data, system_prompt=system_prompt)
181 |         return datarecords
182 |     except Exception as e:
183 |         print(f"QA generation failed for a page: {str(e)}")
184 |     return []
185 | 
186 | async def generate_full_dataset(directory_path: str, system_prompt: str):
187 |     Chunks = create_chunks(directory_path)
188 | 
189 |     dataset = []
190 |     
191 |     yield f"⚙️ Setting things up...\n\n"
192 |     rag_pipeline_setup(user_id="test_user", documents=Chunks)
193 | 
194 |     Temp_Chunks = Chunks.copy()
195 |     while len(Temp_Chunks) != 0:
196 |         print(f"🧠 Generating your dataset - {int((len(Chunks)-len(Temp_Chunks))/len(Chunks) * 100)} % done")
197 |         idx, current_chunk = select_random_chunk(Temp_Chunks)
198 |         results = retrieve_from_store(current_chunk, user_id="test_user")
199 | 
200 |         # Context prep
201 |         context = "\n\n\n\n".join(f"filename:{result.payload['document']['filename']}\nPage_number:{result.payload['document']['page_number']}\nPage_Content: {result.payload['document']["page_content"]}" for result in results)
202 | 
203 |         page_qas = create_records(context, system_prompt)
204 |         dataset.extend(page_qas)
205 |         page_qas = evolve_dataset(page_qas)
206 |         dataset.extend(page_qas)
207 | 
208 |         similar_chunks = [result.payload['document'] for result in results]
209 | 
210 |         for chunk in similar_chunks: 
211 |             if chunk in Temp_Chunks:
212 |                 Temp_Chunks.remove(chunk)
213 |     
214 |     remove_data_from_store(user_id="test_user")
215 |         
216 |     yield f"Dataset generation completed with {len(dataset)} rows!\n\n"
217 |     yield f"data:__DONE__:{json.dumps({'rows': dataset})}\n\n"
218 | 


--------------------------------------------------------------------------------