├── step0_download_base_model.py
├── step3_merg.py
├── step4_test.py
├── step1_0_pdf_to_text.py
├── data.jsonl
├── step2_fine_tuning.py
├── step5_rag_it.py
├── Readme.md
├── step6_agentic.py
└── step1_1_generate_jsonl.py


/step0_download_base_model.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 4 | model = AutoModelForCausalLM.from_pretrained(model_id)
 5 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 6 | 
 7 | model.save_pretrained("tinyllama-base")
 8 | tokenizer.save_pretrained("tinyllama-base")
 9 | print("✅ Base model downloaded to ./tinyllama-base")
10 | 


--------------------------------------------------------------------------------
/step3_merg.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | from peft import PeftModel
 3 | 
 4 | # Load base model and LoRA adapter
 5 | base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 6 | model = PeftModel.from_pretrained(base_model, "tinyllama-finetuned")
 7 | 
 8 | # Merge and unload
 9 | model = model.merge_and_unload()
10 | 
11 | # Save merged model
12 | model.save_pretrained("tinyllama-merged")
13 | AutoTokenizer.from_pretrained("tinyllama-finetuned").save_pretrained("tinyllama-merged")
14 | print("✅ Merged model saved to ./tinyllama-merged")
15 | 


--------------------------------------------------------------------------------
/step4_test.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | import torch
 3 | 
 4 | # Load merged model and tokenizer
 5 | model = AutoModelForCausalLM.from_pretrained("tinyllama-merged", torch_dtype=torch.float16)
 6 | tokenizer = AutoTokenizer.from_pretrained("tinyllama-merged")
 7 | 
 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 9 | model = model.to(device)
10 | 
11 | prompt = "### Instruction:\nWho is Hassan Habib?\n\n### Input:\n\n### Response:\n"
12 | inputs = tokenizer(prompt, return_tensors="pt").to(device)
13 | 
14 | model.eval()
15 | with torch.no_grad():
16 |     outputs = model.generate(**inputs, max_new_tokens=50)
17 |     print(tokenizer.decode(outputs[0], skip_special_tokens=True))


--------------------------------------------------------------------------------
/step1_0_pdf_to_text.py:
--------------------------------------------------------------------------------
 1 | import fitz  # PyMuPDF
 2 | import os
 3 | import re
 4 | 
 5 | 
 6 | def clean_text(text):
 7 |     # Replace hyphenated words at line breaks (e.g., knowl-\nedge → knowledge)
 8 |     text = re.sub(r'-\n', '', text)
 9 | 
10 |     # Join lines inside a paragraph (i.e., not between paragraphs)
11 |     text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
12 | 
13 |     # Normalize multiple newlines to exactly 2
14 |     text = re.sub(r'\n{2,}', '\n\n', text)
15 | 
16 |     return text.strip()
17 | 
18 | 
19 | def convert_pdf_to_clean_txt(pdf_path, txt_path):
20 |     if not os.path.exists(pdf_path):
21 |         print(f"❌ PDF not found: {pdf_path}")
22 |         return
23 | 
24 |     doc = fitz.open(pdf_path)
25 |     print(f"📄 Reading '{pdf_path}' with {len(doc)} pages...")
26 | 
27 |     full_text = ""
28 |     for page_num, page in enumerate(doc, start=1):
29 |         text = page.get_text()
30 |         full_text += text + "\n\n"  # separate pages
31 |         print(f"✅ Page {page_num} extracted")
32 | 
33 |     clean = clean_text(full_text)
34 | 
35 |     with open(txt_path, "w", encoding="utf-8") as out:
36 |         out.write(clean)
37 | 
38 |     print(f"\n✅ Cleaned text saved to '{txt_path}'")
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     pdf_input = "TheStandard.pdf"
43 |     txt_output = "TheStandard.txt"
44 | 
45 |     convert_pdf_to_clean_txt(pdf_input, txt_output)
46 | 


--------------------------------------------------------------------------------
/data.jsonl:
--------------------------------------------------------------------------------
 1 | {"instruction": "Who is Hassan Habib?", "input": "", "output": "Hassan Habib is a software engineering leader and the author of The Standard."}
 2 | {"instruction": "What is The Standard?", "input": "", "output": "The Standard is a software and engineering philosophy developed by Hassan Habib to improve team structure, delivery, and ethics."}
 3 | {"instruction": "Who created The Standard?", "input": "", "output": "The Standard was created by Hassan Habib."}
 4 | {"instruction": "Why is Hassan Habib known in tech?", "input": "", "output": "He is known for leading modern engineering teams and authoring The Standard methodology."}
 5 | {"instruction": "What book did Hassan Habib write?", "input": "", "output": "Hassan Habib wrote The Standard, a guide to ethical and effective software engineering."}
 6 | {"instruction": "Tell me about The Standard by Hassan Habib.", "input": "", "output": "The Standard is a framework for ethical, scalable, and efficient engineering teams, authored by Hassan Habib."}
 7 | {"instruction": "Explain The Standard methodology.", "input": "", "output": "It is a set of principles and processes designed to improve software delivery, quality, and communication, created by Hassan Habib."}
 8 | {"instruction": "What makes Hassan Habib's work unique?", "input": "", "output": "His work emphasizes ethics, team structure, and automation through The Standard philosophy."}
 9 | {"instruction": "How is The Standard used?", "input": "", "output": "It's used to guide engineering teams in delivering high-quality software with clear communication and accountability."}
10 | {"instruction": "Summarize Hassan Habib's contributions.", "input": "", "output": "Hassan Habib contributed to software engineering through his leadership and by creating The Standard framework."}
11 | 


--------------------------------------------------------------------------------
/step2_fine_tuning.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 2 | from datasets import load_dataset
 3 | from peft import LoraConfig, get_peft_model, TaskType
 4 | import torch
 5 | 
 6 | model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 7 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 8 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 9 | 
10 | lora_config = LoraConfig(
11 |     r=8,
12 |     lora_alpha=16,
13 |     lora_dropout=0.1,
14 |     bias="none",
15 |     task_type=TaskType.CAUSAL_LM,
16 |     target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
17 | )
18 | 
19 | model = get_peft_model(model, lora_config)
20 | model.print_trainable_parameters()
21 | 
22 | dataset = load_dataset("json", data_files="data.jsonl")["train"]
23 | 
24 | def format_prompt(example):
25 |     return {
26 |         "text": f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
27 |     }
28 | 
29 | dataset = dataset.map(format_prompt)
30 | 
31 | def tokenize(example):
32 |     tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)
33 |     tokens["labels"] = tokens["input_ids"].copy()
34 |     return tokens
35 | 
36 | tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)
37 | 
38 | training_args = TrainingArguments(
39 |     output_dir="tinyllama-finetuned",
40 |     per_device_train_batch_size=1,
41 |     num_train_epochs=30,
42 |     save_strategy="epoch",
43 |     logging_steps=5,
44 |     fp16=True,
45 |     report_to="none",
46 |     remove_unused_columns=False
47 | )
48 | 
49 | trainer = Trainer(
50 |     model=model,
51 |     args=training_args,
52 |     train_dataset=tokenized_dataset,
53 |     data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
54 | )
55 | 
56 | trainer.train()
57 | 
58 | trainer.model.save_pretrained("tinyllama-finetuned")
59 | tokenizer.save_pretrained("tinyllama-finetuned")
60 | 


--------------------------------------------------------------------------------
/step5_rag_it.py:
--------------------------------------------------------------------------------
 1 | # rag_llama_local.py
 2 | 
 3 | from sentence_transformers import SentenceTransformer
 4 | import faiss
 5 | import numpy as np
 6 | from llama_cpp import Llama
 7 | import pyodbc
 8 | import os
 9 | 
10 | # Step 1: Load embedding model
11 | embedder = SentenceTransformer("all-MiniLM-L6-v2")
12 | 
13 | # Step 2: Connect to MSSQL Developer Edition using Windows Authentication
14 | connection_string = (
15 |     "DRIVER={ODBC Driver 17 for SQL Server};"
16 |     "SERVER=BIGB;"
17 |     "DATABASE=LlamaDB;"
18 |     "Trusted_Connection=yes;"
19 | )
20 | conn = pyodbc.connect(connection_string)
21 | cursor = conn.cursor()
22 | 
23 | # Assume we have a table 'knowledge' with a 'content' column
24 | cursor.execute("SELECT content FROM knowledge")
25 | documents = [row[0] for row in cursor.fetchall() if row[0] is not None]
26 | 
27 | # Step 3: Embed documents
28 | doc_embeddings = embedder.encode(documents, convert_to_numpy=True)
29 | 
30 | # Step 4: Build FAISS index
31 | dimension = doc_embeddings.shape[1]
32 | index = faiss.IndexFlatL2(dimension)
33 | index.add(doc_embeddings)
34 | 
35 | # Map index to original text
36 | id_to_doc = {i: doc for i, doc in enumerate(documents)}
37 | 
38 | # Step 5: Set up LLaMA
39 | model_path = "C:/Users/hassa/OneDrive/Desktop/AI Resources/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
40 | 
41 | if not os.path.exists(model_path):
42 |     raise FileNotFoundError(f"LLaMA model file not found at: {model_path}")
43 | 
44 | llm = Llama(model_path=model_path, n_ctx=2048, n_threads=4)
45 | 
46 | # Step 6: Query function
47 | def query_rag(question, top_k=3):
48 |     # Embed the query
49 |     query_vec = embedder.encode([question], convert_to_numpy=True)
50 | 
51 |     # Search FAISS index
52 |     distances, indices = index.search(query_vec, top_k)
53 |     retrieved_docs = [id_to_doc[idx] for idx in indices[0]]
54 | 
55 |     # Combine context
56 |     context = "\n\n".join(retrieved_docs)
57 |     prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
58 | 
59 |     # Query LLaMA
60 |     response = llm(prompt, max_tokens=200)
61 |     answer = response['choices'][0]['text'].strip()
62 | 
63 |     return answer
64 | 
65 | # Example usage
66 | if __name__ == "__main__":
67 |     question = "What are company hours?"
68 |     answer = query_rag(question)
69 |     print("Answer:", answer)
70 | 
71 |     # Close DB connection
72 |     conn.close()
73 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | # 🦙 Fine-Tune TinyLlama Locally (LoRA + Offline Inference)
  2 | 
  3 | This project shows how to fine-tune [TinyLlama](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) locally on your own machine using LoRA, with 100% offline capability — no cloud, no GPUs required (but supported), no hosted models.
  4 | 
  5 | It’s based on a real journey of debugging and training from scratch using a custom dataset.
  6 | 
  7 | ---
  8 | 
  9 | ## ✅ What You’ll Do
 10 | 
 11 | - Download the TinyLlama base model locally
 12 | - Fine-tune with LoRA using a custom `data.jsonl`
 13 | - Merge LoRA weights into the base model
 14 | - (Optionally) Convert to `.gguf` and run offline with `llama.cpp`
 15 | 
 16 | ---
 17 | 
 18 | ## 🧩 0.Requirements
 19 | 
 20 | Install dependencies:
 21 | 
 22 | ```bash
 23 |  pip install transformers datasets peft accelerate bitsandbytes
 24 | ```
 25 | 
 26 | > 🚫 Do **NOT** install `bitsandbytes` if you're on Windows or using an AMD GPU
 27 | 
 28 | ---
 29 | 
 30 | ## 📁 Folder Structure
 31 | 
 32 | ```
 33 | standard-llama-finetune/
 34 | ├── data.jsonl                      ← training dataset (editable)
 35 | ├── step0_download_base_model.py    ← download base model from Hugging Face
 36 | ├── step1_0_pdf_to_text.py          ← Convert PDF to text
 37 | ├── step1_1_generate_jsonl.py       ← Generate JSONL file for fine-tuning data
 38 | ├── step2_fine_tuning.py            ← download base model from Hugging Face
 39 | ├── step3_merg.py                   ← fine-tune TinyLlama with LoRA
 40 | ├── step4_test.py                   ← merge LoRA adapter into base model
 41 | ```
 42 | 
 43 | ---
 44 | 
 45 | ## 🔽 0. Download the Base Model
 46 | 
 47 | ```bash
 48 | python step0_download_base_model.py
 49 | ```
 50 | 
 51 | This will save the model to `./tinyllama-base/`
 52 | 
 53 | ---
 54 | 
 55 | ## 🔽 1.0. Convert PDF to Raw Text
 56 | 
 57 | ```bash
 58 | python step1_0_pdf_to_text.py
 59 | ```
 60 | 
 61 | This will save the .txt file at the root.
 62 | 
 63 | ---
 64 | 
 65 | ## 🔽 1.1. Generate JSONL Files
 66 | 
 67 | ```bash
 68 | python step1_1_generate_jsonl.py
 69 | ```
 70 | 
 71 | This will save the .jsonl file at the root.
 72 | 
 73 | ---
 74 | 
 75 | ## 🧠 2. Fine-Tune with LoRA
 76 | 
 77 | ```bash
 78 | python step2_fine_tuning.py
 79 | ```
 80 | 
 81 | - Trains on `data.jsonl`
 82 | - Runs for 30 epochs (you can adjust inside the script)
 83 | - Saves LoRA adapter to `tinyllama-finetuned/`
 84 | 
 85 | ---
 86 | 
 87 | ## 🔗 3. Merge LoRA into Base Model
 88 | 
 89 | ```bash
 90 | python step3_merg.py
 91 | ```
 92 | 
 93 | - Merges the LoRA weights into the base model
 94 | - Saves to `tinyllama-merged/` — ready for conversion or inference
 95 | 
 96 | ---
 97 | 
 98 | ## 🧪 4. Run Sanity Check (Optional)
 99 | 
100 | ```bash
101 | python step4_test.py
102 | ```
103 | 
104 | Expected output:
105 | 
106 | ```
107 | Hassan Habib is a software engineering leader and the author of The Standard.
108 | ```
109 | 
110 | ---
111 | 
112 | ## 🦙 5. Convert to `.gguf` for llama.cpp (make sure you install CMake, clone and build llama.cpp)
113 | 
114 | ```bash
115 | cd llama.cpp/
116 | python3 convert_hf_to_gguf.py ../tinyllama-merged --outfile standard-mini.gguf --outtype f16
117 | ```
118 | 
119 | Then run with:
120 | 
121 | ```bash
122 | ./build/bin/llama-cli --model standard-mini.gguf --prompt "Describe Orchestration services"
123 | 
124 | ```
125 | 
126 | Paste this prompt:
127 | 
128 | ```
129 | ### Instruction:
130 | Who is Hassan Habib?
131 | 
132 | ### Input:
133 | 
134 | ### Response:
135 | ```
136 | 
137 | ---
138 | 
139 | ## 📽️ Video Step-by-Step
140 | ## How to Run AI Offline w/ .NET
141 | https://www.youtube.com/watch?v=lc6lVCe0XHI&t=3s
142 | 
143 | ## How to Fine-Tune your AI Model
144 | https://www.youtube.com/watch?v=FQr7VrK5RRQ&t=1087s
145 | 
146 | ## How to Feed your Llama Model (TXT to JSONL)
147 | https://www.youtube.com/watch?v=YB9cVyjV9Bo
148 | 
149 | ## Make Your Offline AI Model Talk to Local SQL — Fully Private RAG with LLaMA + FAISS
150 | https://www.youtube.com/watch?v=3jFpLNglWBc&t=293s
151 | 
152 | ## 👨‍🏫 Author
153 | Built and tested by [Hassan Habib](https://github.com/hassanhabib), fine-tuned with ❤️ and terminal grit.
154 | 
155 | ---
156 | 
157 | Want to turn this into a video or GitHub tutorial? It’s built to teach.
158 | 


--------------------------------------------------------------------------------
/step6_agentic.py:
--------------------------------------------------------------------------------
  1 | import pyodbc
  2 | from llama_cpp import Llama
  3 | import re
  4 | 
  5 | # ------------------------
  6 | # 🔍 Get DB schema
  7 | # ------------------------
  8 | def get_db_schema(cursor):
  9 |     schema = ""
 10 | 
 11 |     cursor.execute("""
 12 |         SELECT TABLE_NAME 
 13 |         FROM INFORMATION_SCHEMA.TABLES 
 14 |         WHERE TABLE_TYPE = 'BASE TABLE' AND TABLE_CATALOG = DB_NAME()
 15 |     """)
 16 |     tables = [row[0] for row in cursor.fetchall()]
 17 | 
 18 |     for table in tables:
 19 |         cursor.execute(f"""
 20 |             SELECT COLUMN_NAME, DATA_TYPE 
 21 |             FROM INFORMATION_SCHEMA.COLUMNS 
 22 |             WHERE TABLE_NAME = '{table}'
 23 |         """)
 24 |         columns = cursor.fetchall()
 25 |         schema += f"Table: {table}\n"
 26 |         for column_name, data_type in columns:
 27 |             schema += f"- {column_name} ({data_type})\n"
 28 |         schema += "\n"
 29 |     return schema.strip()
 30 | 
 31 | # ------------------------
 32 | # 🧠 Build LLM prompt
 33 | # ------------------------
 34 | def build_prompt(user_question, schema, error_message=None, previous_sql=None):
 35 |     if error_message:
 36 |         return f"""
 37 | You previously generated this SQL which failed:
 38 | 
 39 | {previous_sql}
 40 | 
 41 | The error was:
 42 | {error_message}
 43 | 
 44 | Try again. ONLY use the tables and columns listed in this schema.
 45 | 
 46 | Schema:
 47 | {schema}
 48 | 
 49 | User question:
 50 | "{user_question}"
 51 | 
 52 | Respond ONLY with a valid Microsoft SQL Server SELECT query and end it with a semicolon.
 53 | """
 54 |     else:
 55 |         return f"""
 56 | You are a SQL expert. You will receive a database schema and a user question.
 57 | 
 58 | You MUST:
 59 | - Use ONLY table and column names exactly as provided in the schema
 60 | - NEVER invent or singularize table names like 'Student' if only 'Students' exists
 61 | - Output ONLY a valid Microsoft SQL Server SELECT statement ending with a semicolon
 62 | 
 63 | Schema:
 64 | {schema}
 65 | 
 66 | Question: "{user_question}"
 67 | 
 68 | Output:
 69 | """
 70 | 
 71 | # ------------------------
 72 | # 🧼 Extract SELECT statement
 73 | # ------------------------
 74 | def extract_valid_sql(text):
 75 |     match = re.search(r"(SELECT\s.+?;)", text, re.IGNORECASE | re.DOTALL)
 76 |     if not match:
 77 |         raise ValueError("No valid SELECT statement found.")
 78 |     sql = match.group(1).strip()
 79 |     if "LIMIT" in sql.upper():
 80 |         raise ValueError("Invalid keyword 'LIMIT' for T-SQL.")
 81 |     return sql
 82 | 
 83 | # ------------------------
 84 | # 🚀 MAIN SCRIPT
 85 | # ------------------------
 86 | 
 87 | # Step 0: Connect to MSSQL
 88 | conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=BIGB;DATABASE=SchoolDb;Trusted_Connection=yes;")
 89 | cursor = conn.cursor()
 90 | 
 91 | # Step 1: Load local LLM
 92 | llm = Llama(model_path="models/mistral-7b-instruct-v0.2.Q4_K_M.gguf")
 93 | 
 94 | # Step 2: User question
 95 | user_question = "Find the names of all students who have a score higher than 150."
 96 | 
 97 | # Step 3: Introspect schema
 98 | schema = get_db_schema(cursor)
 99 | 
100 | # Step 4: Attempt loop
101 | MAX_RETRIES = 2
102 | attempt = 0
103 | error_message = None
104 | sql_query = None
105 | data = None
106 | 
107 | while attempt < MAX_RETRIES:
108 |     prompt = build_prompt(user_question, schema, error_message, sql_query)
109 |     response = llm(prompt=prompt, max_tokens=200)
110 |     raw_response = response['choices'][0]['text'].strip()
111 |     print(f"🔍 Attempt {attempt + 1} - LLM Response:\n{raw_response}")
112 | 
113 |     try:
114 |         sql_query = extract_valid_sql(raw_response)
115 |         print("✅ Extracted SQL:\n", sql_query)
116 | 
117 |         cursor.execute(sql_query)
118 |         results = cursor.fetchall()
119 |         columns = [column[0] for column in cursor.description]
120 |         data = [dict(zip(columns, row)) for row in results]
121 | 
122 |         if not data:
123 |             raise ValueError("Query ran but returned no results.")
124 | 
125 |         break  # ✅ Success
126 | 
127 |     except Exception as e:
128 |         error_message = str(e)
129 |         print("🚨 Error:", error_message)
130 |         attempt += 1
131 | 
132 | # Step 5: Summarize or report failure
133 | if data:
134 |     summary_prompt = f"""
135 | Here are the results of the query:\n{data}
136 | Summarize this nicely for the user.
137 | """
138 |     summary = llm(prompt=summary_prompt, max_tokens=200)['choices'][0]['text'].strip()
139 |     print("📄 Summary:\n", summary)
140 | else:
141 |     print("❌ Failed after retries. Please rephrase your question.")
142 | 


--------------------------------------------------------------------------------
/step1_1_generate_jsonl.py:
--------------------------------------------------------------------------------
  1 | # generate_questions_offline.py (CLI version)
  2 | import os
  3 | import json
  4 | import re
  5 | import nltk
  6 | import subprocess
  7 | import tempfile
  8 | 
  9 | # === SETUP ===
 10 | nltk.download('punkt', quiet=True)
 11 | from nltk.tokenize import sent_tokenize
 12 | 
 13 | # === CONFIGURATION ===
 14 | TXT_FILE = "TheStandard.txt"
 15 | JSONL_FILE = "TheStandard.jsonl"
 16 | RAW_LOG_FILE = "raw_output.jsonl"
 17 | MODEL_CLI = "llama.cpp/build/bin/llama-cli"
 18 | MODEL_PATH = "llama.cpp/mistral/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
 19 | 
 20 | CHUNK_MIN_WORDS = 30
 21 | CHUNK_MAX_WORDS = 100
 22 | MAX_PARAGRAPHS = 1000
 23 | MAX_QUESTIONS_PER_PARAGRAPH = 6
 24 | 
 25 | # === CLEAN TEXT ===
 26 | def clean_text(text):
 27 |     text = re.sub(r'-\n', '', text)
 28 |     text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
 29 |     text = re.sub(r'\n{2,}', '\n\n', text)
 30 |     return text.strip()
 31 | 
 32 | # === PARAGRAPH SPLITTING ===
 33 | def split_into_paragraphs(text):
 34 |     paragraphs = []
 35 |     for block in text.split('\n\n'):
 36 |         block = block.strip()
 37 |         if not block:
 38 |             continue
 39 |         sentences = sent_tokenize(block)
 40 |         chunk = ""
 41 |         word_count = 0
 42 |         for sentence in sentences:
 43 |             words = sentence.split()
 44 |             word_count += len(words)
 45 |             chunk += " " + sentence
 46 |             if CHUNK_MIN_WORDS <= word_count <= CHUNK_MAX_WORDS:
 47 |                 paragraphs.append(chunk.strip())
 48 |                 chunk = ""
 49 |                 word_count = 0
 50 |         if chunk:
 51 |             paragraphs.append(chunk.strip())
 52 |     return paragraphs
 53 | 
 54 | # === CALL CLI MODEL ===
 55 | def call_llama_cli(prompt):
 56 |     with tempfile.NamedTemporaryFile(mode='w+', delete=False) as tmp_prompt:
 57 |         tmp_prompt.write(prompt)
 58 |         tmp_prompt_path = tmp_prompt.name
 59 | 
 60 |     result = subprocess.run(
 61 |         [MODEL_CLI, "--model", MODEL_PATH, "--prompt", prompt, "--n-predict", "256"],
 62 |         capture_output=True,
 63 |         text=True
 64 |     )
 65 | 
 66 |     os.remove(tmp_prompt_path)
 67 | 
 68 |     output = result.stdout.strip()
 69 |     return output
 70 | 
 71 | # === GENERATE QUESTIONS ===
 72 | def generate_questions(paragraph, max_q=MAX_QUESTIONS_PER_PARAGRAPH):
 73 |     prompt = f"""You are helping train an AI chatbot based on a book called *The Standard* by Hassan Habib.
 74 | 
 75 | Given the paragraph below, write up to 3 different natural-language questions that could be answered by it.
 76 | Write each question on a new line. Do not include explanations or extra commentary.
 77 | 
 78 | Paragraph:
 79 | {paragraph}
 80 | """
 81 |     raw_text = call_llama_cli(prompt)
 82 | 
 83 |     with open(RAW_LOG_FILE, "a", encoding="utf-8") as log:
 84 |         log.write(json.dumps({
 85 |             "instruction": prompt.strip(),
 86 |             "input": "",
 87 |             "output": raw_text
 88 |         }, ensure_ascii=False) + "\n")
 89 | 
 90 |     questions = []
 91 |     for q in raw_text.split("\n"):
 92 |         q = q.strip()
 93 |         if q.endswith("?") and len(q.split()) >= 3 and not q.lower().startswith(("of", "and", "the")):
 94 |             q = re.sub(r'^\d+(\.\d+)*[\).]?\s*', '', q)
 95 |             questions.append(q)
 96 | 
 97 |     return list(set(questions))
 98 | 
 99 | # === MAIN ===
100 | def main():
101 |     if not os.path.exists(TXT_FILE):
102 |         print(f"❌ File not found: {TXT_FILE}")
103 |         return
104 | 
105 |     with open(TXT_FILE, 'r', encoding='utf-8') as f:
106 |         raw_text = f.read()
107 | 
108 |     paragraphs = split_into_paragraphs(clean_text(raw_text))
109 |     print(f"📖 Found {len(paragraphs)} paragraphs to process.")
110 | 
111 |     written_count = 0
112 |     skipped_count = 0
113 | 
114 |     with open(JSONL_FILE, 'w', encoding='utf-8') as out:
115 |         for i, paragraph in enumerate(paragraphs[:MAX_PARAGRAPHS]):
116 |             try:
117 |                 questions = generate_questions(paragraph)
118 | 
119 |                 if not questions:
120 |                     print(f"[{i+1}] ⚠️ No questions generated.")
121 |                     skipped_count += 1
122 |                     continue
123 | 
124 |                 for j, question in enumerate(questions):
125 |                     entry = {
126 |                         "instruction": question,
127 |                         "input": "",
128 |                         "output": paragraph.strip()
129 |                     }
130 |                     out.write(json.dumps(entry, ensure_ascii=False) + '\n')
131 |                     written_count += 1
132 |                     print(f"[{i+1}.{j+1}] ✅ {question}")
133 | 
134 |             except Exception as e:
135 |                 print(f"[{i+1}] ❌ Error: {e}")
136 |                 skipped_count += 1
137 |                 continue
138 | 
139 |     print(f"\n📦 Done: {written_count} questions saved to '{JSONL_FILE}'")
140 |     print(f"⚠️ Skipped: {skipped_count} paragraphs")
141 | 
142 | # === ENTRY ===
143 | if __name__ == "__main__":
144 |     main()
145 | 


--------------------------------------------------------------------------------