├── LICENSE ├── README.md ├── requirements.txt └── streamlit-qa-generator.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Sagar Khanna 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM-Dataset-Generator 2 | Repo contains code for LLM dataset generator which can help create question answer pairs using your very own PDF file. 3 | 4 | Creating datasets for fine-tuning an LLM can be a daunting task and not so enjoyable :( 5 | 6 | Hence, I created and deployed an app called LLM Dataset Generator where you can upload your PDF file and create training and validation datasets and download it in 2 most widely used fine-tuning dataset formats, csv and jsonl. 7 | 8 | The app gives you the flexibility to set your training and validation set split and set your own context. 9 | 10 | App link: https://llm-dataset-generator-hvup73dvmqfk7s5xvg9bng.streamlit.app/ 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | openai 3 | PyPDF2 4 | scikit-learn 5 | 6 | -------------------------------------------------------------------------------- /streamlit-qa-generator.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import PyPDF2 3 | from openai import OpenAI 4 | import pandas as pd 5 | import json 6 | import io 7 | from typing import List, Tuple, Dict 8 | import tempfile 9 | from sklearn.model_selection import train_test_split 10 | 11 | # Initialize session state variables if they don't exist 12 | if 'train_df' not in st.session_state: 13 | st.session_state.train_df = None 14 | if 'val_df' not in st.session_state: 15 | st.session_state.val_df = None 16 | if 'generated' not in st.session_state: 17 | st.session_state.generated = False 18 | if 'previous_upload_state' not in st.session_state: 19 | st.session_state.previous_upload_state = False 20 | 21 | def reset_session_state(): 22 | """Reset all relevant session state variables""" 23 | st.session_state.train_df = None 24 | st.session_state.val_df = None 25 | st.session_state.generated = False 26 | 27 | def parse_pdf(uploaded_file) -> str: 28 | with tempfile.NamedTemporaryFile(delete=False) as tmp_file: 29 | tmp_file.write(uploaded_file.getvalue()) 30 | tmp_file.seek(0) 31 | 32 | reader = PyPDF2.PdfReader(tmp_file.name) 33 | text = "" 34 | for page in reader.pages: 35 | text += page.extract_text() 36 | return text 37 | 38 | def generate_qa_pairs(text: str, api_key: str, model: str, num_pairs: int, context: str) -> pd.DataFrame: 39 | client = OpenAI(api_key=api_key) 40 | 41 | prompt = f""" 42 | Given the following text, generate {num_pairs} question-answer pairs: 43 | 44 | {text} 45 | 46 | Format each pair as: 47 | Q: [Question] 48 | A: [Answer] 49 | 50 | Ensure the questions are diverse and cover different aspects of the text. 51 | """ 52 | 53 | try: 54 | response = client.chat.completions.create( 55 | model=model, 56 | messages=[ 57 | {"role": "system", "content": "You are a helpful assistant that generates question-answer pairs based on given text."}, 58 | {"role": "user", "content": prompt} 59 | ] 60 | ) 61 | 62 | qa_text = response.choices[0].message.content 63 | qa_pairs = [] 64 | 65 | for pair in qa_text.split('\n\n'): 66 | if pair.startswith('Q:') and 'A:' in pair: 67 | question, answer = pair.split('A:') 68 | question = question.replace('Q:', '').strip() 69 | answer = answer.strip() 70 | qa_pairs.append({ 71 | 'Question': question, 72 | 'Answer': answer, 73 | 'Context': context 74 | }) 75 | 76 | return pd.DataFrame(qa_pairs) 77 | 78 | except Exception as e: 79 | st.error(f"Error generating QA pairs: {str(e)}") 80 | return pd.DataFrame() 81 | 82 | def create_jsonl_content(df: pd.DataFrame, system_content: str) -> str: 83 | """Convert DataFrame to JSONL string content""" 84 | jsonl_content = [] 85 | for _, row in df.iterrows(): 86 | entry = { 87 | "messages": [ 88 | {"role": "system", "content": system_content}, 89 | {"role": "user", "content": row['Question']}, 90 | {"role": "assistant", "content": row['Answer']} 91 | ] 92 | } 93 | jsonl_content.append(json.dumps(entry, ensure_ascii=False)) 94 | return '\n'.join(jsonl_content) 95 | 96 | def process_and_split_data(text: str, api_key: str, model: str, num_pairs: int, context: str, train_size: float): 97 | """Process data and store results in session state""" 98 | df = generate_qa_pairs(text, api_key, model, num_pairs, context) 99 | 100 | if not df.empty: 101 | # Split the dataset 102 | train_df, val_df = train_test_split( 103 | df, 104 | train_size=train_size/100, 105 | random_state=42 106 | ) 107 | 108 | # Store in session state 109 | st.session_state.train_df = train_df 110 | st.session_state.val_df = val_df 111 | st.session_state.generated = True 112 | return True 113 | return False 114 | 115 | def main(): 116 | st.title("LLM Dataset Generator") 117 | st.write("Upload a PDF file and generate training & validation sets of question-answer pairs of your data using LLM.") 118 | 119 | # Sidebar configurations 120 | st.sidebar.header("Configuration") 121 | 122 | api_key = st.sidebar.text_input("Enter OpenAI API Key", type="password") 123 | 124 | model = st.sidebar.selectbox( 125 | "Select Model", 126 | ["gpt-4", "gpt-4o-mini", "gpt-4o", "gpt-3.5-turbo"] 127 | ) 128 | 129 | num_pairs = st.sidebar.number_input( 130 | "Number of QA Pairs", 131 | min_value=1, 132 | max_value=10000, 133 | value=5 134 | ) 135 | 136 | context = st.sidebar.text_area( 137 | "Custom Context", 138 | value="Write a response that appropriately completes the request.", 139 | help="This text will be added to the Context column for each QA pair.", 140 | placeholder= "Add custom context here." 141 | ) 142 | 143 | # Dataset split configuration 144 | st.sidebar.header("Dataset Split") 145 | train_size = st.sidebar.slider( 146 | "Training Set Size (%)", 147 | min_value=50, 148 | max_value=90, 149 | value=80, 150 | step=5 151 | ) 152 | 153 | # Output format configuration 154 | st.sidebar.header("Output Format") 155 | output_format = st.sidebar.selectbox( 156 | "Select Output Format", 157 | ["CSV", "JSONL"] 158 | ) 159 | 160 | if output_format == "JSONL": 161 | system_content = st.sidebar.text_area( 162 | "System Message", 163 | value="You are a helpful assistant that provides accurate and informative answers.", 164 | help="This message will be used as the system content in the JSONL format." 165 | ) 166 | 167 | # Main area 168 | uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") 169 | 170 | # Check if upload state has changed 171 | current_upload_state = uploaded_file is not None 172 | if current_upload_state != st.session_state.previous_upload_state: 173 | if not current_upload_state: # File was removed 174 | reset_session_state() 175 | st.session_state.previous_upload_state = current_upload_state 176 | 177 | if uploaded_file is not None: 178 | if not api_key: 179 | st.warning("Please enter your OpenAI API key in the sidebar.") 180 | return 181 | 182 | text = parse_pdf(uploaded_file) 183 | st.success("PDF processed successfully!") 184 | 185 | if st.button("Generate QA Pairs"): 186 | with st.spinner("Generating QA pairs..."): 187 | success = process_and_split_data(text, api_key, model, num_pairs, context, train_size) 188 | if success: 189 | st.success("QA pairs generated successfully!") 190 | 191 | # Display results if data has been generated 192 | if st.session_state.generated and st.session_state.train_df is not None and st.session_state.val_df is not None: 193 | # Display the dataframes 194 | st.subheader("Training Set") 195 | st.dataframe(st.session_state.train_df) 196 | 197 | st.subheader("Validation Set") 198 | st.dataframe(st.session_state.val_df) 199 | 200 | # Create download section 201 | st.subheader("Download Generated Datasets") 202 | col1, col2 = st.columns(2) 203 | 204 | with col1: 205 | st.markdown("##### Training Set") 206 | if output_format == "CSV": 207 | train_csv = st.session_state.train_df.to_csv(index=False) 208 | st.download_button( 209 | label="Download Training Set (CSV)", 210 | data=train_csv, 211 | file_name="train_qa_pairs.csv", 212 | mime="text/csv", 213 | key="train_csv" 214 | ) 215 | else: # JSONL format 216 | train_jsonl = create_jsonl_content(st.session_state.train_df, system_content) 217 | st.download_button( 218 | label="Download Training Set (JSONL)", 219 | data=train_jsonl, 220 | file_name="train_qa_pairs.jsonl", 221 | mime="application/jsonl", 222 | key="train_jsonl" 223 | ) 224 | 225 | with col2: 226 | st.markdown("##### Validation Set") 227 | if output_format == "CSV": 228 | val_csv = st.session_state.val_df.to_csv(index=False) 229 | st.download_button( 230 | label="Download Validation Set (CSV)", 231 | data=val_csv, 232 | file_name="val_qa_pairs.csv", 233 | mime="text/csv", 234 | key="val_csv" 235 | ) 236 | else: # JSONL format 237 | val_jsonl = create_jsonl_content(st.session_state.val_df, system_content) 238 | st.download_button( 239 | label="Download Validation Set (JSONL)", 240 | data=val_jsonl, 241 | file_name="val_qa_pairs.jsonl", 242 | mime="application/jsonl", 243 | key="val_jsonl" 244 | ) 245 | 246 | # Display statistics 247 | st.subheader("Statistics") 248 | st.write(f"Total QA pairs: {len(st.session_state.train_df) + len(st.session_state.val_df)}") 249 | st.write(f"Training set size: {len(st.session_state.train_df)} ({train_size}%)") 250 | st.write(f"Validation set size: {len(st.session_state.val_df)} ({100-train_size}%)") 251 | st.write(f"Average question length: {st.session_state.train_df['Question'].str.len().mean():.1f} characters") 252 | st.write(f"Average answer length: {st.session_state.train_df['Answer'].str.len().mean():.1f} characters") 253 | 254 | if __name__ == "__main__": 255 | st.set_page_config( 256 | page_title="LLM Dataset Generator", 257 | page_icon="📚", 258 | layout="wide" 259 | ) 260 | main() --------------------------------------------------------------------------------