├── LICENSE
├── README.md
├── requirements.txt
└── streamlit-qa-generator.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Sagar Khanna
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLM-Dataset-Generator
 2 | Repo contains code for LLM dataset generator which can help create question answer pairs using your very own PDF file.
 3 | 
 4 | Creating datasets for fine-tuning an LLM can be a daunting task and not so enjoyable :(
 5 | 
 6 | Hence, I created and deployed an app called LLM Dataset Generator where you can upload your PDF file and create training and validation datasets and download it in 2 most widely used fine-tuning dataset formats, csv and jsonl.
 7 | 
 8 | The app gives you the flexibility to set your training and validation set split and set your own context.
 9 | 
10 | App link: https://llm-dataset-generator-hvup73dvmqfk7s5xvg9bng.streamlit.app/
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | openai
3 | PyPDF2
4 | scikit-learn
5 | 
6 | 


--------------------------------------------------------------------------------
/streamlit-qa-generator.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import PyPDF2
  3 | from openai import OpenAI
  4 | import pandas as pd
  5 | import json
  6 | import io
  7 | from typing import List, Tuple, Dict
  8 | import tempfile
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | # Initialize session state variables if they don't exist
 12 | if 'train_df' not in st.session_state:
 13 |     st.session_state.train_df = None
 14 | if 'val_df' not in st.session_state:
 15 |     st.session_state.val_df = None
 16 | if 'generated' not in st.session_state:
 17 |     st.session_state.generated = False
 18 | if 'previous_upload_state' not in st.session_state:
 19 |     st.session_state.previous_upload_state = False
 20 | 
 21 | def reset_session_state():
 22 |     """Reset all relevant session state variables"""
 23 |     st.session_state.train_df = None
 24 |     st.session_state.val_df = None
 25 |     st.session_state.generated = False
 26 | 
 27 | def parse_pdf(uploaded_file) -> str:
 28 |     with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
 29 |         tmp_file.write(uploaded_file.getvalue())
 30 |         tmp_file.seek(0)
 31 |         
 32 |         reader = PyPDF2.PdfReader(tmp_file.name)
 33 |         text = ""
 34 |         for page in reader.pages:
 35 |             text += page.extract_text()
 36 |     return text
 37 | 
 38 | def generate_qa_pairs(text: str, api_key: str, model: str, num_pairs: int, context: str) -> pd.DataFrame:
 39 |     client = OpenAI(api_key=api_key)
 40 |     
 41 |     prompt = f"""
 42 |     Given the following text, generate {num_pairs} question-answer pairs:
 43 | 
 44 |     {text}
 45 | 
 46 |     Format each pair as:
 47 |     Q: [Question]
 48 |     A: [Answer]
 49 | 
 50 |     Ensure the questions are diverse and cover different aspects of the text.
 51 |     """
 52 | 
 53 |     try:
 54 |         response = client.chat.completions.create(
 55 |             model=model,
 56 |             messages=[
 57 |                 {"role": "system", "content": "You are a helpful assistant that generates question-answer pairs based on given text."},
 58 |                 {"role": "user", "content": prompt}
 59 |             ]
 60 |         )
 61 | 
 62 |         qa_text = response.choices[0].message.content
 63 |         qa_pairs = []
 64 |         
 65 |         for pair in qa_text.split('\n\n'):
 66 |             if pair.startswith('Q:') and 'A:' in pair:
 67 |                 question, answer = pair.split('A:')
 68 |                 question = question.replace('Q:', '').strip()
 69 |                 answer = answer.strip()
 70 |                 qa_pairs.append({
 71 |                     'Question': question,
 72 |                     'Answer': answer,
 73 |                     'Context': context
 74 |                 })
 75 | 
 76 |         return pd.DataFrame(qa_pairs)
 77 |     
 78 |     except Exception as e:
 79 |         st.error(f"Error generating QA pairs: {str(e)}")
 80 |         return pd.DataFrame()
 81 | 
 82 | def create_jsonl_content(df: pd.DataFrame, system_content: str) -> str:
 83 |     """Convert DataFrame to JSONL string content"""
 84 |     jsonl_content = []
 85 |     for _, row in df.iterrows():
 86 |         entry = {
 87 |             "messages": [
 88 |                 {"role": "system", "content": system_content},
 89 |                 {"role": "user", "content": row['Question']},
 90 |                 {"role": "assistant", "content": row['Answer']}
 91 |             ]
 92 |         }
 93 |         jsonl_content.append(json.dumps(entry, ensure_ascii=False))
 94 |     return '\n'.join(jsonl_content)
 95 | 
 96 | def process_and_split_data(text: str, api_key: str, model: str, num_pairs: int, context: str, train_size: float):
 97 |     """Process data and store results in session state"""
 98 |     df = generate_qa_pairs(text, api_key, model, num_pairs, context)
 99 |     
100 |     if not df.empty:
101 |         # Split the dataset
102 |         train_df, val_df = train_test_split(
103 |             df, 
104 |             train_size=train_size/100,
105 |             random_state=42
106 |         )
107 |         
108 |         # Store in session state
109 |         st.session_state.train_df = train_df
110 |         st.session_state.val_df = val_df
111 |         st.session_state.generated = True
112 |         return True
113 |     return False
114 | 
115 | def main():
116 |     st.title("LLM Dataset Generator")
117 |     st.write("Upload a PDF file and generate training & validation sets of question-answer pairs of your data using LLM.")
118 | 
119 |     # Sidebar configurations
120 |     st.sidebar.header("Configuration")
121 |     
122 |     api_key = st.sidebar.text_input("Enter OpenAI API Key", type="password")
123 |     
124 |     model = st.sidebar.selectbox(
125 |         "Select Model",
126 |         ["gpt-4", "gpt-4o-mini", "gpt-4o", "gpt-3.5-turbo"]
127 |     )
128 |     
129 |     num_pairs = st.sidebar.number_input(
130 |         "Number of QA Pairs",
131 |         min_value=1,
132 |         max_value=10000,
133 |         value=5
134 |     )
135 |     
136 |     context = st.sidebar.text_area(
137 |         "Custom Context",
138 |         value="Write a response that appropriately completes the request.",
139 |         help="This text will be added to the Context column for each QA pair.",
140 |         placeholder= "Add custom context here."
141 |     )
142 | 
143 |     # Dataset split configuration
144 |     st.sidebar.header("Dataset Split")
145 |     train_size = st.sidebar.slider(
146 |         "Training Set Size (%)",
147 |         min_value=50,
148 |         max_value=90,
149 |         value=80,
150 |         step=5
151 |     )
152 | 
153 |     # Output format configuration
154 |     st.sidebar.header("Output Format")
155 |     output_format = st.sidebar.selectbox(
156 |         "Select Output Format",
157 |         ["CSV", "JSONL"]
158 |     )
159 | 
160 |     if output_format == "JSONL":
161 |         system_content = st.sidebar.text_area(
162 |             "System Message",
163 |             value="You are a helpful assistant that provides accurate and informative answers.",
164 |             help="This message will be used as the system content in the JSONL format."
165 |         )
166 | 
167 |     # Main area
168 |     uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
169 | 
170 |     # Check if upload state has changed
171 |     current_upload_state = uploaded_file is not None
172 |     if current_upload_state != st.session_state.previous_upload_state:
173 |         if not current_upload_state:  # File was removed
174 |             reset_session_state()
175 |         st.session_state.previous_upload_state = current_upload_state
176 | 
177 |     if uploaded_file is not None:
178 |         if not api_key:
179 |             st.warning("Please enter your OpenAI API key in the sidebar.")
180 |             return
181 | 
182 |         text = parse_pdf(uploaded_file)
183 |         st.success("PDF processed successfully!")
184 |         
185 |         if st.button("Generate QA Pairs"):
186 |             with st.spinner("Generating QA pairs..."):
187 |                 success = process_and_split_data(text, api_key, model, num_pairs, context, train_size)
188 |                 if success:
189 |                     st.success("QA pairs generated successfully!")
190 | 
191 |     # Display results if data has been generated
192 |     if st.session_state.generated and st.session_state.train_df is not None and st.session_state.val_df is not None:
193 |         # Display the dataframes
194 |         st.subheader("Training Set")
195 |         st.dataframe(st.session_state.train_df)
196 |         
197 |         st.subheader("Validation Set")
198 |         st.dataframe(st.session_state.val_df)
199 |         
200 |         # Create download section
201 |         st.subheader("Download Generated Datasets")
202 |         col1, col2 = st.columns(2)
203 |         
204 |         with col1:
205 |             st.markdown("##### Training Set")
206 |             if output_format == "CSV":
207 |                 train_csv = st.session_state.train_df.to_csv(index=False)
208 |                 st.download_button(
209 |                     label="Download Training Set (CSV)",
210 |                     data=train_csv,
211 |                     file_name="train_qa_pairs.csv",
212 |                     mime="text/csv",
213 |                     key="train_csv"
214 |                 )
215 |             else:  # JSONL format
216 |                 train_jsonl = create_jsonl_content(st.session_state.train_df, system_content)
217 |                 st.download_button(
218 |                     label="Download Training Set (JSONL)",
219 |                     data=train_jsonl,
220 |                     file_name="train_qa_pairs.jsonl",
221 |                     mime="application/jsonl",
222 |                     key="train_jsonl"
223 |                 )
224 |         
225 |         with col2:
226 |             st.markdown("##### Validation Set")
227 |             if output_format == "CSV":
228 |                 val_csv = st.session_state.val_df.to_csv(index=False)
229 |                 st.download_button(
230 |                     label="Download Validation Set (CSV)",
231 |                     data=val_csv,
232 |                     file_name="val_qa_pairs.csv",
233 |                     mime="text/csv",
234 |                     key="val_csv"
235 |                 )
236 |             else:  # JSONL format
237 |                 val_jsonl = create_jsonl_content(st.session_state.val_df, system_content)
238 |                 st.download_button(
239 |                     label="Download Validation Set (JSONL)",
240 |                     data=val_jsonl,
241 |                     file_name="val_qa_pairs.jsonl",
242 |                     mime="application/jsonl",
243 |                     key="val_jsonl"
244 |                 )
245 |         
246 |         # Display statistics
247 |         st.subheader("Statistics")
248 |         st.write(f"Total QA pairs: {len(st.session_state.train_df) + len(st.session_state.val_df)}")
249 |         st.write(f"Training set size: {len(st.session_state.train_df)} ({train_size}%)")
250 |         st.write(f"Validation set size: {len(st.session_state.val_df)} ({100-train_size}%)")
251 |         st.write(f"Average question length: {st.session_state.train_df['Question'].str.len().mean():.1f} characters")
252 |         st.write(f"Average answer length: {st.session_state.train_df['Answer'].str.len().mean():.1f} characters")
253 | 
254 | if __name__ == "__main__":
255 |     st.set_page_config(
256 |         page_title="LLM Dataset Generator",
257 |         page_icon="📚",
258 |         layout="wide"
259 |     )
260 |     main()


--------------------------------------------------------------------------------