├── .env.example ├── .gitignore ├── LICENSE ├── README.md ├── main.py └── requirements.txt /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .venv 3 | .env 4 | output.csv 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Quentin Lintz 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Synthetic Data Generator 2 | 3 | This Python script uses OpenAI's `gpt-3.5-turbo` language model to generate synthetic data for NLP training. The script prompts the language model to generate random comments, and then labels those comments as either a suggestion or not. The resulting data is saved to a CSV file. 4 | 5 | ## Environment 6 | 7 | - Python 3.11 8 | 9 | ## Getting Started 10 | 11 | ```bash 12 | python -m venv .venv 13 | source .venv/bin/activate 14 | python -m pip install -U pip 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ## Setup 19 | 20 | 2. Sign up for an OpenAI API key, if you don't have one. Visit to sign up. 21 | 22 | 3. Copy the `.env.example` and rename it to `.env`, put your OpenAI API key in there 23 | 24 | ## Usage 25 | 26 | 1. Open a terminal and navigate to the directory containing the `main.py` script. 27 | 28 | 2. Modify the global variables as necessary. 29 | 30 | a. `PROMPT` should be changed based on what you want to generate 31 | 32 | b. `NUM_OF_CALLS` determines how many times the OpenAI API gets called 33 | 34 | 3. Run the script using the following command: 35 | 36 | ```bash 37 | python main.py 38 | ``` 39 | 40 | 4. The script will generate synthetic text data along with their labels and save them to a CSV file named output.csv in the same directory. 41 | 42 | ## Example 43 | 44 | Here's an example of what your output might look like: 45 | 46 | > The characters were poorly developed and the dialogue was cheesy.,0 47 | > 48 | > The cinematography was stunning and the soundtrack was perfect.,1 49 | > 50 | > I found the plot confusing and the acting was mediocre.,0 51 | > 52 | > This movie was a real tearjerker, I couldn't stop crying.,1 53 | > 54 | > The humor was crude and offensive, I didn't find it funny at all.,0 55 | > 56 | > The chemistry between the two leads was electric, I was rooting for them the whole time.,1 57 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import csv 3 | import os 4 | from dotenv import load_dotenv 5 | 6 | load_dotenv() 7 | 8 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") 9 | PROMPT = ( 10 | "Generate 10 random movie reviews in CSV format, each followed by a comma and a label indicating if it's a positive (1) or negative (0) review. " 11 | "Please do not include any numbering or double quotes at the beginning of the lines. " 12 | "Format each line exactly like this (without double quotes): " 13 | "review text,1 or review text,0\n" 14 | "Example:\n" 15 | "The acting was great, but the plot was a bit slow.,0\n" 16 | "This movie was amazing! I loved everything about it.,1\n" 17 | "The special effects were impressive, but the story was weak.,0\n" 18 | "I thought the pacing was perfect and the ending was satisfying.,1\n" 19 | ) 20 | NUM_OF_CALLS = 1 21 | 22 | if not OPENAI_API_KEY: 23 | raise ValueError("OPENAI_API_KEY environment variable not set") 24 | 25 | openai.api_key = os.environ.get("OPENAI_API_KEY") 26 | 27 | def generate_text(): 28 | response = openai.ChatCompletion.create( 29 | model="gpt-3.5-turbo", 30 | messages=[ 31 | {"role": "user", "content": PROMPT} 32 | ], 33 | temperature=0.7 34 | ) 35 | return response.choices[0].message.content.strip() 36 | 37 | def save_to_csv(data, file_name): 38 | with open(file_name, mode="w", newline="", encoding="utf-8") as f: 39 | writer = csv.writer(f) 40 | for row in data: 41 | writer.writerow(row) 42 | 43 | if __name__ == "__main__": 44 | data = [] 45 | 46 | try: 47 | for i in range(NUM_OF_CALLS): 48 | response = generate_text() 49 | lines = response.split("\n") 50 | for line in lines: 51 | if line: 52 | try: 53 | comment, label = line.rsplit(",", 1) 54 | label_int = int(label.strip()) 55 | data.append((comment.strip(), label_int)) 56 | except ValueError: 57 | print(f"Skipping line due to invalid format: '{line}'") 58 | print(f"Completed API call {i + 1} of {NUM_OF_CALLS}") 59 | 60 | 61 | except KeyboardInterrupt: 62 | print("\nKeyboard interrupt detected. Saving partial data to the CSV file...") 63 | 64 | finally: 65 | save_to_csv(data, "output.csv") 66 | print("Data saved to 'output.csv'.") 67 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | python-dotenv 3 | --------------------------------------------------------------------------------