├── .gitignore ├── examples ├── training.jsonl └── messages.json ├── README.md └── preparer.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.jsonl 3 | *.csv 4 | !examples/*.json 5 | !examples/*.jsonl 6 | archive/ -------------------------------------------------------------------------------- /examples/training.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt": "", "completion": " Bob: Hey Alice. How are you doing?\n Alice: I'm good Bob. How are you?\n Bob: Great thanks"} 2 | {"prompt": "", "completion": " Alice: When do you want to come over for dinner?\n Bob: How about tomorrow at 6pm?\n Alice: Sounds perfect"} 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Chat GPT-3 2 | 3 | This repo will help you fine-tune GPT-3 with a Google Chat conversation history. The trained model will be able to converse as one or both sides of the conversation in the participants' style. 4 | 5 | 1. Download your Chat archive from [Google Takeout](https://takeout.google.com/settings/takeout). 6 | 2. Locate the `messages.json` file of the conversation you would like to use as a training set. 7 | 3. Use the script to prepare data for training: 8 | 9 | ```shell 10 | python preparer.py --messages 11 | ``` 12 | 13 | 4. Test your training data with OpenAI's tool: 14 | 15 | ```shell 16 | openai tools fine_tunes.prepare_data -f 17 | ``` 18 | 19 | You should see: `No remediations found.` 20 | 21 | 5. Fine-tune GPT-3 with your training data: 22 | 23 | ```shell 24 | openai api fine_tunes.create -t 25 | ``` 26 | 27 | You should see: `Job complete! Status: succeeded 🎉`. Don't forget to note the name of the model. 28 | 29 | 7. Try out your model in the [Playground](https://beta.openai.com/playground) or with the CLI: 30 | 31 | ``` 32 | openai api completions.create -m 33 | ``` 34 | -------------------------------------------------------------------------------- /examples/messages.json: -------------------------------------------------------------------------------- 1 | { 2 | "messages": [ 3 | { 4 | "creator": { 5 | "name": "Bob", 6 | "email": "bob@example.com", 7 | "user_type": "Human" 8 | }, 9 | "created_date": "Saturday, December 12, 2015 at 12:41:45 AM UTC", 10 | "text": "Hey Alice. How are you doing?", 11 | "topic_id": "2yFH2tnYpps" 12 | }, 13 | { 14 | "creator": { 15 | "name": "Alice", 16 | "email": "alice@example.com", 17 | "user_type": "Human" 18 | }, 19 | "created_date": "Saturday, December 12, 2015 at 12:41:49 AM UTC", 20 | "text": "I'm good Bob. How are you?", 21 | "topic_id": "NGC4aYxTzeW" 22 | }, 23 | { 24 | "creator": { 25 | "name": "Bob", 26 | "email": "bob@example.com", 27 | "user_type": "Human" 28 | }, 29 | "created_date": "Saturday, December 12, 2015 at 12:43:09 AM UTC", 30 | "text": "Great thanks", 31 | "topic_id": "Ovfn9myFgul" 32 | }, 33 | { 34 | "creator": { 35 | "name": "Alice", 36 | "email": "alice@example.com", 37 | "user_type": "Human" 38 | }, 39 | "created_date": "Thursday, December 17, 2015 at 4:31:17 PM UTC", 40 | "text": "When do you want to come over for dinner?", 41 | "topic_id": "9JOAkJrWIh9" 42 | }, 43 | { 44 | "creator": { 45 | "name": "Bob", 46 | "email": "bob@example.com", 47 | "user_type": "Human" 48 | }, 49 | "created_date": "Thursday, December 17, 2015 at 4:33:17 PM UTC", 50 | "text": "How about tomorrow at 6pm?", 51 | "topic_id": "LCtvw75X2H7" 52 | }, 53 | { 54 | "creator": { 55 | "name": "Alice", 56 | "email": "alice@example.com", 57 | "user_type": "Human" 58 | }, 59 | "created_date": "Thursday, December 17, 2015 at 4:40:17 PM UTC", 60 | "text": "Sounds perfect", 61 | "topic_id": "R9Iu9IVaN6E" 62 | } 63 | ] 64 | } -------------------------------------------------------------------------------- /preparer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import datetime 3 | import argparse 4 | import dateutil.parser 5 | 6 | 7 | def read_messages(filepath): 8 | with open(filepath) as file: 9 | return json.load(file)['messages'] 10 | 11 | 12 | def write_completions(filepath, completions): 13 | with open(filepath, 'w') as f: 14 | for completion in completions: 15 | f.write(json.dumps(completion) + '\n') 16 | 17 | 18 | def prepare_messages(messages): 19 | completions = [] 20 | completion = {'prompt': '', 'completion': ''} 21 | last_message_time = dateutil.parser.parse(messages[0]['created_date']) 22 | last_creator = '' 23 | 24 | for message in messages: 25 | same_creator = message['creator']['name'] == last_creator 26 | last_creator = message['creator']['name'] 27 | 28 | # Calculate time elapsed since the last message 29 | date_text = message.get('created_date') or message['updated_date'] 30 | message_time = dateutil.parser.parse(date_text) 31 | delta = message_time - last_message_time 32 | last_message_time = message_time 33 | 34 | # Start a new completion if more than a set time has passed 35 | if completion['completion']: 36 | if (same_creator and delta > datetime.timedelta(hours=3)) or \ 37 | (not same_creator and delta > datetime.timedelta(hours=12)): 38 | if '\n' in completion['completion']: 39 | completions.append(completion) 40 | completion = {'prompt': '', 'completion': ''} 41 | 42 | # Add message to the current completion 43 | if 'text' in message: 44 | completion['completion'] += '\n ' if completion['completion'] else ' ' 45 | completion['completion'] += message['creator']['name'] + ': ' + message['text'] 46 | 47 | # Add final completion 48 | completions.append(completion) 49 | 50 | # Remove duplicates 51 | completions = {frozenset(item.items()): item for item in completions}.values() 52 | return completions 53 | 54 | 55 | def parse_arguments(): 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('--messages', type=str, default='messages.json') 58 | parser.add_argument('--output', type=str, default='training.jsonl') 59 | return parser.parse_args() 60 | 61 | 62 | def estimate_tokens(completions): 63 | num_chars = 0 64 | for completion in completions: 65 | num_chars += len(completion['prompt']) + len(completion['completion']) 66 | return int(num_chars / 4) 67 | 68 | 69 | def print_statistics(messages, completions): 70 | print(f'Finished writing {args.output}'.format(args.output)) 71 | print(f' tokens: {estimate_tokens(completions)}') 72 | print(f' messages: {len(messages)}') 73 | print(f'completions: {len(completions)}') 74 | 75 | 76 | if __name__ == "__main__": 77 | args = parse_arguments() 78 | messages = read_messages(args.messages) 79 | completions = prepare_messages(messages) 80 | write_completions(args.output, completions) 81 | print_statistics(messages, completions) 82 | --------------------------------------------------------------------------------