├── .gitignore
├── examples
    ├── training.jsonl
    └── messages.json
├── README.md
└── preparer.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | *.jsonl
3 | *.csv
4 | !examples/*.json
5 | !examples/*.jsonl
6 | archive/


--------------------------------------------------------------------------------
/examples/training.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt": "", "completion": " Bob: Hey Alice. How are you doing?\n Alice: I'm good Bob. How are you?\n Bob: Great thanks"}
2 | {"prompt": "", "completion": " Alice: When do you want to come over for dinner?\n Bob: How about tomorrow at 6pm?\n Alice: Sounds perfect"}
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Chat GPT-3
 2 | 
 3 | This repo will help you fine-tune GPT-3 with a Google Chat conversation history. The trained model will be able to converse as one or both sides of the conversation in the participants' style.
 4 | 
 5 | 1. Download your Chat archive from [Google Takeout](https://takeout.google.com/settings/takeout).
 6 | 2. Locate the `messages.json` file of the conversation you would like to use as a training set.
 7 | 3. Use the script to prepare data for training:
 8 | 
 9 | ```shell
10 | python preparer.py --messages <MESSAGES_FILE>
11 | ```
12 | 
13 | 4. Test your training data with OpenAI's tool:
14 | 
15 | ```shell
16 | openai tools fine_tunes.prepare_data -f <TRAINING_FILE>
17 | ```
18 | 
19 | You should see: `No remediations found.`
20 | 
21 | 5. Fine-tune GPT-3 with your training data:
22 | 
23 | ```shell
24 | openai api fine_tunes.create -t <TRAINING_FILE>
25 | ```
26 | 
27 | You should see: `Job complete! Status: succeeded 🎉`. Don't forget to note the name of the model.
28 | 
29 | 7. Try out your model in the [Playground](https://beta.openai.com/playground) or with the CLI:
30 | 
31 | ```
32 | openai api completions.create -m <MODEL_NAME>
33 | ```
34 | 


--------------------------------------------------------------------------------
/examples/messages.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "messages": [
 3 |     {
 4 |       "creator": {
 5 |         "name": "Bob",
 6 |         "email": "bob@example.com",
 7 |         "user_type": "Human"
 8 |       },
 9 |       "created_date": "Saturday, December 12, 2015 at 12:41:45 AM UTC",
10 |       "text": "Hey Alice. How are you doing?",
11 |       "topic_id": "2yFH2tnYpps"
12 |     },
13 |     {
14 |       "creator": {
15 |         "name": "Alice",
16 |         "email": "alice@example.com",
17 |         "user_type": "Human"
18 |       },
19 |       "created_date": "Saturday, December 12, 2015 at 12:41:49 AM UTC",
20 |       "text": "I'm good Bob. How are you?",
21 |       "topic_id": "NGC4aYxTzeW"
22 |     },
23 |     {
24 |       "creator": {
25 |         "name": "Bob",
26 |         "email": "bob@example.com",
27 |         "user_type": "Human"
28 |       },
29 |       "created_date": "Saturday, December 12, 2015 at 12:43:09 AM UTC",
30 |       "text": "Great thanks",
31 |       "topic_id": "Ovfn9myFgul"
32 |     },
33 |     {
34 |       "creator": {
35 |         "name": "Alice",
36 |         "email": "alice@example.com",
37 |         "user_type": "Human"
38 |       },
39 |       "created_date": "Thursday, December 17, 2015 at 4:31:17 PM UTC",
40 |       "text": "When do you want to come over for dinner?",
41 |       "topic_id": "9JOAkJrWIh9"
42 |     },
43 |     {
44 |       "creator": {
45 |         "name": "Bob",
46 |         "email": "bob@example.com",
47 |         "user_type": "Human"
48 |       },
49 |       "created_date": "Thursday, December 17, 2015 at 4:33:17 PM UTC",
50 |       "text": "How about tomorrow at 6pm?",
51 |       "topic_id": "LCtvw75X2H7"
52 |     },
53 |     {
54 |       "creator": {
55 |         "name": "Alice",
56 |         "email": "alice@example.com",
57 |         "user_type": "Human"
58 |       },
59 |       "created_date": "Thursday, December 17, 2015 at 4:40:17 PM UTC",
60 |       "text": "Sounds perfect",
61 |       "topic_id": "R9Iu9IVaN6E"
62 |     }
63 |   ]
64 | }


--------------------------------------------------------------------------------
/preparer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import datetime
 3 | import argparse
 4 | import dateutil.parser
 5 | 
 6 | 
 7 | def read_messages(filepath):
 8 |     with open(filepath) as file:
 9 |         return json.load(file)['messages']
10 | 
11 | 
12 | def write_completions(filepath, completions):
13 |     with open(filepath, 'w') as f:
14 |         for completion in completions:
15 |             f.write(json.dumps(completion) + '\n')
16 | 
17 | 
18 | def prepare_messages(messages):
19 |     completions = []
20 |     completion = {'prompt': '', 'completion': ''}
21 |     last_message_time = dateutil.parser.parse(messages[0]['created_date'])
22 |     last_creator = ''
23 | 
24 |     for message in messages:
25 |         same_creator = message['creator']['name'] == last_creator
26 |         last_creator = message['creator']['name']
27 | 
28 |         # Calculate time elapsed since the last message
29 |         date_text = message.get('created_date') or message['updated_date']
30 |         message_time = dateutil.parser.parse(date_text)
31 |         delta = message_time - last_message_time
32 |         last_message_time = message_time
33 | 
34 |         # Start a new completion if more than a set time has passed
35 |         if completion['completion']:
36 |             if (same_creator and delta > datetime.timedelta(hours=3)) or \
37 |                (not same_creator and delta > datetime.timedelta(hours=12)):
38 |                 if '\n' in completion['completion']:
39 |                     completions.append(completion)
40 |                 completion = {'prompt': '', 'completion': ''}
41 | 
42 |         # Add message to the current completion
43 |         if 'text' in message:
44 |             completion['completion'] += '\n ' if completion['completion'] else ' '
45 |             completion['completion'] += message['creator']['name'] + ': ' + message['text']
46 | 
47 |     # Add final completion
48 |     completions.append(completion)
49 | 
50 |     # Remove duplicates
51 |     completions = {frozenset(item.items()): item for item in completions}.values()
52 |     return completions
53 | 
54 | 
55 | def parse_arguments():
56 |     parser = argparse.ArgumentParser()
57 |     parser.add_argument('--messages', type=str, default='messages.json')
58 |     parser.add_argument('--output', type=str, default='training.jsonl')
59 |     return parser.parse_args()
60 | 
61 | 
62 | def estimate_tokens(completions):
63 |     num_chars = 0
64 |     for completion in completions:
65 |         num_chars += len(completion['prompt']) + len(completion['completion'])
66 |     return int(num_chars / 4)
67 | 
68 | 
69 | def print_statistics(messages, completions):
70 |     print(f'Finished writing {args.output}'.format(args.output))
71 |     print(f'     tokens: {estimate_tokens(completions)}')
72 |     print(f'   messages: {len(messages)}')
73 |     print(f'completions: {len(completions)}')
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     args = parse_arguments()
78 |     messages = read_messages(args.messages)
79 |     completions = prepare_messages(messages)
80 |     write_completions(args.output, completions)
81 |     print_statistics(messages, completions)
82 | 


--------------------------------------------------------------------------------