├── LICENSE ├── README.md ├── gtalk_export.py └── hangouts.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 coandco 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | gtalk_export 2 | ============ 3 | 4 | * Author: Clint Olson (with JSON-parsing code from [Jay2K1](http://blog.jay2k1.com/)) 5 | * License: MIT 6 | * Meaning: Use everywhere, keep copyright, it'd be swell if you'd link back here. 7 | 8 | ## Usage 9 | _Export Google Talk/Hangouts chats to logfiles_ 10 | 11 | Uses a modified version of [Jay2K1's Takeouts parser](http://hangoutparser.jay2k1.com/) (for Hangouts chats) alongside a custom mbox parser (for older Google Talk chats made before mid-2013) to produce a single set of unified logfiles. 12 | 13 | To use it, follow these steps: 14 | 15 | 1. Export your Google Hangouts data using [Google Takeout](https://www.google.com/settings/takeout). You'll be using the "Hangouts.json" file from the archive this gives you. 16 | 2. [Enable IMAP](https://support.google.com/mail/troubleshooter/1668960?hl=en#ts=1665018) on your GMail account. 17 | 3. Check the box to make Chats show up in IMAP, as detailed [here](http://readwrite.com/2011/09/16/google_liberates_gmail_chat_logs_via_imap) 18 | 4. Download your GMail IMAP chats folder ([Gmail]/Chats) using a desktop email client (script tested with Thunderbird). 19 | 5. Get the files that contain your chats. This script supports both the mbox format (one file, many "emails") and the Maildir format (one file per "email"): 20 | - If using mbox, you can simply copy the mbox file directly from your profile directory (it may be located at `[thunderbird_profile]/ImapMail/imap.gmail.com/[Gmail].sbd/Chats`). Or you can use Thunderbird's [ImportExportTools](https://addons.mozilla.org/en-us/thunderbird/addon/importexporttools/) addon to assist in obtaining the file required. 21 | - To use Maildir, you must enable the Maildir backend for Thunderbird. It's suggested to backup your emails first & enable maildir at the local level if you regularly use Thunderbird. Maildir can be enabled in the settings 3 different ways: 22 | - Options - Advanced - Advanced Configuration - Message Store Type for new accounts 23 | - Account Settings - Server Settings - Message Storage - Message Store Type 24 | - Account Settings - Local Folders - Message Storage - Message Store Type 25 | 6. Check out this repository to a directory. 26 | 7. Run this command: `python gtalk_export.py -p -j -n -e ` **If using mbox, add `-m` to the end of the command** 27 | 28 | The program needs your name and email so that it knows who "you" are, and by extension who the other party is -- some of the mbox-format chats just list participants with no indication of which one is the account being parsed. Running the command will generate a large number of .txt files in the current working directory (one for each contact you conversed with). 29 | -------------------------------------------------------------------------------- /gtalk_export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mailbox 3 | import re 4 | import time 5 | import sys 6 | import xml.dom.minidom 7 | import HTMLParser 8 | import argparse 9 | import hangouts 10 | from email.utils import parsedate 11 | import quopri 12 | 13 | def extract_date_mbox(email): 14 | date = email.get('Date') 15 | return parsedate(date) 16 | 17 | def filename_sanitize(input): 18 | return re.sub("([^a-zA-Z0-9._-])", "_", input) 19 | 20 | def make_filename_json(member_array, name, email): 21 | outstr = '' 22 | for i in member_array: 23 | if member_array[i] not in (name, email): 24 | if outstr != '': 25 | outstr += "_" 26 | outstr += member_array[i]; 27 | #Need to limit total filename size to 255 28 | return outstr[:250] + ".txt" 29 | 30 | def msg_to_logline_json(message): 31 | return "%s <%s> %s\n" % (message['datetime'], message['sender'], message['message']) 32 | 33 | def write_to_file(filename, lines): 34 | '''Write a set of lines to a specified file. 35 | 36 | @param filename: path to file 37 | @type filename: string 38 | @param lines: array of log lines (strings) to write 39 | @type lines: [string, string, ...] 40 | 41 | ''' 42 | with open(filename, "a") as myfile: 43 | myfile.write("".join(lines)) 44 | 45 | def parse_mailbox(mailbox_path, my_name, my_email, timestamp_format, use_mbox): 46 | if not use_mbox: 47 | mailbox_path = os.path.join(mailbox_path,"") 48 | if not os.path.isdir(mailbox_path + 'new'): 49 | os.mkdir(mailbox_path + 'new') 50 | if not os.path.isdir(mailbox_path + 'tmp'): 51 | os.mkdir(mailbox_path + 'tmp') 52 | 53 | if use_mbox: 54 | mbox = mailbox.mbox(mailbox_path) 55 | else: 56 | mbox = mailbox.Maildir(mailbox_path, None) 57 | sorted_mails = sorted(mbox, key=extract_date_mbox) 58 | 59 | # Sometimes thunderbird will produce mbox files with duplicate messages. 60 | # Keep track of all seen Message-ID's to prevent writing out duplicate 61 | # lines to the logs. 62 | seen_ids = set() 63 | 64 | for message in sorted_mails: 65 | messageobj = [] 66 | 67 | # Very rarely (happened to me with only 1 message out of 25,000), 68 | # Thunderbird/GMail will produce a malformed message with a payload, 69 | # but no metadata. Just skip these, but print a warning so the user 70 | # can ensure that this is not happening too often. 71 | if len(message.keys()) == 0: 72 | print("Warning: Skipping malformed message") 73 | continue 74 | 75 | # Skip duplicates 76 | if message['Message-ID'] in seen_ids: 77 | continue 78 | seen_ids.add(message['Message-ID']) 79 | 80 | name = re.sub("Chat with ", "", message['subject']) 81 | 82 | payload = message.get_payload() 83 | if type(payload) is str: 84 | # We're in one of the new hybrid-style single-use messages 85 | 86 | # Some (but not all) of these messages use quoted-printable 87 | # encoding (which uses = as an escape character). 88 | # The remainder are encoded with 7bit ASCII, which must not 89 | # be decoded, because treating = as an escape causes havoc. 90 | if message['Content-Transfer-Encoding'] == 'quoted-printable': 91 | payload = quopri.decodestring(payload) 92 | payload = payload.decode('utf-8') 93 | payload = payload.strip() 94 | to_name = re.sub(" <[^>]*>", "", message.get('To')) 95 | from_name = re.sub(" <[^>]*>", "", message.get('From')) 96 | if not name: 97 | name = to_name if to_name != my_name else from_name 98 | rawtimestr = message.get('Date') 99 | timestamp = time.strftime(timestamp_format, parsedate(rawtimestr)) 100 | 101 | pars = HTMLParser.HTMLParser() 102 | outline = "%s <%s> %s\n" % (timestamp, from_name, pars.unescape(payload)) 103 | messageobj.append(outline.encode('utf-8')) 104 | else: 105 | #We're in an old Google Talk Jabber conversation message 106 | 107 | payload = payload[0].as_string() 108 | # Seemingly all of these messages use quoted-printable encoding, 109 | # even though 'Content-Transfer-Encoding' is never set. 110 | payload = quopri.decodestring(payload) 111 | payload = payload.decode('utf-8') 112 | # The emails have a couple of chaff lines before the XML starts 113 | payload = re.sub(r'^[^<]*<', "<", payload) 114 | 115 | chatxml = xml.dom.minidom.parseString(payload.encode('utf-8')) 116 | 117 | for messagexml in chatxml.getElementsByTagName("cli:message"): 118 | speaker = messagexml.getAttribute("from") 119 | rawtimestr = messagexml.getElementsByTagName("time")[0].getAttribute("ms") 120 | timefloat = float(rawtimestr[:-3] + "." + rawtimestr[-3:]) 121 | timestamp = time.strftime(timestamp_format,time.localtime(timefloat)) 122 | try: 123 | content = messagexml.getElementsByTagName("cli:body")[0].firstChild.data 124 | except AttributeError: 125 | # No 'data' element means that it's an empty message 126 | content = "" 127 | except IndexError: 128 | # No "cli:body" elements means that it's a non-message event, 129 | # like a time-gap or user-unavailable message 130 | continue 131 | outline = "%s <%s> %s\n" % (timestamp, speaker, content) 132 | messageobj.append(outline.encode('utf-8')) 133 | 134 | write_to_file("%s.txt" % filename_sanitize(name)[:250], messageobj) 135 | 136 | def parse_json(json_path, name, email, timestamp_format): 137 | with open(json_path, "r") as myfile: 138 | mydata=myfile.read() 139 | 140 | conversations = hangouts.hangoutsToArray(mydata, timestamp_format) 141 | 142 | print("JSON file first pass completed. Writing to logfiles...") 143 | 144 | for conversation in conversations: 145 | filename = filename_sanitize(make_filename_json(conversation['members'], 146 | name, email)) 147 | messageobj = [] 148 | for message in conversation['messages']: 149 | messageobj.append(msg_to_logline_json(message).encode("UTF-8")) 150 | write_to_file(filename, messageobj) 151 | 152 | parser = argparse.ArgumentParser(prog="gtalk_export") 153 | parser.add_argument("-p", "--mailbox-path", 154 | required=False, 155 | default=None, 156 | help="The location of the IMAP Maildir or mbox to parse") 157 | parser.add_argument("-j", "--json-path", 158 | required=False, 159 | default=None, 160 | help="The location of the Takeouts JSON to parse") 161 | parser.add_argument("-n", "--name", 162 | required=True, 163 | help="The chat participant name whose files are being parsed") 164 | parser.add_argument("-e", "--email", 165 | required=True, 166 | help="The chat participant email whose files are being parsed") 167 | parser.add_argument("-t", "--timestamp-format", 168 | required=False, 169 | default='%Y-%m-%d %H:%M:%S', 170 | help="Timestamp format to display in output logs") 171 | parser.add_argument("-m", "--mbox", 172 | action='store_true', 173 | help="Use mbox instead of Maildir") 174 | 175 | 176 | args = parser.parse_args() 177 | 178 | if args.mailbox_path is None and args.json_path is None: 179 | sys.exit("No mbox or JSON provided -- nothing to do!") 180 | 181 | if args.mailbox_path: 182 | print("Processing mailbox at %s" % args.mailbox_path) 183 | parse_mailbox(args.mailbox_path, args.name, args.email, args.timestamp_format, args.mbox) 184 | print("Finished processing mailbox") 185 | 186 | if args.json_path: 187 | print("Processing json file at %s" % args.json_path) 188 | parse_json(args.json_path, args.name, args.email, args.timestamp_format) 189 | print("Finished processing json file") 190 | 191 | print("GTalk/Hangouts export completed!") 192 | -------------------------------------------------------------------------------- /hangouts.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import time 4 | 5 | # This code was inspired by Jay2K1's Hangouts parser. You can see the 6 | # blogpost for the original at: 7 | # http://blog.jay2k1.com/2014/11/10/how-to-export-and-backup-your-google-hangouts-chat-history/ 8 | # He also runs a webservice for parsing Google Hangouts JSON files at: 9 | # http://hangoutparser.jay2k1.com/ 10 | 11 | def replaceSmileys(string): 12 | # replaces UTF-8 graphical emoticons by their ASCII equivalents 13 | # list of emoji codes taken from https://aprescott.com/posts/hangouts-emoji 14 | patterns = [ 15 | u'\U0001F41D', # -<@% ? honeybee 16 | u'\U0001F435', # :(|) ? monkey face 17 | u'\U0001F437', # :(:) ? pig face 18 | u'\U0001F473', # (]:{ ? man with turban 19 | u'\U0001F494', # <\3 .< >:( >:-( >=( ? pouting face 44 | u'\U0001F622', # T_T :'( ;_; ='( ? crying face 45 | u'\U0001F623', # >_< ? persevering face 46 | u'\U0001F626', # D: ? frowning face with open mouth 47 | u'\U0001F62E', # o.o :o :-o =o ? face with open mouth 48 | u'\U0001F632', # O.O :O :-O =O ? astonished face 49 | u'\U0001F634', # O.O :O :-O =O ? astonished face 50 | u'\U0001F635', # x_x X-O X-o X( X-( ? dizzy face 51 | u'\U0001F638', # :X) :3 (=^..^=) (=^.^=) =^_^= ? grinning cat face with smiling eyes 52 | u'\U0001F64C' # Dunno, but it needs to be replaced for ASCII 53 | ] 54 | replacements = [ 55 | '-<@%', 56 | ':(|)', 57 | ':(:)', 58 | '(]:{', 59 | '.<', 84 | ":'(", 85 | '>_<', 86 | 'D:', 87 | ':o', 88 | ':O', 89 | '-_-Zzz', 90 | 'x_x', 91 | ':3', 92 | '_' 93 | ] 94 | 95 | for index in range(len(patterns)): 96 | string = re.sub(patterns[index], replacements[index], string) 97 | return string 98 | 99 | def hangoutsToArray(json_input, timestamp_format): 100 | # set the desired timestamp format here 101 | # the default is '%Y-%m-%d %H:%M:%S' which is YYYY-MM-DD HH:mm:ss. 102 | #timestamp_format = '%Y-%m-%d %H:%M:%S' 103 | 104 | # decode JSON 105 | decoded = json.loads(json_input) 106 | # extract useful part 107 | rawconvos = decoded['conversations'] 108 | #print "%r" % rawconvos 109 | retval = [] 110 | # loop through conversations 111 | for i in range(len(rawconvos)): 112 | #print "i is %d" % i 113 | #print "attempting in_conv: %s" % rawconvos[i]['conversation_state']['conversation'] 114 | # first, get metadata 115 | retval.append({}) 116 | convo = rawconvos[i] 117 | #print "%r" % convo 118 | in_conv = rawconvos[i]['conversation']['conversation'] 119 | in_event = rawconvos[i]['events'] 120 | pdata = in_conv['participant_data'] 121 | retval[i]['type'] = in_conv['type'] 122 | retval[i]['msgcount'] = len(in_event) 123 | retval[i]['name'] = in_conv['name'] if 'name' in in_conv.keys() else "" 124 | # conversation participants 125 | for j in range(len(pdata)): 126 | id = pdata[j]['id']['chat_id'] 127 | # use "unknown_" as name if they don't have a fallback_name 128 | name = pdata[j]['fallback_name'] if 'fallback_name' in pdata[j].keys() else "unknown_%s" % id 129 | if not 'members' in retval[i].keys(): 130 | retval[i]['members'] = {} 131 | retval[i]['members'][id] = name 132 | 133 | # loop through messages/events 134 | messages = [] 135 | for k in range(len(in_event)): 136 | messages.append({}) 137 | messages[k]['timestamp'] = in_event[k]['timestamp'] 138 | messages[k]['datetime'] = time.strftime(timestamp_format,time.localtime(int(messages[k]['timestamp'][0:10]))) 139 | messages[k]['sender_id'] = in_event[k]['sender_id']['chat_id'] 140 | messages[k]['sender'] = retval[i]['members'][messages[k]['sender_id']] if messages[k]['sender_id'] in retval[i]['members'].keys() else "unknown_%s" % id 141 | messages[k]['event_type'] = in_event[k]['event_type'] 142 | 143 | if messages[k]['event_type'] == 'RENAME_CONVERSATION': 144 | newname = in_event[k]['conversation_rename']['new_name'] 145 | oldname = in_event[k]['conversation_rename']['old_name'] 146 | messages[k]['message'] = "changed conversation name %s%s" % \ 147 | (("from '%s'" % oldname) if oldname else "", 148 | ("to '%s'" % newname) if newname else "") 149 | elif messages[k]['event_type'] == 'HANGOUT_EVENT': 150 | if in_event[k]['hangout_event']['event_type'] == 'START_HANGOUT': 151 | messages[k]['message'] = 'started a video chat' 152 | elif in_event[k]['hangout_event']['event_type'] == 'END_HANGOUT': 153 | messages[k]['message'] = 'ended a video chat' 154 | else: 155 | messages[k]['message'] = in_event[k]['hangout_event']['event_type'] 156 | elif messages[k]['event_type'] == 'REGULAR_CHAT_MESSAGE': 157 | messages[k]['message'] = "" 158 | msg = "" 159 | msghtml = "" 160 | # join message segments together 161 | if 'segment' in in_event[k]['chat_message']['message_content'].keys(): 162 | for event in in_event[k]['chat_message']['message_content']['segment']: 163 | if not 'text' in event.keys(): 164 | continue 165 | if event['type'] == 'TEXT': 166 | msg += event['text'] 167 | msghtml += re.sub("\n", "
", event['text']) 168 | elif event['type'] == 'LINK': 169 | msg += event['text'] 170 | msghtml += '%s' % (event['link_data']['link_target'], event['text']) 171 | elif event['type'] == 'LINE_BREAK': 172 | msg += event['text'] 173 | msghtml += re.sub("\n", "
", event['text']) 174 | # handle attachments 175 | elif 'attachment' in in_event[k]['chat_message']['message_content'].keys(): 176 | # loop through attachments 177 | for att in in_event[k]['chat_message']['message_content']['attachment']: 178 | # echo "

";print_r($att);echo "

"; 179 | if att['embed_item']['type'][0] == 'PLUS_PHOTO': 180 | imgurl = att['embed_item']['plus_photo']['url'] 181 | msg += imgurl 182 | msghtml += '

' % (imgurl, imgurl, "100%") 183 | # replace unicode emoticon characters by smileys 184 | messages[k]['message'] = replaceSmileys(msg) 185 | if msg != msghtml: 186 | messages[k]['message_html'] = replaceSmileys(msghtml) 187 | elif messages[k]['event_type'] == 'ADD_USER': 188 | newuserid = in_event[k]['membership_change']['participant_id'][0]['chat_id'] 189 | newusername = retval[i]['members'][newuserid] if newuserid in retval[i]['members'].keys() else 'unknown_%s' % newuserid 190 | messages[k]['message'] = "added user '%s' to conversation" % newusername 191 | elif messages[k]['event_type'] == 'REMOVE_USER': 192 | newuserid = in_event[k]['membership_change']['participant_id'][0]['chat_id'] 193 | newusername = retval[i]['members'][newuserid] if newuserid in retval[i]['members'].keys() else 'unknown_%s' % newuserid 194 | messages[k]['message'] = "removed user '%s' from conversation" % newusername 195 | elif messages[k]['event_type'] == 'SMS': 196 | messages[k]['message'] = "" 197 | # join message segments together 198 | if 'segment' in in_event[k]['chat_message']['message_content'].keys(): 199 | for l in range(len(in_event[k]['chat_message']['message_content']['segment'])): 200 | if not 'text' in in_event[k]['chat_message']['message_content']['segment'][l].keys(): 201 | continue 202 | messages[k]['message'] += in_event[k]['chat_message']['message_content']['segment'][l]['text'] 203 | # replace unicode emoticon characters by smileys 204 | messages[k]['message'] = replaceSmileys(messages[k]['message']) 205 | elif messages[k]['event_type'] == 'OTR_MODIFICATION': 206 | messages[k]['message'] = 'unknown OTR_MODIFICATION' 207 | elif messages[k]['event_type'] == 'VOICEMAIL': 208 | messages[k]['message'] = "new voicemail:\n" 209 | # join message segments together 210 | if 'segment' in in_event[k]['chat_message']['message_content'].keys(): 211 | for l in range(len(in_event[k]['chat_message']['message_content']['segment'])): 212 | if not 'text' in in_event[k]['chat_message']['message_content']['segment'][l].keys(): 213 | continue 214 | messages[k]['message'] += in_event[k]['chat_message']['message_content']['segment'][l]['text'] 215 | # replace unicode emoticon characters by smileys 216 | messages[k]['message'] = replaceSmileys(messages[k]['message']) 217 | # sort messages by timestamp because for some reason they're cluttered 218 | messages.sort(cmp=lambda a,b: int(a['timestamp']) - int(b['timestamp'])) 219 | # add the messages array to the conversation array 220 | retval[i]['messages'] = messages 221 | return retval 222 | --------------------------------------------------------------------------------