├── LICENSE ├── mail.py ├── organize.py ├── README.md └── photos.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Trey Moore 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mail.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for extracting all attachments from an mbox archive. 3 | """ 4 | 5 | import mailbox 6 | import os 7 | 8 | 9 | def _get_attachments(message): 10 | """Returns a list of attachments in an email Message.""" 11 | return [part for part in message.get_payload() if 12 | part.get_content_disposition() == 'attachment'] 13 | 14 | def _strip_illegal_char(string, seperator=''): 15 | string = str(string) 16 | parsed = re.sub(r'[\<,\>,\:,\",\/,\\,\|,\?,\*,\n,\t]', seperator, string) 17 | if string != parsed: 18 | print('Illegal windows char. Renamed "{}" to "{}"'.format(string, parsed)) 19 | return parsed 20 | 21 | def _write_attachment(attachment, mbox_file_path): 22 | """Writes the attached file to an output directory.""" 23 | attachments_path = os.path.join(os.path.dirname(mbox_file_path), 24 | 'extracted_attachments') 25 | if not os.path.isdir(attachments_path): 26 | os.mkdir(attachments_path) 27 | 28 | timestamp = str(int(time.time())) # use on collisions 29 | filename = attachment.get_filename() 30 | if filename is None: 31 | return 32 | 33 | filename = _strip_illegal_char(filename) 34 | path = os.path.join(attachments_path, filename) 35 | # If same filename, append with unique string 36 | if os.path.exists(path): 37 | original_path = path 38 | path = os.path.join(attachments_path, timestamp + '-' + filename) 39 | print('Renamed "{}" to "{}"'.format(original_path, path)) 40 | 41 | try: 42 | with open(path, 'wb') as fb: 43 | fb.write(attachment.get_payload(decode=True)) 44 | print('Saved "{}"'.format(path)) 45 | except Exception as e: 46 | print('Error: ' + e) 47 | return 48 | 49 | 50 | def extract_mail_attachments(mbox_file_path): 51 | """Extracts and writes email attachments from an mbox archive.""" 52 | mbox = mailbox.mbox(mbox_file_path) 53 | for message in mbox: 54 | # Right now I'm assuming attachments only exist on multipart messages. I 55 | # haven't read the specifications of the mbox file format. 56 | if message.is_multipart(): 57 | for attachment in _get_attachments(message): 58 | _write_attachment(attachment, mbox_file_path) 59 | mbox = mailbox.mbox(mbox_file_path) 60 | -------------------------------------------------------------------------------- /organize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Tool for organizing and cleaning up Google takeout data. 4 | """ 5 | 6 | import argparse 7 | import distutils.core 8 | import os 9 | 10 | 11 | def dir_path(string): 12 | """Determines if the argument is an existing directory.""" 13 | if os.path.isdir(string): 14 | return string 15 | else: 16 | raise argparse.ArgumentTypeError( 17 | 'Path "' + string + '" is not a directory.') 18 | 19 | PARSER = argparse.ArgumentParser(description=('Tool for processing and ' 20 | 'organizing Google takeout ' 21 | 'data.')) 22 | PARSER.add_argument('--photos_dir', type=dir_path, 23 | help='The directory containing Google Photos takeout ' 24 | 'archives (i.e. one or multiple zip file)') 25 | PARSER.add_argument('--mbox_file', 26 | help='The mbox file with Gmail takeout data.') 27 | 28 | 29 | def _maybe_organize_photos_takeout(takeout_dir): 30 | if not takeout_dir: 31 | print('Invalid (or no) Photos takeout archive directory specified. Not ' 32 | 'extracting photos from archives.') 33 | return 34 | else: 35 | organize_photos = input('Organize Photos takeout archives? y/n: ') 36 | organize_photos = distutils.util.strtobool(organize_photos) 37 | if not organize_photos: 38 | return 39 | import photos 40 | photos.organize_photos_takeout(takeout_dir) 41 | 42 | 43 | def _maybe_extract_email_attachments(mbox_file_path): 44 | if (mbox_file_path and os.path.isfile(mbox_file_path) and 45 | mbox_file_path.endswith('.mbox')): 46 | answer = input('Extract mailbox attachments? y/n: ') 47 | answer = distutils.util.strtobool(answer) 48 | if answer: 49 | import mail 50 | mail.extract_mail_attachments(mbox_file_path) 51 | else: 52 | print('Invalid (or no) .mbox path specified. Not extracting email ' 53 | 'attachments.') 54 | 55 | def main(): 56 | args = PARSER.parse_args() 57 | _maybe_organize_photos_takeout(args.photos_dir) 58 | _maybe_extract_email_attachments(args.mbox_file) 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Takeout Helper 2 | 3 | ## Install 4 | 1. Make sure Python 3 is installed as well as `pip` for python 3 5 | 1. Install ImageMagick version 7 with HEIC support. 6 | * uninstall default libhefi (if it's older than 1.51. If it's >= 1.5.1 you can skip building and installing libheif) 7 | * build and install (libheif-1.5.1)[https://github.com/ImageMagick/libheif] manually from source. This was required for 8 | me on Ubuntu to build ImageMagick 7 correctly with HEIC support. 9 | * `sudo apt install build-essentials libheif-dev` 10 | * Download source for ImageMagick 7 11 | * Uncomment all 'build-deps's in `/etc/apt/sources.list` 12 | * `apt-get build-dep imagemagick` 13 | * `./configure --with-heic` 14 | * `make` 15 | * `sudo make install` 16 | * `sudo ldconfig /usr/local/lib` 17 | 1. Install `libmagickwand-dev`? 18 | 1. Install Wand: `pip install Wand` 19 | 20 | ## Recover your photos from Google Photos 21 | 22 | 1. Download all `.zip` archives for 23 | [*only* Google Photos data](https://takeout.google.com/settings/takeout). 24 | For me, selecting more than Google Photos resulted in an error. This is most 25 | likely because of the size of **all** of my Google data being so massive. 26 | * Note: It can take hours or even days for Google to generate Zip files with 27 | all your data. 28 | 1. Run `./organize.py `. For example: 29 | `./organize.py --photos_dir ~/Downloads/google_takeout_archives/`. The 30 | script will: 31 | * Find all of the takeout archives in this directory 32 | * Extract the photos from these archives 33 | * Delete extra metadata files (i.e. not the images and videos) 34 | * Give you an option to delete the archives after to reclaim some hard 35 | drive space. 36 | 1. (Strongly recommended) Back up your photos somewhere else. Google does a ton 37 | of work to make sure your images will never be accidentally deleted or lost. 38 | It's very unlikely you are willing/able/can afford to have the same 39 | level of reliability as Google. I'm no expert, but I've found [this subreddit's 40 | wiki](https://www.reddit.com/r/DataHoarder/wiki/backups) to be a good 41 | starting point for learning about how to back up. 42 | 43 | ## Extract all attachments from the mail archive 44 | 45 | 1. Download the `.mbox` archive with Gmail data 46 | 2. Run `./organize.py --mbox_file ` 47 | -------------------------------------------------------------------------------- /photos.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module extracts and does some simple cleanup of Google Photos takeout 3 | archives. 4 | 5 | WARNING: This module assumes all JSON metadata files aren't useful and deletes 6 | them by default. 7 | """ 8 | 9 | import argparse 10 | import distutils.core 11 | import fnmatch 12 | import os 13 | import tarfile 14 | import zipfile 15 | 16 | 17 | # Path to the Google Photos data directory in the extracted takeout data. 18 | # When extracted the photos will be in: /Takeout/Google Photos/ 19 | PHOTOS_SUBDIR = ('Takeout', 'Google Photos') 20 | 21 | 22 | def _list_takeout_archives(takeout_dir): 23 | """Lists the full path of all Google Takeout archives.""" 24 | dir_files = [] 25 | for filename in os.listdir(takeout_dir): 26 | if filename.startswith("takeout") and (filename.endswith(".zip") or 27 | filename.endswith(".tgz")): 28 | dir_files.append(os.path.join(takeout_dir, filename)) 29 | return dir_files 30 | 31 | 32 | def _unarchive_archives(takeout_dir): 33 | """Extracts all archives to the archive directory.""" 34 | for archive in _list_takeout_archives(takeout_dir): 35 | print('unarchiveing: ', archive) 36 | if archive.endswith(".zip"): 37 | with zipfile.ZipFile(archive, 'r') as zip_ref: 38 | zip_ref.extractall(takeout_dir) 39 | else: 40 | my_tar = tarfile.open(archive) 41 | my_tar.extractall(takeout_dir) 42 | my_tar.close() 43 | 44 | 45 | def _convert_heic_files(takeout_dir): 46 | """Convert HEIC files to JPG in place and keep the original.""" 47 | from wand.image import Image 48 | for dirpath, _, filenames in os.walk(os.path.join(takeout_dir, 49 | *PHOTOS_SUBDIR)): 50 | heic_files = [os.path.join(dirpath, name) for name in filenames if 51 | name.endswith('.HEIC')] 52 | 53 | for heic_file in heic_files: 54 | with Image(filename=heic_file) as original: 55 | with original.convert('jpeg') as converted: 56 | jpg_file = os.path.splitext(heic_file)[0] + '.jpg' 57 | print('Saved converted JPG: ', jpg_file) 58 | converted.save(filename=jpg_file) 59 | 60 | 61 | def _delete_metadata_files(takeout_dir): 62 | """Deletes all metadata files in the Photos data.""" 63 | for dirpath, _, filenames in os.walk(os.path.join(takeout_dir, 64 | *PHOTOS_SUBDIR)): 65 | metadata_files = [os.path.join(dirpath, name) for name in filenames if 66 | name.endswith('.json')] 67 | 68 | for metadata_file in metadata_files: 69 | os.remove(metadata_file) 70 | 71 | 72 | def _clean_up(takeout_dir, delete_archives=False): 73 | """Cleans up extra files and the compressed archives.""" 74 | _delete_metadata_files(takeout_dir) 75 | if delete_archives: 76 | takeout_archives = _list_takeout_archives(takeout_dir) 77 | for archive in takeout_archives: 78 | print('deleting archive: ', archive) 79 | os.remove(archive) 80 | else: 81 | print('Not deleting archives.') 82 | 83 | 84 | def organize_photos_takeout(takeout_dir): 85 | _unarchive_archives(takeout_dir) 86 | 87 | answer = input('Convert HEIC to JPG and keep original? y/n: ') 88 | answer = distutils.util.strtobool(answer) 89 | if answer: 90 | _convert_heic_files(takeout_dir) 91 | 92 | # Clean up. 93 | answer = input('Delete all takeout archives? y/n: ') 94 | answer = distutils.util.strtobool(answer) 95 | _clean_up(takeout_dir, answer) 96 | --------------------------------------------------------------------------------