├── LICENSE
├── mail.py
├── organize.py
├── README.md
└── photos.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Trey Moore
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/mail.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for extracting all attachments from an mbox archive.
 3 | """
 4 | 
 5 | import mailbox
 6 | import os
 7 | 
 8 | 
 9 | def _get_attachments(message):
10 |     """Returns a list of attachments in an email Message."""
11 |     return [part for part in message.get_payload() if
12 |             part.get_content_disposition() == 'attachment']
13 | 
14 | def _strip_illegal_char(string, seperator=''):
15 | 	string = str(string)
16 | 	parsed = re.sub(r'[\<,\>,\:,\",\/,\\,\|,\?,\*,\n,\t]', seperator, string)
17 | 	if string != parsed:
18 | 		print('Illegal windows char. Renamed "{}" to "{}"'.format(string, parsed))
19 | 	return parsed
20 | 
21 | def _write_attachment(attachment, mbox_file_path):
22 |     """Writes the attached file to an output directory."""
23 |     attachments_path = os.path.join(os.path.dirname(mbox_file_path),
24 |                                     'extracted_attachments')
25 |     if not os.path.isdir(attachments_path):
26 |         os.mkdir(attachments_path)
27 | 
28 | 	timestamp = str(int(time.time()))  # use on collisions
29 | 	filename = attachment.get_filename()
30 | 	if filename is None:
31 | 		return
32 | 
33 | 	filename = _strip_illegal_char(filename)
34 | 	path = os.path.join(attachments_path, filename)
35 | 	# If same filename, append with unique string
36 | 	if os.path.exists(path):
37 | 		original_path = path
38 | 		path = os.path.join(attachments_path, timestamp + '-' + filename)
39 | 		print('Renamed "{}" to "{}"'.format(original_path, path))
40 | 
41 | 	try:
42 | 		with open(path, 'wb') as fb:
43 | 			fb.write(attachment.get_payload(decode=True))
44 | 			print('Saved "{}"'.format(path))
45 | 	except Exception as e:
46 | 		print('Error: ' + e)
47 | 		return
48 | 
49 | 
50 | def extract_mail_attachments(mbox_file_path):
51 |     """Extracts and writes email attachments from an mbox archive."""
52 |     mbox = mailbox.mbox(mbox_file_path)
53 |     for message in mbox:
54 |         # Right now I'm assuming attachments only exist on multipart messages. I
55 |         # haven't read the specifications of the mbox file format.
56 |         if message.is_multipart():
57 |             for attachment in _get_attachments(message):
58 |                 _write_attachment(attachment, mbox_file_path)
59 |                 mbox = mailbox.mbox(mbox_file_path)
60 | 


--------------------------------------------------------------------------------
/organize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Tool for organizing and cleaning up Google takeout data.
 4 | """
 5 | 
 6 | import argparse
 7 | import distutils.core
 8 | import os
 9 | 
10 | 
11 | def dir_path(string):
12 |     """Determines if the argument is an existing directory."""
13 |     if os.path.isdir(string):
14 |         return string
15 |     else:
16 |         raise argparse.ArgumentTypeError(
17 |             'Path "' + string + '" is not a directory.')
18 | 
19 | PARSER = argparse.ArgumentParser(description=('Tool for processing and '
20 |                                               'organizing Google takeout '
21 |                                               'data.'))
22 | PARSER.add_argument('--photos_dir', type=dir_path,
23 |                     help='The directory containing Google Photos takeout '
24 |                     'archives (i.e. one or multiple zip file)')
25 | PARSER.add_argument('--mbox_file',
26 |                     help='The mbox file with Gmail takeout data.')
27 | 
28 | 
29 | def _maybe_organize_photos_takeout(takeout_dir):
30 |     if not takeout_dir:
31 |         print('Invalid (or no) Photos takeout archive directory specified. Not '
32 |               'extracting photos from archives.')
33 |         return
34 |     else:
35 |         organize_photos = input('Organize Photos takeout archives? y/n: ')
36 |         organize_photos = distutils.util.strtobool(organize_photos)
37 |         if not organize_photos:
38 |             return
39 |     import photos
40 |     photos.organize_photos_takeout(takeout_dir)
41 | 
42 | 
43 | def _maybe_extract_email_attachments(mbox_file_path):
44 |     if (mbox_file_path and os.path.isfile(mbox_file_path) and
45 |             mbox_file_path.endswith('.mbox')):
46 |         answer = input('Extract mailbox attachments? y/n: ')
47 |         answer = distutils.util.strtobool(answer)
48 |         if answer:
49 |             import mail
50 |             mail.extract_mail_attachments(mbox_file_path)
51 |     else:
52 |         print('Invalid (or no) .mbox path specified. Not extracting email '
53 |               'attachments.')
54 | 
55 | def main():
56 |     args = PARSER.parse_args()
57 |     _maybe_organize_photos_takeout(args.photos_dir)
58 |     _maybe_extract_email_attachments(args.mbox_file)
59 | 
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Takeout Helper
 2 | 
 3 | ## Install
 4 | 1. Make sure Python 3 is installed as well as `pip` for python 3
 5 | 1. Install ImageMagick version 7 with HEIC support.
 6 |   * uninstall default libhefi (if it's older than 1.51.  If it's >= 1.5.1 you can skip building and installing libheif)
 7 |   * build and install (libheif-1.5.1)[https://github.com/ImageMagick/libheif] manually from source. This was required for
 8 |     me on Ubuntu to build ImageMagick 7 correctly with HEIC support.
 9 |   * `sudo apt install build-essentials libheif-dev`
10 |   * Download source for ImageMagick 7
11 |   * Uncomment all 'build-deps's in `/etc/apt/sources.list`
12 |   * `apt-get build-dep imagemagick`
13 |   * `./configure --with-heic`
14 |   * `make`
15 |   * `sudo make install`
16 |   * `sudo ldconfig /usr/local/lib`
17 | 1. Install `libmagickwand-dev`?
18 | 1. Install Wand: `pip install Wand`
19 | 
20 | ## Recover your photos from Google Photos
21 | 
22 | 1. Download all `.zip` archives for
23 |    [*only* Google Photos data](https://takeout.google.com/settings/takeout).
24 |    For me, selecting more than Google Photos resulted in an error. This is most
25 |    likely because of the size of **all** of my Google data being so massive.
26 |     *  Note: It can take hours or even days for Google to generate Zip files with
27 |        all your data.
28 | 1. Run `./organize.py <directory where all your archives are>`.  For example:
29 |    `./organize.py --photos_dir ~/Downloads/google_takeout_archives/`.  The
30 |    script will:
31 |     *  Find all of the takeout archives in this directory
32 |     *  Extract the photos from these archives
33 |     *  Delete extra metadata files (i.e. not the images and videos)
34 |     *  Give you an option to delete the archives after to reclaim some hard
35 |        drive space.
36 | 1. (Strongly recommended) Back up your photos somewhere else.  Google does a ton
37 |    of work to make sure your images will never be accidentally deleted or lost.
38 |    It's very unlikely you are willing/able/can afford to have the same
39 |    level of reliability as Google.  I'm no expert, but I've found [this subreddit's
40 |    wiki](https://www.reddit.com/r/DataHoarder/wiki/backups) to be a good
41 |    starting point for learning about how to back up.
42 | 
43 | ## Extract all attachments from the mail archive
44 | 
45 | 1.  Download the `.mbox` archive with Gmail data
46 | 2.  Run `./organize.py --mbox_file <path to .mbox file>`
47 | 


--------------------------------------------------------------------------------
/photos.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module extracts and does some simple cleanup of Google Photos takeout
 3 | archives.
 4 | 
 5 | WARNING: This module assumes all JSON metadata files aren't useful and deletes
 6 | them by default.
 7 | """
 8 | 
 9 | import argparse
10 | import distutils.core
11 | import fnmatch
12 | import os
13 | import tarfile
14 | import zipfile
15 | 
16 | 
17 | # Path to the Google Photos data directory in the extracted takeout data.
18 | # When extracted the photos will be in: <takeout_dir>/Takeout/Google Photos/
19 | PHOTOS_SUBDIR = ('Takeout', 'Google Photos')
20 | 
21 | 
22 | def _list_takeout_archives(takeout_dir):
23 |     """Lists the full path of all Google Takeout archives."""
24 |     dir_files = []
25 |     for filename in os.listdir(takeout_dir):
26 |         if filename.startswith("takeout") and (filename.endswith(".zip") or
27 |                                                filename.endswith(".tgz")):
28 |             dir_files.append(os.path.join(takeout_dir, filename))
29 |     return dir_files
30 | 
31 | 
32 | def _unarchive_archives(takeout_dir):
33 |     """Extracts all archives to the archive directory."""
34 |     for archive in _list_takeout_archives(takeout_dir):
35 |         print('unarchiveing: ', archive)
36 |         if archive.endswith(".zip"):
37 |             with zipfile.ZipFile(archive, 'r') as zip_ref:
38 |                 zip_ref.extractall(takeout_dir)
39 |         else:
40 |             my_tar = tarfile.open(archive)
41 |             my_tar.extractall(takeout_dir)
42 |             my_tar.close()
43 | 
44 | 
45 | def _convert_heic_files(takeout_dir):
46 |     """Convert HEIC files to JPG in place and keep the original."""
47 |     from wand.image import Image
48 |     for dirpath, _, filenames in os.walk(os.path.join(takeout_dir,
49 |                                                       *PHOTOS_SUBDIR)):
50 |         heic_files = [os.path.join(dirpath, name) for name in filenames if
51 |                       name.endswith('.HEIC')]
52 | 
53 |         for heic_file in heic_files:
54 |             with Image(filename=heic_file) as original:
55 |                 with original.convert('jpeg') as converted:
56 |                     jpg_file = os.path.splitext(heic_file)[0] + '.jpg'
57 |                     print('Saved converted JPG: ', jpg_file)
58 |                     converted.save(filename=jpg_file)
59 | 
60 | 
61 | def _delete_metadata_files(takeout_dir):
62 |     """Deletes all metadata files in the Photos data."""
63 |     for dirpath, _, filenames in os.walk(os.path.join(takeout_dir,
64 |                                                       *PHOTOS_SUBDIR)):
65 |         metadata_files = [os.path.join(dirpath, name) for name in filenames if
66 |                           name.endswith('.json')]
67 | 
68 |         for metadata_file in metadata_files:
69 |             os.remove(metadata_file)
70 | 
71 | 
72 | def _clean_up(takeout_dir, delete_archives=False):
73 |     """Cleans up extra files and the compressed archives."""
74 |     _delete_metadata_files(takeout_dir)
75 |     if delete_archives:
76 |         takeout_archives = _list_takeout_archives(takeout_dir)
77 |         for archive in takeout_archives:
78 |             print('deleting archive: ', archive)
79 |             os.remove(archive)
80 |     else:
81 |         print('Not deleting archives.')
82 | 
83 | 
84 | def organize_photos_takeout(takeout_dir):
85 |     _unarchive_archives(takeout_dir)
86 | 
87 |     answer = input('Convert HEIC to JPG and keep original? y/n: ')
88 |     answer = distutils.util.strtobool(answer)
89 |     if answer:
90 |         _convert_heic_files(takeout_dir)
91 | 
92 |     # Clean up.
93 |     answer = input('Delete all takeout archives? y/n: ')
94 |     answer = distutils.util.strtobool(answer)
95 |     _clean_up(takeout_dir, answer)
96 | 


--------------------------------------------------------------------------------