├── .gitignore
├── PhotosLibraryExtractor.py
├── README.md
├── Screenshots
    ├── Download Originals.png
    ├── Screenshot - handling duplicate destination.png
    ├── Screenshot 2020-11-07 at 11.34.21.png
    ├── Screenshot 2020-11-07 at 11.35.28.png
    ├── Screenshot 2020-11-07 at 13.32.30.png
    └── Screenshot- Handling live photo.png
└── fix_extensions.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | exiftool.py
3 | *.pyc
4 | PhotosLibraryExtractor_ProcessedFiles
5 | 


--------------------------------------------------------------------------------
/PhotosLibraryExtractor.py:
--------------------------------------------------------------------------------
  1 | import exiftool, sys, shutil, os, hashlib
  2 | from argparse import ArgumentParser
  3 | #exiftool: http://github.com/smarnach/pyexiftool
  4 | 
  5 | 
  6 | # Use this for finding metadata tags
  7 | #with exiftool.ExifTool() as et:
  8 | #		metadata = et.get_metadata(sys.argv[1])
  9 | # 		sys.exit(1)
 10 | 
 11 | parser = ArgumentParser()
 12 | parser.add_argument('-i', action='store', dest='input', help='Folder with pictures you want to process a.k.a. input folder', required=True)
 13 | parser.add_argument('-o', action='store', dest='output', help='Photos will be copied to this folder a.k.a. output folder', required=True)
 14 | parser.add_argument('-db', action='store', dest='db_path', help='Path to PLEDB file', required=False)
 15 | parsed = parser.parse_args()
 16 | 
 17 | ignored_files = [".DS_Store", "PLEDB"]
 18 | ignored_file_exts = [".bat", ".sh", ".py", ".zip", ".7z"]
 19 | 
 20 | in_dir = os.path.abspath(parsed.input)
 21 | 
 22 | 
 23 | # Check if input is a .photoslibrary, if so correct it to point to the originals folder
 24 | if ".photoslibrary" in in_dir:
 25 | 	test_PhotosLibrary_path = os.path.join(in_dir, "originals/")
 26 | 	if os.path.isdir(test_PhotosLibrary_path):
 27 | 		in_dir = test_PhotosLibrary_path
 28 | 	
 29 | 	test_PhotosLibrary_path = os.path.join(in_dir, "Masters/") # high sierra
 30 | 	if os.path.isdir(test_PhotosLibrary_path):
 31 | 		in_dir = test_PhotosLibrary_path
 32 | 
 33 | out_dir = os.path.abspath(parsed.output)
 34 | if parsed.db_path:
 35 | 	already_processed_db = os.path.abspath(parsed.db_path)
 36 | else:
 37 | 	already_processed_db = os.path.join(out_dir, 'PLEDB')
 38 | 
 39 | print("Input folder:", in_dir)
 40 | print("Destination:", out_dir)
 41 | print("DB file:", already_processed_db)
 42 | print("---")
 43 | 
 44 | if not os.path.isdir(in_dir):
 45 | 	print("Error! This doesn't seem to be a folder:", in_dir)
 46 | 	sys.exit(1)
 47 | 
 48 | if not os.path.isdir(out_dir):
 49 | 	print("Making output folder:", out_dir)
 50 | 	os.makedirs(out_dir)
 51 | 
 52 | contentID_filenames = [] # for Live Photos
 53 | contentID_IDs = []
 54 | handled_files = []
 55 | previously_handled_files = []
 56 | duplicate_files = []
 57 | files_copied = 0
 58 | 
 59 | def md5sum(filename):
 60 | 	size = os.path.getsize(filename)
 61 | 
 62 | 	h = hashlib.md5()
 63 | 	with open(filename, 'rb') as file:
 64 | 		chunk = 0
 65 | 		while chunk != b'':
 66 | 			chunk = file.read(1024)
 67 | 			h.update(chunk)
 68 | 
 69 | 	return h.hexdigest()
 70 | 
 71 | def grab_metadata(fp):
 72 | 	with exiftool.ExifTool() as et:
 73 | 		metadata = et.get_metadata(fp)	
 74 | 	#print(metadata)
 75 | 	
 76 | 	if 'QuickTime:ContentCreateDate' in metadata: # usually the date to go by for videos transcoded in Photos.app... CreateDate will be the transcode date for those
 77 | 		date = metadata['QuickTime:ContentCreateDate']
 78 | 	elif 'EXIF:DateTimeOriginal' in metadata:
 79 | 		date = metadata['EXIF:DateTimeOriginal']
 80 | 	elif 'EXIF:CreateDate' in metadata:
 81 | 		date = metadata['EXIF:CreateDate']
 82 | 	elif 'QuickTime:CreateDate' in metadata:
 83 | 		date = metadata['QuickTime:CreateDate']
 84 | 	elif 'EXIF:ModifyDate' in metadata:
 85 | 		date = metadata['EXIF:ModifyDate']
 86 | 	else:
 87 | 		date = False
 88 | 
 89 | 	if 'MakerNotes:ContentIdentifier' in metadata:
 90 | 		content_ID = metadata['MakerNotes:ContentIdentifier']
 91 | 	elif 'QuickTime:ContentIdentifier' in metadata:
 92 | 		content_ID = metadata['QuickTime:ContentIdentifier']
 93 | 	else:
 94 | 		content_ID = False
 95 | 
 96 | 	return {"date": date, "content_ID": content_ID }
 97 | 
 98 | def destination_from_date(in_date, in_path):
 99 | 	f = os.path.basename(in_path)
100 | 	if in_date:
101 | 		date = in_date.split(' ')[0]
102 | 		time = in_date.split(' ')[1]
103 | 
104 | 		year = date.split(':')[0]
105 | 		month = date.split(':')[1]
106 | 		day = date.split(':')[2]
107 | 
108 | 		hour = time.split(':')[0]
109 | 		minute = time.split(':')[1]
110 | 		second = time.split(':')[2]
111 | 		if "+" in second:
112 | 			s = second
113 | 			second = s.split('+')[0]
114 | 			offset = s.split('+')[1] # timezone offset, we dont need it but someday we might?
115 | 		elif "-" in second: # i think there can be negative offsets too ?
116 | 			s = second
117 | 			second = s.split('-')[0]
118 | 			offset = s.split('-')[1]
119 | 
120 | 		ext = os.path.splitext(f)[1]
121 | 
122 | 		folder_path = os.path.join(out_dir, year, month)
123 | 		filename = year + "-" + month + "-" + day + " " + hour + "." + minute + "." + second + ext
124 | 	else: # no date
125 | 		folder_path = os.path.join(out_dir, "Unknown Dates")
126 | 		filename = f
127 | 
128 | 	final = os.path.join(folder_path, filename)
129 | 	return final
130 | 
131 | def copy_handler(input_path,destination):
132 | 	global files_copied
133 | 	dest_folder = os.path.dirname(destination)
134 | 
135 | 	if not os.path.isdir(dest_folder):
136 | 		os.makedirs(dest_folder)
137 | 
138 | 	base = os.path.splitext( os.path.basename(destination) )[0]
139 | 	ext = os.path.splitext( os.path.basename(destination) )[1]
140 | 	if ext.lower() == ".jpeg": # change jpeg to jpg for consistency
141 | 		ext = ".jpg"
142 | 
143 | 	i = 97 # 97 is a
144 | 	new_name = base + ext
145 | 	final_path = (os.path.join(dest_folder, new_name))
146 | 	while os.path.isfile(final_path):
147 | 		existing_file_hash = md5sum(final_path)
148 | 		new_file_hash = md5sum(input_path)
149 | 
150 | 		if existing_file_hash == new_file_hash:
151 | 			return
152 | 		else:
153 | 			new_name = base + chr(i) + ext # append letter to filename if it already exists
154 | 			final_path = (os.path.join(dest_folder, new_name))
155 | 		i += 1
156 | 
157 | 	print(input_path, "-->", final_path)
158 | 	files_copied += 1
159 | 	shutil.copy(input_path, final_path)
160 | 
161 | 
162 | def add_to_processed_files(filepath):
163 | 	with open(already_processed_db, 'a') as sf:
164 | 		sf.write(filepath + '\n')
165 | 
166 | # Read previously handled files
167 | if os.path.isfile(already_processed_db):
168 | 	print("Reading", already_processed_db)
169 | 	with open(already_processed_db) as f:
170 | 		lines = f.readlines()
171 | 	for line in lines:
172 | 		previously_handled_files.append(line.rstrip())
173 | 	print(len(previously_handled_files), "files in PLEDB")
174 | 	print("They will be skipped this run. If you want to start fresh, delete the file:", already_processed_db)
175 | 	print("---")
176 | 
177 | skipped_files = len(previously_handled_files)
178 | 
179 | # Main loop
180 | for dirpath, dirnames, filenames in os.walk(in_dir):
181 | 	for f in filenames:
182 | 		if f in ignored_files or os.path.splitext(f)[1].lower() in ignored_file_exts:
183 | 			print("Ignored:", f)
184 | 			continue # ignore
185 | 
186 | 		in_file = os.path.abspath(os.path.join(dirpath,f))
187 | 
188 | 		if in_file in previously_handled_files:
189 | 			continue
190 | 		
191 | 		md5 = md5sum(in_file)
192 | 		if md5 in handled_files:
193 | 			print("Duplicate:", f)
194 | 			duplicate_files.append(in_file)
195 | 			add_to_processed_files(in_file)
196 | 			continue
197 | 		
198 | 		info = grab_metadata(in_file)
199 | 		d = info['date']
200 | 		cID = info['content_ID']
201 | 		ext = os.path.splitext(in_file)[1]
202 | 
203 | 		#if d:
204 | 		#	print("Date:", d)
205 | 		#if cID:
206 | 		#	print("Content ID:", cID)
207 | 
208 | 		if cID: # has content id, could be a live photo
209 | 			if cID in contentID_IDs: # Found live photo?
210 | 				#print("This seems to be the other part of a Live Photo... let's copy them together")
211 | 				matched_filename = contentID_filenames[contentID_IDs.index(cID)]
212 | 
213 | 				if ext.lower() == '.mov':
214 | 					video_path = in_file
215 | 					picture_path = matched_filename
216 | 				else:
217 | 					video_path = matched_filename
218 | 					picture_path = in_file
219 | 
220 | 				info_pic = grab_metadata(picture_path)
221 | 				d_pic = info_pic['date']
222 | 
223 | 				# copy video part
224 | 				dest = destination_from_date(d_pic, video_path)
225 | 				copy_handler(video_path, dest)
226 | 				# copy picture part
227 | 				dest = destination_from_date(d_pic, picture_path)
228 | 				copy_handler(picture_path, dest)
229 | 
230 | 
231 | 				contentID_IDs.remove(cID)
232 | 				contentID_filenames.remove(matched_filename)
233 | 			else:
234 | 				#print("This seems to be part of a Live Photo... let's wait for the other part before we do anything")
235 | 				contentID_filenames.append(in_file)
236 | 				contentID_IDs.append(cID)
237 | 				
238 | 		else:
239 | 			# no content id, just handle it regularly
240 | 			dest = destination_from_date(d, in_file)
241 | 			copy_handler(in_file, dest)
242 | 
243 | 		handled_files.append(md5)
244 | 		add_to_processed_files(in_file)
245 | 		#print('-')
246 | 
247 | 
248 | # now handle leftover files in contentID_filenames
249 | if len(contentID_filenames) > 0:
250 | 	print("---")
251 | 	print("There are", len(contentID_filenames), "files with unpaired Content IDs left")
252 | 	for f in contentID_filenames:
253 | 		info = grab_metadata(f)
254 | 		d = info['date']
255 | 		#print('Input:', f)
256 | 		#if d:
257 | 		#	print(d)
258 | 		dest = destination_from_date(d, f)
259 | 		copy_handler(f, dest)
260 | 		#print('-')
261 | 	print("---")
262 | 
263 | skipped_files += len(duplicate_files)
264 | 
265 | print("Files processed:", len(handled_files))
266 | print("Files copied:", files_copied)
267 | print("Skipped files:", skipped_files)
268 | #print("Ignored files (duplicates):", len(duplicate_files))
269 | print("Done")
270 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Photos.app Library Extractor
  2 | 
  3 | Exports photos from macOS Photos.app Library to Year/Month folders with dates as filenames. Perfect for a local offline backup.
  4 | 
  5 | Specifically made for macOS Photos.app libraries, as it can match Live Photos by looking at their Content ID tags, but it works for generic folders of photos too.
  6 | 
  7 | Basically, you go from this:
  8 | 
  9 | ![Before](https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/main/Screenshots/Screenshot%202020-11-07%20at%2011.34.21.png)
 10 | 
 11 | To this:
 12 | 
 13 | ![After](https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/main/Screenshots/Screenshot%202020-11-07%20at%2011.35.28.png)
 14 | 
 15 | ## Why not just use the "Export Unmodified Originals" option in Photos.app?
 16 | 
 17 | In my experience:
 18 | 
 19 | - I frequently get non-descript errors doing that
 20 | - It's slow
 21 | - The filenames aren't what I want
 22 | - You can't practically extract all at once, you need to make Smart Albums by years or so to get smaller batches
 23 | 
 24 | But feel free to try that option yourself. Maybe it's good for you!
 25 | 
 26 | ## What is the structure of the Photos.app Library?
 27 | 
 28 | The `Photos Library.photoslibrary` is basically a folder! If you right click it in macOS you can choose "Show Package Contents" and in there you find folders. 
 29 | 
 30 | Most interesting to us is the originals folder as that folder seems to contain unmodified original files. 
 31 | 
 32 | Unfortunately, inside the originals folder is a mess of scary folders and scary file names that are essentially random. Luckily for us, the files themselves seem to be untouched and contain a lot of metadata which we can read, such as the date the photo was taken, which we can use to get a nicer filename.
 33 | 
 34 | ## How are Live Photos paired?
 35 | 
 36 | Apple conveniently has a Content ID that pairs the photo and video component. Just read the tags and if it's the same Content ID: it's a pair. 
 37 | 
 38 | Unfortunately, the photo's date is in one timezone and the video's date is in another, so this script sets the filename of the video based on the photos date, so that both the photo and video file have the same filenames except for the extension.
 39 | 
 40 | Here you can see a screenshot of how the script handles Live Photos: 
 41 | 
 42 | ![Live Photos](https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/main/Screenshots/Screenshot-%20Handling%20live%20photo.png)
 43 | 
 44 | (Notice how the dates are two hours off from each other and that the MOV gets the same filename as the JPG.)
 45 | 
 46 | ## Is this safe?
 47 | 
 48 | It should be. I don't modify anything inside the input folder. I just read it and copy from it.
 49 | 
 50 | ## How are duplicates handled?
 51 | 
 52 | Every input file gets hashed using MD5. If it's a hash we've already had, the file is skipped. 
 53 | 
 54 | If the destination file already exists, the new file and the existing file gets MD5 hashed and compared to see if they're identical, and if so, the new file isn't copied.
 55 | 
 56 | But if the destinaton file already exists, and the MD5 hashes aren't identical the new file is copied with a number appended to it. In case that the new filename with a number appended to it is also already existing then we also MD5 compare them, and so until we either find a identical match or we find a free filename.
 57 | 
 58 | You're meant to be able to re-use the same input and output folders repeatedly without re-doing the work every time.
 59 | 
 60 | Here you can see a screenshot of what happens then the destination file(s) already exist: 
 61 | ![Destination Exists](https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/main/Screenshots/Screenshot%20-%20handling%20duplicate%20destination.png)
 62 | 
 63 | ## How do I use this?
 64 | 
 65 | I've only tested it on macOS (because it is meant for the Photos.app library afterall), but I don't see any reason why it wouldn't work on Windows or Linux.
 66 | 
 67 | 
 68 | ### Preparing in Photos.app
 69 | 
 70 | Make sure you have the "Download Originals to this Mac" option enabled in Photos.app: 
 71 | 
 72 | ![Download Originals](https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/main/Screenshots/Download%20Originals.png)
 73 | 
 74 | Make sure your Photos are completely downloaded and up to date (the bottom of the Photos tab in Photos.app should just say "Updated" or something similar): 
 75 | 
 76 | ![Updated](https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/main/Screenshots/Screenshot%202020-11-07%20at%2013.32.30.png)
 77 | 
 78 | ### Prerequisites
 79 | 
 80 | - This script is written using Python 3, so make sure you have that
 81 | - I use [exiftool](https://exiftool.org) to read metadata from the images, so make sure you have that installed and accessible through the command line
 82 |     - If you're on macOS you can use [Brew](https://brew.sh) to install it: `brew install exiftool` 
 83 | - The script uses the Python library [pyexiftool](https://github.com/smarnach/pyexiftool) to use exiftool
 84 |     - Apparently, you can install that library using pip, but that didn't work for me (I get import module error). What worked for me was just having the `exiftool.py` file in the same folder as this script 
 85 | 
 86 | ### Running the Script
 87 | 
 88 | Run the script: 
 89 |     
 90 |     python3 PhotosLibraryExtractor.py -i /folder/with/pictures/ -o /destination/
 91 | 
 92 | If you are processing a Photos.app Library you should use the `originals` sub folder (the easiest way to get to it is to open the Library in Finder (by right clikcing it and selecting _Show Package Contents_ and dragging and dropping the originals folder onto your Terminal): 
 93 | 
 94 |     python3 PhotosLibraryExtractor.py -i "~/Photos Library.photoslibrary/originals" -o /destination/
 95 | 
 96 | The script is very verbose, almost annoyingly so, because these are highly valuable photos we are dealing with and I want you to know exactly what is going on.
 97 | 
 98 | ### The "PLEDB" File
 99 | 
100 | Files that have been processed are added to a PLEDB (Photos Library Extractor Data Base) file. By default this PLEDB file is in the destination folder.
101 | 
102 | It can also be specified with the `-db` parameter if you wanna have it outside of your precious folder of photos.
103 | 
104 | This PLEDB file is useful for future runs as files that have been already processed are skipped, significantly speeding up a run. 
105 | 
106 | The PLEDB file is just a plain text file with one filepath per line, so you can go in and delete a specific line for a file if you want to just re-process that file, or you can just remove the entire PLEDB file if you wanna start over from scratch.
107 | 
108 | ## What's up with these leftover unpaired IDs?
109 | 
110 | I'm guessing these are Live Photos were the video component of them was removed at some point which I don't remember. There shouldn't be too many. In my library of over 10000 items, I got 16.
111 | 
112 | The unpaired ID's are copied into the destination folder anyway, so you won't lose anything. You can see when the script processes them which paths they have so you can investigate if you want to, but I wouldn't worry about it.
113 | 
114 | ## I got a lot of files in the "Unknown Dates" folder. What do I do?
115 | 
116 | For me, most of them were screenshots, photos I saved from Snapchat, or photos I saved to the library from the internet or Twitter. 
117 | 
118 | There could however be some genuine photos in there too, like if your camera didn't have a date set or something. 
119 | 
120 | So it is unfortunately a thing I can't really help you with. I would just keep it around in case there is anything important in there.


--------------------------------------------------------------------------------
/Screenshots/Download Originals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/cac27f42d6a45197844a2ab98e5be07b4cfd03df/Screenshots/Download Originals.png


--------------------------------------------------------------------------------
/Screenshots/Screenshot - handling duplicate destination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/cac27f42d6a45197844a2ab98e5be07b4cfd03df/Screenshots/Screenshot - handling duplicate destination.png


--------------------------------------------------------------------------------
/Screenshots/Screenshot 2020-11-07 at 11.34.21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/cac27f42d6a45197844a2ab98e5be07b4cfd03df/Screenshots/Screenshot 2020-11-07 at 11.34.21.png


--------------------------------------------------------------------------------
/Screenshots/Screenshot 2020-11-07 at 11.35.28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/cac27f42d6a45197844a2ab98e5be07b4cfd03df/Screenshots/Screenshot 2020-11-07 at 11.35.28.png


--------------------------------------------------------------------------------
/Screenshots/Screenshot 2020-11-07 at 13.32.30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/cac27f42d6a45197844a2ab98e5be07b4cfd03df/Screenshots/Screenshot 2020-11-07 at 13.32.30.png


--------------------------------------------------------------------------------
/Screenshots/Screenshot- Handling live photo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lambdan/PhotosLibraryExtractor/cac27f42d6a45197844a2ab98e5be07b4cfd03df/Screenshots/Screenshot- Handling live photo.png


--------------------------------------------------------------------------------
/fix_extensions.py:
--------------------------------------------------------------------------------
 1 | import exiftool, sys, shutil, os
 2 | from argparse import ArgumentParser
 3 | #exiftool: http://github.com/smarnach/pyexiftool
 4 | 
 5 | parser = ArgumentParser()
 6 | parser.add_argument('-i', action='store', dest='input', help='Folder with pictures you want to process a.k.a. input folder', required=True)
 7 | parser.add_argument('-test', action='store_true', dest='test', help='Test mode')
 8 | parsed = parser.parse_args()
 9 | 
10 | supported_exts = [".jpg", ".jpeg", ".png", ".tif", ".tiff", ".mov", ".mp4", ".gif", ".bmp", ".m4v"]
11 | 
12 | in_dir = os.path.abspath(parsed.input)
13 | test_mode = parsed.test
14 | 
15 | print("Folder:", in_dir)
16 | if test_mode:
17 | 	print("*** Test mode enabled. Files will not be renamed! ***")
18 | 
19 | if not os.path.isdir(in_dir):
20 | 	print("Error! This doesn't seem to be a folder:", in_dir)
21 | 	sys.exit(1)
22 | 
23 | def get_extension(fp):
24 | 	with exiftool.ExifTool() as et:
25 | 		metadata = et.get_metadata(fp)	
26 | 	return "." + metadata['File:FileTypeExtension']
27 | 
28 | wrong_exts = 0
29 | for dirpath, dirnames, filenames in os.walk(in_dir):
30 | 	for f in filenames: 
31 | 		base, ext = os.path.splitext(f) # original file
32 | 
33 | 		if ext.lower() not in supported_exts:
34 | 			continue
35 | 
36 | 		orig_path = os.path.abspath(os.path.join(dirpath,f))
37 | 
38 | 		exiftool_extension = get_extension(orig_path)
39 | 
40 | 		if exiftool_extension.lower() != ext.lower():
41 | 			print(orig_path, "should be", exiftool_extension)
42 | 			wrong_exts += 1
43 | 
44 | 			if not test_mode:
45 | 				new_name = base + exiftool_extension
46 | 				new_path = os.path.abspath(os.path.join(dirpath, new_name))
47 | 
48 | 				i = 97 # a
49 | 				while os.path.isfile(new_path):
50 | 					new_name = base + chr(i) + exiftool_extension
51 | 					new_path = os.path.abspath(os.path.join(dirpath, new_name))
52 | 					i+=1
53 | 
54 | 				print("Renaming:", orig_path, "-->", new_path)
55 | 				shutil.move(orig_path, new_path)
56 | 				
57 | if not test_mode:
58 | 	print(wrong_exts, "files renamed")
59 | else:
60 | 	print(wrong_exts, "files have wrong extensions")
61 | print("Done!")


--------------------------------------------------------------------------------