├── config_flickr.ini ├── logging.properties ├── README.md ├── process-sample.py ├── LICENSE ├── flickr-licenses.ipynb ├── process-images.py ├── .gitignore └── flickr-download.py /config_flickr.ini: -------------------------------------------------------------------------------- 1 | [FLICKR] 2 | id = [your api id] 3 | secret = [your secret/password] 4 | [Download] 5 | path = d:\data\christmas 6 | search = christmas decorations 7 | prefix = christmas 8 | update_minutes = 1 9 | license = 0,1,2,3,4,5,6,7,8,9,10 10 | max_download = 100000 11 | sources_file = sources.csv 12 | [Process] 13 | process = True 14 | crop_square = True 15 | min_width = 256 16 | min_height = 256 17 | scale_width = 256 18 | scale_height = 256 19 | image_format = jpg -------------------------------------------------------------------------------- /logging.properties: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=consoleHandler 6 | 7 | [formatters] 8 | keys=detailedFormatter 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=consoleHandler 13 | 14 | [handler_consoleHandler] 15 | class=StreamHandler 16 | level=INFO 17 | formatter=detailedFormatter 18 | args=(sys.stdout,) 19 | 20 | [formatter_detailedFormatter] 21 | format=%(asctime)s - %(name)s - %(levelname)s : Line %(lineno)s - %(message)s 22 | datefmt= -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyimgdata 2 | 3 | These are the Flickr License identifiers for images. 4 | 5 | * 0 All Rights Reserved 6 | * 1 Attribution-NonCommercial-ShareAlike License 7 | * 2 Attribution-NonCommercial License 8 | * 3 Attribution-NonCommercial-NoDerivs License 9 | * 4 Attribution License 10 | * 5 Attribution-ShareAlike License 11 | * 6 Attribution-NoDerivs License 12 | * 7 No known copyright restrictions 13 | * 8 United States Government Work 14 | * 9 Public Domain Dedication (CC0) 15 | * 10 Public Domain Mark -------------------------------------------------------------------------------- /process-sample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import glob 4 | from shutil import copyfile 5 | from tqdm import tqdm 6 | 7 | INPUT_PATH = "D:\\data\\minecraft\\2_scaled_1024\\" 8 | OUTPUT_PATH = "D:\\data\\minecraft\\3_sampled_1024_15k\\" 9 | 10 | def sample(iterable, n): 11 | reservoir = [] 12 | for t, item in enumerate(iterable): 13 | if t < n: 14 | reservoir.append(item) 15 | else: 16 | m = random.randint(0,t) 17 | if m < n: 18 | reservoir[m] = item 19 | return reservoir 20 | 21 | 22 | iter = glob.iglob(os.path.join(INPUT_PATH,"*.jpg")) 23 | itms = sample(iter, 15000) 24 | for src_path in tqdm(itms): 25 | filename = os.path.split(src_path)[-1] 26 | dst_path = os.path.join(OUTPUT_PATH, filename) 27 | copyfile(src_path, dst_path) 28 | 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jeff Heaton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /flickr-licenses.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "0 All Rights Reserved\n", 13 | "4 Attribution License\n", 14 | "6 Attribution-NoDerivs License\n", 15 | "3 Attribution-NonCommercial-NoDerivs License\n", 16 | "2 Attribution-NonCommercial License\n", 17 | "1 Attribution-NonCommercial-ShareAlike License\n", 18 | "5 Attribution-ShareAlike License\n", 19 | "7 No known copyright restrictions\n", 20 | "8 United States Government Work\n", 21 | "9 Public Domain Dedication (CC0)\n", 22 | "10 Public Domain Mark\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "import flickrapi\n", 28 | "\n", 29 | "flickr=flickrapi.FlickrAPI('[[id]]', '[[password]]', cache=True)\n", 30 | "\n", 31 | "for l in flickr.photos.licenses.getInfo().find('licenses'):\n", 32 | " print(l.get(\"id\"), l.get('name'))" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [] 41 | } 42 | ], 43 | "metadata": { 44 | "kernelspec": { 45 | "display_name": "Python (image-dl)", 46 | "language": "python", 47 | "name": "image-dl" 48 | }, 49 | "language_info": { 50 | "codemirror_mode": { 51 | "name": "ipython", 52 | "version": 3 53 | }, 54 | "file_extension": ".py", 55 | "mimetype": "text/x-python", 56 | "name": "python", 57 | "nbconvert_exporter": "python", 58 | "pygments_lexer": "ipython3", 59 | "version": "3.8.5" 60 | } 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 4 64 | } 65 | -------------------------------------------------------------------------------- /process-images.py: -------------------------------------------------------------------------------- 1 | import imageio 2 | import glob 3 | from tqdm import tqdm 4 | from PIL import Image 5 | import os 6 | import logging 7 | import logging.config 8 | 9 | #SOURCE = "/Users/jheaton/Downloads/kaggle-blocks" 10 | #TARGET = "/Users/jheaton/Downloads/kaggle-convert" 11 | 12 | 13 | #SOURCE = "/mnt/d/data/scifi/70sscifiart" 14 | #TARGET = "/mnt/d/data/scifi/scifi-crop" 15 | 16 | SOURCE = "/mnt/d/data/minecraft/1_sampled/" 17 | TARGET = "/mnt/d/data/minecraft/2_scaled_1024" 18 | 19 | def crop_square(image): 20 | width, height = image.size 21 | 22 | # Crop the image, centered 23 | new_width = min(width,height) 24 | new_height = new_width 25 | left = (width - new_width)/2 26 | top = (height - new_height)/2 27 | right = (width + new_width)/2 28 | bottom = (height + new_height)/2 29 | return image.crop((left, top, right, bottom)) 30 | 31 | def scale(img, scale_width, scale_height): 32 | # Scale the image 33 | img = img.resize(( 34 | scale_width, 35 | scale_height), 36 | Image.ANTIALIAS) 37 | 38 | return img 39 | 40 | def standardize(image): 41 | rgbimg = Image.new("RGB", image.size) 42 | rgbimg.paste(image) 43 | return rgbimg 44 | 45 | def fail_below(image, check_width, check_height): 46 | width, height = image.size 47 | assert width == check_width 48 | assert height == check_height 49 | 50 | logging.config.fileConfig("logging.properties") 51 | files = glob.glob(os.path.join(SOURCE,"*.jpg")) 52 | 53 | for file in tqdm(files): 54 | try: 55 | target = "" 56 | name = os.path.basename(file) 57 | filename, _ = os.path.splitext(name) 58 | img = Image.open(file) 59 | img = standardize(img) 60 | img = crop_square(img) 61 | img = scale(img, 1024, 1024) 62 | #fail_below(img, 1024, 1024) 63 | 64 | target = os.path.join(TARGET,filename+".jpg") 65 | img.save(target, quality=25) 66 | except KeyboardInterrupt: 67 | print("Keyboard interrupt") 68 | break 69 | except AssertionError: 70 | print("Assertion") 71 | break 72 | except: 73 | logging.warning(f"Unexpected exception while processing image source: {file}, target: {target}" , exc_info=True) 74 | 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.code-workspace 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /flickr-download.py: -------------------------------------------------------------------------------- 1 | # Flickr Download, by Jeff Heaton (http://www.heatonresearch.com) 2 | # https://github.com/jeffheaton/pyimgdata 3 | # Copyright 2020, MIT License 4 | import flickrapi 5 | import requests 6 | import logging 7 | import logging.config 8 | import os 9 | import configparser 10 | import time 11 | import csv 12 | import sys 13 | from urllib.request import urlretrieve 14 | from PIL import Image 15 | from io import BytesIO 16 | from hashlib import sha256 17 | 18 | # https://code.flickr.net/2008/08/19/standard-photos-response-apis-for-civilized-age/ 19 | 20 | # Nicely formatted time string 21 | def hms_string(sec_elapsed): 22 | h = int(sec_elapsed / (60 * 60)) 23 | m = int((sec_elapsed % (60 * 60)) / 60) 24 | s = sec_elapsed % 60 25 | return f"{h}:{m:>02}:{s:>05.2f}" 26 | 27 | def is_true(str): 28 | return str.lower()[0] == 't' 29 | 30 | class FlickrImageDownload: 31 | def __init__(self): 32 | self.config = configparser.ConfigParser() 33 | self.config.read("config_flickr.ini") 34 | logging.config.fileConfig("logging.properties") 35 | 36 | self.config_path = self.config['Download']['path'] 37 | self.config_prefix = self.config['Download']['prefix'] 38 | self.config_search = self.config['Download']['search'] 39 | self.config_update_minutes = int(self.config['Download']['update_minutes']) 40 | self.config_max_download_count = int(self.config['Download']['max_download']) 41 | self.config_license_allowed = [int(e) if e.isdigit() else e 42 | for e in self.config['Download']['license'].split(',')] 43 | self.config_format = self.config['Process']['image_format'] 44 | self.config_process = is_true(self.config['Process']['process']) 45 | self.config_crop_square = is_true(self.config['Process']['crop_square']) 46 | self.config_scale_width = int(self.config['Process']['scale_width']) 47 | self.config_scale_height = int(self.config['Process']['scale_height']) 48 | self.config_min_width = int(self.config['Process']['min_width']) 49 | self.config_min_height = int(self.config['Process']['min_height']) 50 | 51 | if "sources_file" in self.config['Download']: 52 | self.config_sources_file = self.config['Download']['sources_file'] 53 | else: 54 | self.config_sources_file = None 55 | 56 | 57 | self.flickr=flickrapi.FlickrAPI( 58 | self.config['FLICKR']['id'], 59 | self.config['FLICKR']['secret'], 60 | cache=True) 61 | 62 | def reset_counts(self): 63 | self.download_count = 0 64 | self.start_time = time.time() 65 | self.last_update = 0 66 | self.download_count = 0 67 | self.error_count = 0 68 | self.cached = 0 69 | self.sources = [] 70 | 71 | def load_image(self, url): 72 | try: 73 | response = requests.get(url) 74 | h = sha256(response.content).hexdigest() 75 | img = Image.open(BytesIO(response.content)) 76 | img.load() 77 | return img, h 78 | except KeyboardInterrupt: 79 | logging.info("Keyboard interrupt, stopping") 80 | sys.exit(0) 81 | except: 82 | logging.warning(f"Unexpected exception while downloading image: {url}" , exc_info=True) 83 | return None, None 84 | 85 | 86 | def obtain_photo(self, photo): 87 | url = photo.get('url_c') 88 | license = photo.get('license') 89 | 90 | if int(license) in self.config_license_allowed and url: 91 | image, h = self.load_image(url) 92 | 93 | if image: 94 | return image 95 | else: 96 | self.error_count += 1 97 | 98 | return None 99 | 100 | def check_to_keep_photo(self, url, image): 101 | h = sha256(image.tobytes()).hexdigest() 102 | p = os.path.join(self.config_path, f"{self.config_prefix}-{h}.{self.config_format}") 103 | self.sources.append([url,p]) 104 | if not os.path.exists(p): 105 | self.download_count += 1 106 | logging.debug(f"Downloaded: {url} to {p}") 107 | return p 108 | else: 109 | self.cached += 1 110 | logging.debug(f"Image already exists: {url}") 111 | return None 112 | 113 | def process_image(self, image, path): 114 | width, height = image.size 115 | 116 | # Crop the image, centered 117 | if self.config_crop_square and self.config_process: 118 | new_width = min(width,height) 119 | new_height = new_width 120 | left = (width - new_width)/2 121 | top = (height - new_height)/2 122 | right = (width + new_width)/2 123 | bottom = (height + new_height)/2 124 | image = image.crop((left, top, right, bottom)) 125 | 126 | # Scale the image 127 | if self.config_scale_width>0 and self.config_process: 128 | image = image.resize(( 129 | self.config_scale_width, 130 | self.config_scale_height), 131 | Image.ANTIALIAS) 132 | 133 | 134 | # Convert to full color (no grayscale, no transparent) 135 | if image.mode not in ('RGB'): 136 | logging.debug(f"Grayscale to RGB: {path}") 137 | rgbimg = Image.new("RGB", image.size) 138 | rgbimg.paste(image) 139 | image = rgbimg 140 | 141 | return image 142 | 143 | def track_progress(self): 144 | elapsed_min = int((time.time() - self.start_time)/60) 145 | self.since_last_update = elapsed_min - self.last_update 146 | if self.since_last_update >= self.config_update_minutes: 147 | logging.info(f"Update for {elapsed_min}: images={self.download_count:,}; errors={self.error_count:,}; cached={self.cached:,}") 148 | self.last_update = elapsed_min 149 | 150 | if self.download_count > self.config_max_download_count: 151 | logging.info("Reached max download count") 152 | return True 153 | 154 | return False 155 | 156 | def write_sources(self): 157 | if self.config_sources_file: 158 | logging.info("Writing sources file.") 159 | filename = os.path.join(self.config_path, self.config_sources_file) 160 | with open(filename, 'w', newline='') as csvfile: 161 | csvwriter = csv.writer(csvfile) 162 | csvwriter.writerow(['url', 'file']) 163 | csvwriter.writerows(self.sources) 164 | 165 | def run(self): 166 | logging.info("Starting...") 167 | self.reset_counts() 168 | 169 | photos = self.flickr.walk(text=self.config_search, 170 | tag_mode='all', 171 | tags=self.config_search, 172 | extras='url_c,license', 173 | per_page=100, 174 | sort='relevance', 175 | #license='0' 176 | ) 177 | 178 | for photo in photos: 179 | url = photo.get('url_c') 180 | img = self.obtain_photo(photo) 181 | if img: 182 | path = self.check_to_keep_photo(url, img) 183 | if path: 184 | img = self.process_image(img, path) 185 | img.save(path) 186 | 187 | if self.track_progress(): 188 | break 189 | 190 | self.write_sources() 191 | elapsed_time = time.time() - self.start_time 192 | logging.info("Complete, elapsed time: {}".format(hms_string(elapsed_time))) 193 | 194 | task = FlickrImageDownload() 195 | task.run() 196 | --------------------------------------------------------------------------------