├── .gitignore ├── __init__.py ├── LICENSE ├── ai_utils.py ├── search_images_ddg.py ├── make_train_valid.py ├── download_images.py ├── README.md └── image_download.py /.gitignore: -------------------------------------------------------------------------------- 1 | geckodriver.log 2 | dataset 3 | dataset/* 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .image_download import * 2 | from .make_train_valid import * 3 | from .ai_utils import * 4 | from .search_images_ddg import * 5 | from .download_images import * 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 C. Bryan Daniels 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ai_utils.py: -------------------------------------------------------------------------------- 1 | ### 2 | # C. Bryan Daniels 3 | # https://github.com/prairie-guy 4 | # 3/24/2018 5 | # ai_utils.py 6 | # 7 | ### 8 | 9 | __all__ = ['methods_of', 'attributes_of'] 10 | import inspect, os, imghdr 11 | from pathlib import Path 12 | 13 | # print methods of an object 14 | def methods_of(obj,lr=False): 15 | for attr in dir(obj): 16 | if attr.startswith("_"): continue 17 | try: 18 | if callable(getattr(obj,str(attr),None)): 19 | print(f"{attr}{str(inspect.signature(getattr(obj,str(attr), None)))}:") 20 | if lr==True: print() 21 | except: pass 22 | 23 | # print attributes of an object 24 | def attributes_of(obj, *exclude): 25 | for attr in dir(obj): 26 | if attr.startswith("_"): continue 27 | try: 28 | if not callable(getattr(obj,str(attr),None)): 29 | if attr in exclude: 30 | print(f"{attr}: ...") 31 | else: 32 | print(f"{attr}: {getattr(obj,attr)}") 33 | except: pass 34 | 35 | """ 36 | ai_utils.py 37 | contains: 38 | atttributes_of(obj, *exclude): -> prints obj attributes 39 | methods_of(obj,lr=False): -> prints obj methods 40 | 41 | usage: import ai_utils 42 | 43 | > data = ImageClassifierData.from_paths(PATH, tfms=tfms_from_model(arch, sz)) 44 | > attributes_of(data.trn_dl.dataset,'fnames') 45 | c: 2 46 | fnames: ... 47 | is_multi: False 48 | is_reg: False 49 | n: 23000 50 | path: data/dogscats/ 51 | sz: 224 52 | y: [0 0 0 ... 1 1 1] 53 | 54 | > methods_of(data.trn_dl.dataset) 55 | denorm(arr): 56 | get(tfm, x, y): 57 | get_c(): 58 | get_n(): 59 | get_sz(): 60 | get_x(i): 61 | get_y(i): 62 | resize_imgs(targ, new_path): 63 | transform(im, y=None): 64 | """ 65 | -------------------------------------------------------------------------------- /search_images_ddg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ### search_images_ddg.py 4 | ### C. Bryan Daniels 5 | ### 9/1/2020 6 | ### Adopted from https://github.com/deepanprabhu/duckduckgo-images-api 7 | ### 8 | 9 | import requests, re, json, time, sys, os 10 | from pathlib import Path 11 | 12 | __all__ = ['search_images_ddg'] 13 | 14 | def search_images_ddg(keywords,max_n=100): 15 | """Search for 'keywords' with DuckDuckGo and return a unique urls of 'max_n' images""" 16 | url = 'https://duckduckgo.com/' 17 | params = {'q':keywords} 18 | res = requests.post(url,data=params) 19 | searchObj = re.search(r'vqd=([\d-]+)\&',res.text) 20 | if not searchObj: print('Token Parsing Failed !'); return 21 | requestUrl = url + 'i.js' 22 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'} 23 | params = (('l','us-en'),('o','json'),('q',keywords),('vqd',searchObj.group(1)),('f',',,,'),('p','1'),('v7exp','a')) 24 | urls = [] 25 | while True: 26 | try: 27 | res = requests.get(requestUrl,headers=headers,params=params) 28 | data = json.loads(res.text) 29 | for obj in data['results']: 30 | urls.append(obj['image']) 31 | max_n = max_n - 1 32 | if max_n < 1: return list(set(urls)) 33 | if 'next' not in data: return list(set(urls)) 34 | requestUrl = url + data['next'] 35 | except: 36 | pass 37 | 38 | if __name__ == "__main__": 39 | def print_urls(urls): 40 | for url in urls: 41 | print(url) 42 | if len(sys.argv) == 2: print_urls(search_images_ddg(sys.argv[1])) 43 | elif len(sys.argv) == 3: print_urls(search_images_ddg(sys.argv[1],int(sys.argv[2]))) 44 | else: print("usage: search(keywords,max_n=100)") 45 | -------------------------------------------------------------------------------- /make_train_valid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ### 4 | # C. Bryan Daniels 5 | # https://github.com/prairie-guy 6 | # 2/7/2018 7 | ### 8 | 9 | import os, sys, shutil, random, argparse 10 | from pathlib import Path 11 | 12 | __all__ = ['make_train_valid'] 13 | 14 | def make_train_valid(labels_dir:Path, train:float=.8, valid:float=.2, test:float=0): 15 | """ 16 | usage: make_train_valid(labels_dir:Path, train:float=.8, valid:float=.2, test:float=0) 17 | 18 | Make a train-valid directory and randomly copy files from labels_dir to sub- 19 | directories 20 | 21 | positional arguments: 22 | labels_dir Contains at least two directories of labels, each containing 23 | files of that label 24 | 25 | optional arguments: 26 | train=.8 files for training, default=.8 27 | valid=.2 files for validation, default=.2 28 | test= 0 files for training, default=.0 29 | """ 30 | assert sum([train, valid, test]) == 1 31 | assert (Path(labels_dir).is_dir()) 32 | labels_path = Path(labels_dir) 33 | runs = {'train':train, 'valid':valid, 'test':test} 34 | 35 | 36 | for run in runs.keys(): 37 | shutil.rmtree((labels_path / run), ignore_errors=True) 38 | 39 | labels = [d.name for d in labels_path.iterdir() if d.is_dir()] 40 | 41 | for label in labels: 42 | files = list((labels_path / label).iterdir()) 43 | num_files = len(files) 44 | for run in runs.keys(): 45 | os.makedirs(labels_path / run / label) 46 | take = round(num_files * runs[run]) 47 | random.shuffle(files) 48 | for f in files[:take]: 49 | shutil.copy(f, (labels_path / run / label / f.name)) 50 | #print(f, (labels_path / run / label / f.name)) 51 | files = files[take:] 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser(description = "Make a train-valid directory and randomly copy files from labels_dir to sub-directories") 55 | parser.add_argument("labels_dir", help= "Contains at least two directories of labels, each containing files of that label") 56 | parser.add_argument("--train", type=float, default=.8, help="files for training, default=.8") 57 | parser.add_argument("--valid", type=float, default=.2, help="files for validation, default=.2") 58 | parser.add_argument("--test", type=float, default=.0, help="files for testing, default=.0") 59 | args = parser.parse_args() 60 | 61 | make_train_valid(args.labels_dir, args.train, args.valid, args.test) 62 | 63 | -------------------------------------------------------------------------------- /download_images.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ### download_images.py 4 | ### C. Bryan Daniels 5 | ### 9/1/2020 6 | ### Adopted from https://github.com/deepanprabhu/duckduckgo-images-api 7 | ### 8 | 9 | import requests, re, sys, os, requests, threading 10 | from pathlib import Path 11 | from fastprogress import progress_bar 12 | from threading import Thread 13 | 14 | __all__ = ['download_images'] 15 | 16 | def download_images(urls=None, url_file=None, dest='dataset',max_pics=1000, timeout=5): 17 | "Download images listed in text file `url_file` to path `dest`, at most `max_pics`" 18 | if urls is None: urls = (open(url_file)).read().strip().split("\n")[:max_pics] 19 | dest = Path(dest) 20 | dest.mkdir(exist_ok=True) 21 | download_image_inner(dest, list(enumerate(urls)), timeout=timeout) 22 | 23 | def download_image_inner(dest, inp, timeout=5): 24 | for (i,url) in inp: 25 | suffix = re.findall(r'\.\w+?(?=(?:\?|$))', url) 26 | suffix = suffix[0] if len(suffix)>0 else '.jpg' 27 | try: 28 | Thread(target=download_url, 29 | args = (url, dest/f"{i:08d}{suffix}"), 30 | kwargs = {'overwrite':True, 'show_progress':False, 'timeout':timeout}).start() 31 | #download_url(url, dest/f"{i:08d}{suffix}", overwrite=True, show_progress=False, timeout=timeout) 32 | except Exception as e: f"Couldn't download {url}." 33 | 34 | def download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1025*1024, 35 | timeout=5, retries=5): 36 | "Download `url` to `dest` unless it exists and not `overwrite`" 37 | if os.path.exists(dest) and not overwrite: return 38 | 39 | s = requests.Session() 40 | s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries)) 41 | # additional line to identify as a firefox browser, see fastai/#2438 42 | s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}) 43 | u = s.get(url, stream=True, timeout=timeout) 44 | try: file_size = int(u.headers["Content-Length"]) 45 | except: show_progress = False 46 | 47 | with open(dest, 'wb') as f: 48 | nbytes = 0 49 | if show_progress: pbar = progress_bar(range(file_size), leave=False, parent=pbar) 50 | try: 51 | if show_progress: pbar.update(0) 52 | for chunk in u.iter_content(chunk_size=chunk_size): 53 | nbytes += len(chunk) 54 | if show_progress: pbar.update(nbytes) 55 | f.write(chunk) 56 | except requests.exceptions.ConnectionError as e: 57 | fname = url.split('/')[-1] 58 | data_dir = dest.parent 59 | print(f'\n Download of {url} has failed after {retries} retries\n' 60 | f' Fix the download manually:\n' 61 | f'$ mkdir -p {data_dir}\n' 62 | f'$ cd {data_dir}\n' 63 | f'$ wget -c {url}\n' 64 | f'$ tar xf {fname}\n' 65 | f' And re-run your code once the download is successful\n') 66 | 67 | if __name__ == "__main__": 68 | args = sys.argv[1:] 69 | try: 70 | download_images(url_file=args[0]) 71 | except: 72 | print("usage: download_images(urls=None, url_file=None, dest='dataset',max_pics=1000, timeout=4):") 73 | # if len(sys.argv) == 2: print_urls(search_images_ddg(sys.argv[1])) 74 | # elif len(sys.argv) == 3: print_urls(search_images_ddg(sys.argv[1],int(sys.argv[2]))) 75 | # else: print("usage: search(keywords,max_n=100)") 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## ai_utilities 2 | 3 | Several useful scripts for use with `fast.ai` lectures and libraries. 4 | 5 | `image_download` is the primary function. It provides easy download of images from `bing`, `google`, `baidu`, and/or `flickr` (though the later requires an `apikey`). It is intended for direct import of images within a python script or Jupyter Notebook. 6 | 7 | 8 | `make-train-valid` makes a train-valid directory and randomly copy files from labels_dir to sub- 9 | directories. It has largely been replaced by the capabilities within `fastai` but is still useful. 10 | 11 | ### Installation 12 | - `pip install icrawler` 13 | - `pip install python-magic` or `pip install python-magic-bin` 14 | - `git clone https://github.com/prairie-guy/ai_utilities.git` 15 | 16 | 17 | ### image_download.py 18 | Downloads up to a `n_images` (typically limited to 100-300) from a specified search engine, including `bing`, `baidu` and `flickr`. The `search_text` can be different from its `label`. Images are checked to be valid images and duplicates are eliminated. Images are saved to the directory `dataset` by defalult. (Based upon the excellent work of: https://github.com/hellock/icrawler) 19 | 20 | ``` 21 | usage: image_download(search_text:Path, n_images, label:str=None, engine:str='bing', image_dir='dataset', apikey=None) 22 | where, 'engine' = ['bing'|'google'|baidu'|'flickr'], 23 | 'flickr' requires an apikey and 24 | 'label' can be different from 'search_text' 25 | ``` 26 | 27 | ### Example Usage 28 | Download up to 100 images of each `class`, check each file to be a valid `jpeg` image, save to directory `dataset` and create `data = ImageDataBunch.from_folder(...)`. Optionally create an imagenet-type directory structure. 29 | ``` 30 | import sys 31 | sys.path.append('your-parent-directory-of-ai_utilities') 32 | from ai_utilities import * 33 | from pathlib import Path 34 | from fastai.vision.all import * 35 | 36 | for p in ['dog', 'goat', 'sheep']: 37 | image_download(p, 100) 38 | path = Path.cwd()/'dataset' 39 | data = ImageDataLoaders.from_folder(path,valid_pct=0.2, item_tfms=Resize(224)) 40 | 41 | # Optionally, create an imagenet-type file directory. 42 | make_train_valid(path) 43 | data = ImageDataLoaders.from_folder(path, train='train', valid='valid', item_tfms=Resize(224)) 44 | ``` 45 | 46 | ### make_train_valid.py 47 | From a directory containing sub-directories, each with a different class of images, make an imagenet-type directory structure. 48 | It randomly copies files from `labels_dir` to sub-directories: `train`, `valid`, `test`. Creates an imagmenet-type directory usable by `ImageDataBunch.from_folder(dir,...)` 49 | 50 | ``` 51 | usage: make_train_valid(labels_dir:Path, train:float=.8, valid:float=.2, test:float=0) 52 | positional arguments: 53 | labels_dir Contains at least two directories of labels, each containing 54 | files of that label 55 | optional arguments: 56 | train=.8 files for training, default=.8 57 | valid=.2 files for validation, default=.2 58 | test= 0 files for training, default=.0 59 | ``` 60 | 61 | For example, given a directory: 62 | ``` 63 | catsdogs/ 64 | ..cat/[*.jpg] 65 | ..dog/[*.jpg] 66 | ``` 67 | 68 | Creates the following directory structure: 69 | ``` 70 | catsdogs/ 71 | ..cat/[*.jpg] 72 | ..dog/[*.jpg] 73 | ..train/ 74 | ..cat/[*.jpg] 75 | ..dog/[*.jpg] 76 | ..valid/ 77 | ..cat/[*.jpg] 78 | ..dog/[*.jpg] 79 | ``` 80 | 81 | -------------------------------------------------------------------------------- /image_download.py: -------------------------------------------------------------------------------- 1 | ### 2 | # C.Bryan Daniels 3 | # 6/20/2019 4 | # Adapted from github.com/atif93/google_image_downloader 5 | # Adapted from github.com/cwerner/fastclass.git 6 | ### 7 | 8 | # Install these modules before fastai to avoid clobbering pillow 9 | # conda install -c hellock icrawler 10 | # pip install python-magic 11 | 12 | import os, sys, shutil 13 | from pathlib import Path 14 | from glob import glob 15 | import mimetypes 16 | import hashlib, magic 17 | import icrawler 18 | from icrawler.builtin import GoogleImageCrawler, BingImageCrawler, BaiduImageCrawler, FlickrImageCrawler 19 | # GoogleImageCrawler is not working from icrawler 20 | 21 | __all__ = ['dedupe_images','filter_images','image_download','filter_images'] 22 | 23 | def image_download(search_text:str, n_images:int, label:str=None, engine:str='bing', image_dir='dataset', apikey=None): 24 | """ 25 | Download images from bing, baidu or flickr 26 | usage: image_download(search_text:Path, n_images, label:str=None, engine:str='bing', image_dir='dataset', apikey=None) 27 | where, engine = ['bing'|'baidu'|'flickr'], 28 | 'flickr' requires an apikey, 29 | """ 30 | if engine not in ['google','bing', 'baidu', 'flickr']: print("supported engines are: google,bing,baidu,flickr"); exit() 31 | # If you have patched icrawler/icrawler/builtin/google.py, COMMENT OUT the next line of code to use the google search engine 32 | # if engine=='google': print("engine=google is currently being fixed. Try another engine."); exit() # Temp until icrawler PR is applied 33 | if label is None: label = search_text 34 | path = Path.cwd()/image_dir/label 35 | if Path.exists(path): 36 | response = input(f"'{label}' exists. Overwrite? [Y/n]: ") 37 | if response == 'Y': shutil.rmtree(path) 38 | else: print(f"'{label}' unchanged", end='\r'); exit() 39 | 40 | if engine == 'flickr': 41 | start_flickr_crawler(path, search_text, n_images, apikey) 42 | else: 43 | engines = {'google':GoogleImageCrawler, 'bing':BingImageCrawler,'baidu':BaiduImageCrawler} 44 | start_crawler(engines[engine], path, search_text, n_images) 45 | nons = filter_images(path) # Remove non-jpg images 46 | dups = dedupe_images(path) # Remove duplicates 47 | print() 48 | print("**********************************************************") 49 | print(f"Path: {path}") 50 | print(f"Removed: {dups} duplicate images") 51 | print(f"Removed: {nons} non-jpeg images ") 52 | print(f"Downloaded: {len(list(path.iterdir()))} images") 53 | print("**********************************************************") 54 | 55 | def start_crawler(Crawler:icrawler, path:Path, search_text:str, n_images:int, file_idx_offset=0): 56 | crawler = Crawler(feeder_threads=2,parser_threads=2,downloader_threads=8,storage={'root_dir': path}) 57 | crawler.crawl(keyword=search_text, max_num=n_images, file_idx_offset=file_idx_offset) 58 | 59 | def start_flickr_crawler(path:Path, search_text:str, n_images:int, apikey:str): 60 | if apikey == None: print("Flickr requires an apikey: 'https://www.flickr.com/services/api/misc.api_keys.html'"); exit() 61 | crawler = FlickrImageCrawler(apikey,feeder_threads=2,parser_threads=2,downloader_threads=8,storage={'root_dir': path}) 62 | crawler.crawl(tags=search_text, max_num=n_images, tag_mode='all') 63 | 64 | def dedupe_images(image_dir:Path)->int: 65 | """Delete duplicate images from image_dir, also works recursively if there are 66 | subfolders containing images, OBS: only works in image_files with image extensions""" 67 | 68 | image_extensions = set(k for k,v in mimetypes.types_map.items() if v.startswith('image/')) 69 | 70 | images = {}; dups = [] 71 | image_files = [y for x in os.walk(image_dir) for ext in image_extensions for y in glob(os.path.join(x[0], f'*{ext}'))] 72 | for f in image_files: 73 | h = hashfile(f) 74 | if h in images: 75 | images[h] = images[h] + 1 76 | dups.append(f) 77 | else: 78 | images[h] = 1 79 | n = len(dups) 80 | for f in dups: 81 | Path(f).unlink() 82 | return n 83 | 84 | def hashfile(path:Path)->str: 85 | """Create hash of file""" 86 | blocksize = 65536 87 | with open(path, 'rb') as f: 88 | hasher = hashlib.sha512() 89 | buf = f.read(blocksize) 90 | while len(buf) > 0: 91 | hasher.update(buf) 92 | buf = f.read(blocksize) 93 | return hasher.hexdigest() 94 | 95 | def filter_images(image_dir:Path, img_type:str='JPEG')->int: 96 | """Filter (keep) only pictures of a specified type. The default is jpeg""" 97 | nons = 0 98 | path = Path(image_dir) 99 | for f in path.iterdir(): 100 | try: 101 | jpeg = magic.from_file(f.as_posix())[:4] 102 | if f.is_file() and jpeg != img_type: 103 | nons = nons + 1 104 | f.unlink() 105 | except: 106 | nons += 1 107 | f.unlink() 108 | return nons 109 | --------------------------------------------------------------------------------