├── .gitignore
├── __init__.py
├── LICENSE
├── ai_utils.py
├── search_images_ddg.py
├── make_train_valid.py
├── download_images.py
├── README.md
└── image_download.py


/.gitignore:
--------------------------------------------------------------------------------
1 | geckodriver.log
2 | dataset
3 | dataset/*
4 | __pycache__
5 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .image_download import *
2 | from .make_train_valid import *
3 | from .ai_utils import *
4 | from .search_images_ddg import *
5 | from .download_images import *
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 C. Bryan Daniels
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ai_utils.py:
--------------------------------------------------------------------------------
 1 | ###
 2 | # C. Bryan Daniels
 3 | # https://github.com/prairie-guy
 4 | # 3/24/2018
 5 | # ai_utils.py
 6 | #
 7 | ###
 8 | 
 9 | __all__ = ['methods_of', 'attributes_of']
10 | import inspect, os, imghdr
11 | from pathlib import Path
12 | 
13 | # print methods of an object
14 | def methods_of(obj,lr=False):
15 |     for attr in dir(obj):
16 |         if attr.startswith("_"): continue
17 |         try:
18 |             if callable(getattr(obj,str(attr),None)):
19 |                 print(f"{attr}{str(inspect.signature(getattr(obj,str(attr), None)))}:")
20 |                 if lr==True: print()
21 |         except: pass
22 | 
23 | # print attributes of an object
24 | def attributes_of(obj, *exclude):
25 |     for attr in dir(obj):
26 |         if attr.startswith("_"): continue
27 |         try:
28 |             if not callable(getattr(obj,str(attr),None)):
29 |                 if attr in exclude:
30 |                     print(f"{attr}: ...")
31 |                 else:
32 |                     print(f"{attr}: {getattr(obj,attr)}")
33 |         except: pass
34 | 
35 | """
36 | ai_utils.py
37 | contains:
38 | atttributes_of(obj, *exclude): -> prints obj attributes
39 | methods_of(obj,lr=False):      -> prints obj methods
40 | 
41 | usage: import ai_utils
42 | 
43 | > data = ImageClassifierData.from_paths(PATH, tfms=tfms_from_model(arch, sz))
44 | > attributes_of(data.trn_dl.dataset,'fnames')
45 | c: 2
46 | fnames: ...
47 | is_multi: False
48 | is_reg: False
49 | n: 23000
50 | path: data/dogscats/
51 | sz: 224
52 | y: [0 0 0 ... 1 1 1]
53 | 
54 | > methods_of(data.trn_dl.dataset)
55 | denorm(arr):
56 | get(tfm, x, y):
57 | get_c():
58 | get_n():
59 | get_sz():
60 | get_x(i):
61 | get_y(i):
62 | resize_imgs(targ, new_path):
63 | transform(im, y=None):
64 | """
65 | 


--------------------------------------------------------------------------------
/search_images_ddg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ### search_images_ddg.py
 4 | ### C. Bryan Daniels
 5 | ### 9/1/2020
 6 | ### Adopted from https://github.com/deepanprabhu/duckduckgo-images-api
 7 | ###
 8 | 
 9 | import requests, re, json, time, sys, os
10 | from pathlib import Path
11 | 
12 | __all__ = ['search_images_ddg']
13 | 
14 | def search_images_ddg(keywords,max_n=100):
15 |     """Search for 'keywords' with DuckDuckGo and return a unique urls of 'max_n' images"""
16 |     url        = 'https://duckduckgo.com/'
17 |     params     = {'q':keywords}
18 |     res        = requests.post(url,data=params)
19 |     searchObj  = re.search(r'vqd=([\d-]+)\&',res.text)
20 |     if not searchObj: print('Token Parsing Failed !'); return
21 |     requestUrl = url + 'i.js'
22 |     headers    = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}
23 |     params     = (('l','us-en'),('o','json'),('q',keywords),('vqd',searchObj.group(1)),('f',',,,'),('p','1'),('v7exp','a'))
24 |     urls       = []
25 |     while True:
26 |         try:
27 |             res  = requests.get(requestUrl,headers=headers,params=params)
28 |             data = json.loads(res.text)
29 |             for obj in data['results']:
30 |                 urls.append(obj['image'])
31 |                 max_n = max_n - 1
32 |                 if max_n < 1: return list(set(urls))
33 |             if 'next' not in data: return list(set(urls))
34 |             requestUrl = url + data['next']
35 |         except:
36 |             pass
37 | 
38 | if __name__ == "__main__":
39 |     def print_urls(urls):
40 |         for url in urls:
41 |             print(url)
42 |     if len(sys.argv)    == 2: print_urls(search_images_ddg(sys.argv[1])) 
43 |     elif len(sys.argv)  == 3: print_urls(search_images_ddg(sys.argv[1],int(sys.argv[2]))) 
44 |     else: print("usage: search(keywords,max_n=100)")
45 | 


--------------------------------------------------------------------------------
/make_train_valid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ###
 4 | # C. Bryan Daniels
 5 | # https://github.com/prairie-guy
 6 | # 2/7/2018
 7 | ###
 8 | 
 9 | import os, sys, shutil, random, argparse
10 | from pathlib import Path
11 | 
12 | __all__ = ['make_train_valid']
13 | 
14 | def make_train_valid(labels_dir:Path, train:float=.8, valid:float=.2, test:float=0):
15 |     """
16 | usage: make_train_valid(labels_dir:Path, train:float=.8, valid:float=.2, test:float=0) 
17 |                            
18 | Make a train-valid directory and randomly copy files from labels_dir to sub-
19 | directories
20 | 
21 | positional arguments:
22 |   labels_dir     Contains at least two directories of labels, each containing
23 |                  files of that label
24 | 
25 | optional arguments:
26 |   train=.8  files for training, default=.8
27 |   valid=.2  files for validation, default=.2
28 |   test=  0  files for training, default=.0
29 | """
30 |     assert sum([train, valid, test]) == 1
31 |     assert (Path(labels_dir).is_dir())
32 |     labels_path = Path(labels_dir)
33 |     runs = {'train':train, 'valid':valid, 'test':test}
34 |     
35 | 
36 |     for run in runs.keys():
37 |         shutil.rmtree((labels_path / run), ignore_errors=True)
38 | 
39 |     labels = [d.name for d in labels_path.iterdir() if d.is_dir()]
40 |     
41 |     for label in labels:
42 |         files = list((labels_path / label).iterdir())
43 |         num_files = len(files)
44 |         for run in runs.keys():
45 |             os.makedirs(labels_path / run / label)
46 |             take = round(num_files * runs[run])
47 |             random.shuffle(files)
48 |             for f in files[:take]:
49 |                 shutil.copy(f, (labels_path / run / label / f.name))
50 |                 #print(f, (labels_path / run / label / f.name))
51 |             files = files[take:]
52 |         
53 | if __name__ == "__main__":
54 |     parser = argparse.ArgumentParser(description = "Make a train-valid  directory  and randomly copy files from labels_dir to sub-directories")
55 |     parser.add_argument("labels_dir", help= "Contains at least two directories of labels, each containing files of that label")
56 |     parser.add_argument("--train", type=float, default=.8, help="files for training, default=.8")
57 |     parser.add_argument("--valid", type=float, default=.2, help="files for validation, default=.2")
58 |     parser.add_argument("--test", type=float, default=.0,  help="files for testing,  default=.0")
59 |     args = parser.parse_args()
60 |      
61 |     make_train_valid(args.labels_dir, args.train, args.valid, args.test)
62 |     
63 | 


--------------------------------------------------------------------------------
/download_images.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ### download_images.py
 4 | ### C. Bryan Daniels
 5 | ### 9/1/2020
 6 | ### Adopted from https://github.com/deepanprabhu/duckduckgo-images-api
 7 | ###
 8 | 
 9 | import requests, re, sys, os, requests, threading
10 | from pathlib import Path
11 | from fastprogress import progress_bar
12 | from threading import Thread
13 | 
14 | __all__ = ['download_images']
15 | 
16 | def download_images(urls=None, url_file=None, dest='dataset',max_pics=1000, timeout=5):  
17 |     "Download images listed in text file `url_file` to path `dest`, at most `max_pics`"
18 |     if urls is None: urls = (open(url_file)).read().strip().split("\n")[:max_pics]
19 |     dest = Path(dest)
20 |     dest.mkdir(exist_ok=True)
21 |     download_image_inner(dest, list(enumerate(urls)), timeout=timeout)
22 |     
23 | def download_image_inner(dest, inp, timeout=5):
24 |     for (i,url)  in inp:
25 |         suffix = re.findall(r'\.\w+?(?=(?:\?|$))', url)
26 |         suffix = suffix[0] if len(suffix)>0  else '.jpg'
27 |         try:
28 |             Thread(target=download_url,
29 |                              args = (url, dest/f"{i:08d}{suffix}"),
30 |                              kwargs = {'overwrite':True, 'show_progress':False, 'timeout':timeout}).start()
31 |             #download_url(url, dest/f"{i:08d}{suffix}", overwrite=True, show_progress=False, timeout=timeout)
32 |         except Exception as e: f"Couldn't download {url}."
33 | 
34 | def download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1025*1024,
35 |                  timeout=5, retries=5):
36 |     "Download `url` to `dest` unless it exists and not `overwrite`"
37 |     if os.path.exists(dest) and not overwrite: return
38 | 
39 |     s = requests.Session()
40 |     s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
41 |     # additional line to identify as a firefox browser, see fastai/#2438
42 |     s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'})
43 |     u = s.get(url, stream=True, timeout=timeout)
44 |     try: file_size = int(u.headers["Content-Length"])
45 |     except: show_progress = False
46 | 
47 |     with open(dest, 'wb') as f:
48 |         nbytes = 0
49 |         if show_progress: pbar = progress_bar(range(file_size), leave=False, parent=pbar)
50 |         try:
51 |             if show_progress: pbar.update(0)
52 |             for chunk in u.iter_content(chunk_size=chunk_size):
53 |                 nbytes += len(chunk)
54 |                 if show_progress: pbar.update(nbytes)
55 |                 f.write(chunk)
56 |         except requests.exceptions.ConnectionError as e:
57 |             fname = url.split('/')[-1]
58 |             data_dir = dest.parent
59 |             print(f'\n Download of {url} has failed after {retries} retries\n'
60 |                   f' Fix the download manually:\n'
61 |                   f'$ mkdir -p {data_dir}\n'
62 |                   f'$ cd {data_dir}\n'
63 |                   f'$ wget -c {url}\n'
64 |                   f'$ tar xf {fname}\n'
65 |                   f' And re-run your code once the download is successful\n')
66 | 
67 | if __name__ == "__main__":
68 |     args = sys.argv[1:]
69 |     try:
70 |         download_images(url_file=args[0])
71 |     except:
72 |         print("usage: download_images(urls=None, url_file=None,  dest='dataset',max_pics=1000, timeout=4):")
73 |     # if len(sys.argv)    == 2: print_urls(search_images_ddg(sys.argv[1]))
74 |     # elif len(sys.argv)  == 3: print_urls(search_images_ddg(sys.argv[1],int(sys.argv[2])))
75 |     # else: print("usage: search(keywords,max_n=100)")
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## ai_utilities
 2 | 
 3 | Several useful scripts for use with `fast.ai` lectures and libraries.
 4 | 
 5 | `image_download` is the primary function. It provides easy download of images from `bing`, `google`, `baidu`,  and/or `flickr` (though the later requires an `apikey`). It is intended for direct import of images within a python script or Jupyter Notebook. 
 6 | 
 7 | 
 8 | `make-train-valid` makes a train-valid directory and randomly copy files from labels_dir to sub-
 9 | directories. It has largely been replaced by the capabilities within `fastai` but is still useful.
10 | 
11 | ### Installation
12 | - `pip install icrawler` 
13 | - `pip install python-magic` or  `pip install python-magic-bin`
14 | - `git clone https://github.com/prairie-guy/ai_utilities.git`
15 | 
16 | 
17 | ### image_download.py
18 | Downloads up to a `n_images` (typically limited to 100-300) from a specified search engine, including `bing`, `baidu` and `flickr`. The `search_text` can be different from its `label`. Images are checked to be valid images and duplicates are eliminated. Images are saved to the directory `dataset` by defalult. (Based upon the excellent work of: https://github.com/hellock/icrawler)
19 | 
20 | ```
21 | usage: image_download(search_text:Path, n_images, label:str=None, engine:str='bing', image_dir='dataset', apikey=None)
22 |            where, 'engine'   = ['bing'|'google'|baidu'|'flickr'],
23 |                   'flickr' requires an apikey and
24 |                   'label' can be different from 'search_text'
25 | ```
26 | 
27 | ### Example Usage
28 | Download up to 100 images of each `class`, check each file to be a valid `jpeg` image, save to directory `dataset` and create `data = ImageDataBunch.from_folder(...)`. Optionally create an imagenet-type directory structure.
29 | ```
30 | import sys
31 | sys.path.append('your-parent-directory-of-ai_utilities')
32 | from ai_utilities import *
33 | from pathlib import Path
34 | from fastai.vision.all import *
35 | 
36 | for p in ['dog', 'goat', 'sheep']:
37 |     image_download(p, 100)
38 | path = Path.cwd()/'dataset'    
39 | data = ImageDataLoaders.from_folder(path,valid_pct=0.2, item_tfms=Resize(224))
40 | 
41 | # Optionally, create an imagenet-type file directory.
42 | make_train_valid(path)
43 | data = ImageDataLoaders.from_folder(path, train='train', valid='valid', item_tfms=Resize(224))
44 | ```    
45 | 
46 | ### make_train_valid.py
47 | From a directory containing sub-directories, each with a different class of images, make an imagenet-type directory structure.
48 | It randomly copies files from `labels_dir` to sub-directories: `train`, `valid`, `test`. Creates an imagmenet-type directory usable by `ImageDataBunch.from_folder(dir,...)`
49 | 
50 | ```
51 | usage: make_train_valid(labels_dir:Path, train:float=.8, valid:float=.2, test:float=0)                           
52 |      positional arguments:
53 |         labels_dir     Contains at least two directories of labels, each containing
54 |                        files of that label
55 |          optional arguments:
56 |                         train=.8  files for training, default=.8
57 |                         valid=.2  files for validation, default=.2
58 |                         test=  0  files for training, default=.0
59 | ```
60 | 
61 | For example, given a directory:
62 | ```
63 | catsdogs/
64 |          ..cat/[*.jpg]
65 |          ..dog/[*.jpg]
66 | ```         
67 | 
68 | Creates the following directory structure:
69 | ```
70 | catsdogs/
71 |          ..cat/[*.jpg]
72 |          ..dog/[*.jpg]
73 |          ..train/
74 |                  ..cat/[*.jpg]
75 |                  ..dog/[*.jpg]
76 |          ..valid/
77 |                  ..cat/[*.jpg]
78 |                  ..dog/[*.jpg]
79 | ``` 
80 | 
81 | 


--------------------------------------------------------------------------------
/image_download.py:
--------------------------------------------------------------------------------
  1 | ###
  2 | # C.Bryan Daniels
  3 | # 6/20/2019
  4 | # Adapted from github.com/atif93/google_image_downloader
  5 | # Adapted from github.com/cwerner/fastclass.git
  6 | ###
  7 | 
  8 | # Install these modules before fastai to avoid clobbering pillow
  9 | # conda install -c hellock icrawler
 10 | # pip install python-magic
 11 | 
 12 | import os, sys, shutil
 13 | from pathlib import Path
 14 | from glob import glob
 15 | import mimetypes
 16 | import hashlib, magic
 17 | import icrawler
 18 | from icrawler.builtin import GoogleImageCrawler, BingImageCrawler, BaiduImageCrawler, FlickrImageCrawler
 19 | # GoogleImageCrawler is not working from icrawler
 20 | 
 21 | __all__ = ['dedupe_images','filter_images','image_download','filter_images']
 22 | 
 23 | def image_download(search_text:str, n_images:int, label:str=None, engine:str='bing', image_dir='dataset', apikey=None):
 24 |     """
 25 |     Download images from bing, baidu or flickr
 26 |     usage: image_download(search_text:Path, n_images, label:str=None, engine:str='bing', image_dir='dataset', apikey=None)
 27 |     where, engine   = ['bing'|'baidu'|'flickr'],
 28 |            'flickr' requires an apikey,
 29 |     """
 30 |     if engine not in ['google','bing', 'baidu', 'flickr']: print("supported engines are: google,bing,baidu,flickr"); exit()
 31 |     # If you have patched icrawler/icrawler/builtin/google.py, COMMENT OUT the next line of code to use the google search engine
 32 |     # if engine=='google': print("engine=google is currently being fixed. Try another engine."); exit() # Temp until icrawler PR is applied
 33 |     if label is None: label = search_text
 34 |     path = Path.cwd()/image_dir/label
 35 |     if Path.exists(path):
 36 |         response = input(f"'{label}' exists. Overwrite? [Y/n]: ")
 37 |         if response == 'Y': shutil.rmtree(path)
 38 |         else: print(f"'{label}' unchanged", end='\r'); exit()
 39 | 
 40 |     if engine == 'flickr':
 41 |         start_flickr_crawler(path, search_text, n_images, apikey)
 42 |     else:
 43 |         engines = {'google':GoogleImageCrawler, 'bing':BingImageCrawler,'baidu':BaiduImageCrawler}
 44 |         start_crawler(engines[engine], path, search_text, n_images)
 45 |     nons = filter_images(path)   # Remove non-jpg images
 46 |     dups = dedupe_images(path)   # Remove duplicates
 47 |     print()
 48 |     print("**********************************************************")
 49 |     print(f"Path:       {path}")
 50 |     print(f"Removed:    {dups} duplicate images")
 51 |     print(f"Removed:    {nons} non-jpeg images ")
 52 |     print(f"Downloaded: {len(list(path.iterdir()))} images")
 53 |     print("**********************************************************")
 54 | 
 55 | def start_crawler(Crawler:icrawler, path:Path, search_text:str, n_images:int, file_idx_offset=0):
 56 |     crawler = Crawler(feeder_threads=2,parser_threads=2,downloader_threads=8,storage={'root_dir': path})
 57 |     crawler.crawl(keyword=search_text, max_num=n_images, file_idx_offset=file_idx_offset)
 58 | 
 59 | def start_flickr_crawler(path:Path, search_text:str, n_images:int, apikey:str):
 60 |     if apikey == None: print("Flickr requires an apikey: 'https://www.flickr.com/services/api/misc.api_keys.html'"); exit()
 61 |     crawler = FlickrImageCrawler(apikey,feeder_threads=2,parser_threads=2,downloader_threads=8,storage={'root_dir': path})
 62 |     crawler.crawl(tags=search_text, max_num=n_images, tag_mode='all')
 63 | 
 64 | def dedupe_images(image_dir:Path)->int:
 65 |     """Delete duplicate images from image_dir, also works recursively if there are
 66 |     subfolders containing images, OBS: only works in image_files with image extensions"""
 67 |     
 68 |     image_extensions = set(k for k,v in mimetypes.types_map.items() if v.startswith('image/'))
 69 |     
 70 |     images = {}; dups = []
 71 |     image_files = [y for x in os.walk(image_dir) for ext in image_extensions for y in glob(os.path.join(x[0], f'*{ext}'))]
 72 |     for f in image_files:
 73 |         h = hashfile(f)
 74 |         if h in images:
 75 |             images[h] = images[h] + 1
 76 |             dups.append(f)
 77 |         else:
 78 |             images[h] = 1
 79 |     n = len(dups)
 80 |     for f in dups:
 81 |         Path(f).unlink()
 82 |     return n
 83 | 
 84 | def hashfile(path:Path)->str:
 85 |     """Create hash of file"""
 86 |     blocksize = 65536
 87 |     with open(path, 'rb') as f:
 88 |         hasher = hashlib.sha512()
 89 |         buf = f.read(blocksize)
 90 |         while len(buf) > 0:
 91 |             hasher.update(buf)
 92 |             buf = f.read(blocksize)
 93 |     return hasher.hexdigest()
 94 | 
 95 | def filter_images(image_dir:Path, img_type:str='JPEG')->int:
 96 |     """Filter (keep) only pictures of a specified type. The default is jpeg"""
 97 |     nons = 0
 98 |     path = Path(image_dir)
 99 |     for f in path.iterdir():
100 |         try: 
101 |             jpeg = magic.from_file(f.as_posix())[:4]
102 |             if f.is_file() and jpeg != img_type:
103 |                 nons = nons + 1
104 |                 f.unlink()
105 |         except: 
106 |             nons += 1   
107 |             f.unlink() 
108 |     return nons
109 | 


--------------------------------------------------------------------------------