├── .gitignore
├── LICENSE.md
├── README.md
├── duckgoose
    ├── __init__.py
    ├── cam.py
    └── image_classification_bootstrap.py
├── environment-LINUX.yml
├── examples
    └── example.py
├── images
    └── duck.png
├── requirement.txt
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | *.npy
91 | *.pkl
92 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (C) 2018, Sergiusz Bleja
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 9 | of the Software, and to permit persons to whom the Software is furnished to do
10 | so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # duckgoose
 2 | Utility scripts for the online [fast.ai](www.fast.ai) course. There are two main parts: one to download and organise arbitrary image classes, and one to highlight what parts of an image is activating the decision for a classification.
 3 | 
 4 | 1. Utility for Lesson 1 experimentation with external image classes. The script:
 5 | * Downloads images from google images for specific classes
 6 | * Sanity checks that images can be opened and have three channels
 7 | * Organises the images into separate folders (train/valid/test + classes) as expected by the fast.ai library
 8 | 
 9 | 2. Utility for creating Class Activation Maps for both classifications.
10 | 
11 | ## Prerequisites 
12 | 
13 | * `chromedriver` is required. On ubuntu/debian: `sudo apt-get chromium-chromedriver`
14 | 
15 | ## Installation
16 | 
17 | ```python
18 | pip install duckgoose
19 | ```
20 | 
21 | ## Usage
22 | 
23 | ### Fetching, sanity checking and organising images
24 | 
25 | ```python
26 | from duckgoose import fetchImagesAndPrepForClassification
27 | 
28 | # dictionary structure `class_name => search term`
29 | image_classes = { 'ducks' : 'ducks -rubber' , 'geese' : 'geese' }
30 | download_path = '/home/myuser/data/downloaded_from_google'
31 | output_path = '/home/myuser/data/ducksgeese/'
32 | number_of_images = 100
33 | 
34 | fetchImagesAndPrepForClassification(image_classes, download_path, output_path, number_of_images)
35 | ```
36 | 
37 | ### Create Class Activation Maps (CAM)
38 | *Note*: This was implemented for fastai v2 part 1.
39 | Here is a full example of creating a class activation maps for ducks and geese using fast ai. 
40 | 
41 | ```python
42 | from fastai.imports import *
43 | from fastai.transforms import *
44 | from fastai.conv_learner import *
45 | from fastai.model import *
46 | from fastai.dataset import *
47 | from fastai.sgdr import *
48 | from fastai.plots import *
49 | 
50 | from duckgoose.cam import calculateAndChartHeatZoneFor
51 | 
52 | PATH = "data/ducksgeese/"
53 | sz=224
54 | arch = resnet34
55 | bs = 64
56 | 
57 | m = arch(True)
58 | m = nn.Sequential(*children(m)[:-2], 
59 |                   nn.Conv2d(512, 2, 3, padding=1), 
60 |                   nn.AdaptiveAvgPool2d(1),
61 |                   Flatten(), 
62 |                   nn.LogSoftmax())
63 | 
64 | tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
65 | data = ImageClassifierData.from_paths(PATH, tfms=tfms, bs=bs)
66 | learn = ConvLearner.from_model_data(m, data)
67 | 
68 | learn.freeze_to(-4)
69 | 
70 | _, val_tfms = tfms_from_model(learn.model, sz)
71 | 
72 | learn.fit(0.01, 2)
73 | 
74 | calculateAndChartHeatZoneFor('./data/ducksgeese/test/ducks/ducks_427.jpg', val_tfms, learn)
75 | ```
76 | 
77 | ![Duck and goose heatmap](images/duck.png)
78 | 
79 | 
80 | # License
81 | [The MIT License (MIT)](LICENSE.txt)
82 | 


--------------------------------------------------------------------------------
/duckgoose/__init__.py:
--------------------------------------------------------------------------------
1 | from duckgoose.image_classification_bootstrap import fetchImagesAndPrepForClassification
2 | from duckgoose.image_classification_bootstrap import santityCheckAndOrganiseFromGoogle
3 | from duckgoose.image_classification_bootstrap import partitonIntoTrainValidTest
4 | from duckgoose.image_classification_bootstrap import downloadImagesForClasses
5 | 


--------------------------------------------------------------------------------
/duckgoose/cam.py:
--------------------------------------------------------------------------------
  1 | from fastai.imports import *
  2 | from fastai.transforms import *
  3 | from fastai.conv_learner import *
  4 | from fastai.model import *
  5 | from fastai.dataset import *
  6 | from fastai.sgdr import *
  7 | 
  8 | def modelForCam(PATH, sz, arch, bs):
  9 |     m = arch(True)
 10 |     m = nn.Sequential(*children(m)[:-2], 
 11 |                       nn.Conv2d(512, 2, 3, padding=1), 
 12 |                       nn.AdaptiveAvgPool2d(1),
 13 |                       Flatten(), 
 14 |                       nn.LogSoftmax())
 15 | 
 16 |     tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
 17 |     data = ImageClassifierData.from_paths(PATH, tfms=tfms, bs=bs)
 18 |     learn = ConvLearner.from_model_data(m, data)
 19 | 
 20 |     learn.freeze_to(-4)
 21 | 
 22 |     return learn
 23 | 
 24 | 
 25 | class SaveFeatures():
 26 |     features=None
 27 |     def __init__(self, m): self.hook = m.register_forward_hook(self.hook_fn)
 28 |     def hook_fn(self, module, input, output): self.features = output
 29 |     def remove(self): self.hook.remove()
 30 | 
 31 | 
 32 | def calculateAndChartHeatZoneFor(input_image, val_tfms, learn):
 33 |     m = learn.model
 34 |     data = learn.data
 35 |     classes = learn.data.classes
 36 | 
 37 |     im = val_tfms(np.array(open_image(input_image)))
 38 |     actual = inferActualFromPath(input_image, classes)
 39 | 
 40 |     preds = learn.predict_array(im[None])
 41 |     probs = np.exp(preds)
 42 | 
 43 |     d_im = data.val_ds.denorm(im)[0]
 44 |     x,y = im[None], probs[0][0]
 45 | 
 46 |     conv_features = SaveFeatures(m[-4])
 47 |     py = m(Variable(T(x)))
 48 |     conv_features.remove()
 49 |     py = np.exp(to_np(py)[0])
 50 | 
 51 |     feat = np.maximum(0,to_np(conv_features.features[0]))
 52 | 
 53 |     non_zeroes_feat = to_np(conv_features.features[0])
 54 | 
 55 |     py_orig = py
 56 | 
 57 |     dd, gg = heatmapsFor(feat, d_im.shape)
 58 | 
 59 |     p_class1= py_orig[0]
 60 | 
 61 |     plotCAMHeatmaps(d_im, dd, gg, actual, p_class1, classes)
 62 | 
 63 | 
 64 | def inferActualFromPath(im_path, classes):
 65 |     os.path.split(im_path)
 66 |     actual = "Unknown"
 67 | 
 68 |     try:
 69 |         pp = os.path.normpath(im_path).split(os.sep)[-2]
 70 |     
 71 |         if pp in classes:
 72 |             actual = pp
 73 |     except IndexError:
 74 |         print(f'No match for {im_path}')
 75 | 
 76 |     return actual
 77 | 
 78 | 
 79 | def normalise_img(f2, all_min, all_max):
 80 |     return (f2-all_min)/all_max
 81 | 
 82 | 
 83 | def resize_img(img, shape):
 84 |     return scipy.misc.imresize(img, shape)
 85 | 
 86 | 
 87 | def heatmapsFor(feat, shape):
 88 |     class1_py = np.array([1, 0])
 89 |     class2_py = np.array([0, 1])
 90 | 
 91 |     class1_f2=np.dot(np.rollaxis(feat,0,3), class1_py)
 92 |     class2_f2=np.dot(np.rollaxis(feat,0,3), class2_py)
 93 |     
 94 |     all_max = np.concatenate([class1_f2, class2_f2]).max()
 95 |     all_min = np.concatenate([class1_f2, class2_f2]).min()
 96 |     
 97 |     class1_f2 = normalise_img(class1_f2, all_min, all_max)
 98 |     class2_f2 = normalise_img(class2_f2, all_min, all_max)
 99 |     
100 |     return resize_img(class1_f2, shape), resize_img(class2_f2,shape)
101 | 
102 | 
103 | def plotCAMHeatmaps(d_im, dd, gg, actual, p_class1, classes):
104 |     c1 = classes[0]
105 |     c2 = classes[1]
106 | 
107 |     alpha = 0.7
108 |     fig = plt.figure(figsize=(15,8))
109 |     left_ax = fig.add_subplot(1,3,1)
110 |     left_ax.imshow(d_im)
111 |     left_ax.imshow(dd, alpha=alpha, cmap='hot');
112 |     plt.title(f'{c1} heat zone')
113 |     plt.axis('off')
114 | 
115 |     middle_ax = fig.add_subplot(1,3,2)
116 |     middle_ax.imshow(d_im)
117 |     plt.title(f'Actual: {actual}. \nPrediction: P({c1})={p_class1:0.2f}')
118 |     plt.axis('off')
119 | 
120 |     right_ax = fig.add_subplot(1,3,3)
121 |     right_ax.imshow(d_im)
122 |     right_ax.imshow(gg, alpha=alpha, cmap='hot');
123 |     plt.title(f'{c2} heat zone')
124 |     plt.axis('off')
125 | 
126 | 


--------------------------------------------------------------------------------
/duckgoose/image_classification_bootstrap.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | import os
  3 | import glob
  4 | from os import path
  5 | import random
  6 | from tempfile import TemporaryDirectory
  7 | import gzip
  8 | import tarfile
  9 | import shutil
 10 | from hashlib import md5
 11 | from collections import defaultdict
 12 | 
 13 | from google_images_download import google_images_download
 14 | 
 15 | 
 16 | def fetchImagesAndPrepForClassification(image_classes, download_path, output_path, number_of_images, chromedriver='/usr/lib/chromium-browser/chromedriver', download_if_paths_exists = True):
 17 |     """
 18 |     Main entry point to prepare for image classification. The function will
 19 |     1. Download jpg images from google images search for the search terms
 20 |     2. Sanity check they can be opened and have three channels
 21 |     3. Organise into train/valid/test folder as expected by the fastai library
 22 | 
 23 |     Parameters:
 24 |     The image_classes is a dictionary of image_class to search term. Often they are identical
 25 |     """
 26 | 
 27 |     if download_if_paths_exists:
 28 |         do_download = True
 29 |     else:
 30 |         do_download = not download_paths_exist(image_classes, download_path)
 31 | 
 32 |     if do_download:
 33 |         downloadImagesForClasses(image_classes, download_path, number_of_images=number_of_images, chromedriver=chromedriver)
 34 |     else: 
 35 |         print("Skipping download")
 36 | 
 37 |     for image_class in image_classes.keys():
 38 |         sanitised_images, cannot_open, one_channel = santityCheckAndOrganiseFromGoogle(image_class, download_path, output_path)
 39 |         partitonIntoTrainValidTest(sanitised_images, image_class, output_path)
 40 | 
 41 | 
 42 | def download_paths_exist(image_classes, download_path):
 43 |     exists = [path.exists(path.join(download_path, x)) for x in image_classes]
 44 |     return all(exists)
 45 | 
 46 |                 
 47 | def file_hash(filepath):
 48 |     with open(filepath, "rb") as f:
 49 |             return md5(f.read()).hexdigest()
 50 |         
 51 | 
 52 | def santityCheckAndOrganiseFromGoogle(image_prefix, base_path, output_path):
 53 |     """ Check that the images can be opened and that there are three channels. Organise into train/valid/test split by 60/30/10% """
 54 |     # This is tied to the google download settings: specifically using the prefix == class
 55 |     gg = f'{base_path}/**/*{image_prefix} *.jpg'
 56 | 
 57 |     files = glob.glob(gg, recursive=True)
 58 |     outfiles = []
 59 |     ioe_error_files = []
 60 |     one_channel_files = []
 61 |     image_hashes = set()
 62 | 
 63 |     num = 1
 64 |     for ff in files:
 65 |         try:
 66 |             is_ok = True
 67 |             ii = Image.open(ff)
 68 |             number_of_channels = len(ii.getbands())
 69 |             
 70 |             if number_of_channels != 3: 
 71 |                 is_ok = False
 72 |                 print(f'Figure does not have 3 channels: {ff}')
 73 |             
 74 |             hash = file_hash(ff)
 75 |             if hash in image_hashes: 
 76 |                 is_ok = False
 77 |                 print(f'Found duplicate: {ff}')
 78 |             
 79 |             image_hashes.add(hash)
 80 |             
 81 |             if is_ok:
 82 |                 outfiles.append(ff)
 83 |                 num +=1
 84 | 
 85 |         except IOError as ioe:
 86 |             ioe_error_files.append(ff)
 87 |             print(f'Error encountered for {ff}: {ioe}')
 88 | 
 89 |     return(outfiles, ioe_error_files, one_channel_files)
 90 | 
 91 | 
 92 | def partitonIntoTrainValidTest(all_files, prefix, output_path, fraction_train = .6, fraction_valid = 0.3):
 93 |     """
 94 |     Randomnly parititons and copies files into train/valid/test directories with by default a 60/30/10% split.
 95 |     The target is [output_path]/train/[prefix] 
 96 |     """
 97 | 
 98 |     train_files, valid_files, test_files = shuffledSplit(all_files, fraction_train, fraction_valid)
 99 | 
100 |     copyFilesToPath(train_files, output_path, prefix, 'train')
101 |     copyFilesToPath(valid_files, output_path, prefix, 'valid')
102 |     copyFilesToPath(test_files, output_path, prefix, 'test')
103 | 
104 | 
105 | def shuffledSplit(all_files, fraction_train, fraction_valid):
106 |     total_number_of_files = len(all_files)
107 | 
108 |     train_num = round(total_number_of_files * fraction_train)
109 |     valid_num = round(total_number_of_files * fraction_valid)
110 |     test_num = total_number_of_files - train_num - valid_num
111 | 
112 |     random.shuffle(all_files)
113 | 
114 |     train_files = all_files[:train_num]
115 |     valid_files = all_files[train_num:(train_num+valid_num)]
116 |     test_files = all_files[(train_num+valid_num):]
117 | 
118 |     return(train_files, valid_files, test_files)
119 | 
120 | 
121 | def copyFilesToPath(files_to_move, output_path, prefix, ml_type):
122 |     this_path = path.join(output_path,ml_type, prefix)
123 |     os.makedirs(this_path, exist_ok=True)
124 |     for tt in files_to_move:
125 |         shutil.copy2(tt, path.join(this_path, path.basename(tt)))
126 | 
127 | 
128 | def downloadImagesForClasses(image_classes, download_path, number_of_images=1000, chromedriver='/usr/lib/chromium-browser/chromedriver'):
129 |     """
130 |     Download images for the specified image classes
131 |     """
132 | 
133 |     if not path.exists(download_path):
134 |         os.makedirs(download_path)
135 | 
136 |     common_arguments = {'limit' : number_of_images, 
137 |             'format' : 'jpg',
138 |             'color_type' : 'full-color',
139 |             'type' : 'photo',
140 |             'output_directory':download_path,
141 |             'chromedriver': chromedriver} 
142 |             
143 |     for image_class, search_term in image_classes.items():
144 |         downloadImagesFor(image_class, search_term, common_arguments)
145 | 
146 | 
147 | def downloadImagesFor(prefix, search_term, common_arguments = {}):
148 | 
149 |     search = common_arguments.copy()
150 |     search['keywords'] = search_term
151 |     search['prefix'] = prefix
152 | 
153 |     resp = google_images_download.googleimagesdownload()
154 |     paths = resp.download(search)
155 | 
156 | 


--------------------------------------------------------------------------------
/environment-LINUX.yml:
--------------------------------------------------------------------------------
 1 | name: duckgoose
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - ca-certificates=2018.03.07=0
 6 |   - certifi=2018.4.16=py36_0
 7 |   - libedit=3.1.20170329=h6b74fdf_2
 8 |   - libffi=3.2.1=hd88cf55_4
 9 |   - libgcc-ng=7.2.0=hdf63c60_3
10 |   - libstdcxx-ng=7.2.0=hdf63c60_3
11 |   - ncurses=6.1=hf484d3e_0
12 |   - openssl=1.0.2o=h20670df_0
13 |   - pip=10.0.1=py36_0
14 |   - python=3.6.5=hc3d631a_2
15 |   - readline=7.0=ha6073c6_4
16 |   - setuptools=39.2.0=py36_0
17 |   - sqlite=3.23.1=he433501_0
18 |   - tk=8.6.7=hc745277_3
19 |   - wheel=0.31.1=py36_0
20 |   - xz=5.2.4=h14c3975_4
21 |   - zlib=1.2.11=ha838bed_2
22 |   - pip:
23 |     - google-images-download==2.3.0
24 |     - greenlet==0.4.13
25 |     - msgpack==0.5.6
26 |     - neovim==0.2.6
27 |     - selenium==3.12.0
28 | 
29 | 


--------------------------------------------------------------------------------
/examples/example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from duckgoose import fetchImagesAndPrepForClassification
 3 | 
 4 | # dictionary structure `class_name => search term`
 5 | user = os.environ['USER']
 6 | image_classes = { 'ducks' : 'ducks -rubber' , 'geese' : 'geese' }
 7 | download_path = f'/home/{user}/data/dev/downloaded_from_google'
 8 | output_path = f'/home/{user}/data/dev/ducksgeese/'
 9 | number_of_images = 30
10 | 
11 | fetchImagesAndPrepForClassification(image_classes, download_path, output_path, number_of_images, download_if_paths_exists=False)
12 | 
13 | 


--------------------------------------------------------------------------------
/images/duck.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svenski/duckgoose/bcbdca7ac37c165aa319245a464e848e2721fac0/images/duck.png


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | google-images-download
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", 'r') as f:
 4 |     long_description = f.read()
 5 | 
 6 | setup(name='duckgoose',
 7 |       version='0.1.8',
 8 |       description='Utility functions for the fast ai mooc',
 9 |       url='http://github.com/svenski/duckgoose',
10 |       author='Sergiusz Bleja',
11 |       author_email='duckgoose@bleja.org',
12 |       license='MIT',
13 |       long_description=long_description,
14 |       packages=['duckgoose'],
15 |       install_requires=['google-images-download'],
16 |       keywords=['fastai','image-classification', 'deep-learning', 'class-activation-maps'],
17 |       download_url='https://github.com/svenski/duckgoose/archive/0.1.8.tar.gz')
18 | 
19 | 
20 |       
21 | 


--------------------------------------------------------------------------------