├── .gitignore ├── README.md └── project ├── proj1 ├── Assignment 1.docx ├── code │ ├── __init__.py │ ├── proj1.ipynb │ ├── proj1_test_filtering.ipynb │ ├── student_code.py │ └── utils.py ├── data │ ├── bicycle.bmp │ ├── bird.bmp │ ├── cat.bmp │ ├── dog.bmp │ ├── einstein.bmp │ ├── fish.bmp │ ├── marilyn.bmp │ ├── motorcycle.bmp │ ├── plane.bmp │ └── submarine.bmp ├── results │ ├── blur_image.jpg │ ├── high_frequencies.jpg │ ├── high_pass_image.jpg │ ├── hybrid_image.jpg │ ├── hybrid_image_scales.jpg │ ├── identity_image.jpg │ ├── laplacian_image.jpg │ ├── large_blur_image.jpg │ ├── low_frequencies.jpg │ └── sobel_image.jpg └── zip_submission.py ├── proj2 ├── Assignment 2.pdf ├── annotate_correspondences │ ├── collect_ground_truth_corr.py │ ├── eval_file.pkl │ ├── sydney_opera_house1.jpg │ └── sydney_opera_house2.jpg ├── code │ ├── __init__.py │ ├── examples.py │ ├── proj2.ipynb │ ├── student_feature_matching.py │ ├── student_harris.py │ ├── student_sift.py │ └── utils.py ├── data │ ├── Episcopal Gaudi │ │ ├── 3743214471_1b5bbfda98_o.jpg │ │ └── 4386465943_8cf9776378_o.jpg │ ├── Mount Rushmore │ │ ├── 9021235130_7c2acd9554_o.jpg │ │ └── 9318872612_a255c874fb_o.jpg │ └── Notre Dame │ │ ├── 4191453057_c86028ce1f_o.jpg │ │ └── 921919841_a30df938f2_o.jpg └── results │ ├── circles0.jpg │ ├── circles1.jpg │ ├── circles2.jpg │ ├── eval.jpg │ ├── lines0.jpg │ ├── lines1.jpg │ ├── lines2.jpg │ ├── vis_circles.jpg │ └── vis_lines.jpg ├── proj3 ├── Assigment3.pdf └── code │ ├── __init__.py │ ├── assignment3.ipynb │ ├── student_code.py │ ├── utils.py │ └── vocab.pkl ├── proj4 ├── Assignment4.pdf └── code │ ├── __init__.py │ ├── proj5.ipynb │ ├── student_code.py │ └── utils.py └── proj5 ├── Assigment5.pdf └── code ├── Assignment5.ipynb ├── __init__.py ├── student_code.py ├── utils.py └── utils_gpu.py /.gitignore: -------------------------------------------------------------------------------- 1 | project/proj2/data/ 2 | project/proj3/data/ 3 | project/proj4/data/ 4 | project/proj5/data/ 5 | .ipynb_checkpoints/ 6 | *.pyc 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS308-Computer-Vision 2 | 3 | ## Image Channels 4 | 5 | RGB -> HSV (Hue, Saturation, Value) 6 | 7 | - V = max 8 | - S = max - min / max 9 | - H = 10 | - if max = R, 60 * (G - B) / (max - min) 11 | - if max = G, 120 * (B - R) / (max - min) 12 | - if max = B, 240 * (R - G) / (max - min) 13 | 14 | ## Histogram Equalization 15 | 16 | **T(k) = floor((L - 1) sum(p0..k))** 17 | 18 | - L = intensity 19 | - pn = number of pixels with intensity n / total number of pixels 20 | 21 | ## Convolution 22 | 23 | ```python 24 | h, w, channels = image.shape 25 | fh, fw = kernel.shape[:2] 26 | pad_h = (fh - 1) // 2 27 | pad_w = (fw - 1) // 2 28 | image = np.pad(image, [(pad_h, pad_h), (pad_w, pad_w), (0, 0)], 'symmetric') 29 | 30 | kernel = kernel[..., None] # [fH, fW] -> [fH, fW, 1] 31 | output = np.zeros((h, w, channels), dtype='float32') 32 | for r in range(h): 33 | for c in range(w): 34 | # image patch: [fH, fW, 3] 35 | # `filter`: [fH, fW, 1] -> [fH, fW, 3] 36 | result = image[r:r+fh, c:c+fw] * kernel 37 | output[r, c, :] = np.sum(np.sum(result, axis=0), axis=0) 38 | ``` 39 | 40 | ## Filtering 41 | 42 | **Gaussian filter** 43 | 44 | - weights = 1/(2 * pi * std^2) * exp(-(x^2 + y^2) / (2 * std^2)) / max 45 | 46 | **Sobel filter** 47 | 48 | - mean = [1, 2, 1] 49 | - gradient = [1 0 -1] 50 | - horizontal sobel = [[1, 2, 1], [0, 0, 0], [-1, -2, -1]] 51 | - vertical sobel = [[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]] 52 | 53 | Laplacian of Gaussian (LoG) filter 54 | 55 | - 2nd derivative = (x^2 + y^2 / std^4 - 2 / std^2) * G 56 | 57 | Low pass 58 | 59 | - Gaussian 60 | 61 | High pass 62 | 63 | - 1 - Gaussian 64 | 65 | ## Fourier 66 | 67 | DFT 68 | 69 | - X(k) = sum(n=0..N-1, x(n) * exp(-2pi * j * k * n / N) 70 | 71 | IDFT 72 | 73 | - x(n) = 1/N * sum(k=0..N-1, X(k) * exp(2pi * j * k * n / N) 74 | 75 | 2D DFT 76 | 77 | - F(u, v) = sum(x=0..M-1, y=0..N-1, f(x, y) * exp(-2pi * j * (ux / M + vy / N))) 78 | 79 | Gabor 80 | 81 | ## Pyramid 82 | 83 | Gaussian Pyramid 84 | 85 | - scales (2^k) 86 | 87 | - Gaussian filter with (2^k) stds 88 | 89 | **Laplacian pyramid** 90 | 91 | - interpolate 92 | 93 | - DoG: G(k) - G(k-1) 94 | 95 | Image Blending 96 | 97 | - G(k) + L(k-1) + ... + L(1) 98 | 99 | ## Transformation 100 | 101 | Affinity Transform 102 | 103 | - I -> aI + b 104 | 105 | Rotation Matrix 106 | 107 | - R = [[cos, -sin], [sin, cos]] 108 | - [x', y'] = R * [x, y] 109 | 110 | Warping 111 | 112 | ## Keypoint Matching 113 | 114 | Interesting points 115 | 116 | - repeatability 117 | - distinctiveness 118 | 119 | **Harris Corner Detection** 120 | 121 | - Change in appearance of window w(x,y) for the shift [u,v]: 122 | - E(u,v) = ∑ w(x, y)*[I(x+u, y+v) - I(x,y)]^2 123 | - w(x, y) = (1 or Gaussian) in window 124 | - Second-order Taylor expansion of E(u,v) about (0,0): 125 | - E(u,v) = [u, v] M [u, v].T 126 | - M = ∑ w(x, y) [[Ix^2, Ix * Iy], [Ix * Iy, Iy^2]] 127 | - M = [[grad(x)^2, grad(x) * grad(y)], [grad(x) * grad(y), grad(y)^2]] 128 | - Gaussian filtering 129 | - Corner response function 130 | - R = det(M) − a * trace(M)^2 131 | - R = grad(x)^2 * grad(y)^2 - [grad(x) * grad(y)]^2 - a * [grad(x)^2 + grad(y)^2]^2 132 | - R > threshold 133 | - Take the points of local maxima (non-maximum suppression) 134 | 135 | - Invariance: 136 | - invariant to translation and rotation 137 | - not invariant to scaling 138 | 139 | **SIFT Descriptor** 140 | 141 | - gradient orientation histogram 142 | - 4x4x8=128 array weighted by gradient magnitude 143 | - define feature width for each keypoint 144 | - normalize, threshold, normalize 145 | 146 | **Matching** 147 | 148 | - Nearest Neighbor Distance Ratio 149 | - NN1 / NN2 150 | 151 | ## RANSAC 152 | 153 | 154 | 155 | ## Hough Transform 156 | 157 | shape detection 158 | 159 | voting scheme 160 | 161 | p = xcos(theta) + ysin(theta) 162 | 163 | ## Manifold Learning 164 | 165 | Dimensionality Reduction 166 | 167 | Unsupervised learning + continuous 168 | 169 | Linear methods 170 | 171 | - **Principal component analysis (PCA)** 172 | - find the principal axes are those orthonormal axes onto which the variance retained under projection is maximal 173 | - cov(X, Y) = 1/n * ∑ (Xi - Xm) * (Yi - Ym) 174 | - Multidimensional scaling (MDS) 175 | 176 | Nonlinear methods 177 | 178 | - Kernel PCA 179 | - Locally linear embedding (LLE) 180 | - Isomap 181 | - Laplacian eigenmaps (LE) 182 | - **T-distributed stochastic neighbor embedding (TSNE)** 183 | 184 | ## Classification 185 | 186 | Supervised learning + discrete 187 | 188 | **Support Vector Machine** 189 | 190 | - 2 classes 191 | 192 | - for support vector x, wx + b = +-1 -> w (x+ - x-) = 2 -> Margin = 2 / |w| 193 | 194 | - min{1/2 * |w|^2}, y(wx+b) >=1 195 | 196 | - Lagrange: a>=0, L = 1/2 * |w|^2 - ∑ {a*(y-wx+b) - 1)} 197 | 198 | - dual problem: min(w,b) max(a>=0) L -> max(a>=0) min(w,b) L 199 | - derivatives = 0 -> w = ∑ a * y * x, ∑ a * y = 0 200 | - L = max(a>=0) ∑ a - 1/2 * ∑∑ {ai * aj * yi * yj * xi * xj} 201 | - KKT -> a(y(wx+b) - 1) = 0 (complementary slackness) 202 | - f = ∑ a * yi * xi * x + b 203 | - SMO algorithm 204 | 205 | - Soft margin 206 | 207 | - slack variables ξ 208 | 209 | - min{1/2 * |w|^2 + C * ∑ξ}, y(wx+b) >= 1 - ξ, ξ >=0 210 | - 0 <= a <= C 211 | 212 | - Non-linear classification 213 | 214 | - K(x, z) = Φ(x) * Φ(z), x = input space, z = feature space, Φ = feature map 215 | - K is semi-positive definite symmetric function 216 | - L = max(a>=0) ∑ a - 1/2 * ∑∑ {ai * aj * yi * yj * K(xi, xj)} 217 | - f = ∑ a * yi * K(xi, x) + b 218 | 219 | - Multi-class classification 220 | 221 | - m class -> m SVMs 222 | 223 | ## Clustering 224 | 225 | unsupervised learning + discrete 226 | 227 | - Applications 228 | - summary 229 | - counting 230 | - segmentation 231 | - prediction 232 | - K-means 233 | - Mean-shift -------------------------------------------------------------------------------- /project/proj1/Assignment 1.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/Assignment 1.docx -------------------------------------------------------------------------------- /project/proj1/code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/code/__init__.py -------------------------------------------------------------------------------- /project/proj1/code/student_code.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import utils 4 | 5 | def conv2d(img, krnl): 6 | krnl_h, krnl_w = krnl.shape[:2] 7 | pad_h = (krnl_h - 1) // 2 8 | pad_w = (krnl_w - 1) // 2 9 | img = np.pad(img, [(pad_h, pad_h), (pad_w, pad_w)], 'constant') # zero-padding 10 | img_h, img_w = img.shape[:2] 11 | shape = (img_h - krnl_h + 1, img_w - krnl_w + 1, krnl_h, krnl_w) 12 | strides = np.array([img_w, 1, img_w, 1]) * img.itemsize 13 | img = np.lib.stride_tricks.as_strided(img, shape, strides) 14 | return np.tensordot(img, krnl, axes=[(2, 3), (0, 1)]) 15 | 16 | 17 | def my_imfilter(image, filter): 18 | """ 19 | Apply a filter to an image. Return the filtered image. 20 | 21 | Args 22 | - image: numpy nd-array of dim (m, n, c) 23 | - filter: numpy nd-array of dim (k, k) 24 | Returns 25 | - filtered_image: numpy nd-array of dim (m, n, c) 26 | 27 | HINTS: 28 | - You may not use any libraries that do the work for you. Using numpy to work 29 | with matrices is fine and encouraged. Using opencv or similar to do the 30 | filtering for you is not allowed. 31 | - I encourage you to try implementing this naively first, just be aware that 32 | it may take an absurdly long time to run. You will need to get a function 33 | that takes a reasonable amount of time to run so that the TAs can verify 34 | your code works. 35 | - Remember these are RGB images, accounting for the final image dimension. 36 | """ 37 | 38 | assert filter.shape[0] % 2 == 1 39 | assert filter.shape[1] % 2 == 1 40 | 41 | filtered_image = np.zeros_like(image) 42 | 43 | if len(image.shape) == 2: 44 | filtered_image = conv2d(image, filter) 45 | else: 46 | for i in range(image.shape[2]): 47 | filtered_image[:,:,i] = conv2d(image[:,:,i], filter) 48 | 49 | return filtered_image 50 | 51 | 52 | def create_hybrid_image(image1, image2, filter): 53 | """ 54 | Takes two images and creates a hybrid image. Returns the low 55 | frequency content of image1, the high frequency content of 56 | image 2, and the hybrid image. 57 | 58 | Args 59 | - image1: numpy nd-array of dim (m, n, c) 60 | - image2: numpy nd-array of dim (m, n, c) 61 | Returns 62 | - low_frequencies: numpy nd-array of dim (m, n, c) 63 | - high_frequencies: numpy nd-array of dim (m, n, c) 64 | - hybrid_image: numpy nd-array of dim (m, n, c) 65 | 66 | HINTS: 67 | - You will use your my_imfilter function in this function. 68 | - You can get just the high frequency content of an image by removing its low 69 | frequency content. Think about how to do this in mathematical terms. 70 | - Don't forget to make sure the pixel values are >= 0 and <= 1. This is known 71 | as 'clipping'. 72 | - If you want to use images with different dimensions, you should resize them 73 | in the notebook code. 74 | """ 75 | 76 | assert image1.shape[0] == image2.shape[0] 77 | assert image1.shape[1] == image2.shape[1] 78 | assert image1.shape[2] == image2.shape[2] 79 | 80 | low_frequencies = my_imfilter(image1, filter) 81 | high_frequencies = image2 - my_imfilter(image2, filter) 82 | hybrid_image = low_frequencies + high_frequencies 83 | 84 | # visualization 85 | high_frequencies += 1 - high_frequencies.max() 86 | 87 | return np.clip(low_frequencies, 0, 1), np.clip(high_frequencies, 0, 1), np.clip(hybrid_image, 0, 1) 88 | -------------------------------------------------------------------------------- /project/proj1/code/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | def vis_hybrid_image(hybrid_image): 5 | scales = 5 6 | scale_factor = 0.5 7 | padding = 5 8 | original_height = hybrid_image.shape[0] 9 | num_colors = 1 if hybrid_image.ndim == 2 else 3 10 | 11 | output = np.copy(hybrid_image) 12 | cur_image = np.copy(hybrid_image) 13 | for scale in range(2, scales+1): 14 | # add padding 15 | output = np.hstack((output, np.ones((original_height, padding, num_colors), 16 | dtype=np.float32))) 17 | 18 | # downsample image 19 | cur_image = cv2.resize(cur_image, (0, 0), fx=scale_factor, fy=scale_factor) 20 | 21 | # pad the top to append to the output 22 | pad = np.ones((original_height-cur_image.shape[0], cur_image.shape[1], 23 | num_colors), dtype=np.float32) 24 | tmp = np.vstack((pad, cur_image)) 25 | output = np.hstack((output, tmp)) 26 | 27 | return output 28 | 29 | def im2single(im): 30 | im = im.astype(np.float32) / 255 31 | return im 32 | 33 | def single2im(im): 34 | im *= 255 35 | im = im.astype(np.uint8) 36 | return im 37 | 38 | def load_image(path): 39 | return im2single(cv2.imread(path))[:, :, ::-1] 40 | 41 | def save_image(path, im): 42 | return cv2.imwrite(path, single2im(im.copy())[:, :, ::-1]) 43 | 44 | def im_range(im): 45 | im -= im.min() 46 | im = im / im.max() 47 | return im 48 | 49 | -------------------------------------------------------------------------------- /project/proj1/data/bicycle.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/bicycle.bmp -------------------------------------------------------------------------------- /project/proj1/data/bird.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/bird.bmp -------------------------------------------------------------------------------- /project/proj1/data/cat.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/cat.bmp -------------------------------------------------------------------------------- /project/proj1/data/dog.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/dog.bmp -------------------------------------------------------------------------------- /project/proj1/data/einstein.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/einstein.bmp -------------------------------------------------------------------------------- /project/proj1/data/fish.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/fish.bmp -------------------------------------------------------------------------------- /project/proj1/data/marilyn.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/marilyn.bmp -------------------------------------------------------------------------------- /project/proj1/data/motorcycle.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/motorcycle.bmp -------------------------------------------------------------------------------- /project/proj1/data/plane.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/plane.bmp -------------------------------------------------------------------------------- /project/proj1/data/submarine.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/data/submarine.bmp -------------------------------------------------------------------------------- /project/proj1/results/blur_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/blur_image.jpg -------------------------------------------------------------------------------- /project/proj1/results/high_frequencies.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/high_frequencies.jpg -------------------------------------------------------------------------------- /project/proj1/results/high_pass_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/high_pass_image.jpg -------------------------------------------------------------------------------- /project/proj1/results/hybrid_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/hybrid_image.jpg -------------------------------------------------------------------------------- /project/proj1/results/hybrid_image_scales.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/hybrid_image_scales.jpg -------------------------------------------------------------------------------- /project/proj1/results/identity_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/identity_image.jpg -------------------------------------------------------------------------------- /project/proj1/results/laplacian_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/laplacian_image.jpg -------------------------------------------------------------------------------- /project/proj1/results/large_blur_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/large_blur_image.jpg -------------------------------------------------------------------------------- /project/proj1/results/low_frequencies.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/low_frequencies.jpg -------------------------------------------------------------------------------- /project/proj1/results/sobel_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj1/results/sobel_image.jpg -------------------------------------------------------------------------------- /project/proj1/zip_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | def copy_directory(src, dest): 5 | try: 6 | shutil.copytree(src, dest) 7 | except shutil.Error as e: 8 | print('Directory not copied. Error: %s' % e) 9 | except OSError as e: 10 | print('Directory not copied. Error: %s' % e) 11 | 12 | shutil.rmtree('temp_submission', ignore_errors=True) 13 | os.mkdir('temp_submission') 14 | for dir_name in ['code', 'results']: 15 | copy_directory(dir_name, '/'.join(['temp_submission', dir_name])) 16 | shutil.make_archive('submission', 'zip', 'temp_submission') 17 | shutil.rmtree('temp_submission', ignore_errors=True) -------------------------------------------------------------------------------- /project/proj2/Assignment 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/Assignment 2.pdf -------------------------------------------------------------------------------- /project/proj2/annotate_correspondences/collect_ground_truth_corr.py: -------------------------------------------------------------------------------- 1 | 2 | # CS 6476 Computer Vision, Georgia Tech 3 | # Written by James Hays, John Lambert 4 | 5 | # This file is completely optional for the assignment, but is a way to provide 6 | # helpful service. 7 | 8 | # An interactive method to specify and then save many point correspondences 9 | # between two photographs, which will be used to generate a projective 10 | # transformation. 11 | 12 | # Pick a dozen corresponding points throughout the images, although more is 13 | # better. 14 | 15 | import pickle 16 | import cv2 17 | import matplotlib.pyplot as plt 18 | import numpy as np 19 | import sys 20 | import pdb 21 | FIGURE_HEIGHT = 6 22 | FIGURE_WIDTH = 10 23 | plt.rcParams["figure.figsize"] = (FIGURE_WIDTH,FIGURE_HEIGHT) 24 | 25 | sys.path.append('../code') 26 | 27 | from utils import load_image, show_correspondence_lines 28 | from pathlib import Path 29 | 30 | class CorrespondenceAnnotator(object): 31 | def __init__(self): 32 | 33 | self.image1 = load_image('./sydney_opera_house1.jpg') 34 | self.image2 = load_image('./sydney_opera_house2.jpg') 35 | self.corr_file = Path('./sydney_opera_house_correspondences.pkl') 36 | f, (ax1, ax2) = plt.subplots(1, 2, figsize=(FIGURE_WIDTH,FIGURE_HEIGHT)) 37 | self.ax1 = ax1 38 | self.ax2 = ax2 39 | self.x1 = [] # x locations in image 1 40 | self.y1 = [] # y locations in image 1 41 | self.x2 = [] # corresponding x locations in image 2 42 | self.y2 = [] # corresponding y locations in image 2 43 | 44 | def collect_ground_truth_corr(self): 45 | """ 46 | Collect ground truth image-to-image correspondences by manually annotating them. 47 | 48 | This function checks if some corresponding points are already saved, and 49 | if so, resumes work from there. 50 | """ 51 | if self.corr_file.exists(): 52 | self.load_pkl_correspondences() 53 | 54 | # The correspondences that already exist 55 | corr_image = show_correspondence_lines( self.image1, self.image2, 56 | np.array(self.x1), np.array(self.y1), 57 | np.array(self.x2), np.array(self.y2)) 58 | else: 59 | self.x1 = [] 60 | self.y1 = [] 61 | self.x2 = [] 62 | self.y2 = [] 63 | 64 | self.ax1.imshow(self.image1) 65 | self.ax2.imshow(self.image2) 66 | 67 | self.mark_corrs_with_clicks() 68 | self.dump_pkl_correspondences() 69 | 70 | corr_image = show_correspondence_lines( self.image1, self.image2, 71 | np.array(self.x1), np.array(self.y1), 72 | np.array(self.x2), np.array(self.y2)) 73 | plt.gcf().clear() 74 | plt.imshow(corr_image) 75 | plt.show() 76 | 77 | def load_pkl_correspondences(self): 78 | with open(str(self.corr_file), 'rb') as f: 79 | d = pickle.load(f) 80 | 81 | self.x1 = d['x1'] 82 | self.y1 = d['y1'] 83 | self.x2 = d['x2'] 84 | self.y2 = d['y2'] 85 | 86 | def dump_pkl_correspondences(self): 87 | print('saving matched points') 88 | data_dict = {} 89 | data_dict['x1'] = self.x1 90 | data_dict['y1'] = self.y1 91 | data_dict['x2'] = self.x2 92 | data_dict['y2'] = self.y2 93 | 94 | with open(str(self.corr_file), 'wb') as f: 95 | pickle.dump(data_dict,f) 96 | 97 | def mark_corrs_with_clicks(self): 98 | """ 99 | Mark correspondences with clicks 100 | """ 101 | print('Exit the matplotlib window to stop annotation.') 102 | title = 'Click on a point in the left window\n' 103 | title += 'then on a point in the right window.\n' 104 | title += 'Exit the matplotlib window to stop annotation.\n' 105 | title += 'Afterwards, you will see the plotted correspondences.' 106 | self.ax1.set_title(title) 107 | while(1): 108 | pt = plt.ginput(1) 109 | if len(pt) == 0: 110 | break 111 | x = pt[0][0] 112 | y = pt[0][1] 113 | 114 | self.ax1.scatter(x,y,30,color='r', marker='o') 115 | self.x1 += [x] 116 | self.y1 += [y] 117 | 118 | pt = plt.ginput(1) 119 | if len(pt) == 0: 120 | break 121 | x = pt[0][0] 122 | y = pt[0][1] 123 | 124 | self.ax2.scatter(x,y,30,color='r', marker='o') 125 | self.x2 += [x] 126 | self.y2 += [y] 127 | 128 | print('({}, {}) matches to ({},{})'.format( self.x1[-1], 129 | self.y1[-1], 130 | self.x2[-1], 131 | self.y2[-1])) 132 | print('{} total points corresponded'.format(len(self.x1))) 133 | 134 | if __name__ == '__main__': 135 | ca = CorrespondenceAnnotator() 136 | ca.collect_ground_truth_corr() 137 | -------------------------------------------------------------------------------- /project/proj2/annotate_correspondences/eval_file.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/annotate_correspondences/eval_file.pkl -------------------------------------------------------------------------------- /project/proj2/annotate_correspondences/sydney_opera_house1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/annotate_correspondences/sydney_opera_house1.jpg -------------------------------------------------------------------------------- /project/proj2/annotate_correspondences/sydney_opera_house2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/annotate_correspondences/sydney_opera_house2.jpg -------------------------------------------------------------------------------- /project/proj2/code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/code/__init__.py -------------------------------------------------------------------------------- /project/proj2/code/examples.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import time 5 | from utils import * 6 | from student_feature_matching import match_features 7 | from student_sift import get_features 8 | from student_harris import get_interest_points 9 | 10 | 11 | scale_factor = 0.5 12 | feature_width = 16 # width and height of each local feature, in pixels. 13 | 14 | images = [ 15 | ['../data/Mount Rushmore/9021235130_7c2acd9554_o.jpg', 16 | '../data/Mount Rushmore/9318872612_a255c874fb_o.jpg'], 17 | ['../data/Episcopal Gaudi/4386465943_8cf9776378_o.jpg', 18 | '../data/Episcopal Gaudi/3743214471_1b5bbfda98_o.jpg'], 19 | ['../data/Capricho Gaudi/36185369_1dcbb23308_o.jpg', 20 | '../data/Capricho Gaudi/6727732233_4564516d61_o.jpg'] 21 | ] 22 | 23 | for i, pair in enumerate(images): 24 | image1 = load_image(pair[0]) 25 | image2 = load_image(pair[1]) 26 | 27 | print(f"\nstart matching image pair {i}") 28 | start_time = time.time() 29 | 30 | image1 = cv2.resize(image1, (0, 0), fx=scale_factor, fy=scale_factor) 31 | image2 = cv2.resize(image2, (0, 0), fx=scale_factor, fy=scale_factor) 32 | image1_bw = cv2.cvtColor(image1, cv2.COLOR_RGB2GRAY) 33 | image2_bw = cv2.cvtColor(image2, cv2.COLOR_RGB2GRAY) 34 | 35 | x1, y1, _, _, _ = get_interest_points(image1_bw, feature_width) 36 | x2, y2, _, _, _ = get_interest_points(image2_bw, feature_width) 37 | 38 | print('{:d} corners in image 1, {:d} corners in image 2'.format(len(x1), len(x2))) 39 | 40 | image1_features = get_features(image1_bw, x1, y1, feature_width) 41 | image2_features = get_features(image2_bw, x2, y2, feature_width) 42 | 43 | matches, _ = match_features( 44 | image1_features, image2_features, x1, y1, x2, y2) 45 | print('{:d} matches from {:d} corners'.format(len(matches), len(x1))) 46 | 47 | print(f"time cost: {time.time() - start_time}") 48 | 49 | num_pts_to_visualize = 100 50 | c1 = show_correspondence_circles(image1, image2, 51 | x1[matches[:num_pts_to_visualize, 0]], 52 | y1[matches[:num_pts_to_visualize, 0]], 53 | x2[matches[:num_pts_to_visualize, 1]], 54 | y2[matches[:num_pts_to_visualize, 1]]) 55 | plt.figure() 56 | plt.imshow(c1) 57 | plt.savefig(f'../results/circles{i}.jpg', dpi=1000) 58 | c2 = show_correspondence_lines(image1, image2, 59 | x1[matches[:num_pts_to_visualize, 0]], 60 | y1[matches[:num_pts_to_visualize, 0]], 61 | x2[matches[:num_pts_to_visualize, 1]], 62 | y2[matches[:num_pts_to_visualize, 1]]) 63 | plt.figure() 64 | plt.imshow(c2) 65 | plt.savefig(f'../results/lines{i}.jpg', dpi=1000) 66 | print("result saved") 67 | -------------------------------------------------------------------------------- /project/proj2/code/student_feature_matching.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def match_features(features1, features2, x1, y1, x2, y2): 5 | """ 6 | This function does not need to be symmetric (e.g. it can produce 7 | different numbers of matches depending on the order of the arguments). 8 | 9 | To start with, simply implement the "ratio test", equation 4.18 in 10 | section 4.1.3 of Szeliski. There are a lot of repetitive features in 11 | these images, and all of their descriptors will look similar. The 12 | ratio test helps us resolve this issue (also see Figure 11 of David 13 | Lowe's IJCV paper). 14 | 15 | For extra credit you can implement various forms of spatial/geometric 16 | verification of matches, e.g. using the x and y locations of the features. 17 | 18 | Args: 19 | - features1: A numpy array of shape (n,feat_dim) representing one set of 20 | features, where feat_dim denotes the feature dimensionality 21 | - features2: A numpy array of shape (m,feat_dim) representing a second set 22 | features (m not necessarily equal to n) 23 | - x1: A numpy array of shape (n,) containing the x-locations of features1 24 | - y1: A numpy array of shape (n,) containing the y-locations of features1 25 | - x2: A numpy array of shape (m,) containing the x-locations of features2 26 | - y2: A numpy array of shape (m,) containing the y-locations of features2 27 | 28 | Returns: 29 | - matches: A numpy array of shape (k,2), where k is the number of matches. 30 | The first column is an index in features1, and the second column is 31 | an index in features2 32 | - confidences: A numpy array of shape (k,) with the real valued confidence for 33 | every match 34 | 35 | 'matches' and 'confidences' can be empty e.g. (0x2) and (0x1) 36 | """ 37 | threshold = 0.6 38 | max_matches = 100 39 | n = len(x1) 40 | m = len(x2) 41 | 42 | match_points = [] 43 | for i in range(n): 44 | distance = np.sum( 45 | np.square(np.tile(features1[i, :], (m, 1)) - features2), 1) 46 | argnn = np.argpartition(distance, 2)[:2] 47 | ratio = distance[argnn[0]] / distance[argnn[1]] 48 | if ratio < threshold: 49 | match_points.append([ratio, i, argnn[0]]) 50 | 51 | sorted_match_points = sorted(match_points, key=lambda x: x[0]) 52 | matches = np.array([[i[1], i[2]] 53 | for i in sorted_match_points[:max_matches]]).astype(int) 54 | confidences = np.array(i[0] for i in sorted_match_points[:max_matches]) 55 | 56 | return matches, confidences 57 | -------------------------------------------------------------------------------- /project/proj2/code/student_harris.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def get_interest_points(image, feature_width): 7 | """ 8 | Implement the Harris corner detector (See Szeliski 4.1.1) to start with. 9 | You can create additional interest point detector functions (e.g. MSER) 10 | for extra credit. 11 | 12 | If you're finding spurious interest point detections near the boundaries, 13 | it is safe to simply suppress the gradients / corners near the edges of 14 | the image. 15 | 16 | Useful in this function in order to (a) suppress boundary interest 17 | points (where a feature wouldn't fit entirely in the image, anyway) 18 | or (b) scale the image filters being used. Or you can ignore it. 19 | 20 | By default you do not need to make scale and orientation invariant 21 | local features. 22 | 23 | The lecture slides and textbook are a bit vague on how to do the 24 | non-maximum suppression once you've thresholded the cornerness score. 25 | You are free to experiment. For example, you could compute connected 26 | components and take the maximum value within each component. 27 | Alternatively, you could run a max() operator on each sliding window. You 28 | could use this to ensure that every interest point is at a local maximum 29 | of cornerness. 30 | 31 | Args: 32 | - image: A numpy array of shape (m,n,c), 33 | image may be grayscale of color (your choice) 34 | - feature_width: integer representing the local feature width in pixels. 35 | 36 | Returns: 37 | - x: A numpy array of shape (N,) containing x-coordinates of interest points 38 | - y: A numpy array of shape (N,) containing y-coordinates of interest points 39 | - confidences (optional): numpy nd-array of dim (N,) containing the strength 40 | of each interest point 41 | - scales (optional): A numpy array of shape (N,) containing the scale at each 42 | interest point 43 | - orientations (optional): A numpy array of shape (N,) containing the orientation 44 | at each interest point 45 | """ 46 | confidences, scales, orientations = None, None, None 47 | 48 | # dest = cv2.cornerHarris(image, 5, 3, 0.06) 49 | # ind = np.argwhere(dest > 0.01 * dest.max()) 50 | # x = ind[:, 0] 51 | # y = ind[:, 1] 52 | 53 | ksize1 = 3 54 | ksize2 = 5 55 | sigma = 1.5 56 | k = 0.06 57 | threshold = 10000 58 | n = 1500 59 | 60 | # Compute the horizontal and vertical derivatives of the image I x and I y 61 | # by convolving the original image with derivatives of Gaussians 62 | ix = cv2.Sobel(image, -1, 1, 0, ksize=ksize1) 63 | iy = cv2.Sobel(image, -1, 0, 1, ksize=ksize1) 64 | 65 | # Compute the three images corresponding to the outer products of these gradients. 66 | # (The matrix A is symmetric, so only three entries are needed.) 67 | ixx = np.square(ix) 68 | iyy = np.square(iy) 69 | ixy = np.multiply(ix, iy) 70 | 71 | # Convolve each of these images with a larger Gaussian. 72 | gaussian = cv2.getGaussianKernel(ksize2, sigma) 73 | gxx = cv2.filter2D(ixx, -1, gaussian) 74 | gyy = cv2.filter2D(iyy, -1, gaussian) 75 | gxy = cv2.filter2D(ixy, -1, gaussian) 76 | 77 | # Compute a scalar interest measure using one of the formulas discussed above. 78 | # np.linalg.det(A) - k * (np.trace(A) ** 2) 79 | R = np.multiply(gxx, gyy) - np.square(gxy) - k * np.square(gxx + gyy) 80 | 81 | # Find local maxima above a certain threshold and report them as detected feature point locations. 82 | corners = [] 83 | for row in range(R.shape[0]): 84 | for col in range(R.shape[1]): 85 | corners.append([R[row, col], col, row]) 86 | # threshold 87 | corners = np.array(sorted(corners, key=lambda x:x[0], reverse=True)[:threshold]) 88 | 89 | responses = corners[:, 0] 90 | x = corners[:, 1] 91 | y = corners[:, 2] 92 | 93 | # non-maxima suppress 94 | points = np.vstack([y, x]).T 95 | size = len(x) 96 | radii = np.zeros(size) 97 | radii[0] = np.inf 98 | for i in range(1, size): 99 | curr_response = responses[i] 100 | idx = i 101 | # while idx < size - 1: 102 | # if responses[idx+1] * 1.1 > curr_response: 103 | # idx += 1 104 | # else: 105 | # break 106 | radii[i] = np.min(np.sum(np.square(points[:idx] - points[i]), 1)) 107 | 108 | x = np.array(x[np.argpartition(radii, -n)[-n:]]) 109 | y = np.array(y[np.argpartition(radii, -n)[-n:]]) 110 | 111 | return x, y, confidences, scales, orientations 112 | -------------------------------------------------------------------------------- /project/proj2/code/student_sift.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | 5 | def get_features(image, x, y, feature_width, scales=None): 6 | """ 7 | To start with, you might want to simply use normalized patches as your 8 | local feature. This is very simple to code and works OK. However, to get 9 | full credit you will need to implement the more effective SIFT descriptor 10 | (See Szeliski 4.1.2 or the original publications at 11 | http://www.cs.ubc.ca/~lowe/keypoints/) 12 | 13 | Your implementation does not need to exactly match the SIFT reference. 14 | Here are the key properties your (baseline) descriptor should have: 15 | (1) a 4x4 grid of cells, each feature_width/4. It is simply the 16 | terminology used in the feature literature to describe the spatial 17 | bins where gradient distributions will be described. 18 | (2) each cell should have a histogram of the local distribution of 19 | gradients in 8 orientations. Appending these histograms together will 20 | give you 4x4 x 8 = 128 dimensions. 21 | (3) Each feature should be normalized to unit length. 22 | 23 | You do not need to perform the interpolation in which each gradient 24 | measurement contributes to multiple orientation bins in multiple cells 25 | As described in Szeliski, a single gradient measurement creates a 26 | weighted contribution to the 4 nearest cells and the 2 nearest 27 | orientation bins within each cell, for 8 total contributions. This type 28 | of interpolation probably will help, though. 29 | 30 | You do not have to explicitly compute the gradient orientation at each 31 | pixel (although you are free to do so). You can instead filter with 32 | oriented filters (e.g. a filter that responds to edges with a specific 33 | orientation). All of your SIFT-like feature can be constructed entirely 34 | from filtering fairly quickly in this way. 35 | 36 | You do not need to do the normalize -> threshold -> normalize again 37 | operation as detailed in Szeliski and the SIFT paper. It can help, though. 38 | 39 | Another simple trick which can help is to raise each element of the final 40 | feature vector to some power that is less than one. 41 | 42 | Args: 43 | - image: A numpy array of shape (m,n) or (m,n,c). can be grayscale or color, your choice 44 | - x: A numpy array of shape (k,), the x-coordinates of interest points 45 | - y: A numpy array of shape (k,), the y-coordinates of interest points 46 | - feature_width: integer representing the local feature width in pixels. 47 | You can assume that feature_width will be a multiple of 4 (i.e. every 48 | cell of your local SIFT-like feature will have an integer width 49 | and height). This is the initial window size we examine around 50 | each keypoint. 51 | - scales: Python list or tuple if you want to detect and describe features 52 | at multiple scales 53 | 54 | You may also detect and describe features at particular orientations. 55 | 56 | Returns: 57 | - fv: A numpy array of shape (k, feat_dim) representing a feature vector. 58 | "feat_dim" is the feature_dimensionality (e.g. 128 for standard SIFT). 59 | These are the computed features. 60 | """ 61 | assert image.ndim == 2, 'Image must be grayscale' 62 | ############################################################################# 63 | # TODO: YOUR CODE HERE # 64 | # If you choose to implement rotation invariance, enabling it should not # 65 | # decrease your matching accuracy. # 66 | ############################################################################# 67 | 68 | # features = 0 69 | # octave_layers = 3 70 | # contrast_threshold = 0.04 71 | # edge_threshold = 10 72 | # sigma = 1.6 73 | # sift_init_sigma = 0.5 74 | 75 | # # init Gaussian 76 | # sig_diff = np.sqrt( 77 | # max(sigma * sigma - sift_init_sigma * sift_init_sigma * 4, 0.01)) 78 | # resized = cv2.resize( 79 | # image, (2 * image.shape[1], 2 * image.shape[0]), interpolation=cv2.INTER_LINEAR) 80 | # base = cv2.GaussianBlur(resized, 0, sig_diff) 81 | 82 | # # number of octaves 83 | # # for( size_t i = 0; i < keypoints.size(); i++ ) 84 | # # { 85 | # # KeyPoint& kpt = keypoints[i]; 86 | # # float scale = 1.f/(float)(1 << -firstOctave); 87 | # # kpt.octave = (kpt.octave & ~255) | ((kpt.octave + firstOctave) & 255); 88 | # # kpt.pt *= scale; 89 | # # kpt.size *= scale; 90 | # # } 91 | # for i in range(len(x)): 92 | # scale = 0.5 93 | # octave = 0 94 | 95 | # octaves = int(round(np.log(min(base.shape)) / np.log(2) - 2)) + 1 96 | 97 | # # build Gaussian pyramid 98 | # sig_len = octave_layers + 3 99 | # sig = [sigma] * sig_len 100 | # gpyr = [None] * octaves * sig_len 101 | # k = np.pow(2, 1 / octave_layers) 102 | # for i in range(1, octave_layers + 3): 103 | # sig_prev = np.pow(k, i-1) * sigma 104 | # sig_total = sig_prev * k 105 | # sig[i] = np.sqrt(sig_total * sig_total - sig_prev * sig_prev) 106 | # for o in range(octaves): 107 | # for i in range(sig_len): 108 | # if o == 0 and i == 0: 109 | # gpyr[o * sig_len + i] = base 110 | # elif i == 0: 111 | # src = gpyr[(o - 1) * sig_len + octave_layers] 112 | # cv2.resize(src, (src.shape[1] // 2, src.shape[0] // 2), 113 | # gpyr[o * sig_len + i], interpolation=cv2.INTER_NEAREST) 114 | # else: 115 | # src = gpyr[o * sig_len + i - 1] 116 | # gpyr[o * sig_len + i] = cv2.GaussianBlur(src, 0, sig[i]) 117 | 118 | # # build DoG pyramid 119 | # dogpyr = [None] * octaves * (octave_layers + 2) 120 | # for a in range(len(dogpyr)): 121 | # o = a // (octave_layers + 2) 122 | # i = a % (octave_layers + 2) 123 | # src1 = gpyr[o*(octave_layers + 3) + i] 124 | # src2 = gpyr[o*(octave_layers + 3) + i + 1] 125 | # cv2.subtract(src1, src2, dogpyr[o*(octave_layers + 2) + i]) 126 | 127 | # SIFT descriptor 128 | ksize = 3 129 | n_angles = 8 130 | n_bins = 4 131 | n_samples = n_bins * n_bins 132 | n_pts = len(x) 133 | threshold = 0.2 134 | 135 | fv = np.zeros((n_pts, n_angles * n_samples)) 136 | 137 | # padding 138 | image = np.pad(image, feature_width // 2) 139 | 140 | # histogram of oriented gradients 141 | ix = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=ksize) 142 | iy = cv2.Sobel(image, cv2.CV_64F, 0, 1, ksize=ksize) 143 | magnitude = np.sqrt(np.square(ix) + np.square(iy)) 144 | theta = np.arctan2(iy, ix) 145 | 146 | theta[theta > 1] = 2 147 | theta[theta < -1] = -1 148 | x = x.astype(int) 149 | y = y.astype(int) 150 | 151 | for k in range(n_pts): 152 | histogram = np.zeros((n_bins, n_bins, n_angles)) 153 | for j in range(feature_width): 154 | for i in range(feature_width): 155 | curr = (y[k] + j, x[k] + i) 156 | if ix[curr] > 0: 157 | histogram[j // n_bins, i // n_bins, 158 | int(np.ceil(theta[curr] + 1))] += magnitude[curr] 159 | else: 160 | histogram[j // n_bins, i // n_bins, 161 | int(np.ceil(theta[curr] + 1 + n_angles//2))] += magnitude[curr] 162 | fv[k, :] = np.reshape(histogram, (1, n_angles * n_samples)) 163 | 164 | # normalize, threshold, normalize 165 | tmp = np.sqrt(np.sum(np.power(fv, 2), 1)) 166 | fv_norm = np.divide(fv, np.tile(tmp, (fv.shape[1], 1)).T) 167 | fv_norm[fv_norm > threshold] = threshold 168 | tmp = np.sqrt(np.sum(np.power(fv_norm, 2), 1)) 169 | fv = np.divide(fv_norm, np.tile(tmp, (fv.shape[1], 1)).T) 170 | return fv 171 | -------------------------------------------------------------------------------- /project/proj2/code/utils.py: -------------------------------------------------------------------------------- 1 | # Please do not modify this file. 2 | 3 | import numpy as np 4 | import cv2 5 | import pickle 6 | 7 | 8 | def im2single(im): 9 | im = im.astype(np.float32) / 255 10 | 11 | return im 12 | 13 | def single2im(im): 14 | im *= 255 15 | im = im.astype(np.uint8) 16 | 17 | return im 18 | 19 | def rgb2gray(rgb): 20 | """Convert RGB image to grayscale 21 | Args: 22 | - rgb: A numpy array of shape (m,n,c) representing an RGB image 23 | Returns: 24 | - gray: A numpy array of shape (m,n) representing the corresponding grayscale image 25 | """ 26 | return np.dot(rgb[...,:3], [0.299, 0.587, 0.144]) 27 | 28 | 29 | def load_image(path): 30 | """ 31 | Args: 32 | - path: string representing a filepath to an image 33 | """ 34 | return im2single(cv2.imread(path))[:, :, ::-1] 35 | 36 | def save_image(path, im): 37 | """ 38 | Args: 39 | - path: 40 | - im: A numpy array of shape 41 | """ 42 | return cv2.imwrite(path, single2im(im.copy())[:, :, ::-1]) 43 | 44 | def cheat_interest_points(eval_file, scale_factor): 45 | """ 46 | This function is provided for development and debugging but cannot be used in 47 | the final handin. It 'cheats' by generating interest points from known 48 | correspondences. It will only work for the 3 image pairs with known 49 | correspondences. 50 | 51 | Args: 52 | - eval_file: string representing the file path to the list of known correspondences 53 | - scale_factor: Python float representing the scale needed to map from the original 54 | image coordinates to the resolution being used for the current experiment. 55 | 56 | Returns: 57 | - x1: A numpy array of shape (k,) containing ground truth x-coordinates of imgA correspondence pts 58 | - y1: A numpy array of shape (k,) containing ground truth y-coordinates of imgA correspondence pts 59 | - x2: A numpy array of shape (k,) containing ground truth x-coordinates of imgB correspondence pts 60 | - y2: A numpy array of shape (k,) containing ground truth y-coordinates of imgB correspondence pts 61 | """ 62 | with open(eval_file, 'rb') as f: 63 | d = pickle.load(f, encoding='latin1') 64 | 65 | return d['x1'] * scale_factor, d['y1'] * scale_factor, d['x2'] * scale_factor,\ 66 | d['y2'] * scale_factor 67 | 68 | def hstack_images(imgA, imgB): 69 | """ 70 | Stacks 2 images side-by-side and creates one combined image. 71 | 72 | Args: 73 | - imgA: A numpy array of shape (M,N,3) representing rgb image 74 | - imgB: A numpy array of shape (D,E,3) representing rgb image 75 | 76 | Returns: 77 | - newImg: A numpy array of shape (max(M,D), N+E, 3) 78 | """ 79 | Height = max(imgA.shape[0], imgB.shape[0]) 80 | Width = imgA.shape[1] + imgB.shape[1] 81 | 82 | newImg = np.zeros((Height, Width, 3), dtype=imgA.dtype) 83 | newImg[:imgA.shape[0], :imgA.shape[1], :] = imgA 84 | newImg[:imgB.shape[0], imgA.shape[1]:, :] = imgB 85 | 86 | return newImg 87 | 88 | def show_interest_points(img, X, Y): 89 | """ 90 | Visualized interest points on an image with random colors 91 | 92 | Args: 93 | - img: A numpy array of shape (M,N,C) 94 | - X: A numpy array of shape (k,) containing x-locations of interest points 95 | - Y: A numpy array of shape (k,) containing y-locations of interest points 96 | 97 | Returns: 98 | - newImg: A numpy array of shape (M,N,C) showing the original image with 99 | colored circles at keypoints plotted on top of it 100 | """ 101 | newImg = img.copy() 102 | for x, y in zip(X.astype(int), Y.astype(int)): 103 | cur_color = np.random.rand(3) 104 | newImg = cv2.circle(newImg, (x, y), 10, cur_color, -1, cv2.LINE_AA) 105 | 106 | return newImg 107 | 108 | def show_correspondence_circles(imgA, imgB, X1, Y1, X2, Y2): 109 | """ 110 | Visualizes corresponding points between two images by plotting circles at 111 | each correspondence location. Corresponding points will have the same random color. 112 | 113 | Args: 114 | - imgA: A numpy array of shape (M,N,3) 115 | - imgB: A numpy array of shape (D,E,3) 116 | - x1: A numpy array of shape (k,) containing x-locations of keypoints in imgA 117 | - y1: A numpy array of shape (k,) containing y-locations of keypoints in imgA 118 | - x2: A numpy array of shape (j,) containing x-locations of keypoints in imgB 119 | - y2: A numpy array of shape (j,) containing y-locations of keypoints in imgB 120 | 121 | Returns: 122 | - newImg: A numpy array of shape (max(M,D), N+E, 3) 123 | """ 124 | newImg = hstack_images(imgA, imgB) 125 | shiftX = imgA.shape[1] 126 | X1 = X1.astype(np.int) 127 | Y1 = Y1.astype(np.int) 128 | X2 = X2.astype(np.int) 129 | Y2 = Y2.astype(np.int) 130 | 131 | for x1, y1, x2, y2 in zip(X1, Y1, X2, Y2): 132 | cur_color = np.random.rand(3) 133 | green = (0, 1, 0) 134 | newImg = cv2.circle(newImg, (x1, y1), 10, cur_color, -1, cv2.LINE_AA) 135 | newImg = cv2.circle(newImg, (x1, y1), 10, green, 2, cv2.LINE_AA) 136 | newImg = cv2.circle(newImg, (x2+shiftX, y2), 10, cur_color, -1, cv2.LINE_AA) 137 | newImg = cv2.circle(newImg, (x2+shiftX, y2), 10, green, 2, cv2.LINE_AA) 138 | 139 | return newImg 140 | 141 | def show_correspondence_lines(imgA, imgB, X1, Y1, X2, Y2, line_colors=None): 142 | """ 143 | Visualizes corresponding points between two images by drawing a line segment 144 | between the two images for each (x1,y1) (x2,y2) pair. 145 | 146 | Args: 147 | - imgA: A numpy array of shape (M,N,3) 148 | - imgB: A numpy array of shape (D,E,3) 149 | - x1: A numpy array of shape (k,) containing x-locations of keypoints in imgA 150 | - y1: A numpy array of shape (k,) containing y-locations of keypoints in imgA 151 | - x2: A numpy array of shape (j,) containing x-locations of keypoints in imgB 152 | - y2: A numpy array of shape (j,) containing y-locations of keypoints in imgB 153 | - line_colors: A numpy array of shape (N x 3) with colors of correspondence lines (optional) 154 | 155 | Returns: 156 | - newImg: A numpy array of shape (max(M,D), N+E, 3) 157 | """ 158 | newImg = hstack_images(imgA, imgB) 159 | shiftX = imgA.shape[1] 160 | X1 = X1.astype(np.int) 161 | Y1 = Y1.astype(np.int) 162 | X2 = X2.astype(np.int) 163 | Y2 = Y2.astype(np.int) 164 | 165 | dot_colors = np.random.rand(len(X1), 3) 166 | if line_colors is None: 167 | line_colors = dot_colors 168 | 169 | for x1, y1, x2, y2, dot_color, line_color in zip(X1, Y1, X2, Y2, dot_colors, 170 | line_colors): 171 | newImg = cv2.circle(newImg, (x1, y1), 5, dot_color, -1) 172 | newImg = cv2.circle(newImg, (x2+shiftX, y2), 5, dot_color, -1) 173 | newImg = cv2.line(newImg, (x1, y1), (x2+shiftX, y2), line_color, 2, 174 | cv2.LINE_AA) 175 | return newImg 176 | 177 | def show_ground_truth_corr(imgA, imgB, corr_file, show_lines=True): 178 | """ 179 | Show the ground truth correspondeces 180 | 181 | Args: 182 | - imgA: string, representing the filepath to the first image 183 | - imgB: string, representing the filepath to the second image 184 | - corr_file: filepath to pickle (.pkl) file containing the correspondences 185 | - show_lines: boolean, whether to visualize the correspondences as line segments 186 | """ 187 | imgA = load_image(imgA) 188 | imgB = load_image(imgB) 189 | with open(corr_file, 'rb') as f: 190 | d = pickle.load(f) 191 | if show_lines: 192 | return show_correspondence_lines(imgA, imgB, d['x1'], d['y1'], d['x2'], d['y2']) 193 | else: 194 | # show circles 195 | return show_correspondence_circles(imgA, imgB, d['x1'], d['y1'], d['x2'], d['y2']) 196 | 197 | def load_corr_pkl_file(corr_fpath): 198 | """ Load ground truth correspondences from a pickle (.pkl) file. """ 199 | with open(corr_fpath, 'rb') as f: 200 | d = pickle.load(f, encoding='latin1') 201 | x1 = d['x1'].squeeze() 202 | y1 = d['y1'].squeeze() 203 | x2 = d['x2'].squeeze() 204 | y2 = d['y2'].squeeze() 205 | 206 | return x1,y1,x2,y2 207 | 208 | 209 | def evaluate_correspondence(imgA, imgB, corr_fpath, scale_factor, x1_est, y1_est, 210 | x2_est, y2_est, confidences=None, num_req_matches=100): 211 | """ 212 | Function to evaluate estimated correspondences against ground truth. 213 | 214 | The evaluation requires 100 matches to receive full credit 215 | when num_req_matches=100 because we define accuracy as: 216 | 217 | Accuracy = (true_pos)/(true_pos+false_pos) * min(num_matches,num_req_matches)/num_req_matches 218 | 219 | Args: 220 | - imgA: A numpy array of shape (M,N,C) representing a first image 221 | - imgB: A numpy array of shape (M,N,C) representing a second image 222 | - corr_fpath: string, representing a filepath to a .pkl file containing ground truth correspondences 223 | - scale_factor: scale factor on the size of the images 224 | - x1_est: A numpy array of shape (k,) containing estimated x-coordinates of imgA correspondence pts 225 | - y1_est: A numpy array of shape (k,) containing estimated y-coordinates of imgA correspondence pts 226 | - x2_est: A numpy array of shape (k,) containing estimated x-coordinates of imgB correspondence pts 227 | - y2_est: A numpy array of shape (k,) containing estimated y-coordinates of imgB correspondence pts 228 | - confidences: (optional) confidence values in the matches 229 | """ 230 | if confidences is None: 231 | confidences = np.random.rand(len(x1_est)) 232 | confidences /= np.max(confidences) 233 | 234 | x1_est = x1_est.squeeze() / scale_factor 235 | y1_est = y1_est.squeeze() / scale_factor 236 | x2_est = x2_est.squeeze() / scale_factor 237 | y2_est = y2_est.squeeze() / scale_factor 238 | 239 | num_matches = x1_est.shape[0] 240 | 241 | x1,y1,x2,y2 = load_corr_pkl_file(corr_fpath) 242 | 243 | good_matches = [False for _ in range(len(x1_est))] 244 | # array marking which GT pairs are already matched 245 | matched = [False for _ in range(len(x1))] 246 | 247 | # iterate through estimated pairs in decreasing order of confidence 248 | priority = np.argsort(-confidences) 249 | for i in priority: 250 | # print('Examining ({:4.0f}, {:4.0f}) to ({:4.0f}, {:4.0f})'.format( 251 | # x1_est[i], y1_est[i], x2_est[i], y2_est[i])) 252 | cur_offset = np.asarray([x1_est[i]-x2_est[i], y1_est[i]-y2_est[i]]) 253 | # for each x1_est find nearest ground truth point in x1 254 | dists = np.linalg.norm(np.vstack((x1_est[i]-x1, y1_est[i]-y1)), axis=0) 255 | best_matches = np.argsort(dists) 256 | 257 | # find the best match that is not taken yet 258 | for match_idx in best_matches: 259 | if not matched[match_idx]: 260 | break 261 | else: 262 | continue 263 | 264 | # A match is good only if 265 | # (1) An unmatched GT point exists within 150 pixels, and 266 | # (2) GT correspondence offset is within 25 pixels of estimated 267 | # correspondence offset 268 | gt_offset = np.asarray([x1[match_idx]-x2[match_idx], 269 | y1[match_idx]-y2[match_idx]]) 270 | offset_dist = np.linalg.norm(cur_offset-gt_offset) 271 | if (dists[match_idx] < 150.0) and (offset_dist < 25): 272 | good_matches[i] = True 273 | print('Correct') 274 | else: 275 | print('Incorrect') 276 | 277 | print('You found {}/{} required matches'.format(num_matches, num_req_matches)) 278 | accuracy = np.mean(good_matches) * min(num_matches, num_req_matches)*1./num_req_matches 279 | print('Accuracy = {:f}'.format(accuracy)) 280 | green = np.asarray([0, 1, 0], dtype=float) 281 | red = np.asarray([1, 0, 0], dtype=float) 282 | line_colors = np.asarray([green if m else red for m in good_matches]) 283 | 284 | return accuracy, show_correspondence_lines(imgA, imgB, 285 | x1_est*scale_factor, y1_est*scale_factor, 286 | x2_est*scale_factor, y2_est*scale_factor, 287 | line_colors) 288 | -------------------------------------------------------------------------------- /project/proj2/data/Episcopal Gaudi/3743214471_1b5bbfda98_o.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/data/Episcopal Gaudi/3743214471_1b5bbfda98_o.jpg -------------------------------------------------------------------------------- /project/proj2/data/Episcopal Gaudi/4386465943_8cf9776378_o.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/data/Episcopal Gaudi/4386465943_8cf9776378_o.jpg -------------------------------------------------------------------------------- /project/proj2/data/Mount Rushmore/9021235130_7c2acd9554_o.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/data/Mount Rushmore/9021235130_7c2acd9554_o.jpg -------------------------------------------------------------------------------- /project/proj2/data/Mount Rushmore/9318872612_a255c874fb_o.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/data/Mount Rushmore/9318872612_a255c874fb_o.jpg -------------------------------------------------------------------------------- /project/proj2/data/Notre Dame/4191453057_c86028ce1f_o.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/data/Notre Dame/4191453057_c86028ce1f_o.jpg -------------------------------------------------------------------------------- /project/proj2/data/Notre Dame/921919841_a30df938f2_o.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/data/Notre Dame/921919841_a30df938f2_o.jpg -------------------------------------------------------------------------------- /project/proj2/results/circles0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/circles0.jpg -------------------------------------------------------------------------------- /project/proj2/results/circles1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/circles1.jpg -------------------------------------------------------------------------------- /project/proj2/results/circles2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/circles2.jpg -------------------------------------------------------------------------------- /project/proj2/results/eval.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/eval.jpg -------------------------------------------------------------------------------- /project/proj2/results/lines0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/lines0.jpg -------------------------------------------------------------------------------- /project/proj2/results/lines1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/lines1.jpg -------------------------------------------------------------------------------- /project/proj2/results/lines2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/lines2.jpg -------------------------------------------------------------------------------- /project/proj2/results/vis_circles.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/vis_circles.jpg -------------------------------------------------------------------------------- /project/proj2/results/vis_lines.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj2/results/vis_lines.jpg -------------------------------------------------------------------------------- /project/proj3/Assigment3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj3/Assigment3.pdf -------------------------------------------------------------------------------- /project/proj3/code/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project/proj3/code/student_code.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import pickle 4 | from utils import load_image, load_image_gray 5 | import cyvlfeat as vlfeat 6 | import sklearn.metrics.pairwise as sklearn_pairwise 7 | from sklearn.svm import LinearSVC 8 | from IPython.core.debugger import set_trace 9 | from cyvlfeat.sift.dsift import dsift 10 | from cyvlfeat.kmeans import kmeans 11 | from time import time 12 | from joblib import Parallel, delayed, parallel_backend 13 | 14 | 15 | def get_tiny_images(image_paths): 16 | """ 17 | This feature is inspired by the simple tiny images used as features in 18 | 80 million tiny images: a large dataset for non-parametric object and 19 | scene recognition. A. Torralba, R. Fergus, W. T. Freeman. IEEE 20 | Transactions on Pattern Analysis and Machine Intelligence, vol.30(11), 21 | pp. 1958-1970, 2008. http://groups.csail.mit.edu/vision/TinyImages/ 22 | 23 | To build a tiny image feature, simply resize the original image to a very 24 | small square resolution, e.g. 16x16. You can either resize the images to 25 | square while ignoring their aspect ratio or you can crop the center 26 | square portion out of each image. Making the tiny images zero mean and 27 | unit length (normalizing them) will increase performance modestly. 28 | 29 | Useful functions: 30 | - cv2.resize 31 | - use load_image(path) to load a RGB images and load_image_gray(path) to 32 | load grayscale images 33 | 34 | Args: 35 | - image_paths: list of N elements containing image paths 36 | 37 | Returns: 38 | - feats: N x d numpy array of resized and then vectorized tiny images 39 | e.g. if the images are resized to 16x16, d would be 256 40 | """ 41 | # parameter 42 | width = 16 43 | 44 | N = len(image_paths) 45 | d = width * width 46 | # dummy feats variable 47 | feats = np.zeros((N, d)) 48 | for i in range(N): 49 | image = load_image_gray(image_paths[i]) 50 | image = cv2.resize(image, (width, width), 51 | interpolation=cv2.INTER_LINEAR) 52 | image = np.reshape(image, (1, d)) 53 | image -= np.mean(image) 54 | image_normalized = image / np.std(image) 55 | feats[i, :] = image_normalized 56 | return feats 57 | 58 | 59 | def build_vocabulary(image_paths, vocab_size): 60 | """ 61 | This function will sample SIFT descriptors from the training images, 62 | cluster them with kmeans, and then return the cluster centers. 63 | 64 | Useful functions: 65 | - Use load_image(path) to load RGB images and load_image_gray(path) to load 66 | grayscale images 67 | - frames, descriptors = vlfeat.sift.dsift(img) 68 | http://www.vlfeat.org/matlab/vl_dsift.html 69 | - frames is a N x 2 matrix of locations, which can be thrown away 70 | here (but possibly used for extra credit in get_bags_of_sifts if 71 | you're making a "spatial pyramid"). 72 | - descriptors is a N x 128 matrix of SIFT features 73 | Note: there are step, bin size, and smoothing parameters you can 74 | manipulate for dsift(). We recommend debugging with the 'fast' 75 | parameter. This approximate version of SIFT is about 20 times faster to 76 | compute. Also, be sure not to use the default value of step size. It 77 | will be very slow and you'll see relatively little performance gain 78 | from extremely dense sampling. You are welcome to use your own SIFT 79 | feature code! It will probably be slower, though. 80 | - cluster_centers = vlfeat.kmeans.kmeans(X, K) 81 | http://www.vlfeat.org/matlab/vl_kmeans.html 82 | - X is a N x d numpy array of sampled SIFT features, where N is 83 | the number of features sampled. N should be pretty large! 84 | - K is the number of clusters desired (vocab_size) 85 | cluster_centers is a K x d matrix of cluster centers. This is 86 | your vocabulary. 87 | 88 | Args: 89 | - image_paths: list of image paths. 90 | - vocab_size: size of vocabulary 91 | 92 | Returns: 93 | - vocab: This is a vocab_size x d numpy array (vocabulary). Each row is a 94 | cluster center / visual word 95 | """ 96 | # Load images from the training set. To save computation time, you don't 97 | # necessarily need to sample from all images, although it would be better 98 | # to do so. You can randomly sample the descriptors from each image to save 99 | # memory and speed up the clustering. Or you can simply call vl_dsift with 100 | # a large step size here, but a smaller step size in get_bags_of_sifts. 101 | # 102 | # For each loaded image, get some SIFT features. You don't have to get as 103 | # many SIFT features as you will in get_bags_of_sift, because you're only 104 | # trying to get a representative sample here. 105 | # 106 | # Once you have tens of thousands of SIFT features from many training 107 | # images, cluster them with kmeans. The resulting centroids are now your 108 | # visual word vocabulary. 109 | 110 | # length of the SIFT descriptors that you are going to compute. 111 | dim = 128 112 | vocab = np.zeros((vocab_size, dim)) 113 | 114 | # parameters 115 | step = 10 116 | sample = 200 117 | 118 | N = len(image_paths) 119 | features = np.zeros((sample * N, dim)) 120 | 121 | def parallel_func(image_path, sample): 122 | image = load_image_gray(image_path) 123 | _, descriptors = vlfeat.sift.dsift(image, fast=True, step=step) 124 | sample_idx = np.random.permutation(descriptors.shape[0]) 125 | return descriptors[sample_idx[:sample], :] 126 | 127 | results = Parallel(n_jobs=-1)(delayed(parallel_func)(image_path, sample) 128 | for image_path in image_paths) 129 | 130 | idx = 0 131 | for result in results: 132 | features[idx:sample + idx, :] = result 133 | idx += sample 134 | 135 | vocab = vlfeat.kmeans.kmeans(features, vocab_size) 136 | 137 | return vocab 138 | 139 | 140 | def get_bags_of_sifts(image_paths, vocab_filename): 141 | """ 142 | This feature representation is described in the handout, lecture 143 | materials, and Szeliski chapter 14. 144 | You will want to construct SIFT features here in the same way you 145 | did in build_vocabulary() (except for possibly changing the sampling 146 | rate) and then assign each local feature to its nearest cluster center 147 | and build a histogram indicating how many times each cluster was used. 148 | Don't forget to normalize the histogram, or else a larger image with more 149 | SIFT features will look very different from a smaller version of the same 150 | image. 151 | 152 | Useful functions: 153 | - Use load_image(path) to load RGB images and load_image_gray(path) to load 154 | grayscale images 155 | - frames, descriptors = vlfeat.sift.dsift(img) 156 | http://www.vlfeat.org/matlab/vl_dsift.html 157 | frames is a M x 2 matrix of locations, which can be thrown away here 158 | (but possibly used for extra credit in get_bags_of_sifts if you're 159 | making a "spatial pyramid"). 160 | descriptors is a M x 128 matrix of SIFT features 161 | note: there are step, bin size, and smoothing parameters you can 162 | manipulate for dsift(). We recommend debugging with the 'fast' 163 | parameter. This approximate version of SIFT is about 20 times faster 164 | to compute. Also, be sure not to use the default value of step size. 165 | It will be very slow and you'll see relatively little performance 166 | gain from extremely dense sampling. You are welcome to use your own 167 | SIFT feature code! It will probably be slower, though. 168 | - assignments = vlfeat.kmeans.kmeans_quantize(data, vocab) 169 | finds the cluster assigments for features in data 170 | - data is a M x d matrix of image features 171 | - vocab is the vocab_size x d matrix of cluster centers 172 | (vocabulary) 173 | - assignments is a Mx1 array of assignments of feature vectors to 174 | nearest cluster centers, each element is an integer in 175 | [0, vocab_size) 176 | 177 | Args: 178 | - image_paths: paths to N images 179 | - vocab_filename: Path to the precomputed vocabulary. 180 | This function assumes that vocab_filename exists and contains an 181 | vocab_size x 128 ndarray 'vocab' where each row is a kmeans centroid 182 | or visual word. This ndarray is saved to disk rather than passed in 183 | as a parameter to avoid recomputing the vocabulary every run. 184 | 185 | Returns: 186 | - image_feats: N x d matrix, where d is the dimensionality of the 187 | feature representation. In this case, d will equal the number of 188 | clusters or equivalently the number of entries in each image's 189 | histogram (vocab_size) below. 190 | """ 191 | # load vocabulary 192 | with open(vocab_filename, 'rb') as f: 193 | vocab = pickle.load(f) 194 | 195 | # parameter 196 | step = 10 197 | 198 | N = len(image_paths) 199 | vocab_size = vocab.shape[0] 200 | 201 | # dummy features variable 202 | feats = np.zeros((N, vocab_size)) 203 | 204 | def parallel_func(i, image_paths, step, vocab, vocab_size): 205 | image = load_image_gray(image_paths[i]) 206 | _, descriptors = vlfeat.sift.dsift(image, fast=True, step=step) 207 | assignments = vlfeat.kmeans.kmeans_quantize( 208 | descriptors.astype('float64'), vocab) 209 | bags_of_sifts = np.zeros((1, vocab_size)) 210 | for assignment in assignments: 211 | bags_of_sifts[0, assignment] += 1 212 | return bags_of_sifts / np.linalg.norm(bags_of_sifts) 213 | 214 | result = Parallel(n_jobs=-1)(delayed(parallel_func)(i, image_paths, step, vocab, vocab_size) 215 | for i in range(N)) 216 | 217 | for i in range(N): 218 | feats[i, :] = result[i] 219 | 220 | return feats 221 | 222 | 223 | def nearest_neighbor_classify(train_image_feats, train_labels, test_image_feats, 224 | metric='euclidean'): 225 | """ 226 | This function will predict the category for every test image by finding 227 | the training image with most similar features. Instead of 1 nearest 228 | neighbor, you can vote based on k nearest neighbors which will increase 229 | performance (although you need to pick a reasonable value for k). 230 | 231 | Useful functions: 232 | - D = sklearn_pairwise.pairwise_distances(X, Y) 233 | computes the distance matrix D between all pairs of rows in X and Y. 234 | - X is a N x d numpy array of d-dimensional features arranged along 235 | N rows 236 | - Y is a M x d numpy array of d-dimensional features arranged along 237 | N rows 238 | - D is a N x M numpy array where d(i, j) is the distance between row 239 | i of X and row j of Y 240 | 241 | Args: 242 | - train_image_feats: N x d numpy array, where d is the dimensionality of 243 | the feature representation 244 | - train_labels: N element list, where each entry is a string indicating 245 | the ground truth category for each training image 246 | - test_image_feats: M x d numpy array, where d is the dimensionality of the 247 | feature representation. You can assume N = M, unless you have changed 248 | the starter code 249 | - metric: (optional) metric to be used for nearest neighbor. 250 | Can be used to select different distance functions. The default 251 | metric, 'euclidean' is fine for tiny images. 'chi2' tends to work 252 | well for histograms 253 | 254 | Returns: 255 | - test_labels: M element list, where each entry is a string indicating the 256 | predicted category for each testing image 257 | """ 258 | 259 | # parameter 260 | k = 5 261 | 262 | N = len(train_labels) 263 | M = test_image_feats.shape[0] 264 | test_labels = [None] * M 265 | 266 | # distance 267 | D = sklearn_pairwise.pairwise_distances( 268 | test_image_feats, train_image_feats, metric, n_jobs=2) 269 | 270 | string_int_dict = {} 271 | int_to_string_dict = {} 272 | int_train_labels = [0] * N 273 | counter = 0 274 | for i in range(N): 275 | key = train_labels[i] 276 | if key in string_int_dict: 277 | _id = string_int_dict[key] 278 | else: 279 | _id = counter 280 | counter += 1 281 | string_int_dict[key] = _id 282 | int_to_string_dict[_id] = train_labels[i] 283 | int_train_labels[i] = _id 284 | 285 | # find knn 286 | for i in range(M): 287 | knn = np.argpartition(D[i, :], k, axis=0)[:k] 288 | knn_labels = np.take(int_train_labels, knn) 289 | test_labels[i] = int_to_string_dict[np.argmax(np.bincount(knn_labels))] 290 | 291 | return test_labels 292 | 293 | 294 | def svm_classify(train_image_feats, train_labels, test_image_feats): 295 | """ 296 | This function will train a linear SVM for every category (i.e. one vs all) 297 | and then use the learned linear classifiers to predict the category of 298 | every test image. Every test feature will be evaluated with all 15 SVMs 299 | and the most confident SVM will "win". Confidence, or distance from the 300 | margin, is W*X + B where '*' is the inner product or dot product and W and 301 | B are the learned hyperplane parameters. 302 | 303 | Useful functions: 304 | - sklearn LinearSVC 305 | http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html 306 | - svm.fit(X, y) 307 | - set(l) 308 | 309 | Args: 310 | - train_image_feats: N x d numpy array, where d is the dimensionality of 311 | the feature representation 312 | - train_labels: N element list, where each entry is a string indicating the 313 | ground truth category for each training image 314 | - test_image_feats: M x d numpy array, where d is the dimensionality of the 315 | feature representation. You can assume N = M, unless you have changed 316 | the starter code 317 | Returns: 318 | - test_labels: M element list, where each entry is a string indicating the 319 | predicted category for each testing image 320 | """ 321 | # categories 322 | categories = list(set(train_labels)) 323 | 324 | # construct 1 vs all SVMs for each category 325 | svms = {cat: LinearSVC(random_state=0, tol=1e-3, loss='hinge', C=5) 326 | for cat in categories} 327 | 328 | test_labels = [] 329 | 330 | N = train_image_feats.shape[0] 331 | M = test_image_feats.shape[0] 332 | C = len(categories) 333 | confidence_scores = np.zeros((M, C)) 334 | 335 | for i in range(C): 336 | cat = categories[i] 337 | y_train = np.zeros(N) 338 | for j in range(N): 339 | if cat == train_labels[j]: 340 | y_train[j] = 1 341 | svms[cat].fit(train_image_feats, y_train) 342 | confidence_scores[:, i] = svms[cat].decision_function(test_image_feats) 343 | 344 | for i in range(M): 345 | test_labels.append(categories[np.argmax(confidence_scores[i, :])]) 346 | 347 | return test_labels 348 | -------------------------------------------------------------------------------- /project/proj3/code/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from sklearn.metrics import confusion_matrix 4 | import matplotlib.pyplot as plt 5 | import os.path as osp 6 | from glob import glob 7 | from random import shuffle 8 | 9 | 10 | def im2single(im): 11 | im = im.astype(np.float32) / 255 12 | return im 13 | 14 | 15 | def single2im(im): 16 | im *= 255 17 | im = im.astype(np.uint8) 18 | return im 19 | 20 | 21 | def load_image(path): 22 | return im2single(cv2.imread(path))[:, :, ::-1] 23 | 24 | 25 | def load_image_gray(path): 26 | img = load_image(path) 27 | return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) 28 | 29 | 30 | def get_image_paths(data_path, categories, num_train_per_cat=100, fmt='jpg'): 31 | """ 32 | This function returns lists containing the file path for each train 33 | and test image, as well as listss with the label of each train and 34 | test image. By default all four of these arrays will have 1500 35 | elements where each element is a string. 36 | :param data_path: path to the 'test' and 'train' directories 37 | :param categories: list of category names 38 | :param num_train_per_cat: max number of training images to use (per category) 39 | :param fmt: file extension of the images 40 | :return: lists: train_image_paths, test_image_paths, train_labels, test_labels 41 | """ 42 | train_image_paths = [] 43 | test_image_paths = [] 44 | train_labels = [] 45 | test_labels = [] 46 | 47 | for cat in categories: 48 | # train 49 | pth = osp.join(data_path, 'train', cat, '*.{:s}'.format(fmt)) 50 | pth = glob(pth) 51 | shuffle(pth) 52 | pth = pth[:num_train_per_cat] 53 | train_image_paths.extend(pth) 54 | train_labels.extend([cat]*len(pth)) 55 | 56 | # test 57 | pth = osp.join(data_path, 'test', cat, '*.{:s}'.format(fmt)) 58 | pth = glob(pth) 59 | shuffle(pth) 60 | pth = pth[:num_train_per_cat] 61 | test_image_paths.extend(pth) 62 | test_labels.extend([cat]*len(pth)) 63 | 64 | return train_image_paths, test_image_paths, train_labels, test_labels 65 | 66 | 67 | def show_results(train_image_paths, test_image_paths, train_labels, test_labels, 68 | categories, abbr_categories, predicted_categories): 69 | """ 70 | shows the results 71 | :param train_image_paths: 72 | :param test_image_paths: 73 | :param train_labels: 74 | :param test_labels: 75 | :param categories: 76 | :param abbr_categories: 77 | :param predicted_categories: 78 | :return: 79 | """ 80 | cat2idx = {cat: idx for idx, cat in enumerate(categories)} 81 | 82 | # confusion matrix 83 | y_true = [cat2idx[cat] for cat in test_labels] 84 | y_pred = [cat2idx[cat] for cat in predicted_categories] 85 | cm = confusion_matrix(y_true, y_pred) 86 | cm = cm.astype(np.float) / cm.sum(axis=1)[:, np.newaxis] 87 | acc = np.mean(np.diag(cm)) 88 | plt.figure() 89 | plt.imshow(cm, interpolation='nearest', cmap=plt.cm.get_cmap('jet')) 90 | plt.title('Confusion matrix. Mean of diagonal = {:4.2f}%'.format(acc*100)) 91 | tick_marks = np.arange(len(categories)) 92 | plt.tight_layout() 93 | plt.xticks(tick_marks, abbr_categories, rotation=45) 94 | plt.yticks(tick_marks, categories) 95 | -------------------------------------------------------------------------------- /project/proj3/code/vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj3/code/vocab.pkl -------------------------------------------------------------------------------- /project/proj4/Assignment4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj4/Assignment4.pdf -------------------------------------------------------------------------------- /project/proj4/code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj4/code/__init__.py -------------------------------------------------------------------------------- /project/proj4/code/student_code.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cyvlfeat as vlfeat 3 | from utils import * 4 | import os.path as osp 5 | from glob import glob 6 | from random import shuffle 7 | from sklearn.svm import LinearSVC 8 | 9 | 10 | def get_positive_features(train_path_pos, feature_params): 11 | """ 12 | This function should return all positive training examples (faces) from 13 | 36x36 images in 'train_path_pos'. Each face should be converted into a 14 | HoG template according to 'feature_params'. 15 | 16 | Useful functions: 17 | - vlfeat.hog.hog(im, cell_size): computes HoG features 18 | 19 | Args: 20 | - train_path_pos: (string) This directory contains 36x36 face images 21 | - feature_params: dictionary of HoG feature computation parameters. 22 | You can include various parameters in it. Two defaults are: 23 | - template_size: (default 36) The number of pixels spanned by 24 | each train/test template. 25 | - hog_cell_size: (default 6) The number of pixels in each HoG 26 | cell. template size should be evenly divisible by hog_cell_size. 27 | Smaller HoG cell sizes tend to work better, but they make things 28 | slower because the feature dimensionality increases and more 29 | importantly the step size of the classifier decreases at test time 30 | (although you don't have to make the detector step size equal a 31 | single HoG cell). 32 | 33 | Returns: 34 | - feats: N x D matrix where N is the number of faces and D is the template 35 | dimensionality, which would be (feature_params['template_size'] / 36 | feature_params['hog_cell_size'])^2 * 31 if you're using the default 37 | hog parameters. 38 | """ 39 | # params for HOG computation 40 | win_size = feature_params.get('template_size', 36) 41 | cell_size = feature_params.get('hog_cell_size', 6) 42 | 43 | positive_files = glob(osp.join(train_path_pos, '*.jpg')) 44 | 45 | ########################################################################### 46 | # TODO: YOUR CODE HERE # 47 | ########################################################################### 48 | 49 | n_cell = np.ceil(win_size/cell_size).astype('int') 50 | feats = np.zeros((len(positive_files), n_cell*n_cell*31)) 51 | 52 | for i in range(len(positive_files)): 53 | im = load_image_gray(positive_files[i]) 54 | feats[i, :] = vlfeat.hog.hog(im, cell_size).ravel() 55 | 56 | ########################################################################### 57 | # END OF YOUR CODE # 58 | ########################################################################### 59 | 60 | return feats 61 | 62 | 63 | def get_random_negative_features(non_face_scn_path, feature_params, num_samples): 64 | """ 65 | This function should return negative training examples (non-faces) from any 66 | images in 'non_face_scn_path'. Images should be loaded in grayscale because 67 | the positive training data is only available in grayscale (use 68 | load_image_gray()). 69 | 70 | Useful functions: 71 | - vlfeat.hog.hog(im, cell_size): computes HoG features 72 | 73 | Args: 74 | - non_face_scn_path: string. This directory contains many images which 75 | have no faces in them. 76 | - feature_params: dictionary of HoG feature computation parameters. See 77 | the documentation for get_positive_features() for more information. 78 | - num_samples: number of negatives to be mined. It is not important for 79 | the function to find exactly 'num_samples' non-face features. For 80 | example, you might try to sample some number from each image, but 81 | some images might be too small to find enough. 82 | 83 | Returns: 84 | - N x D matrix where N is the number of non-faces and D is the feature 85 | dimensionality, which would be (feature_params['template_size'] / 86 | feature_params['hog_cell_size'])^2 * 31 if you're using the default 87 | hog parameters. 88 | """ 89 | # params for HOG computation 90 | win_size = feature_params.get('template_size', 36) 91 | cell_size = feature_params.get('hog_cell_size', 6) 92 | 93 | negative_files = glob(osp.join(non_face_scn_path, '*.jpg')) 94 | 95 | ########################################################################### 96 | # TODO: YOUR CODE HERE # 97 | ########################################################################### 98 | 99 | n_cell = np.ceil(win_size/cell_size).astype('int') 100 | scales = [1, 0.9, 0.8, 0.7, 0.6, 0.5] 101 | feats = [] 102 | samples_per_image = int(num_samples / len(negative_files) / len(scales)) 103 | for i in range(len(negative_files)): 104 | im = load_image_gray(negative_files[i]) 105 | for scale in scales: 106 | hog = vlfeat.hog.hog(cv2.resize( 107 | im, None, fx=scale, fy=scale), cell_size) 108 | for j in range(samples_per_image): 109 | r1 = np.random.randint(hog.shape[0]) 110 | r2 = np.random.randint(hog.shape[1]) 111 | if (r1 + n_cell < hog.shape[0]) and (r2 + n_cell < hog.shape[1]): 112 | feats.append( 113 | hog[r1:r1+n_cell, r2:r2+n_cell, :].ravel()) 114 | 115 | feats = np.array(feats) 116 | print(feats.shape) 117 | 118 | ########################################################################### 119 | # END OF YOUR CODE # 120 | ########################################################################### 121 | 122 | return feats 123 | 124 | 125 | def train_classifier(features_pos, features_neg, C): 126 | """ 127 | This function trains a linear SVM classifier on the positive and negative 128 | features obtained from the previous steps. We fit a model to the features 129 | and return the svm object. 130 | 131 | Args: 132 | - features_pos: N X D array. This contains an array of positive features 133 | extracted from get_positive_feats(). 134 | - features_neg: M X D array. This contains an array of negative features 135 | extracted from get_negative_feats(). 136 | 137 | Returns: 138 | - svm: LinearSVC object. This returns a SVM classifier object trained 139 | on the positive and negative features. 140 | """ 141 | ########################################################################### 142 | # TODO: YOUR CODE HERE # 143 | ########################################################################### 144 | 145 | svm = LinearSVC(tol=1e-5, loss='hinge', C=C) 146 | feats = np.vstack((features_pos, features_neg)) 147 | labels = np.hstack( 148 | (np.ones(len(features_pos)), -np.ones(len(features_neg)))) 149 | svm.fit(feats, labels) 150 | 151 | ########################################################################### 152 | # END OF YOUR CODE # 153 | ########################################################################### 154 | 155 | return svm 156 | 157 | 158 | def mine_hard_negs(non_face_scn_path, svm, feature_params): 159 | """ 160 | This function is pretty similar to get_random_negative_features(). The only 161 | difference is that instead of returning all the extracted features, you only 162 | return the features with false-positive prediction. 163 | 164 | Useful functions: 165 | - vlfeat.hog.hog(im, cell_size): computes HoG features 166 | - svm.predict(feat): predict features 167 | 168 | Args: 169 | - non_face_scn_path: string. This directory contains many images which 170 | have no faces in them. 171 | - feature_params: dictionary of HoG feature computation parameters. See 172 | the documentation for get_positive_features() for more information. 173 | - svm: LinearSVC object 174 | 175 | Returns: 176 | - N x D matrix where N is the number of non-faces which are 177 | false-positive and D is the feature dimensionality. 178 | """ 179 | 180 | # params for HOG computation 181 | win_size = feature_params.get('template_size', 36) 182 | cell_size = feature_params.get('hog_cell_size', 6) 183 | 184 | negative_files = glob(osp.join(non_face_scn_path, '*.jpg')) 185 | 186 | ########################################################################### 187 | # TODO: YOUR CODE HERE # 188 | ########################################################################### 189 | 190 | num_samples = 5000 191 | feats = get_random_negative_features( 192 | non_face_scn_path, feature_params, num_samples) 193 | feats = feats[svm.predict(feats) > 0] 194 | 195 | ########################################################################### 196 | # END OF YOUR CODE # 197 | ########################################################################### 198 | 199 | return feats 200 | 201 | 202 | def run_detector(test_scn_path, svm, feature_params, verbose=False): 203 | """ 204 | This function returns detections on all of the images in a given path. You 205 | will want to use non-maximum suppression on your detections or your 206 | performance will be poor (the evaluation counts a duplicate detection as 207 | wrong). The non-maximum suppression is done on a per-image basis. The 208 | starter code includes a call to a provided non-max suppression function. 209 | 210 | The placeholder version of this code will return random bounding boxes in 211 | each test image. It will even do non-maximum suppression on the random 212 | bounding boxes to give you an example of how to call the function. 213 | 214 | Your actual code should convert each test image to HoG feature space with 215 | a _single_ call to vlfeat.hog.hog() for each scale. Then step over the HoG 216 | cells, taking groups of cells that are the same size as your learned 217 | template, and classifying them. If the classification is above some 218 | confidence, keep the detection and then pass all the detections for an 219 | image to non-maximum suppression. For your initial debugging, you can 220 | operate only at a single scale and you can skip calling non-maximum 221 | suppression. Err on the side of having a low confidence threshold (even 222 | less than zero) to achieve high enough recall. 223 | 224 | Args: 225 | - test_scn_path: (string) This directory contains images which may or 226 | may not have faces in them. This function should work for the 227 | MIT+CMU test set but also for any other images (e.g. class photos). 228 | - svm: A trained sklearn.svm.LinearSVC object 229 | - feature_params: dictionary of HoG feature computation parameters. 230 | You can include various parameters in it. Two defaults are: 231 | - template_size: (default 36) The number of pixels spanned by 232 | each train/test template. 233 | - hog_cell_size: (default 6) The number of pixels in each HoG 234 | cell. template size should be evenly divisible by hog_cell_size. 235 | Smaller HoG cell sizes tend to work better, but they make things 236 | slower because the feature dimensionality increases and more 237 | importantly the step size of the classifier decreases at test time. 238 | - verbose: prints out debug information if True 239 | 240 | Returns: 241 | - bboxes: N x 4 numpy array. N is the number of detections. 242 | bboxes(i,:) is [x_min, y_min, x_max, y_max] for detection i. 243 | - confidences: (N, ) size numpy array. confidences(i) is the real-valued 244 | confidence of detection i. 245 | - image_ids: List with N elements. image_ids[i] is the image file name 246 | for detection i. (not the full path, just 'albert.jpg') 247 | """ 248 | im_filenames = sorted(glob(osp.join(test_scn_path, '*.jpg'))) 249 | bboxes = np.empty((0, 4)) 250 | confidences = np.empty(0) 251 | image_ids = [] 252 | 253 | # number of top detections to feed to NMS 254 | topk = 20 255 | 256 | # params for HOG computation 257 | win_size = feature_params.get('template_size', 36) 258 | cell_size = feature_params.get('hog_cell_size', 6) 259 | template_size = int(win_size / cell_size) 260 | 261 | for idx, im_filename in enumerate(im_filenames): 262 | print('Detecting faces in {:s}'.format(im_filename)) 263 | im = load_image_gray(im_filename) 264 | im_id = osp.split(im_filename)[-1] 265 | im_shape = im.shape 266 | # create scale space HOG pyramid and return scores for prediction 267 | 268 | ####################################################################### 269 | # TODO: YOUR CODE HERE # 270 | ####################################################################### 271 | 272 | cur_bboxes = [] 273 | cur_confidences = [] 274 | 275 | min_dim = min(im_shape[0], im_shape[1]) 276 | iteration = 0 277 | max_iter = 100 278 | scale_factor = 1 279 | step_size = 1 280 | 281 | while scale_factor * min_dim > win_size and iteration < max_iter: 282 | resized_im = cv2.resize(im, None, fx=scale_factor, fy=scale_factor) 283 | hog_feats = vlfeat.hog.hog(resized_im, cell_size) 284 | for r in range(0, hog_feats.shape[0]-template_size, step_size): 285 | for c in range(0, hog_feats.shape[1]-template_size, step_size): 286 | hog = hog_feats[r:r+template_size, c:c+template_size, :] 287 | score = svm.decision_function( 288 | hog.ravel().reshape(1, -1))[0] 289 | if score >= 0: 290 | cur_bboxes.append(cell_size/scale_factor * np.array( 291 | [c, r, c+template_size, r+template_size]).astype(int)) 292 | cur_confidences.append(score) 293 | scale_factor *= 0.85 294 | iteration += 1 295 | 296 | if len(cur_bboxes) == 0: 297 | cur_bboxes = np.zeros((1, 4)) 298 | cur_confidences = np.array([0]) 299 | else: 300 | cur_bboxes = np.array(cur_bboxes) 301 | cur_confidences = np.array(cur_confidences) 302 | 303 | ####################################################################### 304 | # END OF YOUR CODE # 305 | ####################################################################### 306 | 307 | ### non-maximum suppression ### 308 | # non_max_supr_bbox() can actually get somewhat slow with thousands of 309 | # initial detections. You could pre-filter the detections by confidence, 310 | # e.g. a detection with confidence -1.1 will probably never be 311 | # meaningful. You probably _don't_ want to threshold at 0.0, though. You 312 | # can get higher recall with a lower threshold. You should not modify 313 | # anything in non_max_supr_bbox(). If you want to try your own NMS methods, 314 | # please create another function. 315 | 316 | idsort = np.argsort(-cur_confidences)[:topk] 317 | cur_bboxes = cur_bboxes[idsort] 318 | cur_confidences = cur_confidences[idsort] 319 | is_valid_bbox = non_max_suppression_bbox(cur_bboxes, cur_confidences, 320 | im_shape, verbose=verbose) 321 | 322 | print('NMS done, {:d} detections passed'.format(sum(is_valid_bbox))) 323 | cur_bboxes = cur_bboxes[is_valid_bbox] 324 | cur_confidences = cur_confidences[is_valid_bbox] 325 | 326 | bboxes = np.vstack((bboxes, cur_bboxes)) 327 | confidences = np.hstack((confidences, cur_confidences)) 328 | image_ids.extend([im_id] * len(cur_confidences)) 329 | 330 | return bboxes, confidences, image_ids 331 | -------------------------------------------------------------------------------- /project/proj4/code/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import os.path as osp 4 | from glob import glob 5 | import matplotlib.pyplot as plt 6 | from skimage import draw 7 | 8 | plt.rcParams.update({'figure.max_open_warning': 0}) 9 | 10 | 11 | def load_image(path): 12 | im = cv2.imread(path) 13 | im = im[:, :, ::-1] # BGR -> RGB 14 | im = im.astype(np.float32) # for vlfeat functions 15 | return im 16 | 17 | 18 | def load_image_gray(path): 19 | im = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 20 | im = im.astype(np.float32) # for vlfeat functions 21 | return im 22 | 23 | 24 | def report_accuracy(confidences, label_vector): 25 | """ 26 | Calculates various accuracy metrics on the given predictions 27 | :param confidences: 1D numpy array holding predicted confidence scores 28 | :param label_vector: 1D numpy array holding ground truth labels (same size 29 | as confidences 30 | :return: tp_rate, fp_rate, tn_rate, fn_rate 31 | """ 32 | preds = confidences.copy() 33 | preds[preds >= 0] = 1 34 | preds[preds < 0] = -1 35 | 36 | tp = np.logical_and(preds > 0, preds == label_vector) 37 | fp = np.logical_and(preds > 0, preds != label_vector) 38 | tn = np.logical_and(preds < 0, preds == label_vector) 39 | fn = np.logical_and(preds < 0, preds != label_vector) 40 | 41 | N = len(label_vector) 42 | 43 | tp_rate = sum(tp) / (sum(tp) + sum(fn)) * 100 44 | fp_rate = sum(fp) / (sum(fp) + sum(tn)) * 100 45 | tn_rate = 100 - fp_rate 46 | fn_rate = 100 - tp_rate 47 | accuracy = (sum(tp) + sum(tn)) / N * 100 48 | 49 | print('Accuracy = {:4.3f}%\n' 50 | 'True Positive rate = {:4.3f}%\nFalse Positive rate = {:4.3f}%\n' 51 | 'True Negative rate = {:4.3f}%\nFalse Negative rate = {:4.3f}%'. 52 | format(accuracy, tp_rate, fp_rate, tn_rate, fn_rate)) 53 | 54 | return tp_rate, fp_rate, tn_rate, fn_rate 55 | 56 | 57 | def non_max_suppression_bbox(bboxes, confidences, img_size, verbose=False): 58 | """ 59 | high confidence detections suppress all overlapping detections (including 60 | detections at other scales). Detections can partially overlap, but the 61 | center of one detection can not be within another detection. 62 | 63 | :param bboxes: Nx4 numpy array, where N is the number of bounding boxes. Each 64 | row is [xmin, ymin, xmax, ymax] 65 | :param confidences: size (N, ) numpy array, holding the final confidence of 66 | each detection 67 | :param img_size: the [height, width] of the image 68 | :param verbose: boolean 69 | :return: size (N, ) numpy logical array. Element i indicates if the i'th 70 | bounding box survives non-maximum suppression. 71 | """ 72 | # truncate the bounding boxes to image dimensions 73 | bboxes[:, 2] = np.minimum(bboxes[:, 2], img_size[1]) 74 | bboxes[:, 3] = np.minimum(bboxes[:, 3], img_size[0]) 75 | 76 | # higher confidence detections get priority 77 | order = np.argsort(-confidences) 78 | confidences = confidences[order] 79 | bboxes = bboxes[order] 80 | 81 | # output indicator vector 82 | is_valid_bbox = np.asarray([False] * len(confidences)) 83 | 84 | # overlap threshold above which the less confident detection is suppressed 85 | overlap_thresh = 0.3 86 | 87 | for i in range(len(confidences)): 88 | cur_bb = bboxes[i] 89 | cur_bb_is_valid = True 90 | 91 | for j in np.where(is_valid_bbox)[0]: 92 | prev_bb = bboxes[j] 93 | bi = [max(cur_bb[0], prev_bb[0]), max(cur_bb[1], prev_bb[1]), 94 | min(cur_bb[2], prev_bb[2]), min(cur_bb[3], prev_bb[3])] 95 | iw = bi[2] - bi[0] + 1 96 | ih = bi[3] - bi[1] + 1 97 | if (iw > 0) and (ih > 0): 98 | # overlap = area of intersection / area of union 99 | ua = (cur_bb[2] - cur_bb[0] + 1) * (cur_bb[3] - cur_bb[1] + 1) + \ 100 | (prev_bb[2] - prev_bb[0] + 1) * (prev_bb[3] - prev_bb[1] + 1) - \ 101 | iw * ih 102 | ov = (iw * ih) / ua 103 | 104 | if ov > overlap_thresh: 105 | cur_bb_is_valid = False 106 | 107 | # special case: center coordinate of current bbox is inside the previous 108 | # bbox 109 | cx = (cur_bb[0] + cur_bb[2]) / 2 110 | cy = (cur_bb[1] + cur_bb[3]) / 2 111 | if (cx > prev_bb[0]) and (cx < prev_bb[2]) and (cy > prev_bb[1]) and \ 112 | (cy < prev_bb[3]): 113 | cur_bb_is_valid = False 114 | 115 | if verbose: 116 | print('Detection {:d}, bbox = [{:d}, {:d}, {:d}, {:d}], {:f} overlap ' 117 | 'with detection {:d} [{:d}, {:d}, {:d}, {:d}]' 118 | .format(i, cur_bb[0], cur_bb[1], cur_bb[2], cur_bb[3], ov, j, 119 | prev_bb[0], prev_bb[1], prev_bb[2], prev_bb[3])) 120 | 121 | if not cur_bb_is_valid: 122 | break 123 | 124 | is_valid_bbox[i] = cur_bb_is_valid 125 | 126 | # return back to the original order 127 | order = np.argsort(order) 128 | is_valid_bbox = is_valid_bbox[order] 129 | 130 | return is_valid_bbox 131 | 132 | 133 | def voc_ap(rec, prec): 134 | mrec = np.hstack((0, rec, 1)) 135 | mpre = np.hstack((0, prec, 0)) 136 | 137 | for i in reversed(range(len(mpre) - 1)): 138 | mpre[i] = max(mpre[i], mpre[i + 1]) 139 | 140 | i = np.where(mrec[1:] != mrec[:-1])[0] + 1 141 | ap = sum((mrec[i] - mrec[i - 1]) * mpre[i]) 142 | return ap 143 | 144 | 145 | def visualize_hog(svm, feature_params): 146 | win_size = feature_params.get('template_size', 36) 147 | cell_size = feature_params.get('hog_cell_size', 6) 148 | n_cell = np.ceil(win_size / cell_size).astype('int') 149 | 150 | test_feat = svm.coef_ - np.min(svm.coef_) 151 | test_feat = np.reshape(test_feat, [n_cell, n_cell, 31]) 152 | 153 | radius = 22 154 | orientations = 9 155 | 156 | cx, cy = 48, 48 157 | sy, sx = cy * n_cell, cx * n_cell 158 | 159 | n_cellsx = n_cell 160 | n_cellsy = n_cell 161 | 162 | orientation_histogram = test_feat 163 | 164 | orientations_arr = np.arange(orientations) 165 | dx_arr = radius * np.cos(orientations_arr / orientations * np.pi) 166 | dy_arr = radius * np.sin(orientations_arr / orientations * np.pi) 167 | hog_image = np.zeros((sy, sx), dtype=float) 168 | 169 | for x in range(n_cellsx): 170 | for y in range(n_cellsy): 171 | for o, dx, dy in zip(orientations_arr, dx_arr, dy_arr): 172 | centre = tuple([y * cy + cy // 2, x * cx + cx // 2]) 173 | wt = (orientation_histogram[y, x, 18 + o]) * 2.5 174 | 175 | xmin = int(centre[0] - dx) 176 | xmax = int(centre[0] + dx) 177 | ymin = int(centre[1] + dy) 178 | ymax = int(centre[1] - dy) 179 | 180 | rr, cc = draw.line(xmin, ymin, xmax, ymax) 181 | 182 | hog_image[rr, cc] = np.maximum(hog_image[rr, cc], wt) 183 | hog_image[rr + 1, cc] = np.maximum(hog_image[rr + 1, cc], wt) 184 | hog_image[rr, cc + 1] = np.maximum(hog_image[rr, cc + 1], wt) 185 | hog_image[rr - 1, cc] = np.maximum(hog_image[rr - 1, cc], wt) 186 | hog_image[rr, cc - 1] = np.maximum(hog_image[rr, cc - 1], wt) 187 | 188 | hog_image_2 = hog_image.copy() 189 | 190 | hog_image = hog_image ** 3 / np.max(hog_image ** 3) 191 | hog_image = hog_image * 255 192 | 193 | hog_image_2[hog_image_2 == 0] = 0.5 * np.max(hog_image_2) 194 | hog_image_2 = hog_image_2 / np.max(hog_image_2) 195 | hog_image_2 = hog_image_2 * 255 196 | 197 | fig = plt.figure(figsize=[8, 4]) 198 | ax = fig.add_subplot(121) 199 | ax.imshow((hog_image).astype("uint8"), cmap="gray") 200 | ax.axis("off") 201 | 202 | ax = fig.add_subplot(122) 203 | ax.imshow((hog_image_2).astype("uint8"), cmap="gray") 204 | ax.axis("off") 205 | 206 | 207 | def evaluate_detections(bboxes, confidences, image_ids, label_path, draw=True): 208 | """ 209 | :param bboxes: 210 | :param confidences: 211 | :param image_ids: 212 | :param label_path: 213 | :param draw: 214 | :return: 215 | """ 216 | gt_ids = [] 217 | gt_bboxes = [] 218 | with open(label_path, 'r') as f: 219 | for line in f: 220 | gt_id, xmin, ymin, xmax, ymax = line.split(' ') 221 | gt_ids.append(gt_id) 222 | gt_bboxes.append([float(xmin), float(ymin), float(xmax), float(ymax)]) 223 | gt_bboxes = np.vstack(gt_bboxes) 224 | 225 | npos = len(gt_ids) 226 | gt_isclaimed = np.asarray([False] * len(gt_ids)) 227 | 228 | # sort detections by decreasing confidence 229 | order = np.argsort(-confidences) 230 | confidences = confidences[order] 231 | image_ids = [image_ids[i] for i in order] 232 | bboxes = bboxes[order] 233 | 234 | # assign detections to GT objects 235 | nd = len(confidences) 236 | tp = np.asarray([False] * nd) 237 | fp = np.asarray([False] * nd) 238 | duplicate_detections = np.asarray([False] * nd) 239 | 240 | for d in range(nd): 241 | cur_gt_ids = [i for i, gt_id in enumerate(gt_ids) if gt_id == image_ids[d]] 242 | 243 | bb = bboxes[d] 244 | ovmax = -float('inf') 245 | 246 | for j in cur_gt_ids: 247 | bbgt = gt_bboxes[j] 248 | bi = [max(bb[0], bbgt[0]), max(bb[1], bbgt[1]), min(bb[2], bbgt[2]), 249 | min(bb[3], bbgt[3])] 250 | iw = bi[2] - bi[0] + 1 251 | ih = bi[3] - bi[1] + 1 252 | 253 | if (iw > 0) and (ih > 0): 254 | ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + \ 255 | (bbgt[2] - bbgt[0] + 1) * (bbgt[3] - bbgt[1] + 1) - \ 256 | iw * ih 257 | ov = iw * ih / ua 258 | if ov > ovmax: 259 | ovmax = ov 260 | jmax = j 261 | 262 | if ovmax >= 0.3: 263 | if not gt_isclaimed[jmax]: 264 | tp[d] = True 265 | gt_isclaimed[jmax] = True 266 | else: 267 | fp[d] = True 268 | duplicate_detections[d] = True 269 | else: 270 | fp[d] = True 271 | 272 | cum_fp = np.cumsum(fp) 273 | cum_tp = np.cumsum(tp) 274 | rec = cum_tp / npos 275 | prec = cum_tp / (cum_tp + cum_fp) 276 | ap = voc_ap(rec, prec) 277 | 278 | if draw: 279 | plt.figure() 280 | plt.plot(rec, prec, '-') 281 | plt.xlim(0, 1) 282 | plt.ylim(0, 1) 283 | plt.xlabel('Recall') 284 | plt.ylabel('Precision') 285 | plt.title('Average precision = {:4.3f}'.format(ap)) 286 | 287 | order = np.argsort(order) 288 | tp = tp[order] 289 | fp = fp[order] 290 | duplicate_detections = duplicate_detections[order] 291 | 292 | return gt_ids, gt_bboxes, gt_isclaimed, tp, fp, duplicate_detections 293 | 294 | 295 | def visualize_detections_by_image(bboxes, confidences, image_ids, tp, fp, 296 | test_scn_path, label_filename, onlytp=False): 297 | """ 298 | Visuaize the detection bounding boxes and ground truth on images 299 | :param bboxes: N x 4 numpy matrix, where N is the number of detections. Each 300 | row is [xmin, ymin, xmax, ymax] 301 | :param confidences: size (N, ) numpy array of detection confidences 302 | :param image_ids: N-element list of image names for each detection 303 | :param tp: size (N, ) numpy array of true positive indicator variables 304 | :param fp: size (N, ) numpy array of false positive indicator variables 305 | :param test_scn_path: path to directory holding test images (in .jpg format) 306 | :param label_filename: path to .txt file containing labels. Format is 307 | image_id xmin ymin xmax ymax for each row 308 | :param onlytp: show only true positives 309 | :return: 310 | """ 311 | gt_ids = [] 312 | gt_bboxes = [] 313 | with open(label_filename, 'r') as f: 314 | for line in f: 315 | gt_id, xmin, ymin, xmax, ymax = line.split(' ') 316 | gt_ids.append(gt_id) 317 | gt_bboxes.append([float(xmin), float(ymin), float(xmax), float(ymax)]) 318 | gt_bboxes = np.vstack(gt_bboxes) 319 | 320 | gt_file_list = list(set(gt_ids)) 321 | 322 | for gt_file in gt_file_list: 323 | cur_test_image = load_image(osp.join(test_scn_path, gt_file)) 324 | 325 | cur_gt_detections = [i for i, gt_id in enumerate(gt_ids) if gt_id == gt_file] 326 | cur_gt_bboxes = gt_bboxes[cur_gt_detections] 327 | 328 | cur_detections = [i for i, gt_id in enumerate(image_ids) if gt_id == gt_file] 329 | cur_bboxes = bboxes[cur_detections] 330 | cur_confidences = confidences[cur_detections] 331 | cur_tp = tp[cur_detections] 332 | cur_fp = fp[cur_detections] 333 | 334 | plt.figure() 335 | plt.imshow(cur_test_image.astype(np.uint8)) 336 | 337 | for i, bb in enumerate(cur_bboxes): 338 | if cur_tp[i]: # true positive 339 | plt.plot(bb[[0, 2, 2, 0, 0]], bb[[1, 1, 3, 3, 1]], 'g') 340 | elif cur_fp[i]: # false positive 341 | if not onlytp: 342 | plt.plot(bb[[0, 2, 2, 0, 0]], bb[[1, 1, 3, 3, 1]], 'r') 343 | else: 344 | raise AssertionError 345 | 346 | for bb in cur_gt_bboxes: 347 | plt.plot(bb[[0, 2, 2, 0, 0]], bb[[1, 1, 3, 3, 1]], 'y') 348 | 349 | plt.axis("off") 350 | plt.title('{:s} (green=true pos, red=false pos, yellow=ground truth), ' 351 | '{:d}/{:d} found'.format(gt_file, sum(cur_tp), len(cur_gt_bboxes))) 352 | 353 | 354 | def visualize_detections_by_confidence(bboxes, confidences, image_ids, 355 | test_scn_path, label_filename, onlytp=False): 356 | """ 357 | Visuaize the detection bounding boxes and ground truth on images, sorted by 358 | confidence 359 | :param bboxes: N x 4 numpy matrix, where N is the number of detections. Each 360 | row is [xmin, ymin, xmax, ymax] 361 | :param confidences: size (N, ) numpy array of detection confidences 362 | :param image_ids: N-element list of image names for each detection 363 | :param test_scn_path: path to directory holding test images (in .jpg format) 364 | :param label_filename: path to .txt file containing labels. Format is 365 | image_id xmin ymin xmax ymax for each row 366 | :param onlytp: show only true positives 367 | :return: 368 | """ 369 | gt_ids = [] 370 | gt_bboxes = [] 371 | with open(label_filename, 'r') as f: 372 | for line in f: 373 | gt_id, xmin, ymin, xmax, ymax = line.split(' ') 374 | gt_ids.append(gt_id) 375 | gt_bboxes.append([float(xmin), float(ymin), float(xmax), float(ymax)]) 376 | gt_bboxes = np.vstack(gt_bboxes) 377 | 378 | # sort detections by decreasing confidence 379 | order = np.argsort(-confidences) 380 | image_ids = [image_ids[i] for i in order] 381 | bboxes = bboxes[order] 382 | confidences = confidences[order] 383 | 384 | for d in range(len(confidences)): 385 | cur_gt_idxs = [i for i, gt_id in enumerate(gt_ids) if gt_id == image_ids[d]] 386 | bb = bboxes[d] 387 | ovmax = -float('inf') 388 | 389 | for j in cur_gt_idxs: 390 | bbgt = gt_bboxes[j] 391 | bi = [max(bb[0], bbgt[0]), max(bb[1], bbgt[1]), min(bb[2], bbgt[2]), 392 | min(bb[3], bbgt[3])] 393 | iw = bi[2] - bi[0] + 1 394 | ih = bi[3] - bi[1] + 1 395 | 396 | if (iw > 0) and (ih > 0): 397 | ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + \ 398 | (bbgt[2] - bbgt[0] + 1) * (bbgt[3] - bbgt[1] + 1) - \ 399 | iw * ih 400 | ov = iw * ih / ua 401 | if ov > ovmax: 402 | ovmax = ov 403 | jmax = j 404 | 405 | if onlytp and ovmax < 0.3: 406 | continue 407 | 408 | im = load_image(osp.join(test_scn_path, image_ids[d])) 409 | plt.figure() 410 | plt.imshow(im.astype(np.uint8)) 411 | if ovmax >= 0.3: 412 | bbgt = gt_bboxes[jmax] 413 | plt.plot(bbgt[[0, 2, 2, 0, 0]], bbgt[[1, 1, 3, 3, 1]], 'y') 414 | plt.plot(bb[[0, 2, 2, 0, 0]], bb[[1, 1, 3, 3, 1]], 'g') 415 | else: 416 | plt.plot(bb[[0, 2, 2, 0, 0]], bb[[1, 1, 3, 3, 1]], 'r') 417 | plt.title('Image {:s} [{:d}/{:d}], (green=true pos, red=false pos, ' 418 | 'yellow=ground truth)'.format(image_ids[d], d, len(confidences))) 419 | 420 | 421 | def visualize_detections_by_image_no_gt(bboxes, confidences, image_ids, 422 | test_scn_path): 423 | """ 424 | Visualize detection bounding boxes on images that don't have ground truth 425 | labels 426 | :param bboxes: N x 4 numpy matrix, where N is the number of detections. Each 427 | row is [xmin, ymin, xmax, ymax] 428 | :param confidences: size (N, ) numpy array of detection confidences 429 | :param image_ids: N-element list of image names for each detection 430 | :param test_scn_path: path to directory holding test images (in .jpg format) 431 | :return: 432 | """ 433 | test_filenames = glob(osp.join(test_scn_path, '*.jpg')) 434 | 435 | for im_filename in test_filenames: 436 | test_id = im_filename.split('/')[-1] 437 | test_id = test_id.split('\\')[-1] # in case the file path use backslash 438 | cur_test_image = load_image(im_filename) 439 | cur_detections = [i for i, im_id in enumerate(image_ids) if im_id == test_id] 440 | cur_bboxes = bboxes[cur_detections] 441 | cur_confidences = confidences[cur_detections] 442 | 443 | plt.figure() 444 | plt.imshow(cur_test_image.astype(np.uint8)) 445 | 446 | for bb in cur_bboxes: 447 | plt.plot(bb[[0, 2, 2, 0, 0]], bb[[1, 1, 3, 3, 1]], 'g') 448 | plt.title('{:s} green=detection'.format(test_id)) 449 | 450 | 451 | class PseudoSVM(): 452 | 453 | def __init__(self, C=10, dim=1116): 454 | self.C = C 455 | self.coef_ = np.random.rand(dim, 1) 456 | 457 | def decision_function(self, feats): 458 | return np.random.rand(len(feats)) 459 | -------------------------------------------------------------------------------- /project/proj5/Assigment5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj5/Assigment5.pdf -------------------------------------------------------------------------------- /project/proj5/code/Assignment5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [Deep Learning](https://www.cc.gatech.edu/~hays/compvision/proj6/)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Setup" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "pycharm": { 22 | "is_executing": false 23 | } 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "%matplotlib notebook\n", 28 | "%load_ext autoreload\n", 29 | "%autoreload 2\n", 30 | "import cv2\n", 31 | "import numpy as np\n", 32 | "import random\n", 33 | "import torch.nn as nn\n", 34 | "import torch.optim as optim\n", 35 | "import os.path as osp\n", 36 | "import matplotlib.pyplot as plt\n", 37 | "from utils import *\n", 38 | "import student_code as sc\n", 39 | "from torchvision.models import alexnet\n", 40 | "\n", 41 | "data_path = osp.join('../data', '15SceneData')\n", 42 | "num_classes = 15\n", 43 | "\n", 44 | "# If you have a good Nvidia GPU with an appropriate environment, \n", 45 | "# try setting the use_GPU flag to True (the environment provided does\n", 46 | "# not support GPUs and we will not provide any support for GPU\n", 47 | "# computation in this project). Please note that \n", 48 | "# we will evaluate your implementations only using CPU mode so even if\n", 49 | "# you use a GPU, make sure your code runs in the CPU mode with the\n", 50 | "# environment we provided. \n", 51 | "use_GPU = True\n", 52 | "if use_GPU:\n", 53 | " from utils_gpu import *" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "To train a network in PyTorch, we need 4 components:\n", 61 | "1. **Dataset** - an object which can load the data and labels given an index.\n", 62 | "2. **Model** - an object that contains the network architecture definition.\n", 63 | "3. **Loss function** - a function that measures how far the network output is from the ground truth label.\n", 64 | "4. **Optimizer** - an object that optimizes the network parameters to reduce the loss value." 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "This project has two main parts. In Part 1, you will train a deep network from scratch. In Part 2, you will \"fine-tune\" a trained network. " 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Part 1: Modifying the Dataloaders and the Simple Network create_datasets" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 2, 84 | "metadata": { 85 | "pycharm": { 86 | "is_executing": false 87 | } 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# Fix random seeds so that results will be reproducible\n", 92 | "set_seed(0, use_GPU)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 3, 98 | "metadata": { 99 | "pycharm": { 100 | "is_executing": false 101 | } 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# Training parameters.\n", 106 | "input_size = (64, 64)\n", 107 | "RGB = False \n", 108 | "base_lr = 1e-2 # may try a smaller lr if not using batch norm\n", 109 | "weight_decay = 5e-4\n", 110 | "momentum = 0.9" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Now you will modify the create_datasets function from student_code. You will add random left-right mirroring and normalization to the transformations applied to the training dataset. You will also add normalization to the transformations applied to the testing dataset. " 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": { 124 | "pycharm": { 125 | "is_executing": false 126 | } 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "Computing pixel mean and stdev...\n", 134 | "Batch 0 / 30\n", 135 | "Batch 20 / 30\n", 136 | "Done, mean = \n", 137 | "[0.45579668]\n", 138 | "std = \n", 139 | "[0.23624939]\n", 140 | "Computing pixel mean and stdev...\n", 141 | "Batch 0 / 60\n", 142 | "Batch 20 / 60\n", 143 | "Batch 40 / 60\n", 144 | "Done, mean = \n", 145 | "[0.45517009]\n", 146 | "std = \n", 147 | "[0.2350788]\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "# Create the training and testing datasets.\n", 153 | "train_dataset, test_dataset = sc.create_datasets(data_path=data_path, input_size=input_size, rgb=RGB)\n", 154 | "assert test_dataset.classes == train_dataset.classes" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "Now you will modify SimpleNet by adding droppout, batch normalization, and additional convolution/maxpool/relu layers. You should achieve an accuracy of at least **50%**. Make sure your network passes this threshold--it is required for full credit on this section!\n", 162 | "\n", 163 | "You can also use the following two blocks to determine the stucture of your network." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 5, 169 | "metadata": { 170 | "pycharm": { 171 | "is_executing": false 172 | }, 173 | "scrolled": true 174 | }, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "SimpleNet(\n", 181 | " (features): Sequential(\n", 182 | " (0): Conv2d(1, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 183 | " (1): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 184 | " (2): ReLU(inplace=True)\n", 185 | " (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 186 | " (4): Conv2d(12, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 187 | " (5): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 188 | " (6): ReLU(inplace=True)\n", 189 | " (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 190 | " (8): Conv2d(24, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 191 | " (9): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 192 | " (10): ReLU(inplace=True)\n", 193 | " (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 194 | " (12): Conv2d(48, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 195 | " (13): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 196 | " (14): ReLU(inplace=True)\n", 197 | " (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 198 | " )\n", 199 | " (classifier): Sequential(\n", 200 | " (0): Linear(in_features=1536, out_features=1024, bias=True)\n", 201 | " (1): ReLU(inplace=True)\n", 202 | " (2): Dropout(p=0.8, inplace=False)\n", 203 | " (3): Linear(in_features=1024, out_features=15, bias=True)\n", 204 | " )\n", 205 | ")\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "# create the network model\n", 211 | "model = sc.SimpleNet(num_classes=num_classes, rgb=False, verbose=False)\n", 212 | "if use_GPU:\n", 213 | " model = model.cuda()\n", 214 | "print(model)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 6, 220 | "metadata": { 221 | "pycharm": { 222 | "is_executing": false 223 | } 224 | }, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "Network output size is torch.Size([15])\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "# Use this block to determine the kernel size of the conv2d layer in the classifier\n", 236 | "# first, set the kernel size of that conv2d layer to 1, and run this block\n", 237 | "# then, use that size of input to the classifier printed by this block to\n", 238 | "# go back and update the kernel size of the conv2d layer in the classifier\n", 239 | "# Finally, run this block again and verify that the network output size is a scalar\n", 240 | "# Don't forget to re-run the block above every time you update the SimpleNet class!\n", 241 | "from torch.autograd import Variable\n", 242 | "data, _ = train_dataset[0]\n", 243 | "s = data.size()\n", 244 | "data = Variable(data.view(1, *s))\n", 245 | "if use_GPU:\n", 246 | " data = data.cuda()\n", 247 | "out = model(data)\n", 248 | "print('Network output size is ', out.size())" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "Next we will create the loss function and the optimizer. You do not have to modify the custom_part1_trainer in student_code if you use the same loss_function, optimizer, scheduler and parameters (n_epoch, batch_size etc.) as provided in this notebook to hit the required threshold of 50% accuracy. If you changed any of these values, it is important that you modify this function in student_code since we will not be using the notebook you submit to evaluate. " 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 7, 261 | "metadata": { 262 | "pycharm": { 263 | "is_executing": false 264 | } 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "# Set up the trainer. You can modify custom_part1_trainer in\n", 269 | "# student_copy.py if you want to try different learning settings.\n", 270 | "custom_part1_trainer = sc.custom_part1_trainer(model)\n", 271 | "\n", 272 | "if custom_part1_trainer is None:\n", 273 | " # Create the loss function.\n", 274 | " # see http://pytorch.org/docs/0.3.0/nn.html#loss-functions for a list of available loss functions\n", 275 | " loss_function = nn.CrossEntropyLoss()\n", 276 | "\n", 277 | " # Create the optimizer and a learning rate scheduler.\n", 278 | " optimizer = optim.SGD(params=model.parameters(), lr=base_lr, weight_decay=weight_decay, momentum=momentum)\n", 279 | " # Currently a simple step scheduler, but you can get creative.\n", 280 | " # See http://pytorch.org/docs/0.3.0/optim.html#how-to-adjust-learning-rate for various LR schedulers\n", 281 | " # and how to use them\n", 282 | " lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=60, gamma=0.1)\n", 283 | "\n", 284 | " params = {'n_epochs': 100, 'batch_size': 50, 'experiment': 'part1'}\n", 285 | " \n", 286 | "else:\n", 287 | " if 'loss_function' in custom_part1_trainer:\n", 288 | " loss_function = custom_part1_trainer['loss_function']\n", 289 | " if 'optimizer' in custom_part1_trainer:\n", 290 | " optimizer = custom_part1_trainer['optimizer']\n", 291 | " if 'lr_scheduler' in custom_part1_trainer:\n", 292 | " lr_scheduler = custom_part1_trainer['lr_scheduler']\n", 293 | " if 'params' in custom_part1_trainer:\n", 294 | " params = custom_part1_trainer['params']" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "We are ready to train our network! As before, we will start a local server to see the training progress of our network (if you server is already running, you should not start another one). Open a new terminal and activate the environment for this project. Then run the following command: **python -m visdom.server**. This will start a local server. The terminal output should give out a link like: \"http://localhost:8097\". Open this link in your browser. After you run the following block, visit this link again, and you will be able to see graphs showing the progress of your training! If you do not see any graphs, select Part 1 on the top left bar where is says Environment (only select Part 1, do not check main or Part 2)." 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 8, 307 | "metadata": { 308 | "pycharm": { 309 | "is_executing": false 310 | } 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stderr", 315 | "output_type": "stream", 316 | "text": [ 317 | "Setting up a new session...\n" 318 | ] 319 | }, 320 | { 321 | "name": "stdout", 322 | "output_type": "stream", 323 | "text": [ 324 | "---------------------------------------\n", 325 | "Experiment: part1\n", 326 | "n_epochs: 100\n", 327 | "batch_size: 50\n", 328 | "do_val: True\n", 329 | "shuffle: True\n", 330 | "num_workers: 4\n", 331 | "val_freq: 1\n", 332 | "print_freq: 100\n", 333 | "experiment: part1\n", 334 | "checkpoint_file: None\n", 335 | "resume_optim: True\n", 336 | "---------------------------------------\n", 337 | "part1 Epoch 0 / 100\n", 338 | "train part1: batch 0/29, loss 2.716, top-1 accuracy 10.000, top-5 accuracy 24.000\n", 339 | "train part1: loss 2.660641\n", 340 | "val part1: batch 0/59, loss 2.453, top-1 accuracy 10.000, top-5 accuracy 48.000\n", 341 | "val part1: loss 2.674087\n", 342 | "Checkpoint saved\n", 343 | "BEST TOP1 ACCURACY SO FAR\n", 344 | "part1 Epoch 1 / 100\n", 345 | "train part1: batch 0/29, loss 2.418, top-1 accuracy 16.000, top-5 accuracy 64.000\n", 346 | "train part1: loss 2.474717\n", 347 | "val part1: batch 0/59, loss 2.351, top-1 accuracy 22.000, top-5 accuracy 70.000\n", 348 | "val part1: loss 2.311414\n", 349 | "Checkpoint saved\n", 350 | "BEST TOP1 ACCURACY SO FAR\n", 351 | "part1 Epoch 2 / 100\n", 352 | "train part1: batch 0/29, loss 2.262, top-1 accuracy 28.000, top-5 accuracy 68.000\n", 353 | "train part1: loss 2.332324\n", 354 | "val part1: batch 0/59, loss 2.624, top-1 accuracy 22.000, top-5 accuracy 48.000\n", 355 | "val part1: loss 2.108762\n", 356 | "Checkpoint saved\n", 357 | "BEST TOP1 ACCURACY SO FAR\n", 358 | "part1 Epoch 3 / 100\n", 359 | "train part1: batch 0/29, loss 2.374, top-1 accuracy 24.000, top-5 accuracy 66.000\n", 360 | "train part1: loss 2.277820\n", 361 | "val part1: batch 0/59, loss 2.299, top-1 accuracy 30.000, top-5 accuracy 66.000\n", 362 | "val part1: loss 2.034218\n", 363 | "Checkpoint saved\n", 364 | "BEST TOP1 ACCURACY SO FAR\n", 365 | "part1 Epoch 4 / 100\n", 366 | "train part1: batch 0/29, loss 2.404, top-1 accuracy 28.000, top-5 accuracy 62.000\n", 367 | "train part1: loss 2.246646\n", 368 | "val part1: batch 0/59, loss 2.203, top-1 accuracy 2.000, top-5 accuracy 90.000\n", 369 | "val part1: loss 2.223471\n", 370 | "Checkpoint saved\n", 371 | "part1 Epoch 5 / 100\n", 372 | "train part1: batch 0/29, loss 2.088, top-1 accuracy 38.000, top-5 accuracy 82.000\n", 373 | "train part1: loss 2.194817\n", 374 | "val part1: batch 0/59, loss 2.883, top-1 accuracy 6.000, top-5 accuracy 40.000\n", 375 | "val part1: loss 2.443704\n", 376 | "Checkpoint saved\n", 377 | "part1 Epoch 6 / 100\n", 378 | "train part1: batch 0/29, loss 2.317, top-1 accuracy 24.000, top-5 accuracy 70.000\n", 379 | "train part1: loss 2.188513\n", 380 | "val part1: batch 0/59, loss 2.388, top-1 accuracy 26.000, top-5 accuracy 76.000\n", 381 | "val part1: loss 2.040629\n", 382 | "Checkpoint saved\n", 383 | "BEST TOP1 ACCURACY SO FAR\n", 384 | "part1 Epoch 7 / 100\n", 385 | "train part1: batch 0/29, loss 2.052, top-1 accuracy 30.000, top-5 accuracy 78.000\n", 386 | "train part1: loss 2.090109\n", 387 | "val part1: batch 0/59, loss 2.429, top-1 accuracy 10.000, top-5 accuracy 66.000\n", 388 | "val part1: loss 1.852616\n", 389 | "Checkpoint saved\n", 390 | "BEST TOP1 ACCURACY SO FAR\n", 391 | "part1 Epoch 8 / 100\n", 392 | "train part1: batch 0/29, loss 2.074, top-1 accuracy 34.000, top-5 accuracy 72.000\n", 393 | "train part1: loss 2.022772\n", 394 | "val part1: batch 0/59, loss 2.426, top-1 accuracy 16.000, top-5 accuracy 64.000\n", 395 | "val part1: loss 1.945282\n", 396 | "Checkpoint saved\n", 397 | "part1 Epoch 9 / 100\n", 398 | "train part1: batch 0/29, loss 2.030, top-1 accuracy 34.000, top-5 accuracy 78.000\n", 399 | "train part1: loss 2.049587\n", 400 | "val part1: batch 0/59, loss 1.840, top-1 accuracy 46.000, top-5 accuracy 90.000\n", 401 | "val part1: loss 1.799102\n", 402 | "Checkpoint saved\n", 403 | "BEST TOP1 ACCURACY SO FAR\n", 404 | "part1 Epoch 10 / 100\n", 405 | "train part1: batch 0/29, loss 2.037, top-1 accuracy 38.000, top-5 accuracy 72.000\n", 406 | "train part1: loss 2.012847\n", 407 | "val part1: batch 0/59, loss 2.321, top-1 accuracy 16.000, top-5 accuracy 72.000\n", 408 | "val part1: loss 1.853063\n", 409 | "Checkpoint saved\n", 410 | "part1 Epoch 11 / 100\n", 411 | "train part1: batch 0/29, loss 1.784, top-1 accuracy 40.000, top-5 accuracy 84.000\n", 412 | "train part1: loss 1.918819\n", 413 | "val part1: batch 0/59, loss 2.004, top-1 accuracy 20.000, top-5 accuracy 90.000\n", 414 | "val part1: loss 1.740404\n", 415 | "Checkpoint saved\n", 416 | "BEST TOP1 ACCURACY SO FAR\n", 417 | "part1 Epoch 12 / 100\n", 418 | "train part1: batch 0/29, loss 1.655, top-1 accuracy 54.000, top-5 accuracy 86.000\n", 419 | "train part1: loss 1.881280\n", 420 | "val part1: batch 0/59, loss 2.029, top-1 accuracy 26.000, top-5 accuracy 92.000\n", 421 | "val part1: loss 1.778705\n", 422 | "Checkpoint saved\n", 423 | "part1 Epoch 13 / 100\n", 424 | "train part1: batch 0/29, loss 1.871, top-1 accuracy 42.000, top-5 accuracy 78.000\n", 425 | "train part1: loss 1.858144\n", 426 | "val part1: batch 0/59, loss 2.073, top-1 accuracy 24.000, top-5 accuracy 78.000\n", 427 | "val part1: loss 1.753376\n", 428 | "Checkpoint saved\n", 429 | "part1 Epoch 14 / 100\n", 430 | "train part1: batch 0/29, loss 1.662, top-1 accuracy 48.000, top-5 accuracy 90.000\n", 431 | "train part1: loss 1.856867\n", 432 | "val part1: batch 0/59, loss 2.117, top-1 accuracy 28.000, top-5 accuracy 82.000\n", 433 | "val part1: loss 1.657026\n", 434 | "Checkpoint saved\n", 435 | "BEST TOP1 ACCURACY SO FAR\n", 436 | "part1 Epoch 15 / 100\n", 437 | "train part1: batch 0/29, loss 1.631, top-1 accuracy 40.000, top-5 accuracy 94.000\n", 438 | "train part1: loss 1.873407\n", 439 | "val part1: batch 0/59, loss 2.191, top-1 accuracy 18.000, top-5 accuracy 82.000\n", 440 | "val part1: loss 1.590518\n", 441 | "Checkpoint saved\n", 442 | "BEST TOP1 ACCURACY SO FAR\n", 443 | "part1 Epoch 16 / 100\n", 444 | "train part1: batch 0/29, loss 1.556, top-1 accuracy 50.000, top-5 accuracy 84.000\n", 445 | "train part1: loss 1.713181\n", 446 | "val part1: batch 0/59, loss 2.037, top-1 accuracy 26.000, top-5 accuracy 84.000\n", 447 | "val part1: loss 1.500988\n", 448 | "Checkpoint saved\n", 449 | "BEST TOP1 ACCURACY SO FAR\n", 450 | "part1 Epoch 17 / 100\n", 451 | "train part1: batch 0/29, loss 1.738, top-1 accuracy 40.000, top-5 accuracy 86.000\n", 452 | "train part1: loss 1.687592\n", 453 | "val part1: batch 0/59, loss 1.510, top-1 accuracy 48.000, top-5 accuracy 98.000\n", 454 | "val part1: loss 1.448492\n", 455 | "Checkpoint saved\n", 456 | "BEST TOP1 ACCURACY SO FAR\n", 457 | "part1 Epoch 18 / 100\n", 458 | "train part1: batch 0/29, loss 1.599, top-1 accuracy 48.000, top-5 accuracy 86.000\n", 459 | "train part1: loss 1.680492\n", 460 | "val part1: batch 0/59, loss 1.719, top-1 accuracy 34.000, top-5 accuracy 92.000\n", 461 | "val part1: loss 1.454456\n", 462 | "Checkpoint saved\n", 463 | "BEST TOP1 ACCURACY SO FAR\n", 464 | "part1 Epoch 19 / 100\n", 465 | "train part1: batch 0/29, loss 1.789, top-1 accuracy 48.000, top-5 accuracy 78.000\n", 466 | "train part1: loss 1.646169\n", 467 | "val part1: batch 0/59, loss 2.208, top-1 accuracy 26.000, top-5 accuracy 72.000\n", 468 | "val part1: loss 1.490147\n", 469 | "Checkpoint saved\n", 470 | "part1 Epoch 20 / 100\n", 471 | "train part1: batch 0/29, loss 1.923, top-1 accuracy 36.000, top-5 accuracy 86.000\n", 472 | "train part1: loss 1.595892\n", 473 | "val part1: batch 0/59, loss 2.060, top-1 accuracy 24.000, top-5 accuracy 82.000\n", 474 | "val part1: loss 1.519088\n", 475 | "Checkpoint saved\n", 476 | "part1 Epoch 21 / 100\n", 477 | "train part1: batch 0/29, loss 1.250, top-1 accuracy 62.000, top-5 accuracy 94.000\n", 478 | "train part1: loss 1.565675\n", 479 | "val part1: batch 0/59, loss 2.194, top-1 accuracy 14.000, top-5 accuracy 78.000\n", 480 | "val part1: loss 1.879126\n", 481 | "Checkpoint saved\n", 482 | "part1 Epoch 22 / 100\n", 483 | "train part1: batch 0/29, loss 1.585, top-1 accuracy 54.000, top-5 accuracy 90.000\n", 484 | "train part1: loss 1.534763\n", 485 | "val part1: batch 0/59, loss 1.503, top-1 accuracy 60.000, top-5 accuracy 90.000\n", 486 | "val part1: loss 1.521252\n", 487 | "Checkpoint saved\n", 488 | "part1 Epoch 23 / 100\n", 489 | "train part1: batch 0/29, loss 1.447, top-1 accuracy 58.000, top-5 accuracy 84.000\n", 490 | "train part1: loss 1.452563\n", 491 | "val part1: batch 0/59, loss 1.673, top-1 accuracy 42.000, top-5 accuracy 90.000\n", 492 | "val part1: loss 1.344078\n", 493 | "Checkpoint saved\n", 494 | "BEST TOP1 ACCURACY SO FAR\n", 495 | "part1 Epoch 24 / 100\n", 496 | "train part1: batch 0/29, loss 1.142, top-1 accuracy 64.000, top-5 accuracy 96.000\n", 497 | "train part1: loss 1.432810\n", 498 | "val part1: batch 0/59, loss 1.908, top-1 accuracy 28.000, top-5 accuracy 88.000\n", 499 | "val part1: loss 1.289051\n", 500 | "Checkpoint saved\n", 501 | "BEST TOP1 ACCURACY SO FAR\n", 502 | "part1 Epoch 25 / 100\n", 503 | "train part1: batch 0/29, loss 1.658, top-1 accuracy 42.000, top-5 accuracy 86.000\n", 504 | "train part1: loss 1.420957\n", 505 | "val part1: batch 0/59, loss 1.851, top-1 accuracy 38.000, top-5 accuracy 84.000\n", 506 | "val part1: loss 1.310904\n", 507 | "Checkpoint saved\n", 508 | "part1 Epoch 26 / 100\n", 509 | "train part1: batch 0/29, loss 1.580, top-1 accuracy 48.000, top-5 accuracy 86.000\n", 510 | "train part1: loss 1.440826\n", 511 | "val part1: batch 0/59, loss 1.717, top-1 accuracy 36.000, top-5 accuracy 92.000\n", 512 | "val part1: loss 1.312577\n", 513 | "Checkpoint saved\n", 514 | "part1 Epoch 27 / 100\n", 515 | "train part1: batch 0/29, loss 1.277, top-1 accuracy 54.000, top-5 accuracy 94.000\n", 516 | "train part1: loss 1.349694\n", 517 | "val part1: batch 0/59, loss 1.985, top-1 accuracy 18.000, top-5 accuracy 84.000\n", 518 | "val part1: loss 1.180214\n", 519 | "Checkpoint saved\n", 520 | "BEST TOP1 ACCURACY SO FAR\n", 521 | "part1 Epoch 28 / 100\n", 522 | "train part1: batch 0/29, loss 1.234, top-1 accuracy 64.000, top-5 accuracy 92.000\n", 523 | "train part1: loss 1.313300\n", 524 | "val part1: batch 0/59, loss 1.840, top-1 accuracy 30.000, top-5 accuracy 82.000\n", 525 | "val part1: loss 1.273597\n", 526 | "Checkpoint saved\n", 527 | "part1 Epoch 29 / 100\n", 528 | "train part1: batch 0/29, loss 1.282, top-1 accuracy 60.000, top-5 accuracy 90.000\n", 529 | "train part1: loss 1.291305\n", 530 | "val part1: batch 0/59, loss 1.711, top-1 accuracy 38.000, top-5 accuracy 92.000\n", 531 | "val part1: loss 1.377620\n", 532 | "Checkpoint saved\n", 533 | "part1 Epoch 30 / 100\n" 534 | ] 535 | }, 536 | { 537 | "name": "stdout", 538 | "output_type": "stream", 539 | "text": [ 540 | "train part1: batch 0/29, loss 1.031, top-1 accuracy 64.000, top-5 accuracy 98.000\n", 541 | "train part1: loss 1.258324\n", 542 | "val part1: batch 0/59, loss 2.020, top-1 accuracy 24.000, top-5 accuracy 90.000\n", 543 | "val part1: loss 1.195394\n", 544 | "Checkpoint saved\n", 545 | "part1 Epoch 31 / 100\n", 546 | "train part1: batch 0/29, loss 1.137, top-1 accuracy 56.000, top-5 accuracy 96.000\n", 547 | "train part1: loss 1.216911\n", 548 | "val part1: batch 0/59, loss 1.532, top-1 accuracy 44.000, top-5 accuracy 92.000\n", 549 | "val part1: loss 1.414781\n", 550 | "Checkpoint saved\n", 551 | "part1 Epoch 32 / 100\n", 552 | "train part1: batch 0/29, loss 0.987, top-1 accuracy 66.000, top-5 accuracy 98.000\n", 553 | "train part1: loss 1.111287\n", 554 | "val part1: batch 0/59, loss 1.819, top-1 accuracy 28.000, top-5 accuracy 90.000\n", 555 | "val part1: loss 1.145157\n", 556 | "Checkpoint saved\n", 557 | "BEST TOP1 ACCURACY SO FAR\n", 558 | "part1 Epoch 33 / 100\n", 559 | "train part1: batch 0/29, loss 1.042, top-1 accuracy 64.000, top-5 accuracy 98.000\n", 560 | "train part1: loss 1.222887\n", 561 | "val part1: batch 0/59, loss 1.851, top-1 accuracy 28.000, top-5 accuracy 86.000\n", 562 | "val part1: loss 1.380957\n", 563 | "Checkpoint saved\n", 564 | "part1 Epoch 34 / 100\n", 565 | "train part1: batch 0/29, loss 1.042, top-1 accuracy 68.000, top-5 accuracy 96.000\n", 566 | "train part1: loss 1.174929\n", 567 | "val part1: batch 0/59, loss 0.674, top-1 accuracy 82.000, top-5 accuracy 100.000\n", 568 | "val part1: loss 1.449894\n", 569 | "Checkpoint saved\n", 570 | "part1 Epoch 35 / 100\n", 571 | "train part1: batch 0/29, loss 0.938, top-1 accuracy 72.000, top-5 accuracy 98.000\n", 572 | "train part1: loss 1.063462\n", 573 | "val part1: batch 0/59, loss 1.240, top-1 accuracy 62.000, top-5 accuracy 94.000\n", 574 | "val part1: loss 1.201662\n", 575 | "Checkpoint saved\n", 576 | "part1 Epoch 36 / 100\n", 577 | "train part1: batch 0/29, loss 0.953, top-1 accuracy 66.000, top-5 accuracy 98.000\n", 578 | "train part1: loss 1.067647\n", 579 | "val part1: batch 0/59, loss 1.356, top-1 accuracy 44.000, top-5 accuracy 100.000\n", 580 | "val part1: loss 1.169855\n", 581 | "Checkpoint saved\n", 582 | "part1 Epoch 37 / 100\n", 583 | "train part1: batch 0/29, loss 0.820, top-1 accuracy 74.000, top-5 accuracy 100.000\n", 584 | "train part1: loss 1.034231\n", 585 | "val part1: batch 0/59, loss 1.767, top-1 accuracy 36.000, top-5 accuracy 88.000\n", 586 | "val part1: loss 1.176197\n", 587 | "Checkpoint saved\n", 588 | "part1 Epoch 38 / 100\n", 589 | "train part1: batch 0/29, loss 1.085, top-1 accuracy 60.000, top-5 accuracy 98.000\n", 590 | "train part1: loss 1.075923\n", 591 | "val part1: batch 0/59, loss 1.035, top-1 accuracy 58.000, top-5 accuracy 100.000\n", 592 | "val part1: loss 1.250081\n", 593 | "Checkpoint saved\n", 594 | "part1 Epoch 39 / 100\n", 595 | "train part1: batch 0/29, loss 0.700, top-1 accuracy 76.000, top-5 accuracy 98.000\n", 596 | "train part1: loss 0.958608\n", 597 | "val part1: batch 0/59, loss 1.734, top-1 accuracy 30.000, top-5 accuracy 90.000\n", 598 | "val part1: loss 1.146434\n", 599 | "Checkpoint saved\n", 600 | "BEST TOP1 ACCURACY SO FAR\n", 601 | "part1 Epoch 40 / 100\n", 602 | "train part1: batch 0/29, loss 0.917, top-1 accuracy 72.000, top-5 accuracy 90.000\n", 603 | "train part1: loss 0.954836\n", 604 | "val part1: batch 0/59, loss 1.747, top-1 accuracy 28.000, top-5 accuracy 92.000\n", 605 | "val part1: loss 1.254555\n", 606 | "Checkpoint saved\n", 607 | "part1 Epoch 41 / 100\n", 608 | "train part1: batch 0/29, loss 0.920, top-1 accuracy 72.000, top-5 accuracy 98.000\n", 609 | "train part1: loss 1.014113\n", 610 | "val part1: batch 0/59, loss 1.905, top-1 accuracy 28.000, top-5 accuracy 84.000\n", 611 | "val part1: loss 1.315546\n", 612 | "Checkpoint saved\n", 613 | "part1 Epoch 42 / 100\n", 614 | "train part1: batch 0/29, loss 0.859, top-1 accuracy 66.000, top-5 accuracy 96.000\n", 615 | "train part1: loss 0.964082\n", 616 | "val part1: batch 0/59, loss 1.278, top-1 accuracy 48.000, top-5 accuracy 96.000\n", 617 | "val part1: loss 1.213332\n", 618 | "Checkpoint saved\n", 619 | "part1 Epoch 43 / 100\n", 620 | "train part1: batch 0/29, loss 1.106, top-1 accuracy 68.000, top-5 accuracy 100.000\n", 621 | "train part1: loss 0.900427\n", 622 | "val part1: batch 0/59, loss 2.251, top-1 accuracy 14.000, top-5 accuracy 80.000\n", 623 | "val part1: loss 1.178361\n", 624 | "Checkpoint saved\n", 625 | "part1 Epoch 44 / 100\n", 626 | "train part1: batch 0/29, loss 1.066, top-1 accuracy 66.000, top-5 accuracy 92.000\n", 627 | "train part1: loss 0.850803\n", 628 | "val part1: batch 0/59, loss 1.310, top-1 accuracy 50.000, top-5 accuracy 96.000\n", 629 | "val part1: loss 1.108495\n", 630 | "Checkpoint saved\n", 631 | "BEST TOP1 ACCURACY SO FAR\n", 632 | "part1 Epoch 45 / 100\n", 633 | "train part1: batch 0/29, loss 0.912, top-1 accuracy 68.000, top-5 accuracy 100.000\n", 634 | "train part1: loss 0.851264\n", 635 | "val part1: batch 0/59, loss 1.556, top-1 accuracy 40.000, top-5 accuracy 86.000\n", 636 | "val part1: loss 1.169370\n", 637 | "Checkpoint saved\n", 638 | "part1 Epoch 46 / 100\n", 639 | "train part1: batch 0/29, loss 0.781, top-1 accuracy 78.000, top-5 accuracy 98.000\n", 640 | "train part1: loss 0.841260\n", 641 | "val part1: batch 0/59, loss 1.468, top-1 accuracy 48.000, top-5 accuracy 92.000\n", 642 | "val part1: loss 1.093216\n", 643 | "Checkpoint saved\n", 644 | "BEST TOP1 ACCURACY SO FAR\n", 645 | "part1 Epoch 47 / 100\n", 646 | "train part1: batch 0/29, loss 0.596, top-1 accuracy 76.000, top-5 accuracy 100.000\n", 647 | "train part1: loss 0.827810\n", 648 | "val part1: batch 0/59, loss 1.288, top-1 accuracy 50.000, top-5 accuracy 96.000\n", 649 | "val part1: loss 1.070031\n", 650 | "Checkpoint saved\n", 651 | "BEST TOP1 ACCURACY SO FAR\n", 652 | "part1 Epoch 48 / 100\n", 653 | "train part1: batch 0/29, loss 0.810, top-1 accuracy 72.000, top-5 accuracy 100.000\n", 654 | "train part1: loss 0.775189\n", 655 | "val part1: batch 0/59, loss 0.921, top-1 accuracy 66.000, top-5 accuracy 98.000\n", 656 | "val part1: loss 1.524725\n", 657 | "Checkpoint saved\n", 658 | "part1 Epoch 49 / 100\n", 659 | "train part1: batch 0/29, loss 0.775, top-1 accuracy 70.000, top-5 accuracy 96.000\n", 660 | "train part1: loss 0.709857\n", 661 | "val part1: batch 0/59, loss 0.957, top-1 accuracy 62.000, top-5 accuracy 96.000\n", 662 | "val part1: loss 1.261353\n", 663 | "Checkpoint saved\n", 664 | "part1 Epoch 50 / 100\n", 665 | "train part1: batch 0/29, loss 0.730, top-1 accuracy 78.000, top-5 accuracy 98.000\n", 666 | "train part1: loss 0.777712\n", 667 | "val part1: batch 0/59, loss 1.595, top-1 accuracy 42.000, top-5 accuracy 88.000\n", 668 | "val part1: loss 1.394617\n", 669 | "Checkpoint saved\n", 670 | "part1 Epoch 51 / 100\n", 671 | "train part1: batch 0/29, loss 0.627, top-1 accuracy 76.000, top-5 accuracy 100.000\n", 672 | "train part1: loss 0.717591\n", 673 | "val part1: batch 0/59, loss 1.419, top-1 accuracy 46.000, top-5 accuracy 88.000\n", 674 | "val part1: loss 1.157158\n", 675 | "Checkpoint saved\n", 676 | "part1 Epoch 52 / 100\n", 677 | "train part1: batch 0/29, loss 0.915, top-1 accuracy 68.000, top-5 accuracy 96.000\n", 678 | "train part1: loss 0.684333\n", 679 | "val part1: batch 0/59, loss 1.151, top-1 accuracy 62.000, top-5 accuracy 92.000\n", 680 | "val part1: loss 1.355746\n", 681 | "Checkpoint saved\n", 682 | "part1 Epoch 53 / 100\n", 683 | "train part1: batch 0/29, loss 0.533, top-1 accuracy 82.000, top-5 accuracy 100.000\n", 684 | "train part1: loss 0.650421\n", 685 | "val part1: batch 0/59, loss 0.721, top-1 accuracy 76.000, top-5 accuracy 98.000\n", 686 | "val part1: loss 1.270329\n", 687 | "Checkpoint saved\n", 688 | "part1 Epoch 54 / 100\n", 689 | "train part1: batch 0/29, loss 0.678, top-1 accuracy 78.000, top-5 accuracy 100.000\n", 690 | "train part1: loss 0.660190\n", 691 | "val part1: batch 0/59, loss 2.038, top-1 accuracy 20.000, top-5 accuracy 86.000\n", 692 | "val part1: loss 1.179264\n", 693 | "Checkpoint saved\n", 694 | "part1 Epoch 55 / 100\n", 695 | "train part1: batch 0/29, loss 0.483, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 696 | "train part1: loss 0.721452\n", 697 | "val part1: batch 0/59, loss 2.422, top-1 accuracy 20.000, top-5 accuracy 84.000\n", 698 | "val part1: loss 1.309595\n", 699 | "Checkpoint saved\n", 700 | "part1 Epoch 56 / 100\n", 701 | "train part1: batch 0/29, loss 0.583, top-1 accuracy 84.000, top-5 accuracy 96.000\n", 702 | "train part1: loss 0.681370\n", 703 | "val part1: batch 0/59, loss 1.417, top-1 accuracy 48.000, top-5 accuracy 86.000\n", 704 | "val part1: loss 1.156318\n", 705 | "Checkpoint saved\n", 706 | "part1 Epoch 57 / 100\n", 707 | "train part1: batch 0/29, loss 0.540, top-1 accuracy 80.000, top-5 accuracy 98.000\n", 708 | "train part1: loss 0.670764\n", 709 | "val part1: batch 0/59, loss 1.354, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 710 | "val part1: loss 1.246445\n", 711 | "Checkpoint saved\n", 712 | "part1 Epoch 58 / 100\n", 713 | "train part1: batch 0/29, loss 0.406, top-1 accuracy 84.000, top-5 accuracy 100.000\n", 714 | "train part1: loss 0.616679\n", 715 | "val part1: batch 0/59, loss 1.581, top-1 accuracy 48.000, top-5 accuracy 88.000\n", 716 | "val part1: loss 1.153193\n", 717 | "Checkpoint saved\n", 718 | "part1 Epoch 59 / 100\n", 719 | "train part1: batch 0/29, loss 0.546, top-1 accuracy 84.000, top-5 accuracy 96.000\n", 720 | "train part1: loss 0.588246\n", 721 | "val part1: batch 0/59, loss 1.056, top-1 accuracy 58.000, top-5 accuracy 98.000\n", 722 | "val part1: loss 1.489484\n", 723 | "Checkpoint saved\n", 724 | "part1 Epoch 60 / 100\n", 725 | "train part1: batch 0/29, loss 0.505, top-1 accuracy 84.000, top-5 accuracy 98.000\n", 726 | "train part1: loss 0.497765\n", 727 | "val part1: batch 0/59, loss 1.450, top-1 accuracy 48.000, top-5 accuracy 92.000\n", 728 | "val part1: loss 1.059363\n", 729 | "Checkpoint saved\n", 730 | "BEST TOP1 ACCURACY SO FAR\n", 731 | "part1 Epoch 61 / 100\n", 732 | "train part1: batch 0/29, loss 0.479, top-1 accuracy 88.000, top-5 accuracy 96.000\n", 733 | "train part1: loss 0.475674\n", 734 | "val part1: batch 0/59, loss 1.289, top-1 accuracy 50.000, top-5 accuracy 94.000\n", 735 | "val part1: loss 1.053961\n", 736 | "Checkpoint saved\n", 737 | "BEST TOP1 ACCURACY SO FAR\n", 738 | "part1 Epoch 62 / 100\n" 739 | ] 740 | }, 741 | { 742 | "name": "stdout", 743 | "output_type": "stream", 744 | "text": [ 745 | "train part1: batch 0/29, loss 0.450, top-1 accuracy 90.000, top-5 accuracy 98.000\n", 746 | "train part1: loss 0.456607\n", 747 | "val part1: batch 0/59, loss 1.264, top-1 accuracy 50.000, top-5 accuracy 96.000\n", 748 | "val part1: loss 1.069520\n", 749 | "Checkpoint saved\n", 750 | "part1 Epoch 63 / 100\n", 751 | "train part1: batch 0/29, loss 0.425, top-1 accuracy 88.000, top-5 accuracy 100.000\n", 752 | "train part1: loss 0.444092\n", 753 | "val part1: batch 0/59, loss 1.291, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 754 | "val part1: loss 1.065126\n", 755 | "Checkpoint saved\n", 756 | "part1 Epoch 64 / 100\n", 757 | "train part1: batch 0/29, loss 0.348, top-1 accuracy 92.000, top-5 accuracy 100.000\n", 758 | "train part1: loss 0.411879\n", 759 | "val part1: batch 0/59, loss 1.152, top-1 accuracy 50.000, top-5 accuracy 96.000\n", 760 | "val part1: loss 1.063277\n", 761 | "Checkpoint saved\n", 762 | "part1 Epoch 65 / 100\n", 763 | "train part1: batch 0/29, loss 0.652, top-1 accuracy 80.000, top-5 accuracy 98.000\n", 764 | "train part1: loss 0.439077\n", 765 | "val part1: batch 0/59, loss 1.234, top-1 accuracy 50.000, top-5 accuracy 94.000\n", 766 | "val part1: loss 1.065789\n", 767 | "Checkpoint saved\n", 768 | "part1 Epoch 66 / 100\n", 769 | "train part1: batch 0/29, loss 0.430, top-1 accuracy 90.000, top-5 accuracy 98.000\n", 770 | "train part1: loss 0.384757\n", 771 | "val part1: batch 0/59, loss 1.307, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 772 | "val part1: loss 1.048810\n", 773 | "Checkpoint saved\n", 774 | "BEST TOP1 ACCURACY SO FAR\n", 775 | "part1 Epoch 67 / 100\n", 776 | "train part1: batch 0/29, loss 0.330, top-1 accuracy 92.000, top-5 accuracy 100.000\n", 777 | "train part1: loss 0.396477\n", 778 | "val part1: batch 0/59, loss 1.360, top-1 accuracy 50.000, top-5 accuracy 90.000\n", 779 | "val part1: loss 1.063634\n", 780 | "Checkpoint saved\n", 781 | "part1 Epoch 68 / 100\n", 782 | "train part1: batch 0/29, loss 0.282, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 783 | "train part1: loss 0.399306\n", 784 | "val part1: batch 0/59, loss 1.261, top-1 accuracy 50.000, top-5 accuracy 96.000\n", 785 | "val part1: loss 1.075185\n", 786 | "Checkpoint saved\n", 787 | "part1 Epoch 69 / 100\n", 788 | "train part1: batch 0/29, loss 0.375, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 789 | "train part1: loss 0.397953\n", 790 | "val part1: batch 0/59, loss 1.339, top-1 accuracy 48.000, top-5 accuracy 90.000\n", 791 | "val part1: loss 1.067044\n", 792 | "Checkpoint saved\n", 793 | "part1 Epoch 70 / 100\n", 794 | "train part1: batch 0/29, loss 0.575, top-1 accuracy 78.000, top-5 accuracy 100.000\n", 795 | "train part1: loss 0.385016\n", 796 | "val part1: batch 0/59, loss 1.199, top-1 accuracy 50.000, top-5 accuracy 96.000\n", 797 | "val part1: loss 1.080763\n", 798 | "Checkpoint saved\n", 799 | "part1 Epoch 71 / 100\n", 800 | "train part1: batch 0/29, loss 0.439, top-1 accuracy 82.000, top-5 accuracy 100.000\n", 801 | "train part1: loss 0.378764\n", 802 | "val part1: batch 0/59, loss 1.228, top-1 accuracy 50.000, top-5 accuracy 94.000\n", 803 | "val part1: loss 1.090090\n", 804 | "Checkpoint saved\n", 805 | "part1 Epoch 72 / 100\n", 806 | "train part1: batch 0/29, loss 0.336, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 807 | "train part1: loss 0.343431\n", 808 | "val part1: batch 0/59, loss 1.270, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 809 | "val part1: loss 1.083456\n", 810 | "Checkpoint saved\n", 811 | "part1 Epoch 73 / 100\n", 812 | "train part1: batch 0/29, loss 0.366, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 813 | "train part1: loss 0.372473\n", 814 | "val part1: batch 0/59, loss 1.341, top-1 accuracy 50.000, top-5 accuracy 90.000\n", 815 | "val part1: loss 1.078837\n", 816 | "Checkpoint saved\n", 817 | "BEST TOP1 ACCURACY SO FAR\n", 818 | "part1 Epoch 74 / 100\n", 819 | "train part1: batch 0/29, loss 0.340, top-1 accuracy 86.000, top-5 accuracy 100.000\n", 820 | "train part1: loss 0.366458\n", 821 | "val part1: batch 0/59, loss 1.193, top-1 accuracy 50.000, top-5 accuracy 94.000\n", 822 | "val part1: loss 1.083876\n", 823 | "Checkpoint saved\n", 824 | "part1 Epoch 75 / 100\n", 825 | "train part1: batch 0/29, loss 0.323, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 826 | "train part1: loss 0.358552\n", 827 | "val part1: batch 0/59, loss 1.314, top-1 accuracy 50.000, top-5 accuracy 94.000\n", 828 | "val part1: loss 1.082528\n", 829 | "Checkpoint saved\n", 830 | "part1 Epoch 76 / 100\n", 831 | "train part1: batch 0/29, loss 0.336, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 832 | "train part1: loss 0.370824\n", 833 | "val part1: batch 0/59, loss 1.288, top-1 accuracy 50.000, top-5 accuracy 90.000\n", 834 | "val part1: loss 1.092666\n", 835 | "Checkpoint saved\n", 836 | "part1 Epoch 77 / 100\n", 837 | "train part1: batch 0/29, loss 0.232, top-1 accuracy 94.000, top-5 accuracy 100.000\n", 838 | "train part1: loss 0.357634\n", 839 | "val part1: batch 0/59, loss 1.152, top-1 accuracy 52.000, top-5 accuracy 96.000\n", 840 | "val part1: loss 1.100708\n", 841 | "Checkpoint saved\n", 842 | "part1 Epoch 78 / 100\n", 843 | "train part1: batch 0/29, loss 0.308, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 844 | "train part1: loss 0.343843\n", 845 | "val part1: batch 0/59, loss 1.376, top-1 accuracy 50.000, top-5 accuracy 90.000\n", 846 | "val part1: loss 1.085893\n", 847 | "Checkpoint saved\n", 848 | "part1 Epoch 79 / 100\n", 849 | "train part1: batch 0/29, loss 0.320, top-1 accuracy 90.000, top-5 accuracy 98.000\n", 850 | "train part1: loss 0.328905\n", 851 | "val part1: batch 0/59, loss 1.325, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 852 | "val part1: loss 1.093788\n", 853 | "Checkpoint saved\n", 854 | "part1 Epoch 80 / 100\n", 855 | "train part1: batch 0/29, loss 0.201, top-1 accuracy 94.000, top-5 accuracy 100.000\n", 856 | "train part1: loss 0.318215\n", 857 | "val part1: batch 0/59, loss 1.261, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 858 | "val part1: loss 1.092609\n", 859 | "Checkpoint saved\n", 860 | "part1 Epoch 81 / 100\n", 861 | "train part1: batch 0/29, loss 0.369, top-1 accuracy 92.000, top-5 accuracy 100.000\n", 862 | "train part1: loss 0.322468\n", 863 | "val part1: batch 0/59, loss 1.316, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 864 | "val part1: loss 1.091750\n", 865 | "Checkpoint saved\n", 866 | "part1 Epoch 82 / 100\n", 867 | "train part1: batch 0/29, loss 0.309, top-1 accuracy 88.000, top-5 accuracy 98.000\n", 868 | "train part1: loss 0.329091\n", 869 | "val part1: batch 0/59, loss 1.280, top-1 accuracy 50.000, top-5 accuracy 94.000\n", 870 | "val part1: loss 1.110126\n", 871 | "Checkpoint saved\n", 872 | "part1 Epoch 83 / 100\n", 873 | "train part1: batch 0/29, loss 0.400, top-1 accuracy 84.000, top-5 accuracy 100.000\n", 874 | "train part1: loss 0.299104\n", 875 | "val part1: batch 0/59, loss 1.431, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 876 | "val part1: loss 1.106963\n", 877 | "Checkpoint saved\n", 878 | "part1 Epoch 84 / 100\n", 879 | "train part1: batch 0/29, loss 0.287, top-1 accuracy 88.000, top-5 accuracy 100.000\n", 880 | "train part1: loss 0.319058\n", 881 | "val part1: batch 0/59, loss 1.241, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 882 | "val part1: loss 1.117356\n", 883 | "Checkpoint saved\n", 884 | "part1 Epoch 85 / 100\n", 885 | "train part1: batch 0/29, loss 0.307, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 886 | "train part1: loss 0.344792\n", 887 | "val part1: batch 0/59, loss 1.448, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 888 | "val part1: loss 1.112332\n", 889 | "Checkpoint saved\n", 890 | "part1 Epoch 86 / 100\n", 891 | "train part1: batch 0/29, loss 0.312, top-1 accuracy 88.000, top-5 accuracy 100.000\n", 892 | "train part1: loss 0.310987\n", 893 | "val part1: batch 0/59, loss 1.231, top-1 accuracy 52.000, top-5 accuracy 92.000\n", 894 | "val part1: loss 1.104170\n", 895 | "Checkpoint saved\n", 896 | "part1 Epoch 87 / 100\n", 897 | "train part1: batch 0/29, loss 0.398, top-1 accuracy 92.000, top-5 accuracy 98.000\n", 898 | "train part1: loss 0.331794\n", 899 | "val part1: batch 0/59, loss 1.287, top-1 accuracy 50.000, top-5 accuracy 90.000\n", 900 | "val part1: loss 1.106151\n", 901 | "Checkpoint saved\n", 902 | "part1 Epoch 88 / 100\n", 903 | "train part1: batch 0/29, loss 0.208, top-1 accuracy 94.000, top-5 accuracy 100.000\n", 904 | "train part1: loss 0.305059\n", 905 | "val part1: batch 0/59, loss 1.418, top-1 accuracy 50.000, top-5 accuracy 90.000\n", 906 | "val part1: loss 1.113987\n", 907 | "Checkpoint saved\n", 908 | "part1 Epoch 89 / 100\n", 909 | "train part1: batch 0/29, loss 0.387, top-1 accuracy 86.000, top-5 accuracy 100.000\n", 910 | "train part1: loss 0.329037\n", 911 | "val part1: batch 0/59, loss 1.205, top-1 accuracy 52.000, top-5 accuracy 94.000\n", 912 | "val part1: loss 1.121447\n", 913 | "Checkpoint saved\n", 914 | "part1 Epoch 90 / 100\n", 915 | "train part1: batch 0/29, loss 0.536, top-1 accuracy 82.000, top-5 accuracy 98.000\n", 916 | "train part1: loss 0.317368\n", 917 | "val part1: batch 0/59, loss 1.574, top-1 accuracy 48.000, top-5 accuracy 90.000\n", 918 | "val part1: loss 1.114163\n", 919 | "Checkpoint saved\n", 920 | "part1 Epoch 91 / 100\n", 921 | "train part1: batch 0/29, loss 0.359, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 922 | "train part1: loss 0.299202\n", 923 | "val part1: batch 0/59, loss 1.324, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 924 | "val part1: loss 1.111513\n", 925 | "Checkpoint saved\n", 926 | "part1 Epoch 92 / 100\n", 927 | "train part1: batch 0/29, loss 0.584, top-1 accuracy 76.000, top-5 accuracy 100.000\n", 928 | "train part1: loss 0.292010\n", 929 | "val part1: batch 0/59, loss 1.230, top-1 accuracy 52.000, top-5 accuracy 92.000\n", 930 | "val part1: loss 1.121627\n", 931 | "Checkpoint saved\n", 932 | "part1 Epoch 93 / 100\n", 933 | "train part1: batch 0/29, loss 0.291, top-1 accuracy 88.000, top-5 accuracy 100.000\n", 934 | "train part1: loss 0.299472\n", 935 | "val part1: batch 0/59, loss 1.316, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 936 | "val part1: loss 1.112493\n", 937 | "Checkpoint saved\n", 938 | "part1 Epoch 94 / 100\n", 939 | "train part1: batch 0/29, loss 0.269, top-1 accuracy 90.000, top-5 accuracy 100.000\n" 940 | ] 941 | }, 942 | { 943 | "name": "stdout", 944 | "output_type": "stream", 945 | "text": [ 946 | "train part1: loss 0.301035\n", 947 | "val part1: batch 0/59, loss 1.374, top-1 accuracy 50.000, top-5 accuracy 90.000\n", 948 | "val part1: loss 1.111464\n", 949 | "Checkpoint saved\n", 950 | "part1 Epoch 95 / 100\n", 951 | "train part1: batch 0/29, loss 0.404, top-1 accuracy 92.000, top-5 accuracy 96.000\n", 952 | "train part1: loss 0.297892\n", 953 | "val part1: batch 0/59, loss 1.367, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 954 | "val part1: loss 1.120183\n", 955 | "Checkpoint saved\n", 956 | "part1 Epoch 96 / 100\n", 957 | "train part1: batch 0/29, loss 0.249, top-1 accuracy 94.000, top-5 accuracy 100.000\n", 958 | "train part1: loss 0.303044\n", 959 | "val part1: batch 0/59, loss 1.392, top-1 accuracy 50.000, top-5 accuracy 92.000\n", 960 | "val part1: loss 1.126989\n", 961 | "Checkpoint saved\n", 962 | "part1 Epoch 97 / 100\n", 963 | "train part1: batch 0/29, loss 0.165, top-1 accuracy 96.000, top-5 accuracy 100.000\n", 964 | "train part1: loss 0.276733\n", 965 | "val part1: batch 0/59, loss 1.391, top-1 accuracy 52.000, top-5 accuracy 92.000\n", 966 | "val part1: loss 1.127524\n", 967 | "Checkpoint saved\n", 968 | "part1 Epoch 98 / 100\n", 969 | "train part1: batch 0/29, loss 0.160, top-1 accuracy 94.000, top-5 accuracy 100.000\n", 970 | "train part1: loss 0.280525\n", 971 | "val part1: batch 0/59, loss 1.274, top-1 accuracy 52.000, top-5 accuracy 92.000\n", 972 | "val part1: loss 1.138553\n", 973 | "Checkpoint saved\n", 974 | "BEST TOP1 ACCURACY SO FAR\n", 975 | "part1 Epoch 99 / 100\n", 976 | "train part1: batch 0/29, loss 0.119, top-1 accuracy 98.000, top-5 accuracy 100.000\n", 977 | "train part1: loss 0.270406\n", 978 | "val part1: batch 0/59, loss 1.260, top-1 accuracy 52.000, top-5 accuracy 92.000\n", 979 | "val part1: loss 1.146958\n", 980 | "Checkpoint saved\n", 981 | "Best top-1 Accuracy = 69.916\n" 982 | ] 983 | } 984 | ], 985 | "source": [ 986 | "# Train the network!\n", 987 | "trainer = Trainer(train_dataset, test_dataset, model, loss_function, optimizer, lr_scheduler, params)\n", 988 | "best_prec1 = trainer.train_val()\n", 989 | "print('Best top-1 Accuracy = {:4.3f}'.format(best_prec1))" 990 | ] 991 | }, 992 | { 993 | "cell_type": "markdown", 994 | "metadata": {}, 995 | "source": [ 996 | "Make sure you get at least 50% accuracy in this section! If you tried different settings than the ones provided to get 50%, you should modify custom_part1_trainer in student code to return a dictionary with your changed settings. " 997 | ] 998 | }, 999 | { 1000 | "cell_type": "markdown", 1001 | "metadata": {}, 1002 | "source": [ 1003 | "## Part 2. Fine-Tuning a Pre-Trained Network" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": 9, 1009 | "metadata": { 1010 | "pycharm": { 1011 | "is_executing": false 1012 | } 1013 | }, 1014 | "outputs": [], 1015 | "source": [ 1016 | "# Fix random seeds so that results will be reproducible\n", 1017 | "set_seed(0, use_GPU)" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "markdown", 1022 | "metadata": {}, 1023 | "source": [ 1024 | "Training a network from scratch takes a lof of time. Instead of training from scratch, we can take a pre-trained model and fine tune it for our purposes. This is the goal of Part 2--you will train a pre-trained network, and achieve at least 80% accuracy. " 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": 10, 1030 | "metadata": { 1031 | "pycharm": { 1032 | "is_executing": false 1033 | } 1034 | }, 1035 | "outputs": [], 1036 | "source": [ 1037 | "# training parameters\n", 1038 | "input_size = (224, 224)\n", 1039 | "RGB = True\n", 1040 | "base_lr = 1e-3\n", 1041 | "weight_decay = 5e-4\n", 1042 | "momentum = 0.9\n", 1043 | "backprop_depth = 3" 1044 | ] 1045 | }, 1046 | { 1047 | "cell_type": "code", 1048 | "execution_count": 11, 1049 | "metadata": { 1050 | "pycharm": { 1051 | "is_executing": false 1052 | }, 1053 | "scrolled": true 1054 | }, 1055 | "outputs": [ 1056 | { 1057 | "name": "stdout", 1058 | "output_type": "stream", 1059 | "text": [ 1060 | "Computing pixel mean and stdev...\n", 1061 | "Batch 0 / 30\n", 1062 | "Batch 20 / 30\n", 1063 | "Done, mean = \n", 1064 | "[0.45586014 0.45586014 0.45586014]\n", 1065 | "std = \n", 1066 | "[0.24808612 0.24808612 0.24808612]\n", 1067 | "Computing pixel mean and stdev...\n", 1068 | "Batch 0 / 60\n", 1069 | "Batch 20 / 60\n", 1070 | "Batch 40 / 60\n", 1071 | "Done, mean = \n", 1072 | "[0.45524448 0.45524448 0.45524448]\n", 1073 | "std = \n", 1074 | "[0.24719196 0.24719196 0.24719196]\n" 1075 | ] 1076 | } 1077 | ], 1078 | "source": [ 1079 | "# Create the training and testing datasets.\n", 1080 | "train_dataset, test_dataset = sc.create_datasets(data_path=data_path, input_size=input_size, rgb=RGB)\n", 1081 | "assert test_dataset.classes == train_dataset.classes" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "markdown", 1086 | "metadata": {}, 1087 | "source": [ 1088 | "Following block loads a pretrained AlexNet." 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": 12, 1094 | "metadata": { 1095 | "pycharm": { 1096 | "is_executing": false 1097 | } 1098 | }, 1099 | "outputs": [ 1100 | { 1101 | "name": "stdout", 1102 | "output_type": "stream", 1103 | "text": [ 1104 | "AlexNet(\n", 1105 | " (features): Sequential(\n", 1106 | " (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))\n", 1107 | " (1): ReLU(inplace=True)\n", 1108 | " (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 1109 | " (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))\n", 1110 | " (4): ReLU(inplace=True)\n", 1111 | " (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 1112 | " (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 1113 | " (7): ReLU(inplace=True)\n", 1114 | " (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 1115 | " (9): ReLU(inplace=True)\n", 1116 | " (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 1117 | " (11): ReLU(inplace=True)\n", 1118 | " (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 1119 | " )\n", 1120 | " (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))\n", 1121 | " (classifier): Sequential(\n", 1122 | " (0): Dropout(p=0.5, inplace=False)\n", 1123 | " (1): Linear(in_features=9216, out_features=4096, bias=True)\n", 1124 | " (2): ReLU(inplace=True)\n", 1125 | " (3): Dropout(p=0.5, inplace=False)\n", 1126 | " (4): Linear(in_features=4096, out_features=4096, bias=True)\n", 1127 | " (5): ReLU(inplace=True)\n", 1128 | " (6): Linear(in_features=4096, out_features=1000, bias=True)\n", 1129 | " )\n", 1130 | ")\n" 1131 | ] 1132 | } 1133 | ], 1134 | "source": [ 1135 | "# Create the network model.\n", 1136 | "model = alexnet(pretrained=True)\n", 1137 | "print(model)" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "Now, you modify create_part2_model from student code in order to fine-tune AlexNet. As you can see in the docs (https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py) and in the model printout above, AlexNet has 2 parts: 'features', which constists of conv layers that extract feature maps from the image, and 'classifier' which consists of FC layers that classify the features. We want to replace the last Linear layer in model.classifier. " 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": 13, 1150 | "metadata": { 1151 | "pycharm": { 1152 | "is_executing": false 1153 | }, 1154 | "scrolled": true 1155 | }, 1156 | "outputs": [ 1157 | { 1158 | "name": "stdout", 1159 | "output_type": "stream", 1160 | "text": [ 1161 | "Linear(in_features=4096, out_features=1000, bias=True)\n", 1162 | "AlexNet(\n", 1163 | " (features): Sequential(\n", 1164 | " (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))\n", 1165 | " (1): ReLU(inplace=True)\n", 1166 | " (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 1167 | " (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))\n", 1168 | " (4): ReLU(inplace=True)\n", 1169 | " (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 1170 | " (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 1171 | " (7): ReLU(inplace=True)\n", 1172 | " (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 1173 | " (9): ReLU(inplace=True)\n", 1174 | " (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 1175 | " (11): ReLU(inplace=True)\n", 1176 | " (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)\n", 1177 | " )\n", 1178 | " (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))\n", 1179 | " (classifier): Sequential(\n", 1180 | " (0): Dropout(p=0.5, inplace=False)\n", 1181 | " (1): Linear(in_features=9216, out_features=4096, bias=True)\n", 1182 | " (2): ReLU(inplace=True)\n", 1183 | " (3): Dropout(p=0.5, inplace=False)\n", 1184 | " (4): Linear(in_features=4096, out_features=4096, bias=True)\n", 1185 | " (5): ReLU(inplace=True)\n", 1186 | " (6): Linear(in_features=4096, out_features=128, bias=True)\n", 1187 | " (7): ReLU(inplace=True)\n", 1188 | " (8): Linear(in_features=128, out_features=15, bias=True)\n", 1189 | " )\n", 1190 | ")\n" 1191 | ] 1192 | } 1193 | ], 1194 | "source": [ 1195 | "model = sc.create_part2_model(model, num_classes)\n", 1196 | "if use_GPU:\n", 1197 | " model = model.cuda()\n", 1198 | "print(model)" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "markdown", 1203 | "metadata": {}, 1204 | "source": [ 1205 | "Next we will create the loss function and the optimizer. Just as with part 1, if you modify any of the setttings to hit the required accuracy, you must modify custom_part2_trainer function to return a dictionary containing your changes. " 1206 | ] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "execution_count": 14, 1211 | "metadata": { 1212 | "pycharm": { 1213 | "is_executing": false 1214 | } 1215 | }, 1216 | "outputs": [], 1217 | "source": [ 1218 | "# Set up the trainer. You can modify custom_part2_trainer in\n", 1219 | "# student_copy.py if you want to try different learning settings.\n", 1220 | "custom_part2_trainer = sc.custom_part2_trainer(model)\n", 1221 | "\n", 1222 | "if custom_part2_trainer is None:\n", 1223 | " # Create the loss function\n", 1224 | " # see http://pytorch.org/docs/0.3.0/nn.html#loss-functions for a list of available loss functions\n", 1225 | " loss_function = nn.CrossEntropyLoss()\n", 1226 | "\n", 1227 | " # Since we do not want to optimize the whole network, we must extract a list of parameters of interest that will be\n", 1228 | " # optimized by the optimizer.\n", 1229 | " params_to_optimize = []\n", 1230 | "\n", 1231 | " # List of modules in the network\n", 1232 | " mods = list(model.features.children()) + list(model.classifier.children())\n", 1233 | "\n", 1234 | " # Extract parameters from the last `backprop_depth` modules in the network and collect them in\n", 1235 | " # the params_to_optimize list.\n", 1236 | " for m in mods[::-1][:backprop_depth]:\n", 1237 | " params_to_optimize.extend(list(m.parameters()))\n", 1238 | "\n", 1239 | " # Construct the optimizer \n", 1240 | " optimizer = optim.SGD(params=params_to_optimize, lr=base_lr, weight_decay=weight_decay, momentum=momentum)\n", 1241 | "\n", 1242 | " # Create a scheduler, currently a simple step scheduler, but you can get creative.\n", 1243 | " # See http://pytorch.org/docs/0.3.0/optim.html#how-to-adjust-learning-rate for various LR schedulers\n", 1244 | " # and how to use them\n", 1245 | " lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)\n", 1246 | " \n", 1247 | " params = {'n_epochs': 4, 'batch_size': 10, 'experiment': 'part2'} \n", 1248 | " \n", 1249 | "else:\n", 1250 | " if 'loss_function' in custom_part2_trainer:\n", 1251 | " loss_function = custom_part2_trainer['loss_function']\n", 1252 | " if 'optimizer' in custom_part2_trainer:\n", 1253 | " optimizer = custom_part2_trainer['optimizer']\n", 1254 | " if 'lr_scheduler' in custom_part2_trainer:\n", 1255 | " lr_scheduler = custom_part2_trainer['lr_scheduler']\n", 1256 | " if 'params' in custom_part2_trainer:\n", 1257 | " params = custom_part2_trainer['params']" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "markdown", 1262 | "metadata": {}, 1263 | "source": [ 1264 | "We are ready to fine tune our network! Just like before, we will start a local server to see the training progress of our network. Open a new terminal and activate the environment for this project. Then run the following command: **python -m visdom.server**. This will start a local server. The terminal output should give out a link like: \"http://localhost:8097\". Open this link in your browser. After you run the following block, visit this link again, and you will be able to see graphs showing the progress of your training! If you do not see any graphs, select Part 2 on the top left bar where is says Environment (only select Part 2, do not check main or Part 1)." 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": 15, 1270 | "metadata": { 1271 | "pycharm": { 1272 | "is_executing": false 1273 | } 1274 | }, 1275 | "outputs": [ 1276 | { 1277 | "name": "stderr", 1278 | "output_type": "stream", 1279 | "text": [ 1280 | "Setting up a new session...\n" 1281 | ] 1282 | }, 1283 | { 1284 | "name": "stdout", 1285 | "output_type": "stream", 1286 | "text": [ 1287 | "---------------------------------------\n", 1288 | "Experiment: part2\n", 1289 | "n_epochs: 4\n", 1290 | "batch_size: 10\n", 1291 | "do_val: True\n", 1292 | "shuffle: True\n", 1293 | "num_workers: 4\n", 1294 | "val_freq: 1\n", 1295 | "print_freq: 100\n", 1296 | "experiment: part2\n", 1297 | "checkpoint_file: None\n", 1298 | "resume_optim: True\n", 1299 | "---------------------------------------\n", 1300 | "part2 Epoch 0 / 4\n", 1301 | "train part2: batch 0/149, loss 2.682, top-1 accuracy 0.000, top-5 accuracy 60.000\n", 1302 | "train part2: batch 100/149, loss 0.757, top-1 accuracy 80.000, top-5 accuracy 90.000\n", 1303 | "train part2: loss 1.222762\n", 1304 | "val part2: batch 0/298, loss 1.443, top-1 accuracy 30.000, top-5 accuracy 100.000\n", 1305 | "val part2: batch 100/298, loss 0.736, top-1 accuracy 80.000, top-5 accuracy 100.000\n", 1306 | "val part2: batch 200/298, loss 0.562, top-1 accuracy 70.000, top-5 accuracy 100.000\n", 1307 | "val part2: loss 0.561482\n", 1308 | "Checkpoint saved\n", 1309 | "BEST TOP1 ACCURACY SO FAR\n", 1310 | "part2 Epoch 1 / 4\n", 1311 | "train part2: batch 0/149, loss 0.417, top-1 accuracy 80.000, top-5 accuracy 100.000\n", 1312 | "train part2: batch 100/149, loss 0.132, top-1 accuracy 100.000, top-5 accuracy 100.000\n", 1313 | "train part2: loss 0.493316\n", 1314 | "val part2: batch 0/298, loss 1.460, top-1 accuracy 30.000, top-5 accuracy 100.000\n", 1315 | "val part2: batch 100/298, loss 0.349, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 1316 | "val part2: batch 200/298, loss 0.410, top-1 accuracy 80.000, top-5 accuracy 100.000\n", 1317 | "val part2: loss 0.458179\n", 1318 | "Checkpoint saved\n", 1319 | "BEST TOP1 ACCURACY SO FAR\n", 1320 | "part2 Epoch 2 / 4\n", 1321 | "train part2: batch 0/149, loss 0.278, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 1322 | "train part2: batch 100/149, loss 0.428, top-1 accuracy 80.000, top-5 accuracy 100.000\n", 1323 | "train part2: loss 0.374604\n", 1324 | "val part2: batch 0/298, loss 1.957, top-1 accuracy 20.000, top-5 accuracy 100.000\n", 1325 | "val part2: batch 100/298, loss 0.265, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 1326 | "val part2: batch 200/298, loss 1.368, top-1 accuracy 60.000, top-5 accuracy 100.000\n", 1327 | "val part2: loss 0.506881\n", 1328 | "Checkpoint saved\n", 1329 | "part2 Epoch 3 / 4\n", 1330 | "train part2: batch 0/149, loss 0.123, top-1 accuracy 100.000, top-5 accuracy 100.000\n", 1331 | "train part2: batch 100/149, loss 0.152, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 1332 | "train part2: loss 0.335377\n", 1333 | "val part2: batch 0/298, loss 0.604, top-1 accuracy 80.000, top-5 accuracy 100.000\n", 1334 | "val part2: batch 100/298, loss 0.329, top-1 accuracy 90.000, top-5 accuracy 100.000\n", 1335 | "val part2: batch 200/298, loss 0.570, top-1 accuracy 70.000, top-5 accuracy 100.000\n", 1336 | "val part2: loss 0.387761\n", 1337 | "Checkpoint saved\n", 1338 | "BEST TOP1 ACCURACY SO FAR\n", 1339 | "Best top-1 Accuracy = 86.533\n" 1340 | ] 1341 | } 1342 | ], 1343 | "source": [ 1344 | "# Train the network!\n", 1345 | "trainer = Trainer(train_dataset, test_dataset, model, loss_function, optimizer, lr_scheduler, params)\n", 1346 | "best_prec1 = trainer.train_val()\n", 1347 | "print('Best top-1 Accuracy = {:4.3f}'.format(best_prec1))" 1348 | ] 1349 | }, 1350 | { 1351 | "cell_type": "markdown", 1352 | "metadata": {}, 1353 | "source": [ 1354 | "Expect this code to take around 10 minutes on CPU or 30 seconds on GPU. You should hit 80% accuracy. " 1355 | ] 1356 | } 1357 | ], 1358 | "metadata": { 1359 | "kernelspec": { 1360 | "display_name": "Python 3", 1361 | "language": "python", 1362 | "name": "python3" 1363 | }, 1364 | "language_info": { 1365 | "codemirror_mode": { 1366 | "name": "ipython", 1367 | "version": 3 1368 | }, 1369 | "file_extension": ".py", 1370 | "mimetype": "text/x-python", 1371 | "name": "python", 1372 | "nbconvert_exporter": "python", 1373 | "pygments_lexer": "ipython3", 1374 | "version": "3.7.4" 1375 | }, 1376 | "pycharm": { 1377 | "stem_cell": { 1378 | "cell_type": "raw", 1379 | "metadata": { 1380 | "collapsed": false 1381 | }, 1382 | "source": [] 1383 | } 1384 | }, 1385 | "widgets": { 1386 | "state": {}, 1387 | "version": "1.1.2" 1388 | } 1389 | }, 1390 | "nbformat": 4, 1391 | "nbformat_minor": 2 1392 | } 1393 | -------------------------------------------------------------------------------- /project/proj5/code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kemo-Huang/CS308-Computer-Vision/bc83936aa04bf0dca86294595632a9e1d0092f9a/project/proj5/code/__init__.py -------------------------------------------------------------------------------- /project/proj5/code/student_code.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.datasets as datasets 4 | import torchvision.transforms as transforms 5 | import os.path as osp 6 | import utils 7 | 8 | 9 | def create_datasets(data_path, input_size, rgb=False): 10 | """ 11 | This function creates and returns a training data loader and a 12 | testing/validation data loader. The dataloader should also perform some 13 | pre-processing steps on each of the datasets. Most of this function is 14 | implemented for you, you will only need to add a few additional lines. 15 | A data loader in pyTorch is a class inherited from the 16 | torch.utils.data.Dataset abstract class. In this project we will use the 17 | ImageFolder data loader. See 18 | http://pytorch.org/docs/master/torchvision/datasets.html#imagefolder for 19 | details. Although you don't need to for this project, you can also create your 20 | own custom data loader by inheriting from the abstract class and implementing 21 | the __len__() and __getitem__() methods as described in 22 | http://pytorch.org/tutorials/beginner/data_loading_tutorial.html 23 | As mentioned, the data loader should perform any necessary pre-processing 24 | steps on the data (images) and targets (labels). In pyTorch, this is done 25 | with 'transforms', which can be composed (chained together) as shown in 26 | http://pytorch.org/tutorials/beginner/data_loading_tutorial.html#transforms. 27 | While that example implements its own transforms, for this project the 28 | built-in transforms in torchvision.transforms should suffice. See 29 | http://pytorch.org/docs/master/torchvision/transforms.html for the list of 30 | available built-in transforms. 31 | Args: 32 | - data_path: (string) Path to the directory that contains the 'test' and 33 | 'train' data directories. 34 | - input_size: (w, h) Size of input image. The images will be resized to 35 | this size. 36 | - rgb: (boolean) Flag indicating if input images are RGB or grayscale. If 37 | False, images will be converted to grayscale. 38 | Returns: 39 | - train_dataloader: Dataloader for the training dataset. 40 | - test_dataloader: Dataloader for the testing/validation dataset. 41 | """ 42 | train_data_path = osp.join(data_path, 'train') 43 | test_data_path = osp.join(data_path, 'test') 44 | # Below variables are provided for your convenience. You may or may not need 45 | # all of them. 46 | train_mean, train_std = utils.get_mean_std(train_data_path, input_size, rgb) 47 | test_mean, test_std = utils.get_mean_std(test_data_path, input_size, rgb) 48 | 49 | """ TRAIN DATA TRANSFORMS """ 50 | train_data_tforms = [] 51 | train_data_tforms.append(transforms.Resize(size=max(input_size))) 52 | train_data_tforms.append(transforms.CenterCrop(size=input_size)) 53 | if not rgb: 54 | train_data_tforms.append(transforms.Grayscale()) 55 | 56 | ####################################################################### 57 | # TODO: YOUR CODE HERE # 58 | ####################################################################### 59 | # TODO Add a transformation to you train_data_tforms that left-right mirrors 60 | # the image randomly. Which transformation should you add? 61 | # pass 62 | train_data_tforms.append(transforms.RandomHorizontalFlip(p=0.5)) 63 | # train_data_tforms.append(transforms.RandomAffine(degrees=(-30, 30))) 64 | # Do not move the position of the below line (leave it between the left-right 65 | # mirroring and normalization transformations. 66 | train_data_tforms.append(transforms.ToTensor()) 67 | 68 | # TODO Add a transformation to your train_data_tforms that normalizes the 69 | # tensor by subtracting mean and dividing by std. You may use train_mean, 70 | # test_mean, train_std, or test_std values that are already calculated for 71 | # you. Which mean and std should you use to normalize the data? 72 | # pass 73 | train_data_tforms.append(transforms.Normalize(train_mean, train_std)) 74 | ####################################################################### 75 | # END OF YOUR CODE # 76 | ####################################################################### 77 | train_data_tforms = transforms.Compose(train_data_tforms) 78 | 79 | """ TEST/VALIDATION DATA TRANSFORMS """ 80 | 81 | test_data_tforms = [] 82 | test_data_tforms.append(transforms.Resize(size=max(input_size))) 83 | test_data_tforms.append(transforms.CenterCrop(size=input_size)) 84 | if not rgb: 85 | test_data_tforms.append(transforms.Grayscale()) 86 | test_data_tforms.append(transforms.ToTensor()) 87 | ####################################################################### 88 | # TODO: YOUR CODE HERE # 89 | ####################################################################### 90 | # TODO Add a transformation to your test_data_tforms that normalizes the 91 | # tensor by subtracting mean and dividing by std. You may use train_mean, 92 | # test_mean, train_std, or test_std values that are already calculated for 93 | # you. Which mean and std should you use to normalize the data? 94 | # pass 95 | test_data_tforms.append(transforms.Normalize(test_mean, test_std)) 96 | ####################################################################### 97 | # END OF YOUR CODE # 98 | ####################################################################### 99 | test_data_tforms = transforms.Compose(test_data_tforms) 100 | 101 | """ DATASET LOADERS """ 102 | # Creating dataset loaders using the transformations specified above. 103 | train_dset = datasets.ImageFolder(root=osp.join(data_path, 'train'), 104 | transform=train_data_tforms) 105 | test_dset = datasets.ImageFolder(root=osp.join(data_path, 'test'), 106 | transform=test_data_tforms) 107 | return train_dset, test_dset 108 | 109 | 110 | class SimpleNet(nn.Module): 111 | """ 112 | This class implements the network model needed for part 1. Network models in 113 | pyTorch are inherited from torch.nn.Module, only require implementing the 114 | __init__() and forward() methods. The backpropagation is handled automatically 115 | by pyTorch. 116 | The __init__() function defines the various operators needed for 117 | the forward pass e.g. conv, batch norm, fully connected, etc. 118 | The forward() defines how these blocks act on the input data to produce the 119 | network output. For hints on how to implement your network model, see the 120 | AlexNet example at 121 | https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py 122 | """ 123 | 124 | def __init__(self, num_classes, droprate=0.8, rgb=False, verbose=False): 125 | """ 126 | This is where you set up and initialize your network. A basic network is 127 | already set up for you. You will need to add a few more layers to it as 128 | described. You can refer to https://pytorch.org/docs/stable/nn.html for 129 | documentation. 130 | Args: 131 | - num_classes: (int) Number of output classes. 132 | - droprate: (float) Droprate of the network (used for droppout). 133 | - rgb: (boolean) Flag indicating if input images are RGB or grayscale, used 134 | to set the number of input channels. 135 | - verbose: (boolean) If True a hook is registered to print the size of input 136 | to classifier everytime the forward function is called. 137 | """ 138 | super(SimpleNet, self).__init__() # initialize the parent class, a must 139 | in_channels = 3 if rgb else 1 140 | 141 | """ NETWORK SETUP """ 142 | ##################################################################### 143 | # TODO: YOUR CODE HERE # 144 | ##################################################################### 145 | # TODO modify the simple network 146 | # 1) add one dropout layer after the last relu layer. 147 | # 2) add more convolution, maxpool and relu layers. 148 | # 3) add one batch normalization layer after each convolution/linear layer 149 | # except the last convolution/linear layer of the WHOLE model (meaning 150 | # including the classifier). 151 | 152 | self.features = nn.Sequential( 153 | # 64 * 64 * 1 154 | nn.Conv2d(in_channels=in_channels, out_channels=12, kernel_size=3, 155 | stride=1, padding=1), 156 | nn.BatchNorm2d(12), 157 | nn.ReLU(inplace=True), 158 | 159 | nn.MaxPool2d(kernel_size=2, stride=2, padding=0), 160 | 161 | nn.Conv2d(in_channels=12, out_channels=24, kernel_size=3, 162 | stride=1, padding=1), 163 | nn.BatchNorm2d(24), 164 | nn.ReLU(inplace=True), 165 | 166 | nn.MaxPool2d(kernel_size=2, stride=2, padding=0), 167 | 168 | nn.Conv2d(in_channels=24, out_channels=48, kernel_size=3, 169 | stride=1, padding=1), 170 | nn.BatchNorm2d(48), 171 | nn.ReLU(inplace=True), 172 | 173 | nn.MaxPool2d(kernel_size=2, stride=2, padding=0), 174 | 175 | nn.Conv2d(in_channels=48, out_channels=96, kernel_size=3, 176 | stride=1, padding=1), 177 | nn.BatchNorm2d(96), 178 | nn.ReLU(inplace=True), 179 | 180 | nn.MaxPool2d(kernel_size=2, stride=2, padding=0), 181 | ) 182 | 183 | self.classifier = nn.Sequential( 184 | nn.Linear(4 * 4 * 96, 1024), 185 | nn.ReLU(True), 186 | nn.Dropout(droprate), 187 | nn.Linear(1024, num_classes), 188 | ) 189 | 190 | ##################################################################### 191 | # END OF YOUR CODE # 192 | ##################################################################### 193 | 194 | """ NETWORK INITIALIZATION """ 195 | for name, m in self.named_modules(): 196 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): 197 | # Initializing weights with randomly sampled numbers from a normal 198 | # distribution. 199 | m.weight.data.normal_(0, 1) 200 | m.weight.data.mul_(1e-2) 201 | if m.bias is not None: 202 | # Initializing biases with zeros. 203 | nn.init.constant_(m.bias.data, 0) 204 | elif isinstance(m, nn.BatchNorm2d): 205 | ################################################################# 206 | # TODO: YOUR CODE HERE # 207 | ################################################################# 208 | # TODO How should you initialize the weights and biases for BatchNorm? 209 | # Initialize them here. 210 | m.weight.data.normal_(0, 1) 211 | # m.weight.data.mul_(1e-2) 212 | if m.bias is not None: 213 | # Initializing biases with zeros. 214 | nn.init.constant_(m.bias.data, 0) 215 | 216 | ################################################################# 217 | # END OF YOUR CODE # 218 | ################################################################# 219 | 220 | if verbose: 221 | # Hook that prints the size of input to classifier every time the forward 222 | # function is called. 223 | self.classifier.register_forward_hook(utils.print_input_size_hook) 224 | 225 | def forward(self, x): 226 | """ 227 | Forward step of the network. 228 | Args: 229 | - x: input data. 230 | Returns: 231 | - x: output of the classifier. 232 | """ 233 | x = self.features(x) 234 | x = self.classifier(torch.flatten(x, 1)) 235 | return x.squeeze() 236 | 237 | 238 | def custom_part1_trainer(model): 239 | # return a dict that contains your customized learning settings. 240 | pass 241 | return None 242 | 243 | 244 | def create_part2_model(model, num_classes): 245 | """ 246 | Take the passed in model and prepare it for finetuning by following the 247 | instructions. 248 | Args: 249 | - model: The model you need to prepare for finetuning. For the purposes of 250 | this project, you will pass in AlexNet. 251 | - num_classes: number of classes the model should output. 252 | Returns: 253 | - model: The model ready to be fine tuned. 254 | """ 255 | # Getting all layers from the input model's classifier. 256 | new_classifier = list(model.classifier.children()) 257 | print(new_classifier[-1]) 258 | new_classifier = new_classifier[:-1] 259 | ####################################################################### 260 | # TODO: YOUR CODE HERE # 261 | ####################################################################### 262 | # TODO modify the classifier of the model for finetuning. new_classifier is 263 | # now a list containing the layers of the classifier network, the last element 264 | # being the last layer of the classifier. 265 | # 1) Create a linear layer with correct in_features and out_features. What 266 | # should these values be? 267 | # 2) Initialize the weights and the bias in the new linear layer. Look at how 268 | # is the linear layer initialized in SimpleNetPart1. 269 | # 3) Append your new layer to your new_classifier. 270 | 271 | new_linear = nn.Linear(in_features=4096, out_features=128) 272 | if new_linear.bias is not None: 273 | nn.init.constant_(new_linear.bias.data, 0) 274 | new_classifier.append(new_linear) 275 | 276 | new_classifier.append(nn.ReLU(True)) 277 | 278 | new_linear = nn.Linear(in_features=128, out_features=num_classes) 279 | if new_linear.bias is not None: 280 | nn.init.constant_(new_linear.bias.data, 0) 281 | new_classifier.append(new_linear) 282 | 283 | ####################################################################### 284 | # END OF YOUR CODE # 285 | ####################################################################### 286 | # Connecting all layers to form a new classifier. 287 | model.classifier = nn.Sequential(*new_classifier) 288 | 289 | return model 290 | 291 | 292 | def custom_part2_trainer(model): 293 | # return a dict that contains your customized learning settings. 294 | pass 295 | return None 296 | -------------------------------------------------------------------------------- /project/proj5/code/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import os.path as osp 4 | import shutil 5 | import time 6 | import random 7 | import numpy as np 8 | from visdom import Visdom 9 | 10 | import torch 11 | import torch.utils.data 12 | from torch.autograd import Variable 13 | from IPython.core.debugger import set_trace 14 | 15 | import torchvision.transforms as transforms 16 | import torchvision.datasets as datasets 17 | from torch.utils.data import DataLoader 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | 21 | def set_seed(seed, use_GPU=False): 22 | torch.manual_seed(seed) 23 | np.random.seed(seed) 24 | random.seed(seed) 25 | if use_GPU: 26 | torch.cuda.manual_seed(seed) 27 | torch.cuda.manual_seed_all(seed) 28 | torch.backends.cudnn.deterministic = True 29 | 30 | 31 | def print_input_size_hook(self, input, output): 32 | print('Input size to classifier is', input[0].size()) 33 | 34 | 35 | class AverageMeter(object): 36 | """Computes and stores the average and current value""" 37 | 38 | def __init__(self): 39 | self.reset() 40 | 41 | def reset(self): 42 | self.val = 0 43 | self.avg = 0 44 | self.sum = 0 45 | self.count = 0 46 | 47 | def update(self, val, n=1): 48 | self.val = val 49 | self.sum += val * n 50 | self.count += n 51 | self.avg = self.sum / self.count 52 | 53 | 54 | def accuracy(output, target, topk=(1,)): 55 | """Computes the precision@k for the specified values of k""" 56 | maxk = max(topk) 57 | batch_size = target.size(0) 58 | 59 | _, pred = output.topk(maxk, 1, True, True) 60 | pred = pred.t() 61 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 62 | 63 | res = [] 64 | for k in topk: 65 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 66 | res.append(correct_k.mul_(100.0 / batch_size)) 67 | return res 68 | 69 | 70 | class Trainer(object): 71 | def __init__(self, train_dataset, val_dataset, model, loss_fn, optimizer, 72 | lr_scheduler, params): 73 | """ 74 | General purpose training script 75 | :param train_dataset: PyTorch dataset that loads training images 76 | :param val_dataset: PyTorch dataset that loads testing / validation images 77 | :param model: Network model 78 | :param optimizer: PyTorch optimizer object 79 | :param lr_scheduler: PyTorch learning rate scheduler object 80 | :param loss_fn: loss function 81 | :param params: dictionary containing parameters for the training process 82 | It can contain the following fields (fields with no default value mentioned 83 | are mandatory): 84 | n_epochs: number of epochs of training 85 | batch_size: batch size for one iteration 86 | do_val: perform validation? (default: True) 87 | shuffle: shuffle training data? (default: True) 88 | num_workers: number of CPU threads for loading data (default: 4) 89 | val_freq: frequency of validation (in number of epochs) (default: 1) 90 | print_freq: progress printing frequency (in number of iterations 91 | (default: 20) 92 | experiment: name of the experiment, used to create logs and checkpoints 93 | checkpoint_file: Name of file with saved weights. Loaded at before 94 | start of training if provided (default: None) 95 | resume_optim: whether to resume optimization from loaded weights 96 | (default: True) 97 | """ 98 | self.model = model 99 | self.loss_fn = loss_fn 100 | self.optimizer = optimizer 101 | self.lr_scheduler = lr_scheduler 102 | self.best_prec1 = -float('inf') 103 | 104 | # parse params with default values 105 | self.config = { 106 | 'n_epochs': params['n_epochs'], 107 | 'batch_size': params['batch_size'], 108 | 'do_val': params.get('do_val', True), 109 | 'shuffle': params.get('shuffle', True), 110 | 'num_workers': params.get('num_workers', 4), 111 | 'val_freq': params.get('val_freq', 1), 112 | 'print_freq': params.get('print_freq', 100), 113 | 'experiment': params['experiment'], 114 | 'checkpoint_file': params.get('checkpoint_file'), 115 | 'resume_optim': params.get('resume_optim', True) 116 | } 117 | 118 | self.logdir = osp.join(os.getcwd(), 'logs', self.config['experiment']) 119 | if not osp.isdir(self.logdir): 120 | os.makedirs(self.logdir) 121 | 122 | # visdom plots 123 | self.vis_env = self.config['experiment'] 124 | self.loss_win = 'loss_win' 125 | self.vis = Visdom() 126 | self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.loss_win, 127 | opts={'legend': ['train_loss', 'val_loss'], 'xlabel': 'epochs', 128 | 'ylabel': 'loss'}, env=self.vis_env) 129 | self.lr_win = 'lr_win' 130 | self.vis.line(X=np.zeros(1), Y=np.zeros(1), win=self.lr_win, 131 | opts={'legend': ['learning_rate'], 'xlabel': 'epochs', 132 | 'ylabel': 'log(lr)'}, env=self.vis_env) 133 | self.top1_win = 'top1_win' 134 | self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.top1_win, 135 | opts={'legend': ['train_top1_prec', 'val_top1_prec'], 'xlabel': 'epochs', 136 | 'ylabel': 'top1_prec (%)'}, env=self.vis_env) 137 | self.top5_win = 'top5_win' 138 | self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.top5_win, 139 | opts={'legend': ['train_top5_prec', 'val_top5_prec'], 'xlabel': 'epochs', 140 | 'ylabel': 'top5_prec (%)'}, env=self.vis_env) 141 | 142 | # log all the command line options 143 | print('---------------------------------------') 144 | print('Experiment: {:s}'.format(self.config['experiment'])) 145 | for k, v in self.config.items(): 146 | print('{:s}: {:s}'.format(k, str(v))) 147 | print('---------------------------------------') 148 | 149 | self.start_epoch = int(0) 150 | checkpoint_file = self.config['checkpoint_file'] 151 | if checkpoint_file: 152 | if osp.isfile(checkpoint_file): 153 | checkpoint = torch.load(checkpoint_file) 154 | self.model.load_state_dict(checkpoint['model_state_dict']) 155 | self.best_prec1 = checkpoint['best_prec1'] 156 | if self.config['resume_optim']: 157 | self.optimizer.load_state_dict(checkpoint['optim_state_dict']) 158 | self.start_epoch = checkpoint['epoch'] 159 | print('Loaded checkpoint {:s} epoch {:d}'.format(checkpoint_file, 160 | checkpoint['epoch'])) 161 | 162 | self.train_loader = torch.utils.data.DataLoader(train_dataset, 163 | batch_size=self.config['batch_size'], 164 | shuffle=self.config['shuffle'], 165 | num_workers=self.config['num_workers']) 166 | if self.config['do_val']: 167 | self.val_loader = torch.utils.data.DataLoader(val_dataset, 168 | batch_size=self.config['batch_size'], shuffle=False, 169 | num_workers=self.config['num_workers']) 170 | else: 171 | self.val_loader = None 172 | 173 | def save_checkpoint(self, epoch, is_best): 174 | filename = osp.join(self.logdir, 'checkpoint.pth.tar') 175 | checkpoint_dict = \ 176 | {'epoch': epoch, 'model_state_dict': self.model.state_dict(), 177 | 'optim_state_dict': self.optimizer.state_dict(), 178 | 'best_prec1': self.best_prec1} 179 | torch.save(checkpoint_dict, filename) 180 | if is_best: 181 | shutil.copyfile(filename, osp.join(self.logdir, 'best_model.pth.tar')) 182 | 183 | def step_func(self, train): 184 | batch_time = AverageMeter() 185 | data_time = AverageMeter() 186 | losses = AverageMeter() 187 | top1 = AverageMeter() 188 | top5 = AverageMeter() 189 | 190 | if train: 191 | self.model.train() 192 | status = 'train' 193 | loader = self.train_loader 194 | else: 195 | self.model.eval() 196 | status = 'val' 197 | loader = self.val_loader 198 | 199 | end = time.time() 200 | 201 | for batch_idx, (data, target) in enumerate(loader): 202 | data_time.update(time.time() - end) 203 | 204 | kwargs = dict(target=target, loss_fn=self.loss_fn, 205 | optim=self.optimizer, train=train) 206 | loss, output = step_feedfwd(data, self.model, **kwargs) 207 | 208 | # measure accuracy and calculate loss 209 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 210 | losses.update(loss, data.size(0)) 211 | top1.update(prec1[0], data.size(0)) 212 | top5.update(prec5[0], data.size(0)) 213 | 214 | # measure batch time 215 | batch_time.update(time.time() - end) 216 | end = time.time() 217 | 218 | if batch_idx % self.config['print_freq'] == 0: 219 | print('{:s} {:s}: batch {:d}/{:d}, loss {:4.3f}, top-1 accuracy {:4.3f},' 220 | ' top-5 accuracy {:4.3f}'.format(status, self.config['experiment'], 221 | batch_idx, len(loader) - 1, loss, prec1[0], prec5[0])) 222 | 223 | print('{:s} {:s}: loss {:f}'.format(status, self.config['experiment'], 224 | losses.avg)) 225 | 226 | return losses.avg, top1.avg, top5.avg 227 | 228 | def train_val(self): 229 | for epoch in range(self.start_epoch, self.config['n_epochs']): 230 | print('{:s} Epoch {:d} / {:d}'.format(self.config['experiment'], epoch, 231 | self.config['n_epochs'])) 232 | 233 | # ADJUST LR 234 | self.lr_scheduler.step() 235 | lr = self.lr_scheduler.get_lr()[0] 236 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([np.log10(lr)]), 237 | win=self.lr_win, name='learning_rate', update='append', env=self.vis_env) 238 | 239 | # TRAIN 240 | loss, top1_prec, top5_prec = self.step_func(train=True) 241 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss]), 242 | win=self.loss_win, name='train_loss', update='append', env=self.vis_env) 243 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top1_prec]), 244 | win=self.top1_win, name='train_top1_prec', update='append', env=self.vis_env) 245 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top5_prec]), 246 | win=self.top5_win, name='train_top5_prec', update='append', env=self.vis_env) 247 | self.vis.save(envs=[self.vis_env]) 248 | 249 | # VALIDATION 250 | if self.config['do_val'] and ((epoch % self.config['val_freq'] == 0) or 251 | (epoch == self.config['n_epochs'] - 1)): 252 | loss, top1_prec, top5_prec = self.step_func(train=False) 253 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss]), 254 | win=self.loss_win, name='val_loss', update='append', env=self.vis_env) 255 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top1_prec]), 256 | win=self.top1_win, name='val_top1_prec', update='append', env=self.vis_env) 257 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top5_prec]), 258 | win=self.top5_win, name='val_top5_prec', update='append', env=self.vis_env) 259 | self.vis.save(envs=[self.vis_env]) 260 | 261 | # SAVE CHECKPOINT 262 | is_best = top1_prec > self.best_prec1 263 | self.best_prec1 = max(self.best_prec1, top1_prec) 264 | self.save_checkpoint(epoch, is_best) 265 | print('Checkpoint saved') 266 | if is_best: 267 | print('BEST TOP1 ACCURACY SO FAR') 268 | 269 | return self.best_prec1 270 | 271 | 272 | def step_feedfwd(data, model, target=None, loss_fn=None, optim=None, 273 | train=True): 274 | """ 275 | training/validation step for a feedforward NN 276 | :param data: 277 | :param target: 278 | :param model: 279 | :param loss_fn: 280 | :param optim: 281 | :param train: training / val stage 282 | :return: 283 | """ 284 | if train: 285 | assert loss_fn is not None 286 | 287 | with torch.no_grad(): 288 | data_var = Variable(data, requires_grad=train) 289 | output = model(data_var) 290 | 291 | if loss_fn is not None: 292 | with torch.no_grad(): 293 | target_var = Variable(target, requires_grad=False) 294 | loss = loss_fn(output, target_var) 295 | if train: 296 | # SGD step 297 | optim.zero_grad() 298 | loss.backward() 299 | optim.step() 300 | 301 | return loss.item(), output 302 | else: 303 | return 0, output 304 | 305 | 306 | def get_mean_std(data_path, input_size, rgb): 307 | tform = [] 308 | tform.append(transforms.Resize(size=input_size)) 309 | if not rgb: 310 | tform.append(transforms.Grayscale()) 311 | tform.append(transforms.ToTensor()) 312 | tform = transforms.Compose(tform) 313 | dset = datasets.ImageFolder(root=data_path, transform=tform) 314 | train_loader = DataLoader(dataset=dset, batch_size=50) 315 | scaler = StandardScaler(with_mean=True, with_std=True) 316 | print('Computing pixel mean and stdev...') 317 | for idx, (data, labels) in enumerate(train_loader): 318 | if idx % 20 == 0: 319 | print("Batch {:d} / {:d}".format(idx, len(train_loader))) 320 | data = data.numpy() 321 | n_channels = data.shape[1] 322 | # reshape into [n_pixels x 3] 323 | data = data.transpose((0, 2, 3, 1)).reshape((-1, n_channels)) 324 | # pass batch to incremental mean and stdev calculator 325 | scaler.partial_fit(data) 326 | print('Done, mean = ') 327 | pixel_mean = scaler.mean_ 328 | pixel_std = scaler.scale_ 329 | print(pixel_mean) 330 | print('std = ') 331 | print(pixel_std) 332 | return pixel_mean, pixel_std 333 | -------------------------------------------------------------------------------- /project/proj5/code/utils_gpu.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import os.path as osp 4 | import shutil 5 | import time 6 | import random 7 | import numpy as np 8 | from visdom import Visdom 9 | 10 | import torch 11 | import torch.utils.data 12 | from torch.autograd import Variable 13 | from IPython.core.debugger import set_trace 14 | 15 | import torchvision.transforms as transforms 16 | import torchvision.datasets as datasets 17 | from torch.utils.data import DataLoader 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | 21 | def set_seed(seed, use_GPU=False): 22 | torch.manual_seed(seed) 23 | np.random.seed(seed) 24 | random.seed(seed) 25 | if use_GPU: 26 | torch.cuda.manual_seed(seed) 27 | torch.cuda.manual_seed_all(seed) 28 | torch.backends.cudnn.deterministic = True 29 | 30 | 31 | def print_input_size_hook(self, input, output): 32 | print('Input size to classifier is', input[0].size()) 33 | 34 | 35 | class AverageMeter(object): 36 | """Computes and stores the average and current value""" 37 | 38 | def __init__(self): 39 | self.reset() 40 | 41 | def reset(self): 42 | self.val = 0 43 | self.avg = 0 44 | self.sum = 0 45 | self.count = 0 46 | 47 | def update(self, val, n=1): 48 | self.val = val 49 | self.sum += val * n 50 | self.count += n 51 | self.avg = self.sum / self.count 52 | 53 | 54 | def accuracy(output, target, topk=(1,)): 55 | """Computes the precision@k for the specified values of k""" 56 | maxk = max(topk) 57 | batch_size = target.size(0) 58 | 59 | _, pred = output.topk(maxk, 1, True, True) 60 | pred = pred.t() 61 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 62 | 63 | res = [] 64 | for k in topk: 65 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 66 | res.append(correct_k.mul_(100.0 / batch_size)) 67 | return res 68 | 69 | 70 | class Trainer(object): 71 | def __init__(self, train_dataset, val_dataset, model, loss_fn, optimizer, 72 | lr_scheduler, params): 73 | """ 74 | General purpose training script 75 | :param train_dataset: PyTorch dataset that loads training images 76 | :param val_dataset: PyTorch dataset that loads testing / validation images 77 | :param model: Network model 78 | :param optimizer: PyTorch optimizer object 79 | :param lr_scheduler: PyTorch learning rate scheduler object 80 | :param loss_fn: loss function 81 | :param params: dictionary containing parameters for the training process 82 | It can contain the following fields (fields with no default value mentioned 83 | are mandatory): 84 | n_epochs: number of epochs of training 85 | batch_size: batch size for one iteration 86 | do_val: perform validation? (default: True) 87 | shuffle: shuffle training data? (default: True) 88 | num_workers: number of CPU threads for loading data (default: 4) 89 | val_freq: frequency of validation (in number of epochs) (default: 1) 90 | print_freq: progress printing frequency (in number of iterations 91 | (default: 20) 92 | experiment: name of the experiment, used to create logs and checkpoints 93 | checkpoint_file: Name of file with saved weights. Loaded at before 94 | start of training if provided (default: None) 95 | resume_optim: whether to resume optimization from loaded weights 96 | (default: True) 97 | """ 98 | self.model = model 99 | self.loss_fn = loss_fn 100 | self.optimizer = optimizer 101 | self.lr_scheduler = lr_scheduler 102 | self.best_prec1 = -float('inf') 103 | 104 | # parse params with default values 105 | self.config = { 106 | 'n_epochs': params['n_epochs'], 107 | 'batch_size': params['batch_size'], 108 | 'do_val': params.get('do_val', True), 109 | 'shuffle': params.get('shuffle', True), 110 | 'num_workers': params.get('num_workers', 4), 111 | 'val_freq': params.get('val_freq', 1), 112 | 'print_freq': params.get('print_freq', 100), 113 | 'experiment': params['experiment'], 114 | 'checkpoint_file': params.get('checkpoint_file'), 115 | 'resume_optim': params.get('resume_optim', True) 116 | } 117 | 118 | self.logdir = osp.join(os.getcwd(), 'logs', self.config['experiment']) 119 | if not osp.isdir(self.logdir): 120 | os.makedirs(self.logdir) 121 | 122 | # visdom plots 123 | self.vis_env = self.config['experiment'] 124 | self.loss_win = 'loss_win' 125 | self.vis = Visdom() 126 | self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.loss_win, 127 | opts={'legend': ['train_loss', 'val_loss'], 'xlabel': 'epochs', 128 | 'ylabel': 'loss'}, env=self.vis_env) 129 | self.lr_win = 'lr_win' 130 | self.vis.line(X=np.zeros(1), Y=np.zeros(1), win=self.lr_win, 131 | opts={'legend': ['learning_rate'], 'xlabel': 'epochs', 132 | 'ylabel': 'log(lr)'}, env=self.vis_env) 133 | self.top1_win = 'top1_win' 134 | self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.top1_win, 135 | opts={'legend': ['train_top1_prec', 'val_top1_prec'], 'xlabel': 'epochs', 136 | 'ylabel': 'top1_prec (%)'}, env=self.vis_env) 137 | self.top5_win = 'top5_win' 138 | self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.top5_win, 139 | opts={'legend': ['train_top5_prec', 'val_top5_prec'], 'xlabel': 'epochs', 140 | 'ylabel': 'top5_prec (%)'}, env=self.vis_env) 141 | 142 | # log all the command line options 143 | print('---------------------------------------') 144 | print('Experiment: {:s}'.format(self.config['experiment'])) 145 | for k, v in self.config.items(): 146 | print('{:s}: {:s}'.format(k, str(v))) 147 | print('---------------------------------------') 148 | 149 | self.start_epoch = int(0) 150 | checkpoint_file = self.config['checkpoint_file'] 151 | if checkpoint_file: 152 | if osp.isfile(checkpoint_file): 153 | checkpoint = torch.load(checkpoint_file) 154 | self.model.load_state_dict(checkpoint['model_state_dict']) 155 | self.best_prec1 = checkpoint['best_prec1'] 156 | if self.config['resume_optim']: 157 | self.optimizer.load_state_dict(checkpoint['optim_state_dict']) 158 | self.start_epoch = checkpoint['epoch'] 159 | print('Loaded checkpoint {:s} epoch {:d}'.format(checkpoint_file, 160 | checkpoint['epoch'])) 161 | 162 | self.train_loader = torch.utils.data.DataLoader(train_dataset, 163 | batch_size=self.config['batch_size'], 164 | shuffle=self.config['shuffle'], 165 | num_workers=self.config['num_workers']) 166 | if self.config['do_val']: 167 | self.val_loader = torch.utils.data.DataLoader(val_dataset, 168 | batch_size=self.config['batch_size'], shuffle=False, 169 | num_workers=self.config['num_workers']) 170 | else: 171 | self.val_loader = None 172 | 173 | def save_checkpoint(self, epoch, is_best): 174 | filename = osp.join(self.logdir, 'checkpoint.pth.tar') 175 | checkpoint_dict = \ 176 | {'epoch': epoch, 'model_state_dict': self.model.state_dict(), 177 | 'optim_state_dict': self.optimizer.state_dict(), 178 | 'best_prec1': self.best_prec1} 179 | torch.save(checkpoint_dict, filename) 180 | if is_best: 181 | shutil.copyfile(filename, osp.join(self.logdir, 'best_model.pth.tar')) 182 | 183 | def step_func(self, train): 184 | batch_time = AverageMeter() 185 | data_time = AverageMeter() 186 | losses = AverageMeter() 187 | top1 = AverageMeter() 188 | top5 = AverageMeter() 189 | 190 | if train: 191 | self.model.train() 192 | status = 'train' 193 | loader = self.train_loader 194 | else: 195 | self.model.eval() 196 | status = 'val' 197 | loader = self.val_loader 198 | 199 | end = time.time() 200 | 201 | for batch_idx, (data, target) in enumerate(loader): 202 | target = target.cuda() 203 | data_time.update(time.time() - end) 204 | 205 | kwargs = dict(target=target, loss_fn=self.loss_fn, 206 | optim=self.optimizer, train=train) 207 | loss, output = step_feedfwd(data, self.model, **kwargs) 208 | 209 | # measure accuracy and calculate loss 210 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 211 | losses.update(loss, data.size(0)) 212 | top1.update(prec1[0], data.size(0)) 213 | top5.update(prec5[0], data.size(0)) 214 | 215 | # measure batch time 216 | batch_time.update(time.time() - end) 217 | end = time.time() 218 | 219 | if batch_idx % self.config['print_freq'] == 0: 220 | print('{:s} {:s}: batch {:d}/{:d}, loss {:4.3f}, top-1 accuracy {:4.3f},' 221 | ' top-5 accuracy {:4.3f}'.format(status, self.config['experiment'], 222 | batch_idx, len(loader) - 1, loss, prec1[0], prec5[0])) 223 | 224 | print('{:s} {:s}: loss {:f}'.format(status, self.config['experiment'], 225 | losses.avg)) 226 | 227 | return losses.avg, top1.avg, top5.avg 228 | 229 | def train_val(self): 230 | for epoch in range(self.start_epoch, self.config['n_epochs']): 231 | print('{:s} Epoch {:d} / {:d}'.format(self.config['experiment'], epoch, 232 | self.config['n_epochs'])) 233 | # TRAIN 234 | loss, top1_prec, top5_prec = self.step_func(train=True) 235 | # self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss]), 236 | # win=self.loss_win, name='train_loss', update='append', env=self.vis_env) 237 | # self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top1_prec]), 238 | # win=self.top1_win, name='train_top1_prec', update='append', env=self.vis_env) 239 | # self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top5_prec]), 240 | # win=self.top5_win, name='train_top5_prec', update='append', env=self.vis_env) 241 | # self.vis.save(envs=[self.vis_env]) 242 | 243 | # VALIDATION 244 | if self.config['do_val'] and ((epoch % self.config['val_freq'] == 0) or 245 | (epoch == self.config['n_epochs'] - 1)): 246 | loss, top1_prec, top5_prec = self.step_func(train=False) 247 | # self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss]), 248 | # win=self.loss_win, name='val_loss', update='append', env=self.vis_env) 249 | # self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top1_prec]), 250 | # win=self.top1_win, name='val_top1_prec', update='append', env=self.vis_env) 251 | # self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top5_prec]), 252 | # win=self.top5_win, name='val_top5_prec', update='append', env=self.vis_env) 253 | # self.vis.save(envs=[self.vis_env]) 254 | 255 | # ADJUST LR 256 | self.lr_scheduler.step() 257 | lr = self.lr_scheduler.get_lr()[0] 258 | self.vis.line(X=np.asarray([epoch]), Y=np.asarray([np.log10(lr)]), 259 | win=self.lr_win, name='learning_rate', update='append', env=self.vis_env) 260 | 261 | # SAVE CHECKPOINT 262 | is_best = top1_prec > self.best_prec1 263 | self.best_prec1 = max(self.best_prec1, top1_prec) 264 | self.save_checkpoint(epoch, is_best) 265 | print('Checkpoint saved') 266 | if is_best: 267 | print('BEST TOP1 ACCURACY SO FAR') 268 | 269 | return self.best_prec1 270 | 271 | 272 | def step_feedfwd(data, model, target=None, loss_fn=None, optim=None, 273 | train=True): 274 | """ 275 | training/validation step for a feedforward NN 276 | :param data: 277 | :param target: 278 | :param model: 279 | :param loss_fn: 280 | :param optim: 281 | :param train: training / val stage 282 | :return: 283 | """ 284 | if train: 285 | assert loss_fn is not None 286 | 287 | with torch.no_grad(): 288 | data_var = Variable(data, requires_grad=train).cuda() 289 | output = model(data_var) 290 | 291 | if loss_fn is not None: 292 | with torch.no_grad(): 293 | target_var = Variable(target, requires_grad=False).cuda() 294 | loss = loss_fn(output, target_var) 295 | if train: 296 | # SGD step 297 | optim.zero_grad() 298 | loss.backward() 299 | optim.step() 300 | 301 | return loss.item(), output 302 | else: 303 | return 0, output 304 | 305 | 306 | def get_mean_std(data_path, input_size, rgb): 307 | tform = [] 308 | tform.append(transforms.Resize(size=input_size)) 309 | if not rgb: 310 | tform.append(transforms.Grayscale()) 311 | tform.append(transforms.ToTensor()) 312 | tform = transforms.Compose(tform) 313 | dset = datasets.ImageFolder(root=data_path, transform=tform) 314 | train_loader = DataLoader(dataset=dset, batch_size=50) 315 | scaler = StandardScaler(with_mean=True, with_std=True) 316 | print('Computing pixel mean and stdev...') 317 | for idx, (data, labels) in enumerate(train_loader): 318 | if idx % 20 == 0: 319 | print("Batch {:d} / {:d}".format(idx, len(train_loader))) 320 | data = data.numpy() 321 | n_channels = data.shape[1] 322 | # reshape into [n_pixels x 3] 323 | data = data.transpose((0, 2, 3, 1)).reshape((-1, n_channels)) 324 | # pass batch to incremental mean and stdev calculator 325 | scaler.partial_fit(data) 326 | print('Done, mean = ') 327 | pixel_mean = scaler.mean_ 328 | pixel_std = scaler.scale_ 329 | print(pixel_mean) 330 | print('std = ') 331 | print(pixel_std) 332 | return pixel_mean, pixel_std 333 | --------------------------------------------------------------------------------