├── .gitignore ├── LICENSE ├── README.rst ├── bin ├── extract-figures.py └── locate-thumbnail.py ├── image_mining ├── __init__.py ├── figure_extraction.py └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Released into the public domain 2 | 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Experimental image mining using OpenCV 2 | ====================================== 3 | 4 | Attempting to build tools to mine interesting data from large collections of scanned images 5 | 6 | Current 7 | ------- 8 | 9 | * bin/locate-thumbnail: 10 | - `Reconstructing thumbnails using OpenCV: `_ 11 | - `Upgrading Image Thumbnails… Or How to Fill a Large Display Without Your Content Team Quitting `_ 12 | * bin/extract-figures: 13 | - `locate interesting non-text elements (images, figures, tables, etc.) on scanned book pages `_ 14 | 15 | Prerequisites 16 | ------------- 17 | 18 | * Python 2.6+ 19 | * OpenCV 2.4+ 20 | * numpy 21 | 22 | Using Mac Homebrew this should install cleanly:: 23 | 24 | brew install python numpy opencv 25 | 26 | On Ubuntu 12.04 Precise the following is known to work - note the need for a PPA to get OpenCV 2.4+:: 27 | 28 | sudo add-apt-repository ppa:alexei.colin/opencv 29 | sudo apt-get update 30 | sudo apt-get install python-numpy python-opencv 31 | 32 | Discussion 33 | ---------- 34 | 35 | .. image:: https://badges.gitter.im/Join%20Chat.svg 36 | :target: https://gitter.im/acdha/image-mining?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge 37 | :alt: Join Chat on Gitter.im 38 | -------------------------------------------------------------------------------- /bin/extract-figures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import json 5 | import os 6 | import sys 7 | 8 | import cv2 9 | import numpy 10 | 11 | from image_mining.figure_extraction import FigureExtractor 12 | from image_mining.utils import open_image 13 | 14 | 15 | def display_images(extractor, files): 16 | window_name = "Controls" 17 | 18 | images = [] 19 | for f in files: 20 | print "Loading %s" % f 21 | 22 | try: 23 | images.append(open_image(f)) 24 | except StandardError as exc: 25 | print >>sys.stderr, exc 26 | continue 27 | 28 | def update_display(*args): 29 | extractor.canny_threshold = cv2.getTrackbarPos("Canny Threshold", window_name) 30 | extractor.erosion_element = cv2.getTrackbarPos("Erosion Element", window_name) 31 | extractor.erosion_size = cv2.getTrackbarPos("Erosion Size", window_name) 32 | extractor.dilation_element = cv2.getTrackbarPos("Dilation Element", window_name) 33 | extractor.dilation_size = cv2.getTrackbarPos("Dilation Size", window_name) 34 | 35 | # TODO: tame configuration hideousness: 36 | labels = ["Canny Threshold: %s" % extractor.canny_threshold, 37 | "Erosion Element: %s" % FigureExtractor.MORPH_TYPE_KEYS[extractor.erosion_element], 38 | "Erosion Size: %s" % extractor.erosion_size, 39 | "Dilation Element: %s" % FigureExtractor.MORPH_TYPE_KEYS[extractor.dilation_element], 40 | "Dilation Size: %s" % extractor.dilation_size] 41 | 42 | labels_img = numpy.zeros((30 * (len(labels) + 1), 600, 3), numpy.uint8) 43 | for i, label in enumerate(labels, 1): 44 | cv2.putText(labels_img, label, (0, i * 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (192, 192, 192)) 45 | cv2.imshow("Controls", labels_img) 46 | 47 | print "Settings:\n\t", "\n\t".join(labels) 48 | print 49 | 50 | for name, image in images: 51 | filtered_image = extractor.filter_image(image) 52 | contours, hierarchy = extractor.find_contours(filtered_image) 53 | 54 | # The filtered image will be heavily processed down to 1-bit depth. We'll convert it to RGB 55 | # so we can display the effects of the filters with full-color overlays for detected figures: 56 | output = cv2.cvtColor(filtered_image, cv2.COLOR_GRAY2RGB) 57 | 58 | print "Processing %s" % name 59 | 60 | for bbox in extractor.get_bounding_boxes_from_contours(contours, filtered_image): 61 | print "\tExtract: %s" % bbox 62 | output[bbox.image_slice] = image[bbox.image_slice] 63 | 64 | cv2.polylines(output, bbox.poly, True, (32, 192, 32), thickness=3) 65 | cv2.drawContours(output, contours, bbox.contour_index, (32, 192, 32), hierarchy=hierarchy, maxLevel=0) 66 | 67 | cv2.rectangle(output, (bbox.x1, bbox.y1), (bbox.x2, bbox.y2), color=(32, 192, 192)) 68 | 69 | cv2.imshow(name, output) 70 | 71 | cv2.namedWindow(window_name) 72 | cv2.resizeWindow(window_name, 600, 340) 73 | 74 | cv2.createTrackbar("Canny Threshold", window_name, extractor.canny_threshold, 255, update_display) 75 | cv2.createTrackbar("Erosion Element", window_name, extractor.erosion_element, len(extractor.MORPH_TYPES) - 1, update_display) 76 | cv2.createTrackbar("Erosion Size", window_name, extractor.erosion_size, 64, update_display) 77 | cv2.createTrackbar("Dilation Element", window_name, extractor.dilation_element, len(extractor.MORPH_TYPES) - 1, update_display) 78 | cv2.createTrackbar("Dilation Size", window_name, extractor.dilation_size, 64, update_display) 79 | 80 | update_display() 81 | 82 | if args.interactive: 83 | while cv2.waitKey() not in (13, 27): 84 | continue 85 | cv2.destroyAllWindows() 86 | 87 | 88 | if __name__ == "__main__": 89 | parser = argparse.ArgumentParser() 90 | 91 | parser.add_argument('--debug', action="store_true", help="Open debugger for errors") 92 | 93 | parser.add_argument('files', metavar="IMAGE_FILE", nargs="+") 94 | 95 | mode_group = parser.add_mutually_exclusive_group(required=True) 96 | mode_group.add_argument('--interactive', default=False, action="store_true", help="Display visualization windows") 97 | mode_group.add_argument('--output-directory', default=None, help="Directory to store extracted files") 98 | 99 | parser.add_argument('--save-json', action="store_true", help="Save bounding boxes as JSON files along with extracts") 100 | 101 | extraction_params = parser.add_argument_group("Extraction Parameters") 102 | extraction_params.add_argument('--canny-threshold', type=int, default=0, help="Canny edge detection threshold (%(type)s, default=%(default)s, 0 to disable)") 103 | 104 | extraction_params.add_argument('--erosion-element', default="rectangle", choices=FigureExtractor.MORPH_TYPE_KEYS, help="Erosion Element (default: %(default)s)") 105 | extraction_params.add_argument('--erosion-size', type=int, default=0, help="Erosion Size (%(type)s, default=%(default)s, 0 to disable)") 106 | 107 | extraction_params.add_argument('--dilation-element', default="rectangle", choices=FigureExtractor.MORPH_TYPE_KEYS, help="Dilation Element (default: %(default)s)") 108 | extraction_params.add_argument('--dilation-size', type=int, default=0, help="Dilation Size (%(type)s, default=%(default)s, 0 to disable)") 109 | 110 | args = parser.parse_args() 111 | 112 | if not args.output_directory: 113 | output_dir = None 114 | else: 115 | output_dir = os.path.realpath(args.output_directory) 116 | if not os.path.isdir(output_dir): 117 | parser.error("Output directory %s does not exist" % args.output_directory) 118 | else: 119 | print "Output will be saved to %s" % output_dir 120 | 121 | if output_dir is None and not args.interactive: 122 | parser.error("Either use --interactive or specify an output directory to save results!") 123 | 124 | if args.debug: 125 | try: 126 | import bpdb as pdb 127 | except ImportError: 128 | import pdb 129 | 130 | # FIXME: we should have a way to enumerate this from FigureExtractor and feed argparse that way: 131 | param_names = [action.dest for action in extraction_params._group_actions] 132 | params = {k: v for (k, v) in args._get_kwargs() if k in param_names} 133 | 134 | try: 135 | extractor = FigureExtractor(**params) 136 | 137 | if args.interactive: 138 | display_images(extractor, args.files) 139 | else: 140 | for f in args.files: 141 | try: 142 | base_name, source_image = open_image(f) 143 | except StandardError as exc: 144 | print >>sys.stderr, exc 145 | continue 146 | 147 | output_base = os.path.join(output_dir, base_name) 148 | 149 | print "Processing %s" % f 150 | 151 | boxes = [] 152 | 153 | for i, bbox in enumerate(extractor.find_figures(source_image), 1): 154 | extracted = source_image[bbox.image_slice] 155 | extract_filename = os.path.join(output_dir, "%s-%d.jpg" % (output_base, i)) 156 | print "\tSaving %s" % extract_filename 157 | cv2.imwrite(extract_filename, extracted) 158 | 159 | boxes.append(bbox.as_dict()) 160 | 161 | if args.save_json and boxes: 162 | json_data = {"source_image": {"filename": f, 163 | "dimensions": {"width": source_image.shape[1], 164 | "height": source_image.shape[0]}}, 165 | "regions": boxes} 166 | 167 | json_filename = os.path.join(output_dir, "%s.json" % output_base) 168 | with open(json_filename, "wb") as json_f: 169 | json.dump(json_data, json_f, allow_nan=False) 170 | print "\tSaved extract information to %s" % json_filename 171 | 172 | except Exception as exc: 173 | if args.debug: 174 | print >>sys.stderr, exc 175 | pdb.pm() 176 | raise 177 | -------------------------------------------------------------------------------- /bin/locate-thumbnail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Detect the crop box for a thumbnail inside a larger image 5 | 6 | The thumbnail image can be cropped and scaled arbitrarily from the larger image. Rotation and other more 7 | complex transformations should work but may lower accuracy. 8 | """ 9 | from __future__ import (absolute_import, division, print_function, 10 | unicode_literals) 11 | 12 | import argparse 13 | import json 14 | import logging 15 | import os 16 | import sys 17 | 18 | import cv 19 | import cv2 20 | import numpy 21 | from image_mining.utils import open_image 22 | 23 | 24 | def match_images(template, source): 25 | """Return filtered matches from the template and source images""" 26 | 27 | # TODO: Compare non-encumbered options – see http://docs.opencv.org/modules/features2d/doc/features2d.html 28 | detector = cv2.SURF(400, 5, 5) 29 | matcher = cv2.BFMatcher(cv2.NORM_L2) 30 | 31 | kp1, desc1 = detector.detectAndCompute(template, None) 32 | kp2, desc2 = detector.detectAndCompute(source, None) 33 | logging.debug('Features: template %d, source %d', len(kp1), len(kp2)) 34 | 35 | raw_matches = matcher.knnMatch(desc1, trainDescriptors=desc2, k=2) 36 | kp_pairs = filter_matches(kp1, kp2, raw_matches) 37 | 38 | return kp_pairs 39 | 40 | 41 | def filter_matches(kp1, kp2, matches, ratio=0.75): 42 | kp_pairs = [] 43 | 44 | for m1, m2 in matches: 45 | if m1.distance < m2.distance * ratio: 46 | kp_pairs.append((kp1[m1.queryIdx], kp2[m1.trainIdx])) 47 | 48 | return kp_pairs 49 | 50 | 51 | def autorotate_image(img, corners): 52 | corners_x, corners_y = zip(*corners) 53 | 54 | # n.b. numpy rot90 rotates 90° counter-clockwise but our terminology is clockwise 55 | # so the rotations below aren't actually flippy: 56 | 57 | print(corners_x, corners_y) 58 | 59 | if (((min(corners_x[0], corners_x[1]) > max(corners_x[2], corners_x[3])) 60 | and min(corners_y[1], corners_y[2]) > max(corners_y[0], corners_y[3]))): 61 | return 270, numpy.rot90(img) 62 | elif min(corners_x[2], corners_x[3]) > max(corners_x[0], corners_x[1]): 63 | return 90, numpy.rot90(img, 3) 64 | elif min(corners_x[0], corners_x[3]) > max(corners_x[1], corners_x[2]): 65 | return 180, cv2.flip(img, -1) 66 | else: 67 | return 0, img 68 | 69 | 70 | def fit_image_within(img, max_height, max_width): 71 | current_h, current_w = img.shape[:2] 72 | 73 | # Confirm that we need to do anything: 74 | if current_h <= max_height and current_w <= max_width: 75 | return img 76 | 77 | if current_h > current_w: 78 | scale = max_height / current_h 79 | else: 80 | scale = max_width / current_w 81 | 82 | new_dims = (int(round(current_w * scale)), int(round(current_h * scale))) 83 | 84 | # Note the flip from numpy's .shape to opencv's (x, y) format: 85 | logging.info('Resizing from %s to %s', (current_w, current_h), new_dims) 86 | 87 | return cv2.resize(img, new_dims, interpolation=cv2.INTER_AREA) 88 | 89 | 90 | def get_scaled_corners(thumbnail_image, source_image, full_source_image, kp_pairs, H): 91 | thumb_h, thumb_w = thumbnail_image.shape[:2] 92 | 93 | corners = numpy.float32([[0, 0], [thumb_w, 0], [thumb_w, thumb_h], [0, thumb_h]]) 94 | corners = numpy.int32(cv2.perspectiveTransform(corners.reshape(1, -1, 2), H).reshape(-1, 2)) 95 | 96 | # It's possible for rounding errors to produce values which are slightly outside of the image dimensions 97 | # so we'll clamp the boundaries within the source image: https://github.com/acdha/image-mining/issues/5 98 | source_h, source_w = source_image.shape[:2] 99 | 100 | # Transpose the array so we can operate on it *in-place* to clamp values: 101 | corners_x, corners_y = corners.T 102 | numpy.clip(corners_x, 0.0, source_w, out=corners_x) 103 | numpy.clip(corners_y, 0.0, source_h, out=corners_y) 104 | 105 | corners = corners.tolist() 106 | 107 | logging.info("Thumbnail bounds within analyzed image: %s", corners) 108 | 109 | if full_source_image is not None and full_source_image is not source_image: 110 | scale_y = full_source_image.shape[0] / source_image.shape[0] 111 | scale_x = full_source_image.shape[1] / source_image.shape[1] 112 | 113 | corners = [(int(round(x * scale_x)), int(round(y * scale_y))) for x, y in corners] 114 | 115 | logging.info("Thumbnail bounds within full-size source image: %s", corners) 116 | 117 | return corners 118 | 119 | 120 | def adjust_crop_aspect_ratio(cropbox, target_aspect_ratio, original_height=0, original_width=0, 121 | max_height=0, max_width=0): 122 | 123 | new_crop_y, new_crop_x = cropbox 124 | new_crop_height = (new_crop_y[1] - new_crop_y[0]) 125 | new_crop_width = (new_crop_x[1] - new_crop_x[0]) 126 | new_aspect_ratio = new_crop_height / new_crop_width 127 | 128 | if abs(target_aspect_ratio - new_aspect_ratio) < 0.001: 129 | return cropbox 130 | 131 | logging.info('Adjusting reconstruction to match original %0.4f aspect ratio', target_aspect_ratio) 132 | 133 | assert original_height < new_crop_height 134 | assert original_width < new_crop_width 135 | 136 | # The basic idea is that we'll adjust the crop's short axis up or down to match the input aspect 137 | # ratio. To avoid shifting the crop too much we'll attempt to evenly move both sides as long as 138 | # that won't hit the image boundaries: 139 | 140 | if new_aspect_ratio > 1.0: 141 | scale = new_crop_width / original_width 142 | else: 143 | scale = new_crop_height / original_height 144 | 145 | logging.info('Original crop box: %r (%0.4f)', cropbox, new_crop_height / new_crop_width) 146 | logging.info('Reconstructed image is %0.2f%% of the original', scale * 100) 147 | 148 | delta_y = round(original_height * scale) - new_crop_height 149 | delta_x = round(original_width * scale) - new_crop_width 150 | 151 | logging.info('Crop box needs to change by: %0.1f x, %0.1f y', delta_x, delta_y) 152 | 153 | if delta_y != 0: 154 | new_crop_y = clamp_values(delta=delta_y, max_value=max_height, *cropbox[0]) 155 | 156 | if delta_x != 0: 157 | new_crop_x = clamp_values(delta=delta_x, max_value=max_width, *cropbox[1]) 158 | 159 | cropbox = (new_crop_y, new_crop_x) 160 | 161 | logging.info('Updated crop box: %r (%0.4f)', cropbox, 162 | (new_crop_y[1] - new_crop_y[0]) / (new_crop_x[1] - new_crop_x[0])) 163 | 164 | return cropbox 165 | 166 | 167 | def clamp_values(low_value, high_value, delta, min_value=0, max_value=0): 168 | if delta == 0.0: 169 | return low_value, high_value 170 | 171 | top_pad = bottom_pad = delta / 2 172 | 173 | if delta > 0: 174 | # We'll shift the box to avoid hitting an image edge: 175 | top_pad = max(0, top_pad) 176 | bottom_pad = delta - top_pad 177 | 178 | low_value = int(round(low_value - top_pad)) 179 | 180 | if low_value < min_value: 181 | logging.warning('Clamping crop to %f instead of %f', min_value, low_value) 182 | bottom_pad += min_value - low_value 183 | low_value = min_value 184 | 185 | high_value = int(round(high_value + bottom_pad)) 186 | 187 | if high_value > max_value: 188 | logging.warning('Clamping crop to %f instead of %f', max_value, high_value) 189 | high_value = max_value 190 | 191 | return low_value, high_value 192 | 193 | 194 | def reconstruct_thumbnail(thumbnail_image, source_image, corners, downsize_reconstruction=False, 195 | max_aspect_ratio_delta=0.1, match_aspect_ratio=False): 196 | logging.info("Reconstructing thumbnail from source image") 197 | 198 | thumb_h, thumb_w = thumbnail_image.shape[:2] 199 | source_h, source_w = source_image.shape[:2] 200 | 201 | old_aspect_ratio = thumb_h / thumb_w 202 | 203 | corners_x, corners_y = zip(*corners) 204 | new_thumb_crop = [(min(corners_y), max(corners_y)), 205 | (min(corners_x), max(corners_x))] 206 | 207 | if match_aspect_ratio: 208 | new_thumb_crop = adjust_crop_aspect_ratio(new_thumb_crop, old_aspect_ratio, 209 | original_height=thumb_h, 210 | original_width=thumb_w, 211 | max_height=source_h, max_width=source_w) 212 | 213 | new_thumb = source_image[slice(*new_thumb_crop[0]), slice(*new_thumb_crop[1])] 214 | 215 | new_thumb_rotation, new_thumb = autorotate_image(new_thumb, corners) 216 | logging.info('Detected image rotation: %d°', new_thumb_rotation) 217 | 218 | if match_aspect_ratio and new_thumb_rotation not in (0, 180): 219 | raise NotImplementedError('FIXME: refactor autorotation to work with aspect ratio matching!') 220 | 221 | new_thumb_h, new_thumb_w = new_thumb.shape[:2] 222 | 223 | if downsize_reconstruction and (new_thumb_h > thumb_h or new_thumb_w > thumb_w): 224 | new_thumb = fit_image_within(new_thumb, thumb_h, thumb_w) 225 | 226 | new_aspect_ratio = new_thumb.shape[0] / new_thumb.shape[1] 227 | logging.info('Master dimensions: width=%s, height=%s', source_image.shape[1], source_image.shape[0]) 228 | logging.info('Thumbnail dimensions: width=%s, height=%s (aspect ratio: %0.4f)', 229 | thumbnail_image.shape[1], thumbnail_image.shape[0], 230 | old_aspect_ratio) 231 | logging.info('Reconstructed thumb dimensions: width=%s, height=%s (rotation=%d°, aspect ratio: %0.4f)', 232 | new_thumb.shape[1], new_thumb.shape[0], 233 | new_thumb_rotation, new_aspect_ratio) 234 | 235 | if match_aspect_ratio: 236 | scale = thumbnail_image.shape[0] / new_thumb.shape[0] 237 | if thumbnail_image.shape[:2] != tuple(int(round(i * scale)) for i in new_thumb.shape[:2]): 238 | raise RuntimeError('Unable to match aspect ratios: %0.4f != %0.4f' % (old_aspect_ratio, 239 | new_aspect_ratio)) 240 | 241 | if abs(old_aspect_ratio - new_aspect_ratio) > max_aspect_ratio_delta: 242 | raise RuntimeError('Aspect ratios are significantly different – reconstruction likely failed!') 243 | 244 | if (new_thumb_h <= thumb_h) or (new_thumb_w <= thumb_w): 245 | raise RuntimeError("Reconstructed thumbnail wasn't larger than the original!") 246 | 247 | return new_thumb, new_thumb_crop, new_thumb_rotation 248 | 249 | 250 | def visualize_matches(source_image, original_thumbnail, reconstructed_thumbnail, corners, kp_pairs, mask): 251 | thumb_h, thumb_w = original_thumbnail.shape[:2] 252 | source_h, source_w = source_image.shape[:2] 253 | 254 | # Create a new image for the visualization: 255 | vis = numpy.zeros((max(thumb_h, source_h), thumb_w + source_w, source_image.shape[2]), numpy.uint8) 256 | # Draw the original images adjacent to each other: 257 | vis[:thumb_h, :thumb_w] = original_thumbnail 258 | vis[:source_h, thumb_w:thumb_w+source_w] = source_image 259 | 260 | if reconstructed_thumbnail is not None: 261 | # Display the reconstructed thumbnail just below the original thumbnail: 262 | reconstructed_thumbnail = fit_image_within(reconstructed_thumbnail, thumb_h, thumb_w) 263 | reconstructed_h, reconstructed_w = reconstructed_thumbnail.shape[:2] 264 | vis[thumb_h:thumb_h + reconstructed_h, :reconstructed_w] = reconstructed_thumbnail 265 | 266 | if corners is not None: 267 | # Highlight our bounding box on the source image: 268 | cv2.polylines(vis, [numpy.int32(corners) + (thumb_w, 0)], True, (255, 255, 255)) 269 | 270 | thumb_points = numpy.int32([kpp[0].pt for kpp in kp_pairs]) 271 | source_points = numpy.int32([kpp[1].pt for kpp in kp_pairs]) + (thumb_w, 0) 272 | 273 | # Points which fit the model will be marked in green: 274 | inlier_color = (0, 255, 0) 275 | # … while those which do not will be marked in red: 276 | outlier_color = (0, 0, 255) 277 | # Connecting lines will be less intense green: 278 | line_color = (0, 192, 0) 279 | 280 | if mask is None: 281 | mask = numpy.zeros(len(thumb_points)) 282 | 283 | for (x1, y1), (x2, y2), inlier in zip(thumb_points, source_points, mask): 284 | if inlier: 285 | cv2.line(vis, (x1, y1), (x2, y2), line_color) 286 | cv2.circle(vis, (x1, y1), 2, inlier_color, -1) 287 | cv2.circle(vis, (x2, y2), 2, inlier_color, -1) 288 | else: 289 | cv2.circle(vis, (x1, y1), 2, outlier_color, -1) 290 | cv2.circle(vis, (x2, y2), 2, outlier_color, -1) 291 | 292 | return vis 293 | 294 | 295 | def find_homography(kp_pairs): 296 | mkp1, mkp2 = zip(*kp_pairs) 297 | 298 | p1 = numpy.float32([kp.pt for kp in mkp1]) 299 | p2 = numpy.float32([kp.pt for kp in mkp2]) 300 | 301 | assert len(kp_pairs) >= 4 302 | 303 | logging.debug('finding homography') 304 | H, mask = cv2.findHomography(p1, p2, cv2.RANSAC, 5.0) 305 | logging.info('%d inliers, %d matched features', numpy.sum(mask), len(mask)) 306 | return H, mask 307 | 308 | 309 | def locate_thumbnail(thumbnail_filename, source_filename, display=False, save_visualization=False, 310 | save_reconstruction=False, reconstruction_format="jpg", 311 | max_aspect_ratio_delta=0.1, match_aspect_ratio=False, 312 | minimum_matches=10, 313 | json_output_filename=None, max_master_edge=4096, max_output_edge=2048): 314 | thumbnail_basename, thumbnail_image = open_image(thumbnail_filename) 315 | source_basename, source_image = open_image(source_filename) 316 | 317 | if (((source_image.shape[0] <= thumbnail_image.shape[0]) 318 | or (source_image.shape[1] <= thumbnail_image.shape[1]))): 319 | raise RuntimeError("Master file wasn't larger than the thumbnail: %r vs %r" % (source_image.shape, 320 | thumbnail_image.shape)) 321 | 322 | logging.info("Attempting to locate %s within %s", thumbnail_filename, source_filename) 323 | 324 | full_source_image = source_image 325 | if max_master_edge and any(i for i in source_image.shape if i > max_master_edge): 326 | logging.info("Resizing master to fit within %d pixels", max_master_edge) 327 | source_image = fit_image_within(source_image, max_master_edge, max_master_edge) 328 | 329 | logging.info('Finding common features') 330 | kp_pairs = match_images(thumbnail_image, source_image) 331 | 332 | if len(kp_pairs) >= minimum_matches: 333 | title = "Found %d matches" % len(kp_pairs) 334 | logging.info(title) 335 | 336 | H, mask = find_homography(kp_pairs) 337 | 338 | corners = get_scaled_corners(thumbnail_image, source_image, full_source_image, kp_pairs, H) 339 | 340 | new_thumbnail, corners, rotation = reconstruct_thumbnail(thumbnail_image, full_source_image, corners, 341 | match_aspect_ratio=match_aspect_ratio, 342 | max_aspect_ratio_delta=max_aspect_ratio_delta) 343 | 344 | if json_output_filename: 345 | with open(json_output_filename, mode='wb') as json_file: 346 | json.dump({ 347 | "master": { 348 | "source": source_filename, 349 | "dimensions": { 350 | "height": full_source_image.shape[0], 351 | "width": full_source_image.shape[1], 352 | } 353 | }, 354 | "thumbnail": { 355 | "source": thumbnail_filename, 356 | "dimensions": { 357 | "height": thumbnail_image.shape[0], 358 | "width": thumbnail_image.shape[1], 359 | } 360 | }, 361 | "bounding_box": { 362 | "height": corners[0][1] - corners[0][0], 363 | "width": corners[1][1] - corners[1][0], 364 | "x": corners[1][0], 365 | "y": corners[0][0], 366 | }, 367 | "rotation_degrees": rotation 368 | }, json_file, indent=4) 369 | 370 | if save_reconstruction: 371 | new_filename = "%s.reconstructed.%s" % (thumbnail_basename, reconstruction_format) 372 | 373 | new_thumb_img = fit_image_within(new_thumbnail, max_output_edge, max_output_edge) 374 | cv2.imwrite(new_filename, new_thumb_img) 375 | logging.info("Saved reconstructed %s thumbnail %s", new_thumb_img.shape[:2], new_filename) 376 | else: 377 | logging.warning("Found only %d matches; skipping reconstruction", len(kp_pairs)) 378 | title = "MATCH FAILED: %d pairs" % len(kp_pairs) 379 | new_thumbnail = corners = H = mask = None 380 | 381 | if display or save_visualization: 382 | vis_image = visualize_matches(source_image, thumbnail_image, new_thumbnail, corners, kp_pairs, mask) 383 | 384 | if save_visualization: 385 | vis_filename = "%s.visualized%s" % os.path.splitext(thumbnail_filename) 386 | cv2.imwrite(vis_filename, vis_image) 387 | logging.info("Saved match visualization %s", vis_filename) 388 | 389 | if display: 390 | # This may or may not exist depending on whether OpenCV was compiled using the QT backend: 391 | window_flags = getattr(cv, 'CV_WINDOW_NORMAL', cv.CV_WINDOW_AUTOSIZE) 392 | window_title = '%s - %s' % (thumbnail_basename, title) 393 | cv2.namedWindow(window_title, flags=window_flags) 394 | cv2.imshow(window_title, vis_image) 395 | cv2.waitKey() 396 | cv2.destroyAllWindows() 397 | 398 | 399 | def main(): 400 | logging.basicConfig(level=logging.INFO, format='%(levelname)s %(funcName)s: %(message)s') 401 | 402 | parser = argparse.ArgumentParser() 403 | parser.add_argument('files', metavar="THUMBNAIL MASTER", nargs="+") 404 | parser.add_argument('--save-visualization', action="store_true", help="Save match visualization") 405 | parser.add_argument('--save-thumbnail', action="store_true", 406 | help="Save reconstructed thumbnail at full size") 407 | parser.add_argument('--save-json', action="store_true", 408 | help="Save JSON file with thumbnail crop information") 409 | parser.add_argument('--thumbnail-format', default='jpg', 410 | help='Format for reconstructed thumbnails (png or default %(default)s)') 411 | parser.add_argument('--fit-master-within', type=int, default=8192, 412 | help="Resize master so the largest edge is below the specified value " 413 | "(faster but possibly less accurate)") 414 | parser.add_argument('--fit-output-within', type=int, default=2048, 415 | help="Resize output so the largest edge is below the specified value") 416 | parser.add_argument('--minimum-matches', type=int, default=20, 417 | help='Require at least this many features for a match (default %(default)s)') 418 | parser.add_argument('--max-aspect-ratio-delta', type=float, default=0.1, 419 | help='Raise an error if the reconstructed image\'s aspect ratio differs by more than ' 420 | 'this percentage default %(default)s)') 421 | parser.add_argument('--match-aspect-ratio', action='store_true', 422 | help='Adjust the reconstructed crop box to exactly match the original thumbnail') 423 | parser.add_argument('--display', action="store_true", help="Display match visualization") 424 | parser.add_argument('--debug', action="store_true", help="Open debugger for errors") 425 | args = parser.parse_args() 426 | 427 | if len(args.files) % 2 != 0: 428 | parser.error("Files must be provided in thumbnail and master pairs") 429 | 430 | if args.thumbnail_format not in ('jpg', 'png'): 431 | parser.error('Thumbnail format must be either jpg or png') 432 | 433 | if args.debug: 434 | import pdb 435 | 436 | for i in xrange(0, len(args.files), 2): 437 | thumbnail = args.files[i] 438 | source = args.files[i + 1] 439 | 440 | if args.save_json: 441 | json_output_filename = '%s.json' % os.path.splitext(thumbnail)[0] 442 | else: 443 | json_output_filename = None 444 | 445 | try: 446 | locate_thumbnail(thumbnail, source, display=args.display, 447 | save_reconstruction=args.save_thumbnail, 448 | reconstruction_format=args.thumbnail_format, 449 | save_visualization=args.save_visualization, 450 | json_output_filename=json_output_filename, 451 | max_master_edge=args.fit_master_within, 452 | max_output_edge=args.fit_output_within, 453 | max_aspect_ratio_delta=args.max_aspect_ratio_delta, 454 | match_aspect_ratio=args.match_aspect_ratio, 455 | minimum_matches=args.minimum_matches) 456 | except Exception as e: 457 | logging.error("Error processing %s %s: %s", thumbnail, source, e) 458 | if args.debug: 459 | pdb.post_mortem() 460 | sys.exit(1) 461 | 462 | 463 | if __name__ == '__main__': 464 | main() 465 | -------------------------------------------------------------------------------- /image_mining/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acdha/image-mining/cfe842c42f122d676924b16f8af30c2431f9cd5c/image_mining/__init__.py -------------------------------------------------------------------------------- /image_mining/figure_extraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import logging 4 | 5 | import cv2 6 | import numpy 7 | 8 | 9 | class ImageRegion(object): 10 | def __init__(self, x1, y1, x2, y2, poly=None, contour_index=None): 11 | assert x1 < x2 12 | assert y1 < y2 13 | self.x1 = x1 14 | self.x2 = x2 15 | self.y1 = y1 16 | self.y2 = y2 17 | 18 | self.poly = poly 19 | self.contour_index = contour_index 20 | 21 | def __repr__(self): 22 | return "({0.x1}, {0.y1})-({0.x2}, {0.y2})".format(self) 23 | 24 | @property 25 | def area(self): 26 | return (self.y2 - self.y1) * (self.x2 - self.x1) 27 | 28 | @property 29 | def height(self): 30 | return self.y2 - self.y1 31 | 32 | @property 33 | def width(self): 34 | return self.x2 - self.x1 35 | 36 | @property 37 | def image_slice(self): 38 | """Return a Python slice suitable for use on an OpenCV image (i.e. numpy 2D array)""" 39 | return slice(self.y1, self.y2), slice(self.x1, self.x2) 40 | 41 | def contains(self, other): 42 | """Returns True if the other ImageRegion is entirely contained by this one""" 43 | return ((other.x1 >= self.x1) and (other.x2 <= self.x2) 44 | and (other.y1 >= self.y1) and (other.y2 <= self.y2)) 45 | 46 | def overlaps(self, other): 47 | """Returns True if any part of the other ImageRegion is entirely contained by this one""" 48 | 49 | return (((self.x1 < other.x1 < self.x2) or (self.x1 < other.x2 < self.x2)) 50 | and ((self.y1 < other.y1 < self.y2) or (self.y1 < other.y2 < self.y2))) 51 | 52 | def merge(self, other): 53 | """Expand this ImageRegion to contain other""" 54 | self.x1 = min(self.x1, other.x1) 55 | self.y1 = min(self.y1, other.y1) 56 | self.x2 = max(self.x2, other.x2) 57 | self.y2 = max(self.y2, other.y2) 58 | 59 | def as_dict(self): 60 | return {"x1": self.x1, "y1": self.y1, "x2": self.x2, "y2": self.y2} 61 | 62 | 63 | class FigureExtractor(object): 64 | MORPH_TYPES = {"cross": cv2.MORPH_CROSS, 65 | "ellipse": cv2.MORPH_ELLIPSE, 66 | "rectangle": cv2.MORPH_RECT} 67 | MORPH_TYPE_KEYS = sorted(MORPH_TYPES.keys()) 68 | 69 | def __init__(self, canny_threshold=0, erosion_element=None, erosion_size=4, 70 | dilation_element=None, dilation_size=4, 71 | min_area=0.01, 72 | min_height=0.1, max_height=0.9, 73 | min_width=0.1, max_width=0.9): 74 | # TODO: reconsider whether we should split to global config + per-image extractor instances 75 | 76 | # TODO: better way to set configuration options & docs 77 | self.canny_threshold = canny_threshold 78 | self.erosion_element = self.MORPH_TYPE_KEYS.index(erosion_element) 79 | self.erosion_size = erosion_size 80 | self.dilation_element = self.MORPH_TYPE_KEYS.index(dilation_element) 81 | self.dilation_size = dilation_size 82 | 83 | self.min_area_percentage = min_area 84 | self.min_height = min_height 85 | self.max_height = max_height 86 | self.min_width = min_width 87 | self.max_width = max_width 88 | 89 | def find_figures(self, source_image): 90 | assert source_image is not None, "source_image was None. Perhaps imread() failed?" 91 | output_image = self.filter_image(source_image) 92 | 93 | contours, hierarchy = self.find_contours(output_image) 94 | 95 | for bbox in self.get_bounding_boxes_from_contours(contours, source_image): 96 | yield bbox 97 | 98 | def _find_contours_opencv2(self, image): 99 | return cv2.findContours(image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 100 | 101 | def _find_contours_opencv3(self, image): 102 | _, a, b = cv2.findContours(image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 103 | return a, b 104 | 105 | if cv2.__version__.startswith('2.'): 106 | find_contours = _find_contours_opencv2 107 | else: 108 | find_contours = _find_contours_opencv3 109 | 110 | def filter_image(self, source_image): 111 | # TODO: Refactor this into a more reusable filter chain 112 | 113 | output_image = cv2.cvtColor(source_image, cv2.COLOR_BGR2GRAY) 114 | # TODO: make blurring configurable: 115 | # output_image = cv2.medianBlur(output_image, 5) 116 | # output_image = cv2.blur(output_image, (3, 3)) 117 | # output_image = cv2.GaussianBlur(output_image, (5, 5)) 118 | 119 | # TODO: make thresholding configurable 120 | # See http://docs.opencv.org/modules/imgproc/doc/miscellaneous_transformations.html#adaptivethreshold 121 | # output_image = cv2.adaptiveThreshold(output_image, 255.0, cv2.THRESH_BINARY_INV, cv2.ADAPTIVE_THRESH_MEAN_C, 15, 5) 122 | # threshold_rc, output_image = cv2.threshold(output_image, 192, 255, cv2.THRESH_BINARY_INV) 123 | 124 | # Otsu's binarization: see http://bit.ly/194YCPp 125 | output_image = cv2.GaussianBlur(output_image, (3, 3), 0) 126 | threshold_rc, output_image = cv2.threshold(output_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) 127 | 128 | if self.erosion_size > 0: 129 | element_name = self.MORPH_TYPE_KEYS[self.erosion_element] 130 | element = self.MORPH_TYPES[element_name] 131 | 132 | structuring_element = cv2.getStructuringElement(element, (self.erosion_size, self.erosion_size)) 133 | output_image = cv2.erode(output_image, structuring_element) 134 | 135 | if self.dilation_size > 0: 136 | element_name = self.MORPH_TYPE_KEYS[self.dilation_element] 137 | element = self.MORPH_TYPES[element_name] 138 | 139 | structuring_element = cv2.getStructuringElement(element, (self.dilation_size, self.dilation_size)) 140 | output_image = cv2.dilate(output_image, structuring_element) 141 | 142 | if self.canny_threshold > 0: 143 | # TODO: Make all of Canny options configurable 144 | # See http://docs.opencv.org/modules/imgproc/doc/feature_detection.html#canny 145 | output_image = cv2.Canny(output_image, self.canny_threshold, self.canny_threshold * 3, 12) 146 | 147 | return output_image 148 | 149 | def detect_lines(self, source_image): 150 | # TODO: Make HoughLinesP a configurable option 151 | lines = cv2.HoughLinesP(source_image, rho=1, theta=numpy.pi / 180, 152 | threshold=160, minLineLength=80, maxLineGap=10) 153 | 154 | # for line in lines[0]: 155 | # cv2.line(output_image, (line[0], line[1]), (line[2], line[3]), (0, 0, 255), 2, 4) 156 | return lines 157 | 158 | def get_bounding_boxes_from_contours(self, contours, source_image): 159 | # We'll return the boxes ordered largest first to make overlaps easier to see interactively: 160 | boxes = sorted(self.filter_bounding_boxes(contours, source_image), reverse=True, 161 | key=lambda i: i.area) 162 | 163 | # This could be stored in a much more efficient structure but in testing the number 164 | # of boxes is so small that it doesn't seem worth greater effort: 165 | boxes = [i for i in boxes if not any(j.contains(i) for j in boxes if j is not i)] 166 | 167 | restart = True 168 | while restart: 169 | restart = False 170 | for i in boxes: 171 | other_boxes = [j for j in boxes if j is not i] 172 | for j in other_boxes: 173 | if j.overlaps(i): 174 | print "\tMerging overlapping extracts: %s %s" % (i, j) 175 | i.merge(j) 176 | boxes.remove(j) 177 | restart = True 178 | break 179 | 180 | return boxes 181 | 182 | def filter_bounding_boxes(self, contours, source_image): 183 | # TODO: confirm that the min area check buys us anything over the bounding box min/max filtering 184 | min_area = self.min_area_percentage * source_image.size 185 | 186 | # TODO: more robust algorithm for detecting likely scan edge artifacts which can handle cropped scans of large images (e.g. http://dl.wdl.org/107_1_1.png) 187 | max_height = int(round(self.max_height * source_image.shape[0])) 188 | max_width = int(round(self.max_width * source_image.shape[1])) 189 | min_height = int(round(self.min_height * source_image.shape[0])) 190 | min_width = int(round(self.min_width * source_image.shape[1])) 191 | 192 | logging.info("Contour length & area (area: >%d pixels, box: height >%d, <%d, width >%d, <%d)", 193 | min_area, min_height, max_height, min_width, max_width) 194 | 195 | for i, contour in enumerate(contours): 196 | area = cv2.contourArea(contours[i], False) 197 | 198 | if area < min_area: 199 | logging.debug("Contour %4d: failed area check", i) 200 | continue 201 | 202 | poly = cv2.approxPolyDP(contour, 0.01 * cv2.arcLength(contour, False), False) 203 | x, y, w, h = cv2.boundingRect(poly) 204 | bbox = ImageRegion(x, y, x + w, y + h, poly=poly, contour_index=i) 205 | 206 | if w > max_width or w < min_width or h > max_height or h < min_height: 207 | logging.debug("Contour %4d: failed min/max check: %s", i, bbox) 208 | continue 209 | 210 | yield bbox 211 | -------------------------------------------------------------------------------- /image_mining/utils.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from __future__ import absolute_import, unicode_literals, print_function 3 | 4 | from urllib import urlopen 5 | from urlparse import urlparse 6 | import os 7 | 8 | import cv2 9 | import numpy 10 | 11 | 12 | def open_image(file_or_url): 13 | """Load an OpenCV image from a filename or URL 14 | 15 | Returns a base_name, image tuple containing a processed name suitable for naming output files 16 | """ 17 | 18 | if file_or_url.startswith("http"): 19 | source_image = open_image_from_url(file_or_url, cv2_img_flag=cv2.IMREAD_COLOR) 20 | 21 | url_p = urlparse(file_or_url) 22 | 23 | base_name = os.path.splitext(os.path.basename(url_p.path))[0] 24 | else: 25 | if not os.path.exists(file_or_url): 26 | raise IOError("%s does not exist" % file_or_url) 27 | 28 | base_name = os.path.splitext(os.path.basename(file_or_url))[0] 29 | 30 | source_image = cv2.imread(file_or_url, flags=cv2.IMREAD_COLOR) 31 | 32 | if source_image is None: 33 | raise RuntimeError("%s could not be decoded as an image" % file_or_url) 34 | 35 | return base_name, source_image 36 | 37 | 38 | def open_image_from_url(url, cv2_img_flag=0): 39 | """Attempt to load an OpenCV image from a URL""" 40 | # See http://stackoverflow.com/a/13329446/59984 41 | request = urlopen(url) 42 | img_array = numpy.asarray(bytearray(request.read()), dtype=numpy.uint8) 43 | return cv2.imdecode(img_array, cv2_img_flag) 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='image-mining', 4 | version='0.1.6', 5 | author='Chris Adams', 6 | author_email='chris@improbable.org', 7 | packages=['image_mining'], 8 | scripts=['bin/extract-figures.py', 'bin/locate-thumbnail.py'], 9 | url='https://github.com/acdha/image-mining/', 10 | license='LICENSE.txt', 11 | description='Extract useful information from scanned images using OpenCV', 12 | long_description=open('README.rst').read(), 13 | install_requires=['numpy']) 14 | --------------------------------------------------------------------------------