├── README.textile ├── find_images.rb └── picturepages.rb /README.textile: -------------------------------------------------------------------------------- 1 | h1. Picture Pages 2 | 3 | By Eric Larson 4 | 5 | h2. Code4Lib 2012 Lighting Presentation 6 | 7 | h3. See these slides 8 | 9 | "http://speakerdeck.com/u/ewlarson/p/finding-images-in-book-page-images": http://speakerdeck.com/u/ewlarson/p/finding-images-in-book-page-images 10 | 11 | h2. Requirements 12 | 13 | * Ruby 14 | * ImageMagick 15 | - For Mac people 16 | - $> brew install imagemagick 17 | 18 | h2. Getting Started 19 | 20 | * Download this file 21 | * Run this file 22 | * Read the code 23 | * Fetch your own books and find images 24 | 25 | h3. Stay in touch 26 | 27 | If you play around in this problem space, let me know. -------------------------------------------------------------------------------- /find_images.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | def image_candidates(directory_path) 4 | # Walk the book page images, detect images within images 5 | Dir.chdir(directory_path) 6 | Dir.mkdir("working") 7 | Dir.mkdir("images") 8 | 9 | Dir.glob('*').each do |file| 10 | 11 | filename = file.split("/").last.gsub(".jpg", "") 12 | #next if ["working","images"].include?(filename) 13 | 14 | # 1) Desaturate the image 15 | `convert #{file} -colorspace Gray working/#{filename}G.jpg` 16 | 17 | # 2) Contrast x 8! 18 | `convert working/#{filename}G.jpg -contrast -contrast -contrast -contrast -contrast -contrast -contrast -contrast working/#{filename}C.jpg` 19 | 20 | # 3) Convert image to 1px x height 21 | `convert working/#{filename}C.jpg -resize 1x1500! working/#{filename}V.jpg` 22 | 23 | # 4) Sharpen the image 24 | `convert working/#{filename}V.jpg -sharpen 0x5 working/#{filename}S.jpg` 25 | 26 | # 5) Heavy-handed grayscale conversion 27 | `convert working/#{filename}S.jpg -negate -threshold 0 -negate working/#{filename}N.jpg` 28 | 29 | # 6) Color list 30 | `convert working/#{filename}N.jpg TXT:working/#{filename}.txt` 31 | 32 | # 7) More than 200 black pixels in a row is an IMAGE 33 | begin 34 | File.open("working/#{filename}.txt",'r') do |file| 35 | @color = nil 36 | @count = 0 37 | file.each_line do |line| 38 | line_color = line.split(" ").last.strip 39 | if @color == line_color 40 | @count = @count + 1 41 | if @count > 200 && @color == "black" 42 | puts "IMAGE - #{filename}" 43 | `cp #{filename}.jpg images/#{filename}.jpg` 44 | break 45 | end 46 | else 47 | @color = line_color 48 | @count = 0 49 | end 50 | end 51 | end 52 | rescue 53 | img_count = Dir.entries("images").size 54 | puts "\nComplete - Found #{img_count} images" 55 | end 56 | end 57 | end 58 | 59 | # Find path to library directory 60 | library_directory = ARGV[0] 61 | 62 | # Books are each sub-directory in the library directory 63 | Dir.chdir(library_directory) 64 | ld = Dir.pwd 65 | puts Dir.entries(ld).size - 2 # @TODO: need more than two entries 66 | 67 | Dir.foreach(ld) do |book| 68 | if book.length > 2 69 | puts book 70 | Dir.chdir(ld + "/" + book) 71 | book_directory_path = Dir.pwd 72 | image_candidates(book_directory_path) 73 | end 74 | end -------------------------------------------------------------------------------- /picturepages.rb: -------------------------------------------------------------------------------- 1 | # === Author === 2 | # * Eric Larson 3 | # * UW-Madison Libraries 4 | 5 | # === License === 6 | # * No license 7 | 8 | # === REQUIREMENTS === 9 | # * ruby 10 | # * imagemagick 11 | # - brew install imagemagick 12 | 13 | # 1) Set up our directory structure 14 | # - british_flora 15 | # - british_flora/working 16 | # - british_flora/images 17 | 18 | `mkdir british_flora` 19 | `mkdir british_flora/working` 20 | `mkdir british_flora/images` 21 | 22 | # 2) Download a book from the Internet Archive 23 | # - Example (Color): The British flora medica, or, History of the medicinal plants of Great Britain - (34 images) 24 | # - http://openlibrary.org/books/OL13997282M 25 | # $> curl 'http://ia600309.us.archive.org/BookReader/BookReaderImages.php?zip=/33/items/britishfloramedi01bartuoft/britishfloramedi01bartuoft_jp2.zip&file=britishfloramedi01bartuoft_jp2/britishfloramedi01bartuoft_[0000-0482].jp2&scale=2&rotate=0' -o "file_#1.jpg" 26 | 27 | `curl 'http://ia600309.us.archive.org/BookReader/BookReaderImages.php?zip=/33/items/britishfloramedi01bartuoft/britishfloramedi01bartuoft_jp2.zip&file=britishfloramedi01bartuoft_jp2/britishfloramedi01bartuoft_[0000-0482].jp2&scale=2&rotate=0' -o "british_flora/file_#1.jpg"` 28 | 29 | # 3) Walk the book page images, detect images within images 30 | Dir.glob('british_flora/*.jpg').each do |file| 31 | 32 | filename = file.split("/").last.gsub(".jpg", "") 33 | 34 | # 1) Desaturate the image 35 | # `convert #{file} -colorspace Gray british_flora/working/#{filename}G.jpg` 36 | 37 | # 2) Contrast x 8! 38 | # `convert british_flora/working/#{filename}G.jpg -contrast -contrast -contrast -contrast -contrast -contrast -contrast -contrast british_flora/working/#{filename}C.jpg` 39 | 40 | # 3) Convert image to 1px x height 41 | # `convert british_flora/working/#{filename}C.jpg -resize 1x1500! british_flora/working/#{filename}V.jpg` 42 | 43 | # 4) Sharpen the image 44 | # `convert british_flora/working/#{filename}V.jpg -sharpen 0x5 british_flora/working/#{filename}S.jpg` 45 | 46 | # 5) Heavy-handed grayscale conversion 47 | # `convert british_flora/working/#{filename}S.jpg -negate -threshold 0 -negate british_flora/working/#{filename}N.jpg` 48 | 49 | # 6) Color list 50 | # `convert british_flora/working/#{filename}N.jpg TXT:british_flora/working/#{filename}.txt` 51 | 52 | # *) Much faster version of steps 1-6, calling convert just twice 53 | ` convert #{file} -colorspace Gray -contrast -contrast -contrast -contrast -contrast -contrast -contrast -contrast -resize 1X1500! -sharpen 0x5 miff:- | \ 54 | convert - -negate -threshold 0 -negate TXT:british_flora/working/#{filename}.txt` 55 | 56 | # 7) More than 200 black pixels in a row is an IMAGE 57 | begin 58 | File.open("british_flora/working/#{filename}.txt",'r') do |file| 59 | @color = nil 60 | @count = 0 61 | file.each_line do |line| 62 | line_color = line.split(" ").last.strip 63 | if @color == line_color 64 | @count = @count + 1 65 | if @count > 200 && @color == "black" 66 | puts "IMAGE - #{filename}" 67 | `cp british_flora/#{filename}.jpg british_flora/images/#{filename}.jpg` 68 | break 69 | end 70 | else 71 | @color = line_color 72 | @count = 0 73 | end 74 | end 75 | end 76 | rescue 77 | img_count = Dir.entries("british_flora/images").size - 2 # '.' and '..' are not interesting 78 | puts "\nComplete - Found #{img_count} images / Expected 34 images" 79 | end 80 | end --------------------------------------------------------------------------------