├── testdata ├── hzgg-1a.png ├── hzgg-1b.png ├── hzgg-2a.png ├── hzgg-2b.png ├── hzgg-3a.png ├── hzgg-3b.png ├── zhjdasn-4a.png ├── zhjdasn-4b.png └── zhjdasn-extractframe250.png ├── ocr_client.py ├── extract_subtitle_test.py ├── mksrt.py ├── readme.md ├── UnionFind.py ├── extract_frame.py ├── connected_components.py └── extract_subtitle.py /testdata/hzgg-1a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/hzgg-1a.png -------------------------------------------------------------------------------- /testdata/hzgg-1b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/hzgg-1b.png -------------------------------------------------------------------------------- /testdata/hzgg-2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/hzgg-2a.png -------------------------------------------------------------------------------- /testdata/hzgg-2b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/hzgg-2b.png -------------------------------------------------------------------------------- /testdata/hzgg-3a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/hzgg-3a.png -------------------------------------------------------------------------------- /testdata/hzgg-3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/hzgg-3b.png -------------------------------------------------------------------------------- /testdata/zhjdasn-4a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/zhjdasn-4a.png -------------------------------------------------------------------------------- /testdata/zhjdasn-4b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/zhjdasn-4b.png -------------------------------------------------------------------------------- /testdata/zhjdasn-extractframe250.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkovacs/smart-subtitles-extract-subtitle/HEAD/testdata/zhjdasn-extractframe250.png -------------------------------------------------------------------------------- /ocr_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Sends png file to OneNote OCR service and prints output. 4 | # 5 | # Copyright Geza Kovacs 6 | 7 | import urllib2 8 | import httplib 9 | import base64 10 | import sys 11 | 12 | def getIPAddr(): 13 | #return urllib2.urlopen('http://transgame.csail.mit.edu:9537/?varname=win7ipnetbook').read() 14 | return urllib2.urlopen('http://transgame.csail.mit.edu:9537/?varname=win7ipaddress').read() 15 | 16 | def getOCRText(png_file_to_ocr, ipaddr): 17 | httpServ = httplib.HTTPConnection(ipaddr, 8080) 18 | httpServ.connect() 19 | 20 | data = base64.b64encode(open(png_file_to_ocr).read()) 21 | 22 | httpServ.request('POST', '/', data) 23 | 24 | response = httpServ.getresponse() 25 | retv = "" 26 | if response.status == httplib.OK: 27 | retv = response.read() 28 | print retv 29 | else: 30 | print "Got error from server:", response.status 31 | httpServ.close() 32 | return retv 33 | 34 | def main(): 35 | png_file_to_ocr = sys.argv[1] 36 | print png_file_to_ocr 37 | ipaddr = getIPAddr() 38 | print ipaddr 39 | print getOCRText(png_file_to_ocr, ipaddr) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /extract_subtitle_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from extract_subtitle import * 4 | 5 | import unittest 6 | 7 | class TestExtractSubtitle(unittest.TestCase): 8 | def test_getVerticalStartEnd(self): 9 | activation = [0, 0, 0, 1, 1, 1, 0, 0, 0] 10 | start,end = getVerticalStartEnd(activation) 11 | self.assertEqual(3, start) 12 | self.assertEqual(5, end) 13 | 14 | def test_haveTransition(self): 15 | img1a = LoadImage('testdata/hzgg-1a.png') 16 | img1b = LoadImage('testdata/hzgg-1b.png') 17 | img2a = LoadImage('testdata/hzgg-2a.png') 18 | img2b = LoadImage('testdata/hzgg-2b.png') 19 | img3a = LoadImage('testdata/hzgg-3a.png') 20 | img3b = LoadImage('testdata/hzgg-3b.png') 21 | img4a = LoadImage('testdata/zhjdasn-4a.png') 22 | img4b = LoadImage('testdata/zhjdasn-4b.png') 23 | self.assertFalse(haveTransition(img1a, img1b)) 24 | self.assertFalse(haveTransition(img2a, img2b)) 25 | self.assertFalse(haveTransition(img3a, img3b)) 26 | self.assertFalse(haveTransition(img4a, img4b)) 27 | self.assertTrue(haveTransition(img1a, img2a)) 28 | self.assertTrue(haveTransition(img2a, img3a)) 29 | 30 | if __name__ == '__main__': 31 | unittest.main() 32 | 33 | -------------------------------------------------------------------------------- /mksrt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | from os import chdir 5 | from os import getcwd 6 | from tempfile import mkdtemp 7 | from subprocess import check_call 8 | from subprocess import check_output 9 | from ocr_client import getIPAddr, getOCRText 10 | from time import sleep 11 | from xml.etree.ElementTree import fromstring 12 | 13 | name = sys.argv[1] 14 | outf = open(sys.argv[2], 'w') 15 | origdir = getcwd() 16 | tdir = mkdtemp() 17 | print tdir 18 | chdir(tdir) 19 | check_call('cp ' + origdir + '/' + name + '*' + ' .', shell=True) 20 | ipaddr = getIPAddr() 21 | check_call('subp2png -n -s 0 ' + name, shell=True) 22 | dom = fromstring(open(name + '.xml').read()) 23 | for x in dom.findall('subtitle'): 24 | try: 25 | filename = x.find('image').text 26 | if '.png' not in filename: 27 | continue 28 | noext = filename[:filename.rindex('.png')] 29 | if len(filename) <= 0 or len(noext) <= 0: 30 | continue 31 | print noext 32 | check_call('convert %(filename)s -alpha Off %(noext)s-conv.png' % locals(), shell=True) 33 | ocrtext = getOCRText(noext + '-conv.png', ipaddr) 34 | print >> outf, x.attrib['id'] 35 | print >> outf, x.attrib['start'], '-->', x.attrib['stop'] 36 | print >> outf, ocrtext 37 | sleep(1.0) 38 | except: 39 | continue 40 | chdir(origdir) 41 | check_call('rm -rf ' + tdir, shell=True) 42 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # smart-subtitles-extract-subtitle 2 | 3 | ## About 4 | 5 | Code for our subtitle extraction algorithm used in our CHI 2014 paper: "Smart Subtitles for Vocabulary Learning" 6 | 7 | [Paper: CHI 2014](http://up.csail.mit.edu/other-pubs/chi2014-smartsubs.pdf) 8 | 9 | [ACM DL](https://dl.acm.org/citation.cfm?id=2557256) 10 | 11 | [Video](https://www.youtube.com/watch?v=3j-eXUB3eaA) 12 | 13 | [Web app system code](https://github.com/gkovacs/smart-subtitles-system-chi2014) 14 | 15 | [Subtitle extraction system code](https://github.com/gkovacs/smart-subtitles-extract-subtitle) 16 | 17 | [LaTeX sources for paper](https://github.com/gkovacs/smart-subtitles-paper-chi2014) 18 | 19 | ## Abstract 20 | 21 | Language learners often use subtitled videos to help them learn. However, standard subtitles are geared more towards comprehension than vocabulary learning, as translations are nonliteral and are provided only for phrases, not vocabulary. This paper presents Smart Subtitles, which are interactive subtitles tailored towards vocabulary learning. Smart Subtitles can be automatically generated from common video sources such as subtitled DVDs. They provide features such as vocabulary definitions on hover, and dialog-based video navigation. In our pilot study with intermediate learners studying Chinese, participants correctly defined over twice as many new words in a post-viewing vocabulary test when they used Smart Subtitles, compared to dual Chinese-English subtitles. Learners spent the same amount of time watching clips with each tool, and enjoyed viewing videos with Smart Subtitles as much as with dual subtitles. Learners understood videos equally well using either tool, as indicated by selfassessments and independent evaluations of their summaries. 22 | 23 | ## License 24 | 25 | MIT 26 | 27 | ## Author 28 | 29 | [Geza Kovacs](https://www.gkovacs.com) 30 | -------------------------------------------------------------------------------- /UnionFind.py: -------------------------------------------------------------------------------- 1 | """UnionFind.py 2 | 3 | Union-find data structure. Based on Josiah Carlson's code, 4 | http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912 5 | with significant additional changes by D. Eppstein. 6 | """ 7 | 8 | class UnionFind: 9 | """Union-find data structure. 10 | 11 | Each unionFind instance X maintains a family of disjoint sets of 12 | hashable objects, supporting the following two methods: 13 | 14 | - X[item] returns a name for the set containing the given item. 15 | Each set is named by an arbitrarily-chosen one of its members; as 16 | long as the set remains unchanged it will keep the same name. If 17 | the item is not yet part of a set in X, a new singleton set is 18 | created for it. 19 | 20 | - X.union(item1, item2, ...) merges the sets containing each item 21 | into a single larger set. If any item is not yet part of a set 22 | in X, it is added to X as one of the members of the merged set. 23 | """ 24 | 25 | def __init__(self): 26 | """Create a new empty union-find structure.""" 27 | self.weights = {} 28 | self.parents = {} 29 | 30 | def __getitem__(self, object): 31 | """Find and return the name of the set containing the object.""" 32 | 33 | # check for previously unknown object 34 | if object not in self.parents: 35 | self.parents[object] = object 36 | self.weights[object] = 1 37 | return object 38 | 39 | # find path of objects leading to the root 40 | path = [object] 41 | root = self.parents[object] 42 | while root != path[-1]: 43 | path.append(root) 44 | root = self.parents[root] 45 | 46 | # compress the path and return 47 | for ancestor in path: 48 | self.parents[ancestor] = root 49 | return root 50 | 51 | def __iter__(self): 52 | """Iterate through all items ever found or unioned by this structure.""" 53 | return iter(self.parents) 54 | 55 | def union(self, *objects): 56 | """Find the sets containing the objects and merge them all.""" 57 | roots = [self[x] for x in objects] 58 | heaviest = max([(self.weights[r],r) for r in roots])[1] 59 | for r in roots: 60 | if r != heaviest: 61 | self.weights[heaviest] += self.weights[r] 62 | self.parents[r] = heaviest 63 | -------------------------------------------------------------------------------- /extract_frame.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import cv 4 | from cv import * 5 | from connected_components import * 6 | from UnionFind import UnionFind 7 | import random 8 | 9 | def main(): 10 | 11 | vidf = 'video.m4v' 12 | if len(sys.argv) > 1: 13 | vidf = sys.argv[1] 14 | frameno = 0 15 | if len(sys.argv) > 2: 16 | frameno = int(sys.argv[2]) 17 | metadata = getMetadata(vidf) 18 | subtitle_color = metadata['subtitle_color'] 19 | best_portion = metadata['best_portion'] 20 | vstart = metadata['vstart'] 21 | vend = metadata['vend'] 22 | ''' 23 | metadata = getMetadata('zhjdasn-part2.m4v') 24 | subtitle_color = metadata['subtitle_color'] 25 | best_portion = metadata['best_portion'] 26 | vstart = metadata['vstart'] 27 | vend = metadata['vend'] 28 | curImg = None 29 | img = LoadImage('testdata/zhjdasn-extractframe250.png') 30 | extracted_color_img = extractColor(img, subtitle_color) 31 | nimg = extracted_color_img 32 | connectedComponentOutsidePermittedRegionBlacken(extracted_color_img, vstart-5, vend+5) 33 | #connected_components = connectedComponentLabel(extracted_color_img) 34 | #blacklist = findComponentsSpanningOutsideRange(connected_components, vstart-5, vend+5) 35 | #blackenBlacklistedComponents(nimg, connected_components, blacklist) 36 | SaveImage('extractframe250.png', nimg) 37 | ''' 38 | for idx,img in iterVideo(vidf): 39 | if idx != frameno: 40 | continue 41 | if idx > frameno: 42 | break 43 | img = getBottomQuarter(img) 44 | SaveImage('extractframe' + str(idx)+'-orig.png', img) 45 | extracted_color_img = extractColor(img, subtitle_color) 46 | SaveImage('extractframe' + str(idx)+'-color.png', extracted_color_img) 47 | #extracted_color_img = extractColor(img, subtitle_color) 48 | horizontal_boundary = 5 49 | vertical_boundary = 5 50 | connectedComponentOutsidePermittedRegionBlacken(extracted_color_img, vstart-vertical_boundary, vend+vertical_boundary) 51 | SaveImage('extractframe' + str(idx)+'-blackened.png', extracted_color_img) 52 | vertical_extracted_color_img = extractVertical(extracted_color_img, vstart-vertical_boundary, vend-vstart+vertical_boundary) 53 | harris = getHarris(vertical_extracted_color_img) 54 | SaveImage('extractframe' + str(idx)+'-harris.png', harris) 55 | horizontalActivation = getHorizontalActivationWithHarris(vertical_extracted_color_img, harris) 56 | hstart,hend = getHorizontalStartEnd(horizontalActivation) 57 | #nimg = extractHorizontal(vertical_extracted_color_img, hstart, hend-hstart) 58 | nimg = blackenOutsideHorizontalRegion(vertical_extracted_color_img, hstart-horizontal_boundary, hend-hstart+horizontal_boundary) 59 | SaveImage('extractframe' + str(idx)+'-horizontalblackened.png', nimg) 60 | equalizedhist = toEqualizedHistGrayscale(nimg) 61 | SaveImage('extractframe' + str(idx)+'-equalizehist.png', equalizedhist) 62 | #connected_components = connectedComponentLabel(extracted_color_img) 63 | #blacklist = findComponentsSpanningOutsideRange(connected_components, vstart-5, vend+5) 64 | #nimg = visualizeConnectedComponents(connected_components, blacklist) 65 | ''' 66 | vertical_extracted_color_img = extractVertical(extracted_color_img, vstart, vend-vstart) 67 | harris = getHarris(vertical_extracted_color_img) 68 | horizontalActivation = getHorizontalActivationWithHarris(vertical_extracted_color_img, harris) 69 | hstart,hend = getHorizontalStartEnd(horizontalActivation) 70 | #nimg = extractHorizontal(vertical_extracted_color_img, hstart, hend-hstart) 71 | nimg = blackenOutsideHorizontalRegion(vertical_extracted_color_img, hstart, hend-hstart) 72 | ''' 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /connected_components.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import cv 4 | from cv import * 5 | from extract_subtitle import * 6 | from UnionFind import UnionFind 7 | import random 8 | 9 | def encodeNumAsColor(num): 10 | color=[0,0,0] 11 | color[0] = (num & 0x0000FF) 12 | color[1] = (num & 0x00FF00) >> 8 13 | color[2] = (num & 0xFF0000) >> 16 14 | return tuple(color) 15 | 16 | def decodeNumFromColor(color): 17 | return (color[2] << 16) + (color[1] << 8) + (color[0]) 18 | 19 | def connectedComponentLabel(img): 20 | labels = UnionFind() 21 | labeled = set() 22 | # pass 1 23 | for surroundingPoints in iterImgSurrounding1(img): 24 | currentPoint = surroundingPoints[0] 25 | y,x = currentPoint 26 | if sum(img[y,x]) <= 100: # black 27 | continue 28 | labeled.add(currentPoint) 29 | neighbors = surroundingPoints[1:] 30 | # if all 4 neighbors are black or unlabeled (ie, 0 labeled white neighbors), assign new label 31 | # if only 1 neighbor is white, assign its label to current point 32 | # if more than 1 of the neighbors are white, assign one of their labels to the current point, and note equivalence 33 | labeled_white_neighbors = [neighbor for neighbor in neighbors if sum(img[neighbor]) > 100 and neighbor in labeled] 34 | if len(labeled_white_neighbors) == 0: # assign new label 35 | z = labels[currentPoint] 36 | else: 37 | label = labels[labeled_white_neighbors[0]] 38 | labels.union(label, currentPoint) 39 | for neighbor in labeled_white_neighbors[1:]: 40 | labels.union(label, neighbor) 41 | # pass 2 42 | set_num = 1 43 | set_to_num = {} 44 | outimg = CreateImage((img.width, img.height), 8, 3) 45 | for currentPoint in iterImg(img): 46 | y,x = currentPoint 47 | if sum(img[y,x]) <= 100: # black 48 | outimg[y,x] = (0,0,0) 49 | continue 50 | curset = labels[currentPoint] 51 | #print curset 52 | if curset not in set_to_num: 53 | set_to_num[curset] = encodeNumAsColor(set_num) 54 | set_num += 1 55 | #print set_num 56 | outimg[y,x] = set_to_num[curset] 57 | return outimg 58 | 59 | def randomColor(): 60 | return tuple([random.randint(0,255) for i in [0,1,2]]) 61 | 62 | def visualizeConnectedComponents(connectedComponents, blacklisted_labelnums=frozenset()): 63 | outimg = CreateImage((connectedComponents.width, connectedComponents.height), 8, 3) 64 | labelnum_to_color = [(0,0,0)] 65 | for y,x in iterImg(connectedComponents): 66 | labelnum = decodeNumFromColor([int(v) for v in connectedComponents[y,x]]) 67 | while labelnum >= len(labelnum_to_color): 68 | labelnum_to_color.append(randomColor()) 69 | if labelnum in blacklisted_labelnums: 70 | labelnum = 0 71 | color = labelnum_to_color[labelnum] 72 | outimg[y,x] = color 73 | return outimg 74 | 75 | def findComponentsSpanningOutsideRange(connectedComponents, vstart, vend): 76 | blacklist = set() 77 | for y,x in iterImg(connectedComponents): 78 | if y >= vstart and y <= vend: 79 | continue 80 | labelnum = decodeNumFromColor([int(v) for v in connectedComponents[y,x]]) 81 | blacklist.add(labelnum) 82 | return blacklist 83 | 84 | def blackenBlacklistedComponents(img, connectedComponents, blacklisted_labelnums): 85 | for y,x in iterImg(connectedComponents): 86 | labelnum = decodeNumFromColor([int(v) for v in connectedComponents[y,x]]) 87 | if labelnum in blacklisted_labelnums: 88 | img[y,x] = (0,0,0) 89 | 90 | def connectedComponentOutsidePermittedRegionBlacken(img, vstart, vend): 91 | labels = UnionFind() 92 | # pass 1 93 | labeled = set() 94 | for currentPoint in iterImg(img): 95 | y,x = currentPoint 96 | if sum(img[y,x]) <= 100: # black 97 | continue 98 | labeled.add(currentPoint) 99 | neighbors = getSurrounding1(img, y, x) 100 | # if all 4 neighbors are black or unlabeled (ie, 0 labeled white neighbors), assign new label 101 | # if only 1 neighbor is white, assign its label to current point 102 | # if more than 1 of the neighbors are white, assign one of their labels to the current point, and note equivalence 103 | labeled_white_neighbors = [neighbor for neighbor in neighbors if sum(img[neighbor]) > 100 and neighbor in labeled] 104 | if len(labeled_white_neighbors) == 0: # assign new label 105 | z = labels[currentPoint] 106 | else: 107 | for neighbor in labeled_white_neighbors: 108 | labels.union(neighbor, currentPoint) 109 | # now blacklist all sets st they have a child that is in the forbidden region 110 | blacklist = set() 111 | for currentPoint in labeled: 112 | y,x = currentPoint 113 | if y < vstart or y > vend: 114 | blacklist.add(labels[currentPoint]) 115 | # pass 2 - blacken blacklisted components 116 | for currentPoint in labeled: 117 | y,x = currentPoint 118 | curset = labels[currentPoint] 119 | if curset in blacklist: 120 | img[y,x] = (0,0,0) 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /extract_subtitle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import cv 4 | from cv import * 5 | 6 | import collections 7 | import functools 8 | import itertools 9 | import json 10 | from UnionFind import UnionFind 11 | import random 12 | import sys 13 | 14 | from connected_components import * 15 | 16 | activation_threshold = 0.5 17 | 18 | class memoized(object): 19 | '''Decorator. Caches a function's return value each time it is called. 20 | If called later with the same arguments, the cached value is returned 21 | (not reevaluated). 22 | ''' 23 | def __init__(self, func): 24 | self.func = func 25 | self.cache = {} 26 | def __call__(self, *args): 27 | if not isinstance(args, collections.Hashable): 28 | # uncacheable. a list, for instance. 29 | # better to not cache than blow up. 30 | return self.func(*args) 31 | if args in self.cache: 32 | return self.cache[args] 33 | else: 34 | value = self.func(*args) 35 | self.cache[args] = value 36 | return value 37 | def __repr__(self): 38 | '''Return the function's docstring.''' 39 | return self.func.__doc__ 40 | def __get__(self, obj, objtype): 41 | '''Support instance methods.''' 42 | return functools.partial(self.__call__, obj) 43 | 44 | def reducedColor(rgb): 45 | return tuple([round(x/8.0)*8.0 for x in rgb]) 46 | 47 | def colorMatch(color1, color2): 48 | abs_diff_sum = sum([abs(c1-c2) for c1,c2 in zip(color1,color2)]) 49 | return (abs_diff_sum < 100) 50 | 51 | def roughColorMatch(color1, color2): 52 | abs_diff_sum = sum([abs(c1-c2) for c1,c2 in zip(color1,color2)]) 53 | return (abs_diff_sum < 150) 54 | 55 | #def greatlyReducedColor(rgb): 56 | #S return tuple([round(x/50.0)*50.0 for x in rgb]) 57 | 58 | def getHarris(img): 59 | yuv = CreateImage(GetSize(img), 8, 3) 60 | gray = CreateImage(GetSize(img), 8, 1) 61 | CvtColor(img, yuv, CV_BGR2YCrCb) 62 | Split(yuv, gray, None, None, None) 63 | harris = CreateImage (GetSize(img), IPL_DEPTH_32F, 1) 64 | #CornerHarris(gray, harris, 9, 9, 0.1) 65 | CornerHarris(gray, harris, 7, 7, 0.1) 66 | return harris 67 | 68 | def getCanny(img): 69 | yuv = CreateImage(GetSize(img), 8, 3) 70 | gray = CreateImage(GetSize(img), 8, 1) 71 | CvtColor(img, yuv, CV_BGR2YCrCb) 72 | Split(yuv, gray, None, None, None) 73 | canny = cv.CreateImage(cv.GetSize(img), 8, 1) 74 | cv.Canny(gray, canny, 50, 200) 75 | cv.SaveImage('canny.png', canny) 76 | return canny 77 | 78 | def iterImg(img): 79 | for y in range(img.height): 80 | for x in range(img.width): 81 | yield y,x 82 | 83 | def removeUnactivated(img, activation): 84 | output = CreateImage(GetSize(img), 8, 3) 85 | for y,x in iterImg(img): 86 | if activation[y,x] >= activation_threshold: 87 | output[y,x] = img[y,x] 88 | else: 89 | output[y,x] = (0,0,0) 90 | return output 91 | 92 | def closestPoint(img, point): 93 | y,x = point 94 | if y >= img.height: 95 | y = img.height - 1 96 | if x >= img.width: 97 | x = img.width - 1 98 | if y < 0: 99 | y = 0 100 | if x < 0: 101 | x = 0 102 | return y,x 103 | 104 | def getSurrounding1(img, y, x): 105 | left = closestPoint(img, (y,x-1)) 106 | right = closestPoint(img, (y,x-1)) 107 | up = closestPoint(img, (y+1,x)) 108 | down = closestPoint(img, (y-1,x)) 109 | return left,right,up,down 110 | 111 | def iterImgSurrounding1(img): 112 | for y,x in iterImg(img): 113 | mid = y,x 114 | left = closestPoint(img, (y,x-1)) 115 | right = closestPoint(img, (y,x-1)) 116 | up = closestPoint(img, (y+1,x)) 117 | down = closestPoint(img, (y-1,x)) 118 | yield mid,left,right,up,down 119 | 120 | def iterImgSurrounding3(img): 121 | for y,x in iterImg(img): 122 | mid = y,x 123 | left = closestPoint(img, (y,x-1)) 124 | right = closestPoint(img, (y,x-1)) 125 | up = closestPoint(img, (y+1,x)) 126 | down = closestPoint(img, (y-1,x)) 127 | leftup = closestPoint(img, (y+1,x-1)) 128 | leftdown = closestPoint(img, (y-1,x-1)) 129 | rightup = closestPoint(img, (y+1,x+1)) 130 | rightdown = closestPoint(img, (y-1,x+1)) 131 | yield mid,left,right,up,down,leftup,leftdown,rightup,rightdown 132 | 133 | def iterImgSurrounding4(img): 134 | for y,x in iterImg(img): 135 | mid = y,x 136 | left = closestPoint(img, (y,x-1)) 137 | right = closestPoint(img, (y,x-1)) 138 | up = closestPoint(img, (y+1,x)) 139 | down = closestPoint(img, (y-1,x)) 140 | leftup = closestPoint(img, (y+1,x-1)) 141 | leftdown = closestPoint(img, (y-1,x-1)) 142 | rightup = closestPoint(img, (y+1,x+1)) 143 | rightdown = closestPoint(img, (y-1,x+1)) 144 | left2 = closestPoint(img, (y,x-2)) 145 | right2 = closestPoint(img, (y,x+2)) 146 | up2 = closestPoint(img, (y+2,x)) 147 | down2 = closestPoint(img, (y-2,x)) 148 | yield mid,left,right,up,down,leftup,leftdown,rightup,rightdown,left2,right2,up2,down2 149 | ''' 150 | def iterImgSurrounding4(img): 151 | for y,x in iterImg(img): 152 | mid = y,x 153 | coords = [] 154 | for dy in range(-2,3): 155 | for dx in range(-2,3): 156 | if abs(dx)+abs(dy) > 3: 157 | continue 158 | coords.append(closestPoint(img, (y+dy,x+dx))) 159 | yield tuple([mid] + coords) 160 | ''' 161 | 162 | def iterImgSurrounding5(img): 163 | for y,x in iterImg(img): 164 | mid = y,x 165 | coords = [] 166 | for dy in range(-2,3): 167 | for dx in range(-2,3): 168 | coords.append(closestPoint(img, (y+dy,x+dx))) 169 | yield tuple([mid] + coords) 170 | 171 | def iterImgSurrounding6(img): 172 | for y,x in iterImg(img): 173 | mid = y,x 174 | coords = [] 175 | for dy in range(-3,4): 176 | for dx in range(-3,4): 177 | if abs(dx)+abs(dy) > 5: 178 | continue 179 | coords.append(closestPoint(img, (y+dy,x+dx))) 180 | yield tuple([mid] + coords) 181 | 182 | def iterImgSurrounding7(img): 183 | for y,x in iterImg(img): 184 | mid = y,x 185 | coords = [] 186 | for dy in range(-3,4): 187 | for dx in range(-3,4): 188 | coords.append(closestPoint(img, (y+dy,x+dx))) 189 | yield tuple([mid] + coords) 190 | 191 | def rowSum(img, rownum): 192 | return sum([img[rownum,x] for x in range(img.width)]) 193 | 194 | def colSum(img, colnum): 195 | return sum([img[y,colnum] for y in range(img.height)]) 196 | 197 | def toEqualizedHistGrayscale(img): 198 | yuv = CreateImage(GetSize(img), 8, 3) 199 | gray = CreateImage(GetSize(img), 8, 1) 200 | CvtColor(img, yuv, CV_BGR2YCrCb) 201 | Split(yuv, gray, None, None, None) 202 | EqualizeHist(gray, gray) 203 | return gray 204 | 205 | 206 | num_portions = 16 207 | 208 | @memoized 209 | def getBestPortion(videofile): 210 | harrisSum = None 211 | max_portion_counts = [0.0]*num_portions 212 | for idx,img in iterVideo(videofile): 213 | if idx % 100 != 0: 214 | continue 215 | if not img: 216 | break 217 | img = getBottomQuarter(img) 218 | harris = getHarris(img) 219 | vals_and_portion = [] 220 | for i in range(num_portions): 221 | rsum = sum([rowSum(harris, rownum) for rownum in range(i*img.height/num_portions,(i+1)*img.height/num_portions)]) 222 | vals_and_portion.append((rsum, i)) 223 | best_portion_val,best_portion = max(vals_and_portion) 224 | max_portion_counts[best_portion] += best_portion_val 225 | if idx >= 1000: 226 | break 227 | best_portion = max([(x,i) for i,x in enumerate(max_portion_counts)])[1] 228 | return best_portion 229 | 230 | def addColorToHistogram(color, histogram): 231 | color = reducedColor(color) 232 | if color == (0,0,0): 233 | return 234 | if sum(color) < 200: 235 | return 236 | if color not in histogram: 237 | histogram[color] = 1 238 | else: 239 | histogram[color] += 1 240 | 241 | def addImageToColorHistogram(img, histogram): 242 | for y,x in iterImg(img): 243 | addColorToHistogram(img[y,x], histogram) 244 | 245 | def solidColorImg(color): 246 | img = CreateImage((1,1), 8, 3) 247 | for y,x in iterImg(img): 248 | img[y,x] = color 249 | return img 250 | 251 | def iterVideo(videofile): 252 | vid = cv.CaptureFromFile(videofile) 253 | for idx in itertools.count(0): 254 | img = cv.QueryFrame(vid) 255 | if not img: 256 | break 257 | yield idx,img 258 | 259 | def getBottomQuarter(img): 260 | sub = cv.GetSubRect(img, (0, img.height*3/4, img.width, img.height/4)) 261 | return cv.GetMat(sub) 262 | 263 | def getPortion(img, portion_num): 264 | sub = cv.GetSubRect(img, (0, img.height*portion_num/num_portions, img.width, img.height/num_portions)) 265 | return cv.GetMat(sub) 266 | 267 | def getCenterHorizontal(img): 268 | sub = cv.GetSubRect(img, (img.width/3, 0, img.width/3, img.height)) 269 | return cv.GetMat(sub) 270 | 271 | def getCenterHorizontal5(img): 272 | sub = cv.GetSubRect(img, (2*img.width/5, 0, img.width/5, img.height)) 273 | return cv.GetMat(sub) 274 | 275 | def extractVertical(img, vstart, vend): 276 | sub = cv.GetSubRect(img, (0, vstart, img.width, vend+1)) 277 | return cv.GetMat(sub) 278 | 279 | def extractHorizontal(img, hstart, hend): 280 | sub = cv.GetSubRect(img, (hstart, 0, hend, img.height)) 281 | return cv.GetMat(sub) 282 | 283 | def isSubtitleFrame(harris): 284 | max_val = harris.width * harris.height 285 | cur_val = 0 286 | for y,x in iterImg(harris): 287 | if harris[y,x] >= activation_threshold: 288 | cur_val += 1 289 | if cur_val*4 > max_val: 290 | return True 291 | return False 292 | 293 | @memoized 294 | def getSubtitleColor(videofile): 295 | best_portion = getBestPortion(videofile) 296 | print best_portion 297 | color_histogram = {} 298 | for idx,img in iterVideo(videofile): 299 | if idx % 100 != 0: 300 | continue 301 | img = getBottomQuarter(img) 302 | img = getPortion(img, best_portion) 303 | img = getCenterHorizontal5(img) 304 | harris = getHarris(img) 305 | if not isSubtitleFrame(harris): 306 | continue 307 | img_intersected = removeUnactivated(img, harris) 308 | #SaveImage(str(idx)+'.png', img) 309 | #SaveImage(str(idx)+'-harris.png', harris) 310 | #SaveImage(str(idx)+'-intersected.png', img_intersected) 311 | addImageToColorHistogram(img_intersected, color_histogram) 312 | colorcounts = [(count,color) for color,count in color_histogram.iteritems()] 313 | colorcounts.sort() 314 | colorcounts.reverse() 315 | print [(color,count) for count,color in colorcounts[0:4]] 316 | count,color = max(colorcounts) 317 | print count, color 318 | img = solidColorImg(color) 319 | SaveImage('solidcolor.png', img) 320 | return color 321 | 322 | def extractColor(origimg, color): 323 | img = CreateImage((origimg.width, origimg.height), 8, 3) 324 | for surrounding in iterImgSurrounding5(img): 325 | haveMatch = False 326 | centery,centerx = surrounding[0] 327 | if roughColorMatch(origimg[centery,centerx], color): 328 | for y,x in surrounding[1:]: 329 | if colorMatch(origimg[y,x], color): 330 | haveMatch = True 331 | img[centery,centerx] = origimg[centery,centerx] 332 | break 333 | if not haveMatch: 334 | img[centery,centerx] = (0,0,0) 335 | return img 336 | 337 | def getVerticalActivation(extracted_color_img, harris): 338 | activations = [0] * extracted_color_img.height 339 | for y,x in iterImg(extracted_color_img): 340 | if sum(extracted_color_img[y,x]) > 100: # and harris[y,x] > activation_threshold: 341 | activations[y] += 1 342 | return activations 343 | 344 | def getVerticalActivationWithHarris(extracted_color_img, harris): 345 | activations = [0] * extracted_color_img.height 346 | for y,x in iterImg(extracted_color_img): 347 | if sum(extracted_color_img[y,x]) > 100 and harris[y,x] > activation_threshold: 348 | activations[y] += 1 349 | return activations 350 | 351 | def getHorizontalActivationWithHarris(extracted_color_img, harris): 352 | activations = [0] * extracted_color_img.width 353 | for y,x in iterImg(extracted_color_img): 354 | if sum(extracted_color_img[y,x]) > 100 and harris[y,x] > activation_threshold: 355 | activations[x] += 1 356 | return activations 357 | 358 | # assumes centered subtitle! ie, grows left and out equally 359 | def getHorizontalStartEnd(horizontalActivation): 360 | average_activation = float(sum(horizontalActivation))/len(horizontalActivation) 361 | center = len(horizontalActivation)/2 362 | beststart,bestend = (center,center) 363 | bestval = 0 364 | curval = 0 365 | for outward in range(len(horizontalActivation)/2): 366 | start,end = (center-outward,center+outward) 367 | curval -= average_activation*2 368 | curval += horizontalActivation[start] 369 | curval += horizontalActivation[end] 370 | if curval > bestval: 371 | bestval = curval 372 | beststart = start 373 | bestend = end 374 | return beststart,bestend 375 | 376 | def getVerticalStartEnd(verticalActivation): 377 | average_activation = float(sum(verticalActivation))/len(verticalActivation) 378 | bestval = 0 379 | beststart,bestend = (0,0) 380 | for start in range(len(verticalActivation)-1): 381 | curval = 0 382 | for end in range(start, len(verticalActivation)): 383 | curval -= average_activation 384 | curval += verticalActivation[end] 385 | if curval > bestval: 386 | bestval = curval 387 | beststart = start 388 | bestend = end 389 | return beststart,bestend 390 | 391 | def getVideoSubtitleVerticalStartEnd(videofile): 392 | counts = [] 393 | subtitle_color = getSubtitleColor(videofile) 394 | best_portion = getBestPortion(videofile) 395 | verticalActivationTotal = None 396 | for idx,img in iterVideo(videofile): 397 | if idx % 100 != 0: 398 | continue 399 | img = getBottomQuarter(img) 400 | if verticalActivationTotal == None: 401 | verticalActivationTotal = [0] * img.height 402 | img = getCenterHorizontal5(img) 403 | extracted_color_img = extractColor(img, subtitle_color) 404 | harris = getHarris(img) 405 | verticalActivation = getVerticalActivationWithHarris(extracted_color_img, harris) 406 | for i,v in enumerate(verticalActivation): 407 | verticalActivationTotal[i] += v 408 | vstart,vend = getVerticalStartEnd(verticalActivationTotal) 409 | return vstart,vend 410 | 411 | def blackenOutsideHorizontalRegion(origimg, hstart, hlength): 412 | img = CreateImage((origimg.width, origimg.height), 8, 3) 413 | for y,x in iterImg(origimg): 414 | if x >= hstart and x <= hstart+hlength: 415 | img[y,x] = origimg[y,x] 416 | else: 417 | img[y,x] = (0,0,0) 418 | return img 419 | 420 | def imgDifference(img1, img2): 421 | diff = 0 422 | for y,x in iterImg(img1): 423 | if sum([abs(img1[y,x][i] - img2[y,x][i]) for i in [0,1,2]]) > 100: 424 | diff += 1 425 | return float(diff) / (img1.width * img2.height) 426 | 427 | def haveTransition(img1, img2): 428 | diff = imgDifference(img1, img2) 429 | return diff > 0.03 430 | 431 | ''' 432 | def getSubtitleHeight(videofile): 433 | num_counted = 0 434 | total_heights = 0 435 | for idx,img in iterVideo(videofile): 436 | ''' 437 | ''' 438 | def getVerticalStartEnd(verticalActivation): 439 | average_activation = float(sum(verticalActivation))/len(verticalActivation) 440 | bestval = 0 441 | beststart,bestend = (0,0) 442 | for start in range(len(verticalActivation)-1): 443 | curval = 0 444 | for end in range(start+1, len(verticalActivation)): 445 | curval -= average_activation/100 446 | curval += verticalActivation[end] 447 | if curval > bestval: 448 | beststart,bestend = start,end 449 | return beststart,bestend 450 | #return 0,len(verticalActivation)-1 451 | ''' 452 | 453 | ''' 454 | def getSubtitleStartEndTimes(videofile): 455 | best_portion = getBestPortion(videofile) 456 | for idx,img in iterVideo(videofile): 457 | print best_portion 458 | ''' 459 | 460 | def getMetadata(vidf): 461 | try: 462 | metadata = json.load(open(vidf+'.json')) 463 | return metadata 464 | except: 465 | subtitle_color = getSubtitleColor(vidf) 466 | best_portion = getBestPortion(vidf) 467 | vstart,vend = getVideoSubtitleVerticalStartEnd(vidf) 468 | metadata = {} 469 | metadata['subtitle_color'] = subtitle_color 470 | metadata['best_portion'] = best_portion 471 | metadata['vstart'] = vstart 472 | metadata['vend'] = vend 473 | open(vidf+'.json', 'w').write(json.dumps(metadata)) 474 | return metadata 475 | 476 | def whitenAll(curseq): 477 | img = CreateImage((curseq[0].width, curseq[0].height), 8, 3) 478 | for y,x in iterImg(img): 479 | img[y,x] = (0,0,0) 480 | for nimg in curseq: 481 | for y,x in iterImg(nimg): 482 | if sum(nimg[y,x]) > 100: 483 | img[y,x] = (255, 255, 255) 484 | return img 485 | 486 | def halfVoteImages(curseq): 487 | img = CreateImage((curseq[0].width, curseq[0].height), 8, 3) 488 | min_votes = len(curseq)/2 489 | if min_votes == 0: 490 | min_votes += 1 491 | for y,x in iterImg(img): 492 | img[y,x] = (0,0,0) 493 | num_votes = 0 494 | for nimg in curseq: 495 | if sum(nimg[y,x]) > 100: 496 | num_votes += 1 497 | if num_votes >= min_votes: 498 | img[y,x] = (255,255,255) 499 | return img 500 | 501 | def averageImages(curseq): 502 | img = CreateImage((curseq[0].width, curseq[0].height), 8, 3) 503 | lencurseq = float(len(curseq)) 504 | for y,x in iterImg(img): 505 | num_votes = 0 506 | for nimg in curseq: 507 | if sum(nimg[y,x]) > 100: 508 | num_votes += 1 509 | ratio = num_votes / lencurseq 510 | img[y,x] = (255*ratio,255*ratio,255*ratio) 511 | return img 512 | 513 | def invertImage(img): 514 | for y,x in iterImg(img): 515 | img[y,x] = tuple([255-v for v in img[y,x]]) 516 | 517 | def main(): 518 | vidf = 'video.m4v' 519 | if len(sys.argv) > 1: 520 | vidf = sys.argv[1] 521 | metadata = getMetadata(vidf) 522 | subtitle_color = metadata['subtitle_color'] 523 | best_portion = metadata['best_portion'] 524 | vstart = metadata['vstart'] 525 | vend = metadata['vend'] 526 | curImg = None 527 | curseq = [] 528 | for idx,img in iterVideo(vidf): 529 | if idx % 10 != 0: 530 | continue 531 | img = getBottomQuarter(img) 532 | #extracted_color_img = extractColor(img, subtitle_color) 533 | extracted_color_img = extractColor(img, subtitle_color) 534 | #connected_components = connectedComponentLabel(extracted_color_img) 535 | #blacklisted_components = findComponentsSpanningOutsideRange(connected_components, vstart-5, vend+5) 536 | #blackenBlacklistedComponents(extracted_color_img, connected_components, blacklisted_components) 537 | connectedComponentOutsidePermittedRegionBlacken(extracted_color_img, vstart-5, vend+5) 538 | vertical_extracted_color_img = extractVertical(extracted_color_img, vstart-5, vend-vstart+5) 539 | harris = getHarris(vertical_extracted_color_img) 540 | horizontalActivation = getHorizontalActivationWithHarris(vertical_extracted_color_img, harris) 541 | hstart,hend = getHorizontalStartEnd(horizontalActivation) 542 | #nimg = extractHorizontal(vertical_extracted_color_img, hstart, hend-hstart) 543 | nimg = blackenOutsideHorizontalRegion(vertical_extracted_color_img, hstart-5, hend-hstart+5) 544 | if len(curseq) > 0 and haveTransition(curImg, nimg): 545 | combined_img = averageImages(curseq) 546 | invertImage(combined_img) 547 | SaveImage(str(idx)+'.png', toEqualizedHistGrayscale(combined_img)) 548 | curseq = [] 549 | curImg = nimg 550 | curseq.append(nimg) 551 | return 552 | subtitle_color = getSubtitleColor(vidf) 553 | best_portion = getBestPortion(vidf) 554 | ''' 555 | for idx,img in iterVideo(vidf): 556 | img = getBottomQuarter(img) 557 | img = getPortion(img, best_portion) 558 | SaveImage(str(idx)+'.png', img) 559 | ''' 560 | 561 | subtitle_color = getSubtitleColor(vidf) 562 | for idx,img in iterVideo(vidf): 563 | img = getBottomQuarter(img) 564 | #extracted_color_img = extractColor(img, subtitle_color) 565 | extracted_color_img = extractColor(img, subtitle_color) 566 | harris = getHarris(img) 567 | #verticalActivation = getVerticalActivation(extracted_color_img, harris) 568 | verticalActivation = getVerticalActivationWithHarris(extracted_color_img, harris) 569 | vstart,vend = getVerticalStartEnd(verticalActivation) 570 | nimg = extractVertical(extracted_color_img, vstart, vend-vstart) 571 | #blurred_harris = CreateImage (GetSize(harris), IPL_DEPTH_32F, 1) 572 | #Smooth(harris, blurred_harris, smoothtype=CV_BLUR_NO_SCALE, param1=7) 573 | #img_intersected = removeUnactivated(extracted_color_img, blurred_harris) 574 | SaveImage(str(idx)+'.png', nimg) 575 | 576 | ''' 577 | if not harrisSum: 578 | harrisSum = CreateImage (GetSize(img), IPL_DEPTH_32F, 1) 579 | Zero(harrisSum) 580 | harrisSumTemp = CreateImage (GetSize(img), IPL_DEPTH_32F, 1) 581 | Zero(harrisSumTemp) 582 | Add(harris, harrisSum, harrisSumTemp) 583 | harrisSum = harrisSumTemp 584 | scaledHarrisSum = CreateImage (GetSize(img), IPL_DEPTH_32F, 1) 585 | Zero(scaledHarrisSum) 586 | Scale(harrisSum, scaledHarrisSum, 1.0/(idx+1)) 587 | SaveImage(str(idx)+'.png', scaledHarrisSum) 588 | idx += 1 589 | ''' 590 | 591 | 592 | 593 | 594 | 595 | #SaveImage('harris.png', harris) 596 | 597 | #for y in range(harris.rows): 598 | # 599 | 600 | """ 601 | vid = cv.CaptureFromFile('video.m4v') 602 | # determine color of subtitles 603 | framenum = 0 604 | motion_stripped = None 605 | while True: 606 | frame = cv.QueryFrame(vid) 607 | if not frame: 608 | break 609 | sub = cv.GetSubRect(frame, (0, frame.height*3/4, frame.width, frame.height/4)) 610 | mat = cv.GetMat(sub) 611 | downsampled = cv.CreateMat(mat.rows / 2, mat.cols / 2, cv.CV_8UC3) 612 | print mat.type 613 | print downsampled.type 614 | cv.Resize(mat, downsampled) 615 | mat = downsampled 616 | color_changes = {} 617 | for y in range(mat.rows): 618 | prev_color = (0.0,0.0,0.0) 619 | for x in range(mat.cols): 620 | cur_color = reducedColor(mat[y,x]) 621 | if cur_color != prev_color and (max(cur_color) > 80): 622 | if cur_color not in color_changes: 623 | color_changes[cur_color] = 1 624 | else: 625 | color_changes[cur_color] += 1 626 | max_change_color = max([(v,k) for k,v in color_changes.iteritems()])[1] 627 | max_change_color_img = cv.CreateMat(1,1,cv.CV_8UC3) 628 | max_change_color_img[0,0] = max_change_color 629 | cv.SaveImage(str(framenum) + '.png', max_change_color_img) 630 | framenum += 1 631 | ''' 632 | if framenum % 5 == 0: 633 | if motion_stripped: 634 | cv.SaveImage(str(framenum/5) + '.png', motion_stripped) 635 | motion_stripped = cv.CreateMat(mat.rows, mat.cols, cv.CV_8UC3) 636 | for y in range(mat.rows): 637 | for x in range(mat.cols): 638 | motion_stripped[y,x] = reducedColor(mat[y,x]) 639 | else: 640 | for y in range(mat.rows): 641 | for x in range(mat.cols): 642 | if motion_stripped[y,x] != reducedColor(mat[y,x]): 643 | motion_stripped[y,x] = (0.0,0.0,0.0) 644 | pass 645 | framenum += 1 646 | ''' 647 | ''' 648 | for i in range(10): 649 | frame = cv.QueryFrame(vid) 650 | sub = cv.GetSubRect(frame, (0, frame.height*3/4, frame.width, frame.height/4)) 651 | mat = cv.LoadImageM(path, cv.CV_LOAD_IMAGE_UNCHANGED) 652 | cv.SaveImage(str(i) + '.png', sub) 653 | ''' 654 | """ 655 | 656 | if __name__ == '__main__': 657 | main() 658 | --------------------------------------------------------------------------------