├── README.md └── diff.py /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaronsw/htmldiff/f3c99f794a7f83f8e1645d4e58a89948a1939e65/README.md -------------------------------------------------------------------------------- /diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """HTML Diff: http://www.aaronsw.com/2002/diff 3 | Rough code, badly documented. Send me comments and patches.""" 4 | 5 | __author__ = 'Aaron Swartz ' 6 | __copyright__ = '(C) 2003 Aaron Swartz. GNU GPL 2 or 3.' 7 | __version__ = '0.22' 8 | 9 | import difflib, string 10 | 11 | def isTag(x): return x[0] == "<" and x[-1] == ">" 12 | 13 | def textDiff(a, b): 14 | """Takes in strings a and b and returns a human-readable HTML diff.""" 15 | 16 | out = [] 17 | a, b = html2list(a), html2list(b) 18 | try: # autojunk can cause malformed HTML, but also speeds up processing. 19 | s = difflib.SequenceMatcher(None, a, b, autojunk=False) 20 | except TypeError: 21 | s = difflib.SequenceMatcher(None, a, b) 22 | for e in s.get_opcodes(): 23 | if e[0] == "replace": 24 | # @@ need to do something more complicated here 25 | # call textDiff but not for html, but for some html... ugh 26 | # gonna cop-out for now 27 | out.append(''+''.join(a[e[1]:e[2]]) + ''+''.join(b[e[3]:e[4]])+"") 28 | elif e[0] == "delete": 29 | out.append(''+ ''.join(a[e[1]:e[2]]) + "") 30 | elif e[0] == "insert": 31 | out.append(''+''.join(b[e[3]:e[4]]) + "") 32 | elif e[0] == "equal": 33 | out.append(''.join(b[e[3]:e[4]])) 34 | else: 35 | raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'." 36 | return ''.join(out) 37 | 38 | def html2list(x, b=0): 39 | mode = 'char' 40 | cur = '' 41 | out = [] 42 | for c in x: 43 | if mode == 'tag': 44 | if c == '>': 45 | if b: cur += ']' 46 | else: cur += c 47 | out.append(cur); cur = ''; mode = 'char' 48 | else: cur += c 49 | elif mode == 'char': 50 | if c == '<': 51 | out.append(cur) 52 | if b: cur = '[' 53 | else: cur = c 54 | mode = 'tag' 55 | elif c in string.whitespace: out.append(cur+c); cur = '' 56 | else: cur += c 57 | out.append(cur) 58 | return filter(lambda x: x is not '', out) 59 | 60 | if __name__ == '__main__': 61 | import sys 62 | try: 63 | a, b = sys.argv[1:3] 64 | except ValueError: 65 | print "htmldiff: highlight the differences between two html files" 66 | print "usage: " + sys.argv[0] + " a b" 67 | sys.exit(1) 68 | print textDiff(open(a).read(), open(b).read()) 69 | 70 | --------------------------------------------------------------------------------