├── .gitignore ├── .sonarcloud.properties ├── .travis.yml ├── readme.md ├── rich_text_diff ├── __init__.py └── tests │ ├── __init__.py │ └── test_diff.py ├── setup.py └── tea.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | .eggs 2 | .idea 3 | *.egg-info 4 | *.pyc 5 | dist -------------------------------------------------------------------------------- /.sonarcloud.properties: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | - '3.5' 5 | - '3.6' 6 | sudo: false 7 | install: 8 | python setup.py install 9 | script: python setup.py test 10 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # rich_text_diff 2 | [![Build Status](https://travis-ci.org/c1ay/rich_text_diff.svg?branch=master)](https://travis-ci.org/c1ay/rich_text_diff) 3 | [![image](https://img.shields.io/pypi/v/rich-text-diff.svg)](https://pypi.org/project/rich-text-diff/) 4 | [![image](https://img.shields.io/pypi/l/rich-text-diff.svg)](https://pypi.org/project/rich-text-diff/) 5 | [![image](https://img.shields.io/pypi/pyversions/rich-text-diff.svg)](https://pypi.org/project/rich-text-diff/) 6 | 7 | > support rich text(html) diff 8 | 9 | ## Installation 10 | 11 | OS X & Linux & Windows 12 | 13 | ```sh 14 | pip install rich-text-diff 15 | ``` 16 | 17 | ## Usage Exmaple 18 | 19 | ```python 20 | import rich_text_diff as diff 21 | 22 | new_content = u'

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

' 23 | old_content = u'

\u4eca\u5929\u5929\u6c14\u8fd8233

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519


\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

' 24 | 25 | print diff.ContentDiff(new_content, old_content).diff() 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /rich_text_diff/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import copy 3 | import sys 4 | import logging 5 | import re 6 | 7 | from bidict import bidict 8 | import diff_match_patch as dmp_module 9 | from lxml.html import fromstring, tostring, fragment_fromstring 10 | from lxml import etree 11 | 12 | if sys.version_info < (3,): 13 | chr = unichr 14 | unicode_type = unicode 15 | else: 16 | unicode_type = str 17 | 18 | UNICODE_KEY = [chr(item) for item in range(0xE000, 0xFFFF + 1)] 19 | # unicode spec not in use 20 | DMP = dmp_module.diff_match_patch() 21 | 22 | 23 | TAG_RE = re.compile(']*>') 24 | 25 | 26 | class ContentDiff(object): 27 | 28 | INSERT = 1 29 | DELETE = -1 30 | EQUAL = 0 31 | 32 | def __init__(self, new_content, old_content): 33 | self.new_content = to_unicode(new_content) 34 | self.old_content = to_unicode(old_content) 35 | self.tag_map = {} 36 | self.media_url = bidict() 37 | self.code_key = copy.deepcopy(UNICODE_KEY) 38 | 39 | def _map_tag(self, content): 40 | tags = TAG_RE.findall(content) 41 | for tag in tags: 42 | if tag[1] == '/': 43 | if tags not in self.tag_map.values(): 44 | self.tag_map[self.code_key.pop()] = tag 45 | else: 46 | element = fromstring(tag) 47 | if element.tag in ('img', 'a', 'video', 'audio'): 48 | self._map_media_tag(element, tag) 49 | else: 50 | self.tag_map[self.code_key.pop()] = tag 51 | 52 | def _map_media_tag(self, element, raw_tag): 53 | tag_key = gen_tag_key(element.attrib) 54 | if tag_key in self.media_url.values(): 55 | code = self.media_url.inv[tag_key] 56 | self.tag_map[code].append(raw_tag) 57 | return 58 | code = self.code_key.pop() 59 | self.tag_map[code] = [raw_tag] 60 | self.media_url[code] = tag_key 61 | 62 | def _replace(self, new_content, old_content): 63 | self._map_tag(new_content) 64 | for code, tag in self.tag_map.items(): 65 | if not isinstance(tag, list): 66 | tag = [tag] 67 | for item in tag: 68 | new_content = new_content.replace(item, code) 69 | for code, tag in self.tag_map.items(): 70 | if not isinstance(tag, list): 71 | tag = [tag] 72 | for item in tag: 73 | old_content = old_content.replace(item, code) 74 | return to_unicode(new_content), to_unicode(old_content) 75 | 76 | def _recover(self, content): 77 | for code, tag in self.tag_map.items(): 78 | if isinstance(tag, list): 79 | tag = tag[0] 80 | content = content.replace(code, tag) 81 | return ensure_closed_tag(content) 82 | 83 | def diff(self): 84 | if self.new_content == self.old_content: 85 | return self.new_content 86 | new_content, old_content = self._replace(self.new_content, self.old_content) 87 | content = self._diff(old_content, new_content) 88 | return content 89 | 90 | def _diff(self, old_content, new_content): 91 | diffs = DMP.diff_main(to_unicode(old_content), to_unicode(new_content)) 92 | html = [] 93 | for (op, data) in diffs: 94 | text = self._recover(data) 95 | if op == self.INSERT: 96 | html.append(u"{}".format(text)) 97 | elif op == self.DELETE: 98 | html.append(u"{}".format(text)) 99 | elif op == self.EQUAL: 100 | html.append(text) 101 | return utf8(u"".join(html)) 102 | 103 | 104 | _TO_UNICODE_TYPES = (unicode_type, type(None)) 105 | _UTF8_TYPES = (bytes, type(None)) 106 | 107 | 108 | def to_unicode(value): 109 | if isinstance(value, _TO_UNICODE_TYPES): 110 | return value 111 | if not isinstance(value, bytes): 112 | raise TypeError( 113 | "Expected bytes, unicode, or None; got %r" % type(value) 114 | ) 115 | return value.decode("utf-8") 116 | 117 | 118 | def utf8(value): 119 | if isinstance(value, _UTF8_TYPES): 120 | return value 121 | if not isinstance(value, unicode_type): 122 | raise TypeError("Expected bytes, unicode, or None; got %r" % type(value)) 123 | return value.encode("utf-8") 124 | 125 | 126 | def ensure_closed_tag(html): 127 | try: 128 | element = fromstring(html) 129 | except etree.ParserError as e: 130 | logging.warning('fromstring error: {}, use fragment_fromstring'.format(e)) 131 | element = fragment_fromstring(html, create_parent='div') 132 | return to_unicode(tostring(element, encoding='utf-8')) 133 | 134 | 135 | def gen_tag_key(query): 136 | l = ["{}={}".format(to_unicode(k), to_unicode(v)) for k, v in query.items()] 137 | return '&'.join(l) 138 | -------------------------------------------------------------------------------- /rich_text_diff/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /rich_text_diff/tests/test_diff.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | import rich_text_diff 4 | 5 | new_content = u'

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

' 6 | old_content = u'

\u4eca\u5929\u5929\u6c14\u8fd8233

\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519


\u4eca\u5929\u5929\u6c14\u8fd8\u4e0d\u9519

' 7 | 8 | 9 | class TestDiff(unittest.TestCase): 10 | 11 | def test_diff(self): 12 | d = rich_text_diff.ContentDiff(new_content, old_content) 13 | d.diff() 14 | 15 | def test_ensure_closed_tag(self): 16 | ret = rich_text_diff.ensure_closed_tag(' ') 17 | self.assertEqual('
', ret) 18 | rich_text_diff.ensure_closed_tag('

') 19 | rich_text_diff.ensure_closed_tag('

') 20 | 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from setuptools import setup 3 | 4 | setup( 5 | name='rich_text_diff', 6 | version='0.0.6', 7 | author='liukai', 8 | author_email='liukai@zhihu.com', 9 | description='support rich text diff', 10 | packages=['rich_text_diff'], 11 | test_suite='nose.collector', 12 | python_requires='>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*', 13 | long_description='readme', 14 | long_description_content_type='text/markdown', 15 | url='https://github.com/c1ay/rich_text_diff', 16 | tests_require=['nose'], 17 | install_requires=[ 18 | 'lxml', 19 | 'diff_match_patch', 20 | 'bidict', 21 | ] 22 | ) 23 | -------------------------------------------------------------------------------- /tea.yaml: -------------------------------------------------------------------------------- 1 | # https://tea.xyz/what-is-this-file 2 | --- 3 | version: 1.0.0 4 | codeOwners: 5 | - '0xB36C260CA46596bEf797a0ABC4CfF0bA54574dDe' 6 | - '0x69317f8a35746e6B875d68F906875e70D46D3cDf' 7 | - '0xA37AA81AeB2F1686e1e065258d6dA3EE9421eC03' 8 | - '0x25B3D0C7fA7fb47cDE6e05F320aDcde09bd06883' 9 | - '0xdDAD13E2f2121771708e3B0B66D6B43E09e0D2e6' 10 | - '0x5Ab021Dc111BcE4328428606aFC5913Ed581E541' 11 | - '0xAdF7a16b1fc8B9b2d2469412A48fE23864a338ae' 12 | - '0x684F908F148d3F277E0747b1300d90F2760488E5' 13 | - '0x58B55F24815AA6b939fe5b4aF7F26c9431254dF7' 14 | - '0x11B86D355403459AC6AC52f78bA3bd0658624710' 15 | - '0xBAD1BACff5e7c266c0a662F8Eb570e4C6e1af63E' 16 | - '0x8AEe9023D5E812bdf1243127A4e7D15C2996Ac16' 17 | - '0xa3b262B0e5771388CD6314a675A6f063266a0D95' 18 | - '0x13D9E5596c8AF1767Bf38F1E697655A0BA77ba5A' 19 | - '0x374032F4D867c89C026544cFD97dD4575D4BcB32' 20 | - '0x477dd40Ad898B749b63cDb2a0fdBA60473df0F2F' 21 | - '0x58ed3b851D417c0825219ab3E5AAA1fa8618CEDF' 22 | - '0x7Cd0D51a4dF3A3C964497539d9842aeD6C613EaC' 23 | - '0xB6779802AF9f4988BE6c4375A0980AD79097C6aD' 24 | - '0x558e5975FA9fe629999541311887e1B064987C24' 25 | - '0x431d829f662304Bc4F17Ce9CAf73Bd1Ba77Ef0F8' 26 | - '0x76aebb167Bc9E1574E6953D48e222aB1bc8bEe2f' 27 | - '0x8918D4c409ac9a5358A02eAB49F9EB8a257343fa' 28 | - '0x37aC9e0e1519e5497E3d7b7dFF2A7a2FA648c2A3' 29 | - '0x9A6D47D9f963c5d756Ac37f03910f4E578028cF5' 30 | - '0xaF1C466cB0919DD8CfB6243c225e23127F7000E4' 31 | - '0xd1Cb9F26E3cFD5217B5927fD979728B98d5D0217' 32 | - '0x5B233F495B74c7111C84a5C3D105f99af38F1Fd9' 33 | - '0xE1D7296Bd71e87B534d6674A23Fd4a46F21065Cd' 34 | - '0xfD8CD6ab9fB3c1B20b8e23B80df5f02b8166EB15' 35 | quorum: 1 36 | --------------------------------------------------------------------------------