├── .github └── workflows │ └── wheels.yml ├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── interface.cpp ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── test_diff.py └── test_match.py /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | # See https://github.com/pypa/cibuildwheel. 2 | 3 | name: Build 4 | 5 | on: [push, pull_request] 6 | 7 | jobs: 8 | build_wheels: 9 | name: Build wheels on ${{ matrix.os }} 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | os: [ubuntu-20.04, windows-2019, macos-12] 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: Checkout diff-match-patch-cpp-stl submodule 19 | run: git submodule update --init 20 | 21 | - uses: actions/setup-python@v2 22 | 23 | - name: Install cibuildwheel 24 | run: python -m pip install cibuildwheel==2.16.1 25 | 26 | - name: Build wheels 27 | run: python -m cibuildwheel --output-dir wheelhouse 28 | env: 29 | CIBW_TEST_COMMAND: "python -m unittest discover {project}/tests" 30 | CIBW_ARCHS_MACOS: "x86_64 universal2 arm64" 31 | CIBW_ARCHS_LINUX: "auto" 32 | CIBW_ARCHS_WINDOWS: "auto" 33 | 34 | - uses: actions/upload-artifact@v2 35 | with: 36 | path: ./wheelhouse/*.whl 37 | 38 | # Build the source distribution under Linux 39 | build_sdist: 40 | name: Source distribution 41 | runs-on: ubuntu-latest 42 | 43 | steps: 44 | - uses: actions/checkout@v2 45 | 46 | - name: Checkout diff-match-patch-cpp-stl submodule 47 | run: git submodule update --init 48 | 49 | - uses: actions/setup-python@v2 50 | 51 | - name: Build source distribution 52 | run: | 53 | pip install setuptools twine 54 | python setup.py sdist 55 | # Check whether the source distribution will render correctly 56 | twine check dist/*.tar.gz 57 | 58 | - name: Test source distribution 59 | run: | 60 | pip install ./dist/*.tar.gz 61 | python -c "from fast_diff_match_patch import diff" 62 | 63 | - uses: actions/upload-artifact@v2 64 | with: 65 | path: ./dist/*.tar.gz 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | *.so 4 | *.egg-info 5 | __pycache__ 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "diff-match-patch-cpp-stl"] 2 | path = diff-match-patch-cpp-stl 3 | url = https://github.com/leutloff/diff-match-patch-cpp-stl 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | - "3.7" 8 | - "3.8" 9 | - "3.9" 10 | - "3.10" 11 | 12 | install: 13 | - pip install -e . 14 | 15 | script: 16 | - python setup.py test 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include interface.cpp diff-match-patch-cpp-stl/diff_match_patch.h 2 | include LICENSE README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | fast\_diff\_match\_patch: Python package wrapping the C++ implementation of google-diff-match-patch 2 | =================================================================================================== 3 | 4 | This is a Python 3.6+ package that wraps google-diff-match-patch\'s C++ 5 | implementation for performing very fast string comparisons. This package 6 | was previously known as diff\_match\_patch\_python. 7 | 8 | google-diff-match-patch is a Google library for computing differences 9 | between text files (http://code.google.com/p/google-diff-match-patch). 10 | There are implementations in various languages. Although there is a Python 11 | port, it's slow on very large documents, and I have a need for speed. I 12 | wanted to use the C++ implementation, but I'm a Python guy so I'd 13 | prefer to use it from Python. 14 | 15 | Google's library depends on Qt 4, so some other folks rewrote it using 16 | the standard C++ library classes instead, making it more portable. 17 | That's at https://github.com/leutloff/diff-match-patch-cpp-stl. This 18 | package uses that library. 19 | 20 | Example 21 | ------- 22 | 23 | First: 24 | 25 | pip3 install fast_diff_match_patch 26 | 27 | Then write (this is Python 3): 28 | 29 | from fast_diff_match_patch import diff 30 | 31 | changes = diff("Hello world.", "Goodbye moon.") 32 | 33 | for op, length in changes: 34 | if op == "-": print ("next", length, "characters are deleted") 35 | if op == "=": print ("next", length, "characters are in common") 36 | if op == "+": print ("next", length, "characters are inserted") 37 | 38 | The two textual arguments can be either strings or bytes. 39 | 40 | Some keyword arguments are also available: 41 | 42 | `timelimit` (default 0) gives the maximum running time in seconds if you 43 | want to ensure the result comes quickly. According to the Google docs, 44 | the diff will stop working after the time is exceeded and will return a 45 | valid diff, but it might not be the best one. `checklines` is also a 46 | Google thing and might speed up diffs that are over lined-based text 47 | like code. 48 | 49 | `checklines` (default `True`) is the same argument in the diff_main 50 | subroutine of the main library. 51 | 52 | `cleanup` (default `"Semantic"`) is `"Semantic"`, `"Efficiency"`, or `"No"` 53 | to run the corresponding cleanup subroutine after performing the diff. 54 | 55 | Set `counts_only` (default `True`) to `False` to have the returned value be an array of 56 | tuples of operations and corresponding strings rather than operations 57 | and the lengths of those strings. 58 | 59 | If `as_patch` (default `False`) is `True`, the diff is returned in patch format 60 | as a string. 61 | 62 | The Global Interpreter Lock (GIL) is released while performing the diff 63 | so that this library can be used in a multi-threaded application. 64 | 65 | 66 | Changelog 67 | --------- 68 | 69 | ### Version 2.0.1 70 | 71 | * Diffs of byte strings are now null-character-safe. 72 | * Fixed `as_patch` argument. 73 | 74 | ### Version 2.0.0 75 | 76 | * The import has been renamed from `diff_match_patch` to `fast_diff_match_patch` to avoid an import naming collision with https://pypi.org/project/diff-match-patch/ and the package name has been updated to match the import name. 77 | * In previous versions of this package, separate `diff_bytes` (Py3), `diff_unicode` and `diff_str` (Py2) 78 | methods were available. They have been merged into a single `diff` method that checks the type of the arguments passed.) 79 | * `cleanup_semantic` has been renamed to `cleanup`, which takes one of three options (see above) 80 | * On Windows, an exception will be thrown if a string has characters outside of the Basic Multilingual Plane. 81 | 82 | Building from source 83 | -------------------- 84 | 85 | To build from these sources, you will need: 86 | 87 | - Python development headers and the setuptools package 88 | (Debian packages `python3-dev`, `python3-setuptools`) 89 | - The diff-match-patch library, which you can clone using 90 | `git submodule update --init`. 91 | 92 | Then build/install the binary module using: 93 | 94 | python setup.py build 95 | python setup.py install 96 | 97 | 98 | For package maintainers 99 | ----------------------- 100 | 101 | To build everything (for testing): 102 | 103 | git submodule update && rm -rf build && python3 setup.py build 104 | 105 | To test without installing: 106 | 107 | PYTHONPATH=build/lib.linux-x86_64-*/ python3 -m unittest 108 | 109 | Release packages (wheels and a source distribution) are built using GitHub Actions 110 | in this repository. To upload them as a new release to PyPi, download the artifact 111 | and extract the files to a new directory, and: 112 | 113 | ```sh 114 | python3 -m pip install --upgrade twine 115 | python3 -m twine upload -u __token__ path-to-artifact-files/* 116 | ``` 117 | -------------------------------------------------------------------------------- /interface.cpp: -------------------------------------------------------------------------------- 1 | #define PY_SSIZE_T_CLEAN 2 | #include 3 | 4 | #include "diff-match-patch-cpp-stl/diff_match_patch.h" 5 | 6 | struct BytesShim { 7 | static const char* PyArgFormat; // set below 8 | typedef Py_buffer PY_ARG_TYPE; 9 | typedef std::string STL_STRING_TYPE; 10 | 11 | // Extract the bytes data. 12 | static std::string to_string(Py_buffer& value) { 13 | auto buffer = (char*)malloc(value.len + 1); 14 | PyBuffer_ToContiguous(buffer, &value, value.len, 'C'); 15 | PyBuffer_Release(&value); 16 | auto s = std::string(buffer, value.len); 17 | free(buffer); 18 | return s; 19 | } 20 | 21 | // Create PyString from underlying char array 22 | static PyObject* from_string(std::string& value) { 23 | return PyBytes_FromStringAndSize(value.data(), value.size()); 24 | } 25 | }; 26 | 27 | const char* BytesShim::PyArgFormat = "s*"; 28 | 29 | struct UnicodeShim { 30 | static const char* PyArgFormat; // set below 31 | typedef PyObject* PY_ARG_TYPE; 32 | typedef std::u32string STL_STRING_TYPE; 33 | 34 | // Convert PyObject* to std::u32string.... 35 | static std::u32string to_string(PyObject* value) { 36 | auto str = (char32_t*)PyUnicode_AsUCS4Copy(value); 37 | auto string = std::u32string(str, PyUnicode_GetLength(value)); 38 | PyMem_Free(str); 39 | return string; 40 | } 41 | 42 | static PyObject* from_string(std::u32string value) { 43 | return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, value.c_str(), value.size()); 44 | } 45 | }; 46 | const char* UnicodeShim::PyArgFormat = "U"; 47 | 48 | // Specialization of the DMP traits class for char32_t for u32string. 49 | template <> struct diff_match_patch_traits : diff_match_patch_utf32_direct { 50 | static bool is_alnum(char32_t c) { return std::iswalnum(c)? true : false; } 51 | static bool is_digit(char32_t c) { return std::iswdigit(c)? true : false; } 52 | static bool is_space(char32_t c) { return std::iswspace(c)? true : false; } 53 | static int to_int(const char32_t* s) { 54 | std::string narrowed; 55 | while (*s && *s < CHAR_MAX) 56 | narrowed.append(static_cast(*(s++)), 1); 57 | return static_cast(std::strtol(narrowed.c_str(), NULL, 10)); 58 | } 59 | static char32_t from_wchar(wchar_t c) { return (char32_t)c; } 60 | static wchar_t to_wchar(char32_t c) { return c <= WCHAR_MAX ? (wchar_t)c : 0; } 61 | static std::u32string cs(const wchar_t* s) { return std::u32string(s, s + wcslen(s)); } 62 | static const char32_t eol = L'\n'; 63 | static const char32_t tab = L'\t'; 64 | }; 65 | 66 | // COMPUTATIONAL FUNCTIONS 67 | 68 | template 69 | static PyObject * 70 | diff_match_patch__diff__impl(PyObject *self, PyObject *args, PyObject *kwargs) 71 | { 72 | typename Shim::PY_ARG_TYPE a, b; 73 | float timelimit = 0.0; 74 | int checklines = 1; 75 | char* cleanupMode = NULL; 76 | int counts_only = 1; 77 | int as_patch = 0; 78 | char format_spec[64]; 79 | 80 | static char *kwlist[] = { 81 | strdup("left_document"), 82 | strdup("right_document"), 83 | strdup("timelimit"), 84 | strdup("checklines"), 85 | strdup("cleanup"), 86 | strdup("counts_only"), 87 | strdup("as_patch"), 88 | NULL }; 89 | 90 | sprintf(format_spec, "%s%s|fbzbb", Shim::PyArgFormat, Shim::PyArgFormat); 91 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, format_spec, kwlist, 92 | &a, &b, 93 | &timelimit, &checklines, &cleanupMode, 94 | &counts_only, &as_patch)) 95 | return NULL; 96 | 97 | auto a_str = Shim::to_string(a), 98 | b_str = Shim::to_string(b); 99 | 100 | PyObject *ret = PyList_New(0); 101 | 102 | typedef diff_match_patch DMP; 103 | DMP dmp; 104 | 105 | PyObject *opcodes[3]; 106 | opcodes[dmp.DELETE] = PyUnicode_FromString("-"); 107 | opcodes[dmp.INSERT] = PyUnicode_FromString("+"); 108 | opcodes[dmp.EQUAL] = PyUnicode_FromString("="); 109 | 110 | typename DMP::Diffs diff; 111 | 112 | Py_BEGIN_ALLOW_THREADS /* RELEASE THE GIL */ 113 | 114 | dmp.Diff_Timeout = timelimit; 115 | diff = dmp.diff_main(a_str, b_str, checklines); 116 | 117 | if (cleanupMode == NULL || strcmp(cleanupMode, "Semantic") == 0) 118 | dmp.diff_cleanupSemantic(diff); 119 | else if (strcmp(cleanupMode, "Efficiency") == 0) 120 | dmp.diff_cleanupEfficiency(diff); 121 | 122 | Py_END_ALLOW_THREADS /* ACQUIRE THE GIL */ 123 | 124 | if (as_patch) { 125 | typename DMP::Patches patch = dmp.patch_make(a_str, diff); 126 | typename Shim::STL_STRING_TYPE patch_str = dmp.patch_toText(patch); 127 | 128 | return Shim::from_string(patch_str); 129 | } 130 | 131 | typename std::list::const_iterator entryiter; 132 | for (entryiter = diff.begin(); entryiter != diff.end(); entryiter++) { 133 | typename DMP::Diff entry = *entryiter; 134 | 135 | PyObject* tuple = PyTuple_New(2); 136 | 137 | Py_INCREF(opcodes[entry.operation]); // we're going to reuse the object, so don't let SetItem steal the reference 138 | PyTuple_SetItem(tuple, 0, opcodes[entry.operation]); 139 | 140 | if (counts_only) 141 | PyTuple_SetItem(tuple, 1, PyLong_FromLong(entry.text.length())); 142 | else 143 | PyTuple_SetItem(tuple, 1, Shim::from_string(entry.text)); 144 | 145 | PyList_Append(ret, tuple); 146 | Py_DECREF(tuple); // the list owns a reference now 147 | } 148 | 149 | // We're left with one extra reference. 150 | Py_DECREF(opcodes[dmp.DELETE]); 151 | Py_DECREF(opcodes[dmp.INSERT]); 152 | Py_DECREF(opcodes[dmp.EQUAL]); 153 | 154 | return ret; 155 | } 156 | 157 | template 158 | static PyObject * 159 | diff_match_patch__match__impl(PyObject *self, PyObject *args, PyObject *kwargs) 160 | { 161 | typename Shim::PY_ARG_TYPE pattern, text; 162 | int loc; 163 | int match_distance = 1000; 164 | int match_maxbits = 32; 165 | float match_threshold = 0.5; 166 | char format_spec[64]; 167 | 168 | static char *kwlist[] = { 169 | strdup("text"), 170 | strdup("pattern"), 171 | strdup("loc"), 172 | strdup("match_distance"), 173 | strdup("match_maxbits"), 174 | strdup("match_threshold"), 175 | NULL }; 176 | 177 | sprintf(format_spec, "%s%si|iif", Shim::PyArgFormat, Shim::PyArgFormat); 178 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, format_spec, kwlist, 179 | &text, &pattern, &loc, 180 | &match_distance, &match_maxbits, &match_threshold)) { 181 | return NULL; 182 | } 183 | 184 | auto pattern_str = Shim::to_string(pattern), 185 | text_str = Shim::to_string(text); 186 | 187 | typedef diff_match_patch DMP; 188 | DMP dmp; 189 | 190 | dmp.Match_Distance = match_distance; 191 | dmp.Match_MaxBits = match_maxbits; 192 | dmp.Match_Threshold = match_threshold; 193 | 194 | try { 195 | int index = dmp.match_main(text_str, pattern_str, loc); 196 | return Py_BuildValue("i", index); 197 | } catch (std::exception& e) { 198 | PyErr_SetString(PyExc_RuntimeError, e.what()); 199 | return NULL; 200 | } catch (typename Shim::STL_STRING_TYPE& s) { 201 | PyErr_SetObject(PyExc_RuntimeError, Shim::from_string(s)); 202 | return NULL; 203 | } 204 | } 205 | 206 | // WRAPPER FUNCTIONS THAT DETERMINE WHETHER UNICODE OR BYTES ARE PASSED 207 | 208 | static PyObject * 209 | diff_match_patch__diff(PyObject *self, PyObject *args, PyObject *kwargs) 210 | { 211 | // Check if the first argument is a Unicode object, and if so, run 212 | // the Unicode version of the method. Otherwise run the bytes version. 213 | PyObject* first_arg; 214 | if (PyTuple_Size(args) > 0 && (first_arg = PyTuple_GetItem(args, 0))) 215 | if (PyUnicode_Check(first_arg)) 216 | return diff_match_patch__diff__impl(self, args, kwargs); 217 | return diff_match_patch__diff__impl(self, args, kwargs); 218 | } 219 | 220 | static PyObject * 221 | diff_match_patch__match(PyObject *self, PyObject *args, PyObject *kwargs) 222 | { 223 | // Check if the first argument is a Unicode object, and if so, run 224 | // the Unicode version of the method. Otherwise run the bytes version. 225 | PyObject* first_arg; 226 | if (PyTuple_Size(args) > 0 && (first_arg = PyTuple_GetItem(args, 0))) 227 | if (PyUnicode_Check(first_arg)) 228 | return diff_match_patch__match__impl(self, args, kwargs); 229 | return diff_match_patch__match__impl(self, args, kwargs); 230 | } 231 | 232 | // EXTENSION MODULE METADATA 233 | 234 | static PyMethodDef MyMethods[] = { 235 | {"diff", (PyCFunction)diff_match_patch__diff, METH_VARARGS|METH_KEYWORDS, 236 | "Compute the difference between two strings or bytes. Returns a list of tuples (OP, LEN)."}, 237 | {"match", (PyCFunction)diff_match_patch__match, METH_VARARGS|METH_KEYWORDS, 238 | "Locate the best instance of 'pattern' in 'text' near 'loc'. Returns -1 if no match found."}, 239 | {NULL, NULL, 0, NULL} /* Sentinel */ 240 | }; 241 | 242 | static struct PyModuleDef mymodule = { 243 | PyModuleDef_HEAD_INIT, 244 | "fast_diff_match_patch", /* name of module */ 245 | NULL, /* module documentation, may be NULL */ 246 | -1, /* size of per-interpreter state of the module, 247 | or -1 if the module keeps state in global variables. */ 248 | MyMethods 249 | }; 250 | 251 | PyMODINIT_FUNC 252 | PyInit_fast_diff_match_patch(void) 253 | { 254 | auto module = PyModule_Create(&mymodule); 255 | PyModule_AddIntConstant(module, "CHAR_WIDTH", sizeof(UnicodeShim::STL_STRING_TYPE::value_type)); 256 | return module; 257 | } 258 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages, Extension 2 | 3 | module1 = Extension('fast_diff_match_patch', 4 | sources = ['interface.cpp'], 5 | include_dirs = [], 6 | libraries = []) 7 | 8 | setup( 9 | name='fast_diff_match_patch', 10 | version='2.1.0', 11 | description=u'Packages the C++ implementation of google-diff-match-patch for Python for fast byte and string diffs.', 12 | long_description=open("README.md").read(), 13 | long_description_content_type='text/markdown', 14 | author=u'Joshua Tauberer', 15 | author_email=u'jt@occams.info', 16 | url='https://github.com/JoshData/fast_diff_match_patch', 17 | packages=find_packages(), 18 | license='Apache License 2.0', 19 | keywords="diff compare Google match patch diff_match_patch native fast", 20 | ext_modules=[module1], 21 | classifiers=[ 22 | 'Programming Language :: Python :: 3', 23 | 'Programming Language :: Python :: 3.6', 24 | 'Programming Language :: Python :: 3.7', 25 | 'Programming Language :: Python :: 3.8', 26 | 'Programming Language :: Python :: 3.9', 27 | 'Programming Language :: Python :: 3.10', 28 | 'Programming Language :: Python :: 3.11', 29 | 'Programming Language :: Python :: 3.12', 30 | ], 31 | test_suite='tests', 32 | ) 33 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JoshData/fast_diff_match_patch/2910eeedcd5433b353f54f53ce1cf508f6d64b99/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_diff.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import sys 4 | import unittest 5 | 6 | import fast_diff_match_patch 7 | 8 | class DiffTests(unittest.TestCase): 9 | def assertDiff(self, text1, text2, expected): 10 | actual = fast_diff_match_patch.diff( 11 | text1, text2, 12 | timelimit=15, 13 | checklines=False, 14 | counts_only=False) 15 | self.assertEqual(actual, expected) 16 | 17 | actual = fast_diff_match_patch.diff( 18 | text1, text2, 19 | timelimit=15, 20 | checklines=False) 21 | self.assertEqual(actual, [(op, len(text)) for (op, text) in expected]) 22 | 23 | def test_string(self): 24 | self.assertDiff( 25 | '', 26 | '', 27 | [] 28 | ) 29 | 30 | self.assertDiff( 31 | 'this is a test', 32 | 'this is a test', 33 | [('=', 'this is a test')] 34 | ) 35 | 36 | self.assertDiff( 37 | 'this is a test', 38 | 'this program is not \u2192 a test', 39 | [ 40 | ('=', 'this '), 41 | ('-', 'is'), 42 | ('+', 'program is not \u2192'), 43 | ('=', ' a test'), 44 | ] 45 | ) 46 | 47 | self.assertDiff( 48 | 'this\nis\na\ntest', 49 | 'this\nprogram\nis\nnot \u2192 a\ntest', 50 | [ 51 | ('=', 'this\n'), 52 | ('-', 'is\n'), 53 | ('+', 'program\nis\nnot \u2192 '), 54 | ('=', 'a\ntest'), 55 | ] 56 | ) 57 | 58 | def test_binary(self): 59 | self.assertDiff( 60 | b'', 61 | b'', 62 | [] 63 | ) 64 | 65 | self.assertDiff( 66 | b'this is a test', 67 | b'this is a test', 68 | [('=', b'this is a test')] 69 | ) 70 | 71 | self.assertDiff( 72 | b'this is a test', 73 | b'this program is not ==> a test', 74 | [ 75 | ('=', b'this '), 76 | ('-', b'is'), 77 | ('+', b'program is not ==>'), 78 | ('=', b' a test'), 79 | ] 80 | ) 81 | 82 | def test_binary_safe(self): 83 | self.assertDiff( 84 | '1\u00002', 85 | '1\u00003', 86 | [('=', '1\x00'), ('-', '2'), ('+', '3')] 87 | ) 88 | 89 | self.assertDiff( 90 | b'1\0002', 91 | b'1\0003', 92 | [('=', b'1\x00'), ('-', b'2'), ('+', b'3')] 93 | ) 94 | 95 | def test_unicode_surrogate_pair(self): 96 | self.assertEqual(fast_diff_match_patch.CHAR_WIDTH, 4) 97 | 98 | self.assertDiff( 99 | '\U0001f37e', 100 | '\U0001f37f', 101 | [ 102 | ('-', u'\U0001f37e'), 103 | ('+', u'\U0001f37f') 104 | ] 105 | ) 106 | 107 | # Test that the byte string version also works although 108 | # it won't have any surrogate pair issues. 109 | self.assertDiff( 110 | '\U0001f37e'.encode("utf32"), 111 | '\U0001f37f'.encode("utf32"), 112 | [ 113 | ('=', '\U0001f37e'.encode("utf32")[0:4]), 114 | ('-', '\U0001f37e'.encode("utf32")[4:5]), 115 | ('+', '\U0001f37f'.encode("utf32")[4:5]), 116 | ('=', '\U0001f37e'.encode("utf32")[5:8]) 117 | ] 118 | ) 119 | 120 | def test_patch(self): 121 | actual = fast_diff_match_patch.diff( 122 | "Text 1\nLine 1\nLine 2\nLine 3", 123 | "Text 2\nLine 1\nLine 3\nLine 2", 124 | as_patch=True) 125 | self.assertEqual(actual, """@@ -1,14 +1,14 @@ 126 | Text 127 | -1 128 | +2 129 | %0ALine 1%0A 130 | @@ -16,12 +16,12 @@ 131 | ine 132 | -2 133 | +3 134 | %0ALine 135 | -3 136 | +2 137 | """) 138 | 139 | 140 | def test_patch_binary(self): 141 | actual = fast_diff_match_patch.diff( 142 | b"Text 1\nLine 1\nLine 2\nLine 3", 143 | b"Text 2\nLine 1\nLine 3\nLine 2", 144 | as_patch=True) 145 | self.assertEqual(actual, b"""@@ -1,14 +1,14 @@ 146 | Text 147 | -1 148 | +2 149 | %0ALine 1%0A 150 | @@ -16,12 +16,12 @@ 151 | ine 152 | -2 153 | +3 154 | %0ALine 155 | -3 156 | +2 157 | """) 158 | 159 | def test_example_text_1(self): 160 | text1 = """diff_match_patch-python 161 | 162 | A Python extension module that wraps google-diff-match-patch's C++ implementation for performing very fast string comparisons. 163 | 164 | google-diff-match-patch is a Google library for computing differences between text files (). Thare are implementations in various languages. Although there is a Python port, it's slow on very large documents, and I have a need for speed. I wanted to use the C++ implementation, but I'm a Python guy so I'd prefer to use it from Python. 165 | 166 | @leutloff determined that the C++ port could be even faster by replacing the Qt 4 dependency with the standard C++ library primitives. So he rewrote the module at . 167 | 168 | This project is an extension module for Python using @leutloff's library so Python code can call into the native library easily. It works in both Python 2 and Python 3.""" 169 | 170 | text2 = """fast\_diff\_match\_patch: Python package wrapping the C++ implementation of google-diff-match-patch 171 | =================================================================================================== 172 | 173 | This is a Python 3.6+ package that wraps google-diff-match-patch\'s C++ 174 | implementation for performing very fast string comparisons. This package 175 | was previously known as diff\_match\_patch\_python. 176 | 177 | google-diff-match-patch is a Google library for computing differences 178 | between text files (http://code.google.com/p/google-diff-match-patch). 179 | There are implementations in various languages. Although there is a Python 180 | port, it's slow on very large documents, and I have a need for speed. I 181 | wanted to use the C++ implementation, but I'm a Python guy so I'd 182 | prefer to use it from Python. 183 | 184 | Google's library depends on Qt 4, so some other folks rewrote it using 185 | the standard C++ library classes instead, making it more portable. 186 | That's at https://github.com/leutloff/diff-match-patch-cpp-stl. This 187 | package uses that library.""" 188 | 189 | actual = fast_diff_match_patch.diff(text1, text2) 190 | 191 | self.assertEqual(actual, 192 | [('+', 6), ('=', 4), ('+', 1), ('=', 6), ('+', 1), 193 | ('=', 6), ('-', 2), ('+', 3), ('=', 5), ('-', 26), 194 | ('+', 197), ('=', 42), ('-', 1), ('+', 1), ('=', 59), 195 | ('+', 65), ('=', 71), ('-', 1), ('+', 1), ('=', 20), 196 | ('-', 1), ('=', 48), ('-', 1), ('=', 2), ('-', 1), 197 | ('+', 1), ('=', 2), ('-', 1), ('+', 1), ('=', 71), 198 | ('-', 1), ('+', 1), ('=', 71), ('-', 1), ('+', 1), 199 | ('=', 65), ('-', 1), ('+', 1), ('=', 31), ('-', 159), 200 | ('+', 144), ('=', 4), ('-', 1), ('=', 52), ('-', 171), 201 | ('+', 32), ('=', 1)]) 202 | 203 | actual = fast_diff_match_patch.diff(text1, text2, as_patch=True) 204 | 205 | self.assertEqual(actual, """@@ -1,53 +1,233 @@ 206 | +fast%5C_ 207 | diff 208 | +%5C 209 | _match 210 | +%5C 211 | _patch 212 | --p 213 | +: P 214 | ython 215 | -%0A%0AA Python extension modul 216 | + package wrapping the C++ implementation of google-diff-match-patch%0A===================================================================================================%0A%0AThis is a Python 3.6+ packag 217 | e th 218 | @@ -260,25 +260,25 @@ 219 | -patch's C++ 220 | - 221 | +%0A 222 | implementati 223 | @@ -324,16 +324,81 @@ 224 | arisons. 225 | + This package%0Awas previously known as diff%5C_match%5C_patch%5C_python. 226 | %0A%0Agoogle 227 | @@ -460,17 +460,17 @@ 228 | ferences 229 | - 230 | +%0A 231 | between 232 | @@ -481,17 +481,16 @@ 233 | files ( 234 | -%3C 235 | http://c 236 | @@ -533,15 +533,14 @@ 237 | atch 238 | -%3E 239 | ). 240 | - 241 | +%0A 242 | Th 243 | -a 244 | +e 245 | re a 246 | @@ -606,17 +606,17 @@ 247 | a Python 248 | - 249 | +%0A 250 | port, it 251 | @@ -678,17 +678,17 @@ 252 | speed. I 253 | - 254 | +%0A 255 | wanted t 256 | @@ -744,17 +744,17 @@ 257 | y so I'd 258 | - 259 | +%0A 260 | prefer t 261 | @@ -780,172 +780,156 @@ 262 | n.%0A%0A 263 | -@leutloff determined that the C++ port could be even faster by replacing the Qt 4 dependency with the standard C++ library primitives. So he rewrote the module 264 | +Google's library depends on Qt 4, so some other folks rewrote it using%0Athe standard C++ library classes instead, making it more portable.%0AThat's 265 | at 266 | -%3C 267 | http 268 | @@ -980,176 +980,37 @@ 269 | -stl 270 | -%3E.%0A%0AThis project is an extension module for Python using @leutloff's library so Python code can call into the native library easily. It works in both Python 2 and Python 3 271 | +. This%0Apackage uses that library 272 | . 273 | """) 274 | 275 | pattern = """This is a Python 3.6+ package.""" 276 | actual = fast_diff_match_patch.match(text2, pattern, loc=20) 277 | self.assertEqual(actual, 201) 278 | -------------------------------------------------------------------------------- /tests/test_match.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import unittest 4 | import sys 5 | 6 | from fast_diff_match_patch import match 7 | 8 | class MatchTests(unittest.TestCase): 9 | def test_unicode(self): 10 | self.assertEqual(0, match('abcdef', 'abcdef', 1000)) 11 | 12 | self.assertEqual(-1, match('', 'abcdef', 1)) 13 | 14 | self.assertEqual(3, match('abcdef', '', 3)) 15 | 16 | self.assertEqual(3, match('abcdef', 'de', 3)) 17 | 18 | self.assertEqual(3, match('abcdef', 'defy', 4)) 19 | 20 | self.assertEqual(0, match('abcdef', 'abcdefy', 0)) 21 | 22 | self.assertEqual(2, match('abc\u2192def', 'c\u2192defy', 0)) 23 | 24 | def test_bytes(self): 25 | self.assertEqual(0, match(b'abcdef', b'abcdef', 1000)) 26 | 27 | self.assertEqual(-1, match(b'', b'abcdef', 1)) 28 | 29 | self.assertEqual(3, match(b'abcdef', b'', 3)) 30 | 31 | self.assertEqual(3, match(b'abcdef', b'de', 3)) 32 | 33 | self.assertEqual(3, match(b'abcdef', b'defy', 4)) 34 | 35 | self.assertEqual(0, match(b'abcdef', b'abcdefy', 0)) 36 | 37 | self.assertEqual(2, match(b'abc\xe2\x86\x92def', b'c\xe2\x86\x92defy', 0)) 38 | --------------------------------------------------------------------------------