├── test ├── __init__.py ├── test_homonymns.py ├── run_sentence_diff.py ├── run_word_diff.py ├── pivotable_parse.py ├── test_word_diff.py └── test_differencer.py ├── requirements.txt ├── setup.cfg ├── sentence_diff ├── __init__.py ├── worddiff.py └── sentencediff.py ├── setup.py ├── .gitignore └── README.md /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | inflect 3 | pytest 4 | better_profanity -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description-file = README.md -------------------------------------------------------------------------------- /sentence_diff/__init__.py: -------------------------------------------------------------------------------- 1 | from .sentencediff import SentenceDiff 2 | from .worddiff import WordDiff -------------------------------------------------------------------------------- /test/test_homonymns.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from sentence_diff import SentenceDiff 3 | 4 | class TestHomonyms(TestCase): 5 | 6 | def test_sentence_homonymsdeserts(self): 7 | result = SentenceDiff._homonyms("I love desert") 8 | assert result == ["I love desert", "I love dessert"] 9 | 10 | def test_substitutions(self): 11 | list_of_lists = [["a","b"], 12 | ["x","y","z"]] 13 | result = SentenceDiff._all_substitutions(list_of_lists) 14 | assert result == \ 15 | [("a","b"), 16 | ("b","a"), 17 | ("x", "y"), 18 | ("x", "z"), 19 | ("y", "x"), 20 | ("y", "z"), 21 | ("z", "x"), 22 | ("z", "y")] 23 | 24 | -------------------------------------------------------------------------------- /test/run_sentence_diff.py: -------------------------------------------------------------------------------- 1 | from sentence_diff import SentenceDiff 2 | import csv 3 | 4 | with open('edit_score_highlights.csv', newline='') as csv_file_in: 5 | with open('edit_score__highlights_out.csv', 'w', newline='') as csv_file_out: 6 | writer = csv.writer(csv_file_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 7 | writer.writerow(["Actual Text","Target Text","Score By Words","Score By Letters", "New Score", "Attn", "New Score 2"]) #"Updated Wer", "Wer Score" 8 | reader = csv.reader(csv_file_in, delimiter=',', quotechar='"') 9 | reader.__next__() 10 | for row in reader: 11 | actual_sentence = row[0] 12 | target_sentence = row[1] 13 | 14 | if len(actual_sentence.strip()) == 0: 15 | continue 16 | if len(target_sentence.strip()) == 0: 17 | continue 18 | 19 | print("{}-{}".format(actual_sentence, target_sentence)) 20 | 21 | diff = SentenceDiff(actual_sentence, target_sentence) 22 | row.append(diff.chatterize_score() * 100) 23 | writer.writerow(row) 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | setup( 3 | name = 'sentence_diff', 4 | packages = ['sentence_diff'], 5 | version = '0.1', 6 | license='MIT', 7 | description = 'Difference English sentences via Liechtenstein distance, calculate word error rate, and list out word by word differences', 8 | author = 'Miles Thompson', 9 | author_email = 'utunga@gmail.com', 10 | url = 'https://github.com/utunga/sentence_diff', 11 | download_url = 'https://github.com/utunga/sentence_diff/archive/v_01.tar.gz', 12 | keywords = ['Levenshtein', 'English', 'Text', 'WER', 'Diff'], 13 | install_requires=[ 14 | 'numpy', 15 | 'inflect', 16 | ], 17 | classifiers=[ 18 | 'Development Status :: 4 - Beta', 19 | 'Intended Audience :: Developers', 20 | 'License :: OSI Approved :: MIT License', 21 | 'Programming Language :: Python :: 3', 22 | 'Programming Language :: Python :: 3.4', 23 | 'Programming Language :: Python :: 3.5', 24 | 'Programming Language :: Python :: 3.6', 25 | 'Programming Language :: Python :: 3.7', 26 | ], 27 | ) -------------------------------------------------------------------------------- /test/run_word_diff.py: -------------------------------------------------------------------------------- 1 | from sentence_diff import WordDiff 2 | import csv 3 | 4 | with open('keyword_diffs.csv', newline='', encoding="utf-8") as csv_file_in: 5 | with open('keyword_diffs_out.csv', 'w', newline='', encoding="utf-8") as csv_file_out: 6 | writer = csv.writer(csv_file_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 7 | writer.writerow(["target_text","recognized_text","confidence_score","audio_mp3_file", "similarity", "pass_fail"]) 8 | reader = csv.reader(csv_file_in, delimiter=',', quotechar='"') 9 | reader.__next__() 10 | for row in reader: 11 | actual = row[0] 12 | target = row[1] 13 | confidence = float(row[2]) 14 | 15 | if len(actual.strip()) == 0: 16 | continue 17 | if len(target.strip()) == 0: 18 | continue 19 | 20 | print("{}-{}".format(actual, target)) 21 | 22 | diff = WordDiff(actual, target) 23 | pass_fail, similarity = diff.chatterize_score() 24 | row.append(similarity * 100) 25 | row.append(pass_fail) 26 | writer.writerow(row) 27 | -------------------------------------------------------------------------------- /test/pivotable_parse.py: -------------------------------------------------------------------------------- 1 | from sentence_diff import SentenceDiff 2 | import csv 3 | 4 | 5 | def process_row(row, writer): 6 | actual_sentence = row['transcript'] 7 | target_sentence = row['target'] 8 | 9 | if len(actual_sentence.strip()) == 0: 10 | return 11 | if len(target_sentence.strip()) == 0: 12 | return 13 | 14 | print("{}-{}".format(actual_sentence, target_sentence)) 15 | 16 | diff = SentenceDiff(actual_sentence, target_sentence) 17 | row['wer'] = diff.wer() 18 | row['score'] = diff.chatterize_score() * 100 19 | writer.writerow(row) 20 | 21 | 22 | with open('pivotable.csv', encoding='utf-8') as csv_file_in: 23 | with open('pivotable_out.csv', 'w', newline='', encoding='utf-8') as csv_file_out: 24 | reader = csv.DictReader(csv_file_in) 25 | first_row = reader.__next__() 26 | field_names = list(first_row.keys()) 27 | field_names.append('score') 28 | writer = csv.DictWriter(csv_file_out, fieldnames=field_names, 29 | delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 30 | writer.writeheader() 31 | process_row(first_row, writer) 32 | 33 | for row in reader: 34 | process_row(row, writer) 35 | 36 | -------------------------------------------------------------------------------- /sentence_diff/worddiff.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import numpy as np 4 | import inflect 5 | import difflib 6 | from better_profanity import profanity 7 | from sentence_diff import SentenceDiff 8 | 9 | THRESHOLD_PASS = .4 10 | THRESHOLD_SUPER_PASS = .7 11 | 12 | class WordDiff: 13 | 14 | def __init__(self, actual, target): 15 | SentenceDiff._assert_not_empty(actual,target) 16 | self.actual = actual 17 | self.target = target 18 | self.actual_lower = self.normalize(actual) 19 | self.target_lower = self.normalize(target) 20 | 21 | def chatterize_score(self): 22 | homonyms = SentenceDiff._homonyms(self.actual_lower) 23 | max_similarity = -1 24 | for homonym in homonyms: 25 | similarity = self.similarity(homonym, self.target_lower) 26 | if similarity > max_similarity: 27 | max_similarity = similarity 28 | 29 | pass_fail = "SUPER PASS" if max_similarity > THRESHOLD_SUPER_PASS \ 30 | else "PASS" if max_similarity > THRESHOLD_PASS \ 31 | else "FAIL" 32 | 33 | return pass_fail, max_similarity 34 | 35 | def normalize(self, text): 36 | return \ 37 | SentenceDiff._remove_punctuation( 38 | SentenceDiff._spell_out_numbers_in_word( 39 | SentenceDiff._sound_out_dollars( 40 | profanity.censor(text.lower(), 'x')))) 41 | 42 | def similarity(self, wordA, wordB): 43 | # work substitution cost 44 | # similar words cost close to 0 different words cost 1 45 | denominator = 0 46 | numerator = 0 47 | for i, s in enumerate(difflib.ndiff(wordA, wordB)): 48 | denominator += 1 49 | if s[0] == '-' or s[0] == '+': 50 | numerator += 1 51 | return 1 - numerator/denominator 52 | 53 | 54 | -------------------------------------------------------------------------------- /test/test_word_diff.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from sentence_diff import WordDiff 3 | 4 | def word_diff(actual, target): 5 | d = WordDiff(actual,target) 6 | return d.chatterize_score() 7 | 8 | class TestDifferencer(TestCase): 9 | 10 | def test_big(self): 11 | target = "big" 12 | actual = "big" 13 | pass_fail, score = word_diff(actual, target) 14 | assert score == 1 15 | assert pass_fail == "SUPER PASS" 16 | 17 | def test_food(self): 18 | target = "order food" 19 | actual = "all the food i ate food" 20 | pass_fail, score = word_diff(actual, target) 21 | assert score == .2222222222222222 22 | assert pass_fail == "FAIL" 23 | 24 | def test_dog(self): 25 | target = "dog" 26 | actual = "tall dog poke bo suck my mother nature" 27 | pass_fail, score = word_diff(actual, target) 28 | assert score == .07894736842105265 29 | assert pass_fail == "FAIL" 30 | 31 | def test_superhero(self): 32 | target = "superhero" 33 | actual = "superheroes" 34 | pass_fail, score = word_diff(actual, target) 35 | assert score == .8181818181818181 36 | assert pass_fail == "SUPER PASS" 37 | 38 | def test_meat(self): 39 | target = "meat" 40 | actual = "meet" 41 | pass_fail, score = word_diff(actual, target) 42 | assert score == 1 43 | assert pass_fail == "SUPER PASS" 44 | 45 | 46 | def test_shirt(self): 47 | target = "shirt" 48 | actual = "sharks" 49 | pass_fail, score = word_diff(actual, target) 50 | assert score == .375 51 | assert pass_fail == "FAIL" 52 | 53 | def test_pirates(self): 54 | target = "shirt" 55 | actual = "shut" 56 | pass_fail, score = word_diff(actual, target) 57 | assert score == .5 58 | assert pass_fail == "PASS" 59 | 60 | def test_number(self): 61 | target = "one" 62 | actual = "1" 63 | pass_fail, score = word_diff(actual, target) 64 | assert score == 1 65 | assert pass_fail == "SUPER PASS" 66 | 67 | def test_dollars(self): 68 | target = "100 dollars" 69 | actual = "$100" 70 | pass_fail, score = word_diff(actual, target) 71 | assert score == 1 72 | assert pass_fail == "SUPER PASS" 73 | 74 | 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .idea 132 | 133 | *.csv 134 | *.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentence Differences - sentence_diff 2 | Package to difference English sentences via Liechtenstein distance, calculate word error rate, and list out word by word differences 3 | 4 | # Basic usage 5 | 6 | ```python 7 | 8 | from sentence_diff import SentenceDiff 9 | 10 | d = SentenceDiff("can i has 7 loaves of bread please ", "Can I have seven loaves, please?") 11 | assert d.mistakes() == [ 12 | ('has', 'have', 2, 'changed'), 13 | ('of', None, 5, 'added'), 14 | ('bread', None, 6, 'added')] 15 | 16 | ``` 17 | 18 | ### Word Error Rate - wer() 19 | 20 | ```python 21 | d = SentenceDiff("I like to meet people", "I really like to meet people") 22 | assert d.wer() == 1/6 23 | ``` 24 | 25 | ```python 26 | d = SentenceDiff("I really like to meet people", "I like to meet people") 27 | assert d.wer() == 1/5 28 | ``` 29 | 30 | ### Changes - mistakes() 31 | 32 | Added words 33 | ```python 34 | d = SentenceDiff("I like Like to eat people", "I like to eat people") 35 | assert d.mistakes() == [ 36 | ("Like", None, 2,'added')] 37 | ``` 38 | 39 | Changed words 40 | ```python 41 | d = SentenceDiff("How do you", "how are you") 42 | assert d.mistakes() == [ 43 | ("do", "are", 1, 'changed')] 44 | ``` 45 | 46 | Skipped words 47 | ```python 48 | d = SentenceDiff("How see you", "how good to see you") 49 | assert d.mistakes() == [ 50 | (None, "good", 1, 'skipped'), 51 | (None, "to", 1, 'skipped')] 52 | ``` 53 | 54 | No differences (ignores punctuation and case) 55 | ```python 56 | d = SentenceDiff("my name is joe", "My name is Joe!") 57 | assert d.mistakes() == [] 58 | ``` 59 | 60 | ### What words from original are OK - yes_no_words() 61 | 62 | ```python 63 | d = SentenceDiff("can i have 7 loaves please", "Can I have seven loaves, please?") 64 | assert d.yes_no_words() == [ 65 | ("can", True), 66 | ("i", True), 67 | ("have", True), 68 | ("7", True), 69 | ("loaves", True), 70 | ("please", True)] 71 | ``` 72 | 73 | ### What words from original are OK or not? - yes_no_words() 74 | 75 | ```python 76 | d = SentenceDiff("can i have 7 loaves please", "Can I have seven loaves, please?") 77 | assert d.yes_no_words() == [ 78 | ("can", True), 79 | ("i", True), 80 | ("have", True), 81 | ("7", True), 82 | ("loaves", True), 83 | ("please", True)] 84 | ``` 85 | 86 | ### Full list of changes - scored_words() 87 | 88 | ```python 89 | d = SentenceDiff("can i has 7 loaves of bread please ", "Can I have seven loaves, please?") 90 | assert d.scored_words() == [ 91 | ('can', 'Can', 0, None), 92 | ('i', 'I', 1, None), 93 | ('has', 'have', 2, 'changed'), 94 | ('7', 'seven', 3, None), 95 | ('loaves', 'loaves', 4, None), 96 | ('of', None, 5, 'added'), 97 | ('bread', None, 6, 'added'), 98 | ('please', 'please', 7, None)] 99 | ``` 100 | -------------------------------------------------------------------------------- /test/test_differencer.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from sentence_diff import SentenceDiff 3 | 4 | 5 | def diff(actual_sentence, target_sentence): 6 | return SentenceDiff(actual_sentence=actual_sentence, target_sentence=target_sentence) 7 | 8 | 9 | def chatterize_score(actual_sentence, target_sentence): 10 | diff = SentenceDiff(actual_sentence, target_sentence) 11 | return diff.chatterize_score() 12 | 13 | 14 | def assert_chatterize_score(actual_sentence, target_sentence, expected): 15 | score = chatterize_score(actual_sentence, target_sentence) 16 | assert score == expected 17 | 18 | 19 | class TestDifferencer(TestCase): 20 | 21 | def test_x_v_y_wer(self): 22 | d = diff("I like to eat people", "I like like to eat people") 23 | assert d.wer() == 1/6 24 | 25 | def test_y_v_x_wer(self): 26 | d = diff("I like like to eat people", "I like to eat people") 27 | assert d.wer() == 1/5 28 | 29 | def test_words_added(self): 30 | d = diff("I like Like to eat people", "I like to eat people") 31 | assert d.mistakes() == [ 32 | ("Like", None, 2,'added')] 33 | 34 | def test_words_changed(self): 35 | d = diff("How do you", "how are you") 36 | assert d.mistakes() == [ 37 | ("do", "are", 1, 'changed')] 38 | 39 | def test_words_skipped(self): 40 | d = diff("How see you", "how good to see you") 41 | assert d.mistakes() == [ 42 | (None, "good", 1, 'skipped'), 43 | (None, "to", 1, 'skipped')] 44 | 45 | def test_combined(self): 46 | d = diff("can i has 7 loaves of bread please ", "Can I have seven loaves, please?") 47 | assert d.mistakes() == \ 48 | [('has', 'have', 2, 'changed'), 49 | ('of', None, 5, 'added'), 50 | ('bread', None, 6, 'added')] 51 | 52 | def test_no_mistakes(self): 53 | d = diff("my name is leaf", "My name is leaf!") 54 | assert d.mistakes() == [] 55 | 56 | def test_yes_no_words(self): 57 | d = diff("How about a good bath", "Would you like a good bath?") 58 | print(d.scored_words()) 59 | assert d.yes_no_words() == [ 60 | ("How", False), 61 | ("about", False), 62 | ("a", False), 63 | ("good", True), 64 | ("bath", True)] 65 | 66 | def test_yes_no_numbers(self): 67 | d = diff("can i have 7 loaves please", "Can I have seven loaves, please?") 68 | assert d.yes_no_words() == [ 69 | ("can", True), 70 | ("i", True), 71 | ("have", True), 72 | ("7", True), 73 | ("loaves", True), 74 | ("please", True)] 75 | 76 | def test_numbers_mistake(self): 77 | d = diff("can i have 62 loaves please", "Can I have seven loaves, please?") 78 | assert d.mistakes() == [ 79 | ("62", "seven", 3, "changed")] 80 | 81 | def test_numbers_mistake_logic_fail(self): 82 | # this shows the limitations of the current system 83 | # id say its not really want you want but it sort of works 84 | # and as long as you stick to single digits we're fine 85 | d = diff("can i have 27 loaves please", "Can I have twenty six loaves, please?") 86 | assert d.mistakes() == [ 87 | (None, 'twenty', 3, 'skipped'), 88 | ('27', 'six', 3, 'changed')] 89 | 90 | def test_scored_words(self): 91 | d = diff("can i has 7 Loaves of bread please ", "Can I have seven Loaves, please?") 92 | assert d.scored_words() == [ 93 | ('can', 'Can', 0, None), 94 | ('i', 'I', 1, None), 95 | ('has', 'have', 2, 'changed'), 96 | ('7', 'seven', 3, None), 97 | ('Loaves', 'Loaves,', 4, None), 98 | ('of', None, 5, 'added'), 99 | ('bread', None, 6, 'added'), 100 | ('please', 'please?', 7, None)] 101 | 102 | def test_ex_miss_mary(self): 103 | d = chatterize_score("Nice to meet you Miss Mary.", "nice to meet you, Ms Mary!") 104 | assert d == 1 105 | 106 | def test_ex_meet_at_church(self): 107 | d = chatterize_score("Let's meat at the church.", "lets meet at the church") 108 | assert d == 1 109 | 110 | def test_ex_wow_100(self): 111 | d = chatterize_score("wow, 100 dollars", "Wow, $100?") 112 | assert d == 1 113 | 114 | def test_backtrace_ex(self): 115 | d = diff("Hi.", "hello tim my name is scott") 116 | assert d.wer() == 1 117 | 118 | def test_backtrace_ex2(self): 119 | d = chatterize_score("let's climb the rockwall", "Let's climb the rock wall.") 120 | #print(d.mistakes()) 121 | assert d == 1 122 | 123 | def test_normalize_100_dollars(self): 124 | d = SentenceDiff("xx","xx") 125 | assert d._normalize("$100") == "100 dollars" 126 | 127 | def test_normalize_1_dollar(self): 128 | d = SentenceDiff("xx","xx") 129 | assert d._normalize("here is $1 for you") == "here is 1 dollar for you" 130 | 131 | def test_ex_silverware(self): 132 | d = chatterize_score(actual_sentence="i need silver ware", target_sentence="I need silverware.") 133 | assert d == 1 134 | 135 | def test_ex_dog_house(self): 136 | d = chatterize_score("hawaii tim", "Hi Tim.") 137 | assert d == 1 138 | 139 | def test_ex_miss_mary(self): 140 | d = diff("hi miss mary", "Hi Ms. Mary!") 141 | assert d.wer() == 0 142 | 143 | def test_chatterize_score_dont_drop_apostrophe(self): 144 | score = chatterize_score("You're welcome","You're welcome") 145 | assert score == 1 146 | d = diff("You're welcome","You're welcome") 147 | scored = d.scored_words() 148 | assert scored[0][0] == "You're" 149 | 150 | def test_chatterize_score_dont_mess_up_lets(self): 151 | score = chatterize_score("Let's pretend we're pirates.", "Let's pretend we're pirates.") 152 | assert score == 1 153 | d = diff("Let's pretend we're pirates.", "Let's pretend we're pirates.") 154 | scored = d.scored_words() 155 | print(scored) 156 | assert scored[0][0] == "Let's" 157 | 158 | def test_complex_backtrace_ex(self): 159 | actual = "Do you want to have a sleepover?" 160 | target = "want to have a sleep over you want to have a sleepover" 161 | d = diff(actual, target) 162 | print(d.scored_words()) 163 | assert d.chatterize_score() == 0.4708333333333334 164 | 165 | def test_complex_backtrace_ex_2(self): 166 | actual = "where i like to dress as a superhero" 167 | target = "I like to dress as a superhero." 168 | d = diff(actual, target) 169 | assert d.chatterize_score() == 0.8 170 | 171 | def test_complex_backtrace_ex_test(self): 172 | actual = "x a b" 173 | target = "a b c a b" 174 | d = diff(actual, target) 175 | print(d.scored_words()) 176 | assert d.chatterize_score() == 0.6 177 | 178 | def test_profanity(self): 179 | d = diff("two fucking loaves", "two more loaves") 180 | assert d.mistakes() == [ 181 | ("xxxx", 'more', 1, 'changed')] 182 | 183 | def test_i_want_water(self): 184 | actual = "I want water, please" 185 | target = "I want please" 186 | d = diff(actual, target) 187 | assert d.yes_no_words() ==[ 188 | ('I', True), 189 | ('want', True), 190 | ('water,', False), 191 | ('please', True)] 192 | assert d.chatterize_score() == .75 193 | 194 | def test_chatterize_score_partial_word(self): 195 | assert_chatterize_score("I like superheroes.", "i like superhero", 0.9166666666666666) 196 | 197 | def test_chatterize_score_partial_word_round_up(self): 198 | assert_chatterize_score("superhero", "superheros", 0.9) 199 | 200 | def test_chatterize_score_fail(self): 201 | assert_chatterize_score("how you gorger hydra","I am a girl.", 0) 202 | 203 | def test_chatterize_score_pass(self): 204 | assert_chatterize_score("i want corn please","I want corn, please.", 1) 205 | 206 | def test_dad_birthday(self): 207 | assert_chatterize_score("it's my dad's birthday", "It's my dad's birthday", 1) 208 | 209 | def test_mom_birthday(self): 210 | assert_chatterize_score("is my mom's birthday", "It's my mom's birthday", .8) 211 | 212 | def test_some_flower(self): 213 | assert_chatterize_score("some flower please", "Some flour, please.", 1) 214 | 215 | def test_I_love_desert(self): 216 | assert_chatterize_score("I love desert", "I love dessert!", 1) 217 | 218 | def test_blue_bell(self): 219 | assert_chatterize_score("I like the name blue bell", "I like the name Bluebell", 1) 220 | 221 | def test_whats_jumpin(self): 222 | assert_chatterize_score("what's jumping", "Whats jumpin?", 1) 223 | 224 | def test_whats_cookin(self): 225 | assert_chatterize_score("what's cooking", "Whats cookin?", 1) 226 | 227 | def test_meat_meet(self): 228 | assert_chatterize_score("chickens give us meet", "Chickens give us meat.", 1) 229 | 230 | def test_merry_marry(self): 231 | assert_chatterize_score("hi miss marry", "Hi, Ms. Marry", 1) 232 | 233 | def test_talk_town(self): 234 | assert_chatterize_score("talk town school is great", "TalkTown school is great", 1) 235 | 236 | def test_to_please(self): 237 | assert_chatterize_score("I'd like to please", "I'd like 2 please", 1) 238 | 239 | def test_for_please(self): 240 | assert_chatterize_score("I'd like for please", "I'd like 4 please", 1) 241 | 242 | def test_chefs(self): 243 | assert_chatterize_score("can I have a chefs hat", "can I have a chef's hat", 1) 244 | 245 | def test_hi_sally(self): 246 | assert_chatterize_score("hi sally", "Hi, Sally", 1) 247 | 248 | def test_by_bob(self): 249 | assert_chatterize_score("by bob", "Bye, Bob", 1) -------------------------------------------------------------------------------- /sentence_diff/sentencediff.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import numpy as np 4 | import inflect 5 | import difflib 6 | import itertools 7 | import functools 8 | from better_profanity import profanity 9 | 10 | 11 | class SentenceDiff: 12 | 13 | def __init__(self, actual_sentence, target_sentence): 14 | self._assert_not_empty(actual_sentence, target_sentence) 15 | 16 | # lowercase, normalize, tokenize 17 | self.actual_sentence = actual_sentence 18 | self.actual = self._tokenize(actual_sentence) 19 | self.target = self._tokenize(target_sentence) 20 | 21 | # split words without lower casing 22 | self.actual_words = self._tokenize_for_end_user(actual_sentence) 23 | self.target_words = self._tokenize_for_end_user(target_sentence) 24 | 25 | # public methods 26 | def wer(self): 27 | self._compare() 28 | return self.error 29 | 30 | def scored_words(self): 31 | self._compare() 32 | self._backtrace() 33 | return self.scored_words 34 | 35 | def mistakes(self): 36 | self._compare() 37 | self._backtrace() 38 | return [tupl for tupl in self.scored_words if tupl[3]] 39 | 40 | def yes_no_words(self): 41 | self._compare() 42 | self._backtrace() 43 | res = [] 44 | for scored in self.scored_words: 45 | if scored[0]: 46 | res.append((scored[0], scored[3] is None)) 47 | return res 48 | 49 | def chatterize_score(self): 50 | 51 | actual_homonyms = SentenceDiff._homonyms(self.actual_sentence) 52 | 53 | wer1 = 99 54 | matrix1 = None 55 | actual_tokenized1 = None 56 | actual_words1 = None 57 | for tmp_actual in actual_homonyms: 58 | tmp_actual_tokenized = self._tokenize(tmp_actual) 59 | tmp_wer, tmp_matrix = self._do_compare(tmp_actual_tokenized, self.target) 60 | if tmp_wer < wer1: 61 | wer1 = tmp_wer 62 | matrix1 = tmp_matrix 63 | actual_tokenized1 = tmp_actual_tokenized 64 | actual_words1 = self._tokenize_for_end_user(tmp_actual) 65 | 66 | wer2 = 99 67 | matrix2 = None 68 | actual_tokenized2 = None 69 | actual_words2 = None 70 | for tmp_actual in actual_homonyms: 71 | tmp_actual_tokenized = self._tokenize(tmp_actual) 72 | tmp_wer, tmp_matrix = self._do_compare(self.target, tmp_actual_tokenized) 73 | if tmp_wer < wer2: 74 | wer2 = tmp_wer 75 | matrix2 = tmp_matrix 76 | actual_tokenized2 = tmp_actual_tokenized 77 | actual_words2 = self._tokenize_for_end_user(tmp_actual) 78 | 79 | if wer1 <= wer2: 80 | scored_words, alignment = \ 81 | self._do_backtrace(actual_tokenized1, self.target, matrix1, actual_words1, self.target_words) 82 | else: 83 | scored_words, alignment = \ 84 | self._do_backtrace(self.target, actual_tokenized2, matrix2, self.target_words, actual_words2) 85 | 86 | cost = 0 87 | word_count = 0 88 | for tuple in scored_words: 89 | word_count += 1 90 | actual = SentenceDiff._remove_punctuation(tuple[0]) 91 | target = SentenceDiff._remove_punctuation(tuple[1]) 92 | action = tuple[3] 93 | if action is None: 94 | cost += 0 # correct 95 | elif action== 'changed': 96 | cost += SentenceDiff._word_diff_cost(tuple[0], tuple[1]) # substitution cost 97 | else: 98 | cost += SentenceDiff._word_add_rm_cost(tuple[0], tuple[1]) # substitution cost 99 | 100 | return (word_count - cost) / word_count 101 | 102 | def print_debug(self): 103 | self._compare() 104 | self._backtrace() 105 | print("actual") 106 | print(self.actual) 107 | print("target") 108 | print(self.target) 109 | print("wer") 110 | print(self.error) 111 | # print(self.matrix) 112 | # print(self.path) 113 | print(self.alignment) 114 | print("") 115 | print(self.scored_words) 116 | print("") 117 | # print(self.insertions) 118 | # print(self.deletions) 119 | # print(self.substitutions) 120 | 121 | def _init_matrix(self, actual, target): 122 | # initialize the matrix per levenshtein distance 123 | shape = (len(target) + 1, len(actual) + 1) 124 | matrix = np.zeros(shape, dtype=np.uint32) 125 | matrix[0, :] = np.arange(shape[1]) 126 | matrix[:, 0] = np.arange(shape[0]) 127 | return matrix 128 | 129 | def _compare(self): 130 | wer, matrix = self._do_compare(self.actual, self.target) 131 | self.error = wer 132 | self.matrix = matrix 133 | 134 | def _do_compare(self, actual, target): 135 | matrix = self._init_matrix(actual, target) 136 | for trgt_pos, rw in enumerate(target): 137 | for actual_pos, hw in enumerate(actual): 138 | insert = matrix[trgt_pos + 1, actual_pos] + 1 139 | delete = matrix[trgt_pos, actual_pos + 1] + 1 140 | if rw != hw: 141 | subst = matrix[trgt_pos, actual_pos] + 1 142 | else: 143 | subst = matrix[trgt_pos, actual_pos] 144 | 145 | best = min(insert, delete, subst) 146 | matrix[trgt_pos + 1, actual_pos + 1] = best 147 | 148 | cost = matrix[-1, -1] 149 | if len(target)==0: 150 | return 1 151 | wer = cost / len(target) 152 | return wer, matrix 153 | 154 | def _do_backtrace(self, actuals, targets, matrix, actual_words, target_words, safe_mode=False): 155 | i = len(targets) - 1 156 | j = len(actuals) - 1 157 | 158 | alignment = [] 159 | path = [] 160 | inserts = 0 161 | deletions = 0 162 | substitns = 0 163 | matched = 0 164 | 165 | while i >= 0 or j >= 0: 166 | path.append((i + 1, j + 1)) 167 | start = matrix[i + 1, j + 1] 168 | insert = matrix[i + 1, j] 169 | delete = matrix[i, j + 1] 170 | subst = matrix[i, j] 171 | best = min(start, subst) 172 | 173 | if j < 0: 174 | return self._do_backtrace(actuals, targets, matrix, 175 | actual_words, target_words, safe_mode=True) 176 | 177 | if insert < best: 178 | alignment.append((None, actuals[j])) 179 | inserts += 1 180 | j -= 1 181 | 182 | elif delete < best or (safe_mode and delete==best): 183 | alignment.append((targets[i], None)) 184 | deletions += 1 185 | i -= 1 186 | 187 | else: 188 | if start == subst: # no change 189 | matched += 1 190 | else: 191 | substitns += 1 192 | 193 | alignment.append((targets[i], actuals[j])) 194 | j -= 1 195 | i -= 1 196 | 197 | alignment.reverse() 198 | path.reverse() 199 | scored_words = [] 200 | 201 | # the index returned in scored_words is relative to the *actual* sentence 202 | # but we need to keep track of both so we can look up the un-messed-with form of word 203 | a_idx = 0 204 | t_idx = 0 205 | for pair in alignment: 206 | 207 | if pair[0] == pair[1]: 208 | actual = actual_words[a_idx] 209 | target = target_words[t_idx] 210 | scored_words.append((actual, target, a_idx, None)) 211 | a_idx += 1 212 | t_idx += 1 213 | 214 | elif pair[0] is None: 215 | actual = actual_words[a_idx] 216 | scored_words.append((actual, None, a_idx, "added")) 217 | a_idx += 1 218 | 219 | elif pair[1] is None: 220 | target = target_words[t_idx] 221 | scored_words.append((None, target, a_idx, "skipped")) 222 | t_idx += 1 223 | 224 | else: 225 | 226 | actual = actual_words[a_idx] 227 | target = target_words[t_idx] 228 | scored_words.append((actual, target, a_idx, "changed")) 229 | a_idx += 1 230 | t_idx += 1 231 | 232 | return scored_words, alignment 233 | 234 | def _backtrace(self): 235 | scored_words, alignment =\ 236 | self._do_backtrace(self.actual, self.target, self.matrix, self.actual_words, self.target_words) 237 | self.scored_words = scored_words 238 | self.alignment = alignment 239 | 240 | def _tokenize(self, sentence): 241 | normalized_lower = self._normalize(sentence).lower() 242 | words = normalized_lower.split() 243 | words = SentenceDiff._single_word_subs(words) 244 | words = self._spell_out_numbers(words) 245 | return words 246 | 247 | def _tokenize_for_end_user(self, text): 248 | text = SentenceDiff._sound_out_dollars( 249 | profanity.censor(text, 'x')) 250 | words = str(text).strip().split() 251 | return [word for word in words if len(self._remove_punctuation(word).strip())>0] 252 | 253 | def _normalize(self, text): 254 | return \ 255 | self._remove_punctuation( 256 | SentenceDiff._sound_out_dollars( 257 | profanity.censor(text, 'x'))) 258 | 259 | @staticmethod 260 | def _assert_not_empty(actual_sentence, target_sentence): 261 | assert target_sentence is not None 262 | assert actual_sentence is not None 263 | t = len(target_sentence) 264 | a = len(actual_sentence) 265 | if t == 0 or a == 0\ 266 | and a == t: 267 | raise Exception("cannot compare empty sentences") 268 | 269 | @staticmethod 270 | def _spell_out_numbers(words): 271 | p = inflect.engine() 272 | result = [] 273 | for word in words: 274 | if SentenceDiff._check_int(word): 275 | result.append(p.number_to_words(int(word))) 276 | else: 277 | result.append(word) 278 | return result 279 | 280 | @staticmethod 281 | def _spell_out_numbers_in_word(word): 282 | if SentenceDiff._check_int(word): 283 | p = inflect.engine() 284 | return p.number_to_words(int(word)) 285 | else: 286 | return word 287 | 288 | @staticmethod 289 | def _check_int(s): 290 | if s[0] in ('-', '+'): 291 | return s[1:].isdigit() 292 | return s.isdigit() 293 | 294 | @staticmethod 295 | def _remove_punctuation(text): 296 | if text is None: 297 | return None 298 | return text.translate(str.maketrans('', '', string.punctuation)) 299 | 300 | @staticmethod 301 | def _sound_out_dollars(text): 302 | text = re.sub(r"\$1\b", "1 dollar", text) 303 | _subst = "\\2 dollars" 304 | _regex = r"(\$)(\d*)\b" 305 | return re.sub(_regex, _subst, text) 306 | 307 | @staticmethod 308 | def _single_word_subs(words): 309 | return [SentenceDiff._single_word_sub(word) for word in words] 310 | 311 | @staticmethod 312 | def _single_word_sub(word): 313 | # specific 314 | word = re.sub(r"mr", "mister", word, flags=re.IGNORECASE) 315 | word = re.sub(r"ms", "miss", word, flags=re.IGNORECASE) 316 | word = re.sub(r"mrs", "mrs", word, flags=re.IGNORECASE) 317 | word = re.sub(r"dr", "doctor", word, flags=re.IGNORECASE) 318 | return word 319 | 320 | @staticmethod 321 | def _word_diff_cost(wordA, wordB): 322 | #substitution cost, similar words cost close to 0 different words cost 1 323 | denominator = 0 324 | numerator = 0 325 | for i, s in enumerate(difflib.ndiff(wordA, wordB)): 326 | denominator += 1 327 | if s[0] == '-' or s[0] == '+': 328 | numerator += 1 329 | 330 | cost = numerator/denominator 331 | # a little hack to cost zero when its really a random 332 | # coincidence eg 'hydra' vs 'girl' share one letter -r 333 | if cost>=0.85 and numerator>2: 334 | return 1 335 | else: 336 | return cost 337 | 338 | @staticmethod 339 | def _word_add_rm_cost(wordA, wordB): 340 | #addition or removal cost, small words dont cost so much 341 | word = wordA if wordA is not None else wordB 342 | if word == "a" or word == "the" or len(word) <= 2: 343 | return 0.6 344 | else: 345 | return 1 346 | 347 | @staticmethod 348 | def _all_substitutions(list_of_lists): 349 | rslt = [] 350 | for lst in list_of_lists: 351 | for pair in itertools.permutations(lst, 2): 352 | rslt.append(pair) 353 | return rslt 354 | 355 | @staticmethod 356 | @functools.lru_cache() 357 | def _homonyms(sentence): 358 | result = [sentence] 359 | for pair in SentenceDiff._all_word_subs(): 360 | test_sentence = sentence.replace(pair[0], pair[1]) 361 | if test_sentence != sentence: 362 | result.append(test_sentence) 363 | return result 364 | 365 | @staticmethod 366 | @functools.lru_cache(maxsize=None) 367 | def _all_word_subs(): 368 | return SentenceDiff._all_substitutions( 369 | SentenceDiff._all_word_homonyms()) 370 | 371 | @staticmethod 372 | @functools.lru_cache(maxsize=None) 373 | def _all_word_homonyms(): 374 | return \ 375 | [["there", "their", "they’re"], 376 | ["see", "sea"], 377 | ["for", "four"], 378 | ["by", "buy", "bye"], 379 | ["passed", "past"], 380 | ["which", "witch"], 381 | ["son", "sun"], 382 | ["who’s", "whose"], 383 | ["hole", "whole"], 384 | ["write", "right"], 385 | ["to", "too", "two"], 386 | ["threw", "through"], 387 | ["cereal", "serial"], 388 | ["desert", "dessert"], 389 | ["meat", "meet"], 390 | ["flower", "flour"], 391 | ["cooking", "cookin"], 392 | ["jumping", "jumpin"], 393 | ["principal", "principle"], 394 | ["blue bell", "bluebell"], 395 | ["talk town", "talktown"], 396 | ["silverware", "silver ware"], 397 | ["majong", "mah jong"], 398 | ["rock wall", "rockwall"], 399 | ["chicken soup", "chickensoup"], 400 | ["tomato soup", "tomatosoup"], 401 | ["hi tim", "hawaii team"], 402 | ["hi tim", "hi team"], 403 | ["hi", "hawaii"], 404 | ["hi tim", "hawaii team"], 405 | ["hi tim", "hi team"], 406 | ["hi", "hawaii"] 407 | ] --------------------------------------------------------------------------------