├── test
    ├── __init__.py
    ├── test_homonymns.py
    ├── run_sentence_diff.py
    ├── run_word_diff.py
    ├── pivotable_parse.py
    ├── test_word_diff.py
    └── test_differencer.py
├── requirements.txt
├── setup.cfg
├── sentence_diff
    ├── __init__.py
    ├── worddiff.py
    └── sentencediff.py
├── setup.py
├── .gitignore
└── README.md


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | inflect
3 | pytest
4 | better_profanity


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md


--------------------------------------------------------------------------------
/sentence_diff/__init__.py:
--------------------------------------------------------------------------------
1 | from .sentencediff import SentenceDiff
2 | from .worddiff import WordDiff


--------------------------------------------------------------------------------
/test/test_homonymns.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from sentence_diff import SentenceDiff
 3 | 
 4 | class TestHomonyms(TestCase):
 5 | 
 6 |     def test_sentence_homonymsdeserts(self):
 7 |         result = SentenceDiff._homonyms("I love desert")
 8 |         assert result == ["I love desert", "I love dessert"]
 9 | 
10 |     def test_substitutions(self):
11 |         list_of_lists = [["a","b"],
12 |                          ["x","y","z"]]
13 |         result = SentenceDiff._all_substitutions(list_of_lists)
14 |         assert result == \
15 |            [("a","b"),
16 |             ("b","a"),
17 |             ("x", "y"),
18 |             ("x", "z"),
19 |             ("y", "x"),
20 |             ("y", "z"),
21 |             ("z", "x"),
22 |             ("z", "y")]
23 | 
24 | 


--------------------------------------------------------------------------------
/test/run_sentence_diff.py:
--------------------------------------------------------------------------------
 1 | from sentence_diff import SentenceDiff
 2 | import csv
 3 | 
 4 | with open('edit_score_highlights.csv', newline='') as csv_file_in:
 5 |     with open('edit_score__highlights_out.csv', 'w', newline='') as csv_file_out:
 6 |         writer = csv.writer(csv_file_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
 7 |         writer.writerow(["Actual Text","Target Text","Score By Words","Score By Letters", "New Score", "Attn", "New Score 2"]) #"Updated Wer", "Wer Score"
 8 |         reader = csv.reader(csv_file_in, delimiter=',', quotechar='"')
 9 |         reader.__next__()
10 |         for row in reader:
11 |             actual_sentence = row[0]
12 |             target_sentence = row[1]
13 | 
14 |             if len(actual_sentence.strip()) == 0:
15 |                 continue
16 |             if len(target_sentence.strip()) == 0:
17 |                 continue
18 | 
19 |             print("{}-{}".format(actual_sentence, target_sentence))
20 | 
21 |             diff = SentenceDiff(actual_sentence, target_sentence)
22 |             row.append(diff.chatterize_score() * 100)
23 |             writer.writerow(row)
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | setup(
 3 |   name = 'sentence_diff',         
 4 |   packages = ['sentence_diff'],  
 5 |   version = '0.1', 
 6 |   license='MIT',       
 7 |   description = 'Difference English sentences via Liechtenstein distance, calculate word error rate, and list out word by word differences',   
 8 |   author = 'Miles Thompson',
 9 |   author_email = 'utunga@gmail.com',      
10 |   url = 'https://github.com/utunga/sentence_diff',   
11 |   download_url = 'https://github.com/utunga/sentence_diff/archive/v_01.tar.gz',   
12 |   keywords = ['Levenshtein', 'English', 'Text', 'WER', 'Diff'],   
13 |   install_requires=[            
14 |           'numpy',
15 |           'inflect',
16 |       ],
17 |   classifiers=[
18 |     'Development Status :: 4 - Beta',    
19 |     'Intended Audience :: Developers',   
20 |     'License :: OSI Approved :: MIT License', 
21 |     'Programming Language :: Python :: 3',  
22 |     'Programming Language :: Python :: 3.4',
23 |     'Programming Language :: Python :: 3.5',
24 |     'Programming Language :: Python :: 3.6',
25 |     'Programming Language :: Python :: 3.7',
26 |   ],
27 | )


--------------------------------------------------------------------------------
/test/run_word_diff.py:
--------------------------------------------------------------------------------
 1 | from sentence_diff import WordDiff
 2 | import csv
 3 | 
 4 | with open('keyword_diffs.csv', newline='', encoding="utf-8") as csv_file_in:
 5 |     with open('keyword_diffs_out.csv', 'w', newline='', encoding="utf-8") as csv_file_out:
 6 |         writer = csv.writer(csv_file_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
 7 |         writer.writerow(["target_text","recognized_text","confidence_score","audio_mp3_file", "similarity", "pass_fail"])
 8 |         reader = csv.reader(csv_file_in, delimiter=',', quotechar='"')
 9 |         reader.__next__()
10 |         for row in reader:
11 |             actual = row[0]
12 |             target = row[1]
13 |             confidence = float(row[2])
14 | 
15 |             if len(actual.strip()) == 0:
16 |                 continue
17 |             if len(target.strip()) == 0:
18 |                 continue
19 | 
20 |             print("{}-{}".format(actual, target))
21 | 
22 |             diff = WordDiff(actual, target)
23 |             pass_fail, similarity = diff.chatterize_score()
24 |             row.append(similarity * 100)
25 |             row.append(pass_fail)
26 |             writer.writerow(row)
27 | 


--------------------------------------------------------------------------------
/test/pivotable_parse.py:
--------------------------------------------------------------------------------
 1 | from sentence_diff import SentenceDiff
 2 | import csv
 3 | 
 4 | 
 5 | def process_row(row, writer):
 6 |     actual_sentence = row['transcript']
 7 |     target_sentence = row['target']
 8 | 
 9 |     if len(actual_sentence.strip()) == 0:
10 |         return
11 |     if len(target_sentence.strip()) == 0:
12 |         return
13 | 
14 |     print("{}-{}".format(actual_sentence, target_sentence))
15 | 
16 |     diff = SentenceDiff(actual_sentence, target_sentence)
17 |     row['wer'] = diff.wer()
18 |     row['score'] = diff.chatterize_score() * 100
19 |     writer.writerow(row)
20 | 
21 | 
22 | with open('pivotable.csv', encoding='utf-8') as csv_file_in:
23 |     with open('pivotable_out.csv', 'w', newline='', encoding='utf-8') as csv_file_out:
24 |         reader = csv.DictReader(csv_file_in)
25 |         first_row = reader.__next__()
26 |         field_names = list(first_row.keys())
27 |         field_names.append('score')
28 |         writer = csv.DictWriter(csv_file_out, fieldnames=field_names,
29 |                                 delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
30 |         writer.writeheader()
31 |         process_row(first_row, writer)
32 | 
33 |         for row in reader:
34 |             process_row(row, writer)
35 | 
36 | 


--------------------------------------------------------------------------------
/sentence_diff/worddiff.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | import numpy as np
 4 | import inflect
 5 | import difflib
 6 | from better_profanity import profanity
 7 | from sentence_diff import SentenceDiff
 8 | 
 9 | THRESHOLD_PASS = .4
10 | THRESHOLD_SUPER_PASS = .7
11 | 
12 | class WordDiff:
13 | 
14 |     def __init__(self, actual, target):
15 |         SentenceDiff._assert_not_empty(actual,target)
16 |         self.actual = actual
17 |         self.target = target
18 |         self.actual_lower = self.normalize(actual)
19 |         self.target_lower = self.normalize(target)
20 | 
21 |     def chatterize_score(self):
22 |         homonyms = SentenceDiff._homonyms(self.actual_lower)
23 |         max_similarity = -1
24 |         for homonym in homonyms:
25 |             similarity = self.similarity(homonym, self.target_lower)
26 |             if similarity > max_similarity:
27 |                 max_similarity = similarity
28 | 
29 |         pass_fail = "SUPER PASS" if max_similarity > THRESHOLD_SUPER_PASS \
30 |                     else "PASS" if max_similarity > THRESHOLD_PASS \
31 |                     else "FAIL"
32 | 
33 |         return pass_fail, max_similarity
34 | 
35 |     def normalize(self, text):
36 |         return \
37 |             SentenceDiff._remove_punctuation(
38 |                     SentenceDiff._spell_out_numbers_in_word(
39 |                         SentenceDiff._sound_out_dollars(
40 |                             profanity.censor(text.lower(), 'x'))))
41 | 
42 |     def similarity(self, wordA, wordB):
43 |         # work substitution cost
44 |         # similar words cost close to 0 different words cost 1
45 |         denominator = 0
46 |         numerator = 0
47 |         for i, s in enumerate(difflib.ndiff(wordA, wordB)):
48 |             denominator += 1
49 |             if s[0] == '-' or s[0] == '+':
50 |                 numerator += 1
51 |         return 1 - numerator/denominator
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/test/test_word_diff.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from sentence_diff import WordDiff
 3 | 
 4 | def word_diff(actual, target):
 5 |     d = WordDiff(actual,target)
 6 |     return d.chatterize_score()
 7 | 
 8 | class TestDifferencer(TestCase):
 9 | 
10 |     def test_big(self):
11 |         target = "big"
12 |         actual = "big"
13 |         pass_fail, score = word_diff(actual, target)
14 |         assert score == 1
15 |         assert pass_fail == "SUPER PASS"
16 | 
17 |     def test_food(self):
18 |         target = "order food"
19 |         actual = "all the food i ate food"
20 |         pass_fail, score = word_diff(actual, target)
21 |         assert score == .2222222222222222
22 |         assert pass_fail == "FAIL"
23 | 
24 |     def test_dog(self):
25 |         target = "dog"
26 |         actual = "tall dog poke bo suck my mother nature"
27 |         pass_fail, score = word_diff(actual, target)
28 |         assert score == .07894736842105265
29 |         assert pass_fail == "FAIL"
30 | 
31 |     def test_superhero(self):
32 |         target = "superhero"
33 |         actual = "superheroes"
34 |         pass_fail, score = word_diff(actual, target)
35 |         assert score == .8181818181818181
36 |         assert pass_fail == "SUPER PASS"
37 | 
38 |     def test_meat(self):
39 |         target = "meat"
40 |         actual = "meet"
41 |         pass_fail, score = word_diff(actual, target)
42 |         assert score == 1
43 |         assert pass_fail == "SUPER PASS"
44 | 
45 | 
46 |     def test_shirt(self):
47 |         target = "shirt"
48 |         actual = "sharks"
49 |         pass_fail, score = word_diff(actual, target)
50 |         assert score == .375
51 |         assert pass_fail == "FAIL"
52 | 
53 |     def test_pirates(self):
54 |         target = "shirt"
55 |         actual = "shut"
56 |         pass_fail, score = word_diff(actual, target)
57 |         assert score == .5
58 |         assert pass_fail == "PASS"
59 | 
60 |     def test_number(self):
61 |         target = "one"
62 |         actual = "1"
63 |         pass_fail, score = word_diff(actual, target)
64 |         assert score == 1
65 |         assert pass_fail == "SUPER PASS"
66 | 
67 |     def test_dollars(self):
68 |         target = "100 dollars"
69 |         actual = "$100"
70 |         pass_fail, score = word_diff(actual, target)
71 |         assert score == 1
72 |         assert pass_fail == "SUPER PASS"
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .idea
132 | 
133 | *.csv
134 | *.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Sentence Differences - sentence_diff
  2 | Package to difference English sentences via Liechtenstein distance, calculate word error rate, and list out word by word differences
  3 | 
  4 | # Basic usage
  5 | 
  6 | ```python
  7 | 
  8 | from sentence_diff import SentenceDiff
  9 | 
 10 | d = SentenceDiff("can i has 7 loaves of bread please ", "Can I have seven loaves, please?")
 11 | assert d.mistakes() == [
 12 |   ('has', 'have', 2, 'changed'),
 13 |   ('of', None, 5, 'added'),
 14 |   ('bread', None, 6, 'added')]
 15 | 
 16 | ```
 17 | 
 18 | ### Word Error Rate - wer()
 19 | 
 20 | ```python
 21 | d = SentenceDiff("I like to meet people", "I really like to meet people")
 22 | assert d.wer() == 1/6
 23 | ```
 24 | 
 25 | ```python
 26 | d = SentenceDiff("I really like to meet people", "I like to meet people")
 27 | assert d.wer() == 1/5
 28 | ```
 29 | 
 30 | ### Changes - mistakes()
 31 | 
 32 | Added words
 33 | ```python
 34 | d = SentenceDiff("I like Like to eat people", "I like to eat people")
 35 | assert d.mistakes() == [
 36 | ("Like", None, 2,'added')]
 37 | ```
 38 | 
 39 | Changed words 
 40 | ```python
 41 | d = SentenceDiff("How do you", "how are you")
 42 | assert d.mistakes() == [
 43 | ("do", "are", 1, 'changed')]
 44 | ```
 45 | 
 46 | Skipped words
 47 | ```python
 48 | d = SentenceDiff("How see you", "how good to see you")
 49 | assert d.mistakes() == [
 50 | (None, "good", 1, 'skipped'), 
 51 | (None, "to", 1, 'skipped')]
 52 | ```
 53 | 
 54 | No differences (ignores punctuation and case)
 55 | ```python
 56 | d = SentenceDiff("my name is joe", "My name is Joe!")
 57 | assert d.mistakes() == []
 58 | ```
 59 | 
 60 | ### What words from original are OK - yes_no_words()
 61 | 
 62 | ```python
 63 | d = SentenceDiff("can i have 7 loaves please", "Can I have seven loaves, please?")
 64 | assert d.yes_no_words() == [
 65 | ("can", True),
 66 | ("i", True),
 67 | ("have", True),
 68 | ("7", True),
 69 | ("loaves", True),
 70 | ("please", True)]
 71 | ```
 72 | 
 73 | ### What words from original are OK or not? - yes_no_words()
 74 | 
 75 | ```python
 76 | d = SentenceDiff("can i have 7 loaves please", "Can I have seven loaves, please?")
 77 | assert d.yes_no_words() == [
 78 | ("can", True),
 79 | ("i", True),
 80 | ("have", True),
 81 | ("7", True),
 82 | ("loaves", True),
 83 | ("please", True)]
 84 | ```
 85 | 
 86 | ### Full list of changes - scored_words()
 87 | 
 88 | ```python
 89 | d = SentenceDiff("can i has 7 loaves of bread please ", "Can I have seven loaves, please?")
 90 | assert d.scored_words() == [
 91 | ('can', 'Can', 0, None),
 92 | ('i', 'I', 1, None),
 93 | ('has', 'have', 2, 'changed'),
 94 | ('7', 'seven', 3, None),
 95 | ('loaves', 'loaves', 4, None),
 96 | ('of', None, 5, 'added'),
 97 | ('bread', None, 6, 'added'),
 98 | ('please', 'please', 7, None)]
 99 | ```
100 | 


--------------------------------------------------------------------------------
/test/test_differencer.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from sentence_diff import SentenceDiff
  3 | 
  4 | 
  5 | def diff(actual_sentence, target_sentence):
  6 |     return SentenceDiff(actual_sentence=actual_sentence, target_sentence=target_sentence)
  7 | 
  8 | 
  9 | def chatterize_score(actual_sentence, target_sentence):
 10 |     diff = SentenceDiff(actual_sentence, target_sentence)
 11 |     return diff.chatterize_score()
 12 | 
 13 | 
 14 | def assert_chatterize_score(actual_sentence, target_sentence, expected):
 15 |     score = chatterize_score(actual_sentence, target_sentence)
 16 |     assert score == expected
 17 | 
 18 | 
 19 | class TestDifferencer(TestCase):
 20 | 
 21 |     def test_x_v_y_wer(self):
 22 |         d = diff("I like to eat people", "I like like to eat people")
 23 |         assert d.wer() == 1/6
 24 | 
 25 |     def test_y_v_x_wer(self):
 26 |         d = diff("I like like to eat people", "I like  to eat people")
 27 |         assert d.wer() == 1/5
 28 | 
 29 |     def test_words_added(self):
 30 |         d = diff("I like Like to eat people", "I like to eat people")
 31 |         assert d.mistakes() == [
 32 |         ("Like", None, 2,'added')]
 33 | 
 34 |     def test_words_changed(self):
 35 |         d = diff("How do you", "how are you")
 36 |         assert d.mistakes() == [
 37 |         ("do", "are", 1, 'changed')]
 38 | 
 39 |     def test_words_skipped(self):
 40 |         d = diff("How see you", "how good to see you")
 41 |         assert d.mistakes() == [
 42 |         (None, "good", 1, 'skipped'),
 43 |         (None, "to", 1, 'skipped')]
 44 | 
 45 |     def test_combined(self):
 46 |         d = diff("can i has 7 loaves of bread please ", "Can I have seven loaves, please?")
 47 |         assert d.mistakes() == \
 48 |             [('has', 'have', 2, 'changed'),
 49 |             ('of', None, 5, 'added'),
 50 |             ('bread', None, 6, 'added')]
 51 | 
 52 |     def test_no_mistakes(self):
 53 |         d = diff("my name is leaf", "My name is leaf!")
 54 |         assert d.mistakes() == []
 55 | 
 56 |     def test_yes_no_words(self):
 57 |         d = diff("How about a good bath", "Would you like a good bath?")
 58 |         print(d.scored_words())
 59 |         assert d.yes_no_words() == [
 60 |         ("How", False),
 61 |         ("about", False),
 62 |         ("a", False),
 63 |         ("good", True),
 64 |         ("bath", True)]
 65 | 
 66 |     def test_yes_no_numbers(self):
 67 |         d = diff("can i have 7 loaves please", "Can I have seven loaves, please?")
 68 |         assert d.yes_no_words() == [
 69 |         ("can", True),
 70 |         ("i", True),
 71 |         ("have", True),
 72 |         ("7", True),
 73 |         ("loaves", True),
 74 |         ("please", True)]
 75 | 
 76 |     def test_numbers_mistake(self):
 77 |         d = diff("can i have 62 loaves please", "Can I have seven loaves, please?")
 78 |         assert d.mistakes() == [
 79 |         ("62", "seven", 3, "changed")]
 80 |       
 81 |     def test_numbers_mistake_logic_fail(self):
 82 |         # this shows the limitations of the current system
 83 |         # id say its not really want you want but it sort of works
 84 |         # and as long as you stick to single digits we're fine
 85 |         d = diff("can i have 27 loaves please", "Can I have twenty six loaves, please?")
 86 |         assert d.mistakes() == [
 87 |         (None, 'twenty', 3, 'skipped'), 
 88 |         ('27', 'six', 3, 'changed')]
 89 |       
 90 |     def test_scored_words(self):
 91 |         d = diff("can i has 7 Loaves of bread please ", "Can I have seven Loaves, please?")
 92 |         assert d.scored_words() == [
 93 |         ('can', 'Can', 0, None),
 94 |         ('i', 'I', 1, None),
 95 |         ('has', 'have', 2, 'changed'),
 96 |         ('7', 'seven', 3, None),
 97 |         ('Loaves', 'Loaves,', 4, None),
 98 |         ('of', None, 5, 'added'),
 99 |         ('bread', None, 6, 'added'),
100 |         ('please', 'please?', 7, None)]
101 | 
102 |     def test_ex_miss_mary(self):
103 |         d = chatterize_score("Nice to meet you Miss Mary.", "nice to meet you, Ms Mary!")
104 |         assert d == 1
105 | 
106 |     def test_ex_meet_at_church(self):
107 |         d = chatterize_score("Let's meat at the church.", "lets meet at the church")
108 |         assert d == 1
109 | 
110 |     def test_ex_wow_100(self):
111 |         d = chatterize_score("wow, 100 dollars", "Wow, $100?")
112 |         assert d == 1
113 | 
114 |     def test_backtrace_ex(self):
115 |         d = diff("Hi.", "hello tim my name is scott")
116 |         assert d.wer() == 1
117 | 
118 |     def test_backtrace_ex2(self):
119 |         d = chatterize_score("let's climb the rockwall", "Let's climb the rock wall.")
120 |         #print(d.mistakes())
121 |         assert d == 1
122 | 
123 |     def test_normalize_100_dollars(self):
124 |         d = SentenceDiff("xx","xx")
125 |         assert d._normalize("$100") == "100 dollars"
126 | 
127 |     def test_normalize_1_dollar(self):
128 |         d = SentenceDiff("xx","xx")
129 |         assert d._normalize("here is $1 for you") == "here is 1 dollar for you"
130 | 
131 |     def test_ex_silverware(self):
132 |         d = chatterize_score(actual_sentence="i need silver ware", target_sentence="I need silverware.")
133 |         assert d == 1
134 | 
135 |     def test_ex_dog_house(self):
136 |         d = chatterize_score("hawaii tim", "Hi Tim.")
137 |         assert d == 1
138 | 
139 |     def test_ex_miss_mary(self):
140 |         d = diff("hi miss mary", "Hi Ms. Mary!")
141 |         assert d.wer() == 0
142 | 
143 |     def test_chatterize_score_dont_drop_apostrophe(self):
144 |         score = chatterize_score("You're welcome","You're welcome")
145 |         assert score == 1
146 |         d = diff("You're welcome","You're welcome")
147 |         scored = d.scored_words()
148 |         assert scored[0][0] == "You're"
149 | 
150 |     def test_chatterize_score_dont_mess_up_lets(self):
151 |         score = chatterize_score("Let's pretend we're pirates.", "Let's pretend we're pirates.")
152 |         assert score == 1
153 |         d = diff("Let's pretend we're pirates.", "Let's pretend we're pirates.")
154 |         scored = d.scored_words()
155 |         print(scored)
156 |         assert scored[0][0] == "Let's"
157 | 
158 |     def test_complex_backtrace_ex(self):
159 |         actual = "Do you want to have a sleepover?"
160 |         target = "want to have a sleep over you want to have a sleepover"
161 |         d = diff(actual, target)
162 |         print(d.scored_words())
163 |         assert d.chatterize_score() == 0.4708333333333334
164 | 
165 |     def test_complex_backtrace_ex_2(self):
166 |         actual = "where i like to dress as a superhero"
167 |         target = "I like to dress as a superhero."
168 |         d = diff(actual, target)
169 |         assert d.chatterize_score() == 0.8
170 | 
171 |     def test_complex_backtrace_ex_test(self):
172 |         actual = "x a b"
173 |         target = "a b c a b"
174 |         d = diff(actual, target)
175 |         print(d.scored_words())
176 |         assert d.chatterize_score() == 0.6
177 | 
178 |     def test_profanity(self):
179 |         d = diff("two fucking loaves", "two more loaves")
180 |         assert d.mistakes() == [
181 |         ("xxxx", 'more', 1, 'changed')]
182 | 
183 |     def test_i_want_water(self):
184 |         actual = "I want water, please"
185 |         target = "I want please"
186 |         d = diff(actual, target)
187 |         assert d.yes_no_words() ==[
188 |             ('I', True), 
189 |             ('want', True), 
190 |             ('water,', False), 
191 |             ('please', True)]
192 |         assert d.chatterize_score() == .75
193 | 
194 |     def test_chatterize_score_partial_word(self):
195 |         assert_chatterize_score("I like superheroes.", "i like superhero", 0.9166666666666666)
196 | 
197 |     def test_chatterize_score_partial_word_round_up(self):
198 |         assert_chatterize_score("superhero", "superheros", 0.9)
199 | 
200 |     def test_chatterize_score_fail(self):
201 |         assert_chatterize_score("how you gorger hydra","I am a girl.", 0)
202 | 
203 |     def test_chatterize_score_pass(self):
204 |         assert_chatterize_score("i want corn please","I want corn, please.", 1)
205 | 
206 |     def test_dad_birthday(self):
207 |         assert_chatterize_score("it's my dad's birthday", "It's my dad's birthday", 1)
208 | 
209 |     def test_mom_birthday(self):
210 |         assert_chatterize_score("is my mom's birthday", "It's my mom's birthday", .8)
211 | 
212 |     def test_some_flower(self):
213 |         assert_chatterize_score("some flower please", "Some flour, please.", 1)
214 | 
215 |     def test_I_love_desert(self):
216 |         assert_chatterize_score("I love desert", "I love dessert!", 1)
217 | 
218 |     def test_blue_bell(self):
219 |         assert_chatterize_score("I like the name blue bell", "I like the name Bluebell", 1)
220 | 
221 |     def test_whats_jumpin(self):
222 |         assert_chatterize_score("what's jumping", "Whats jumpin?", 1)
223 | 
224 |     def test_whats_cookin(self):
225 |         assert_chatterize_score("what's cooking", "Whats cookin?", 1)
226 | 
227 |     def test_meat_meet(self):
228 |         assert_chatterize_score("chickens give us meet", "Chickens give us meat.", 1)
229 | 
230 |     def test_merry_marry(self):
231 |         assert_chatterize_score("hi miss marry", "Hi, Ms. Marry", 1)
232 | 
233 |     def test_talk_town(self):
234 |         assert_chatterize_score("talk town school is great", "TalkTown school is great", 1)
235 | 
236 |     def test_to_please(self):
237 |         assert_chatterize_score("I'd like to please", "I'd like 2 please", 1)
238 | 
239 |     def test_for_please(self):
240 |         assert_chatterize_score("I'd like for please", "I'd like 4 please", 1)
241 | 
242 |     def test_chefs(self):
243 |         assert_chatterize_score("can I have a chefs hat", "can I have a chef's hat", 1)
244 | 
245 |     def test_hi_sally(self):
246 |         assert_chatterize_score("hi sally", "Hi, Sally", 1)
247 | 
248 |     def test_by_bob(self):
249 |         assert_chatterize_score("by bob", "Bye, Bob", 1)


--------------------------------------------------------------------------------
/sentence_diff/sentencediff.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import string
  3 | import numpy as np
  4 | import inflect
  5 | import difflib
  6 | import itertools
  7 | import functools
  8 | from better_profanity import profanity
  9 | 
 10 | 
 11 | class SentenceDiff:
 12 | 
 13 |     def __init__(self, actual_sentence, target_sentence):
 14 |         self._assert_not_empty(actual_sentence, target_sentence)
 15 |         
 16 |         # lowercase, normalize, tokenize
 17 |         self.actual_sentence = actual_sentence
 18 |         self.actual = self._tokenize(actual_sentence)
 19 |         self.target = self._tokenize(target_sentence)
 20 | 
 21 |         # split words without lower casing
 22 |         self.actual_words = self._tokenize_for_end_user(actual_sentence)
 23 |         self.target_words = self._tokenize_for_end_user(target_sentence)
 24 | 
 25 |     # public methods 
 26 |     def wer(self):
 27 |         self._compare()
 28 |         return self.error
 29 | 
 30 |     def scored_words(self):
 31 |         self._compare()
 32 |         self._backtrace()
 33 |         return self.scored_words
 34 | 
 35 |     def mistakes(self):
 36 |         self._compare()
 37 |         self._backtrace()
 38 |         return [tupl for tupl in self.scored_words if tupl[3]]
 39 | 
 40 |     def yes_no_words(self):
 41 |         self._compare()
 42 |         self._backtrace()
 43 |         res = []
 44 |         for scored in self.scored_words:
 45 |             if scored[0]:
 46 |                 res.append((scored[0], scored[3] is None))
 47 |         return res
 48 | 
 49 |     def chatterize_score(self):
 50 | 
 51 |         actual_homonyms = SentenceDiff._homonyms(self.actual_sentence)
 52 | 
 53 |         wer1 = 99
 54 |         matrix1 = None
 55 |         actual_tokenized1 = None
 56 |         actual_words1 = None
 57 |         for tmp_actual in actual_homonyms:
 58 |             tmp_actual_tokenized = self._tokenize(tmp_actual)
 59 |             tmp_wer, tmp_matrix = self._do_compare(tmp_actual_tokenized, self.target)
 60 |             if tmp_wer < wer1:
 61 |                 wer1 = tmp_wer
 62 |                 matrix1 = tmp_matrix
 63 |                 actual_tokenized1 = tmp_actual_tokenized
 64 |                 actual_words1 = self._tokenize_for_end_user(tmp_actual)
 65 | 
 66 |         wer2 = 99
 67 |         matrix2 = None
 68 |         actual_tokenized2 = None
 69 |         actual_words2 = None
 70 |         for tmp_actual in actual_homonyms:
 71 |             tmp_actual_tokenized = self._tokenize(tmp_actual)
 72 |             tmp_wer, tmp_matrix = self._do_compare(self.target, tmp_actual_tokenized)
 73 |             if tmp_wer < wer2:
 74 |                 wer2 = tmp_wer
 75 |                 matrix2 = tmp_matrix
 76 |                 actual_tokenized2 = tmp_actual_tokenized
 77 |                 actual_words2 = self._tokenize_for_end_user(tmp_actual)
 78 | 
 79 |         if wer1 <= wer2:
 80 |             scored_words, alignment = \
 81 |                 self._do_backtrace(actual_tokenized1, self.target, matrix1, actual_words1, self.target_words)
 82 |         else:
 83 |             scored_words, alignment = \
 84 |                 self._do_backtrace(self.target, actual_tokenized2, matrix2, self.target_words, actual_words2)
 85 | 
 86 |         cost = 0
 87 |         word_count = 0
 88 |         for tuple in scored_words:
 89 |             word_count += 1
 90 |             actual = SentenceDiff._remove_punctuation(tuple[0])
 91 |             target = SentenceDiff._remove_punctuation(tuple[1])
 92 |             action = tuple[3]
 93 |             if action is None:
 94 |                 cost += 0  # correct
 95 |             elif action== 'changed':
 96 |                 cost += SentenceDiff._word_diff_cost(tuple[0], tuple[1])  # substitution cost
 97 |             else:
 98 |                 cost += SentenceDiff._word_add_rm_cost(tuple[0], tuple[1])  # substitution cost
 99 | 
100 |         return (word_count - cost) / word_count
101 | 
102 |     def print_debug(self):
103 |         self._compare()
104 |         self._backtrace()
105 |         print("actual")
106 |         print(self.actual)
107 |         print("target")
108 |         print(self.target)
109 |         print("wer")
110 |         print(self.error)
111 |         # print(self.matrix)
112 |         # print(self.path)
113 |         print(self.alignment)
114 |         print("")
115 |         print(self.scored_words)
116 |         print("")
117 |         # print(self.insertions)
118 |         # print(self.deletions)
119 |         # print(self.substitutions)
120 | 
121 |     def _init_matrix(self, actual, target):
122 |         # initialize the matrix per levenshtein distance
123 |         shape = (len(target) + 1, len(actual) + 1)
124 |         matrix = np.zeros(shape, dtype=np.uint32)
125 |         matrix[0, :] = np.arange(shape[1])
126 |         matrix[:, 0] = np.arange(shape[0])
127 |         return matrix
128 | 
129 |     def _compare(self):
130 |         wer, matrix = self._do_compare(self.actual, self.target)
131 |         self.error = wer
132 |         self.matrix = matrix
133 | 
134 |     def _do_compare(self, actual, target):
135 |         matrix = self._init_matrix(actual, target)
136 |         for trgt_pos, rw in enumerate(target):
137 |             for actual_pos, hw in enumerate(actual):
138 |                 insert = matrix[trgt_pos + 1, actual_pos] + 1
139 |                 delete = matrix[trgt_pos, actual_pos + 1] + 1
140 |                 if rw != hw:
141 |                     subst = matrix[trgt_pos, actual_pos] + 1
142 |                 else:
143 |                     subst = matrix[trgt_pos, actual_pos]
144 | 
145 |                 best = min(insert, delete, subst)
146 |                 matrix[trgt_pos + 1, actual_pos + 1] = best
147 | 
148 |         cost = matrix[-1, -1]
149 |         if len(target)==0:
150 |             return 1
151 |         wer = cost / len(target)
152 |         return wer, matrix
153 | 
154 |     def _do_backtrace(self, actuals, targets, matrix, actual_words, target_words, safe_mode=False):
155 |         i = len(targets) - 1
156 |         j = len(actuals) - 1
157 | 
158 |         alignment = []
159 |         path = []
160 |         inserts = 0
161 |         deletions = 0
162 |         substitns = 0
163 |         matched = 0
164 | 
165 |         while i >= 0 or j >= 0:
166 |             path.append((i + 1, j + 1))
167 |             start = matrix[i + 1, j + 1]
168 |             insert = matrix[i + 1, j]
169 |             delete = matrix[i, j + 1]
170 |             subst = matrix[i, j]
171 |             best = min(start, subst)
172 | 
173 |             if j < 0:
174 |                 return self._do_backtrace(actuals, targets, matrix,
175 |                                           actual_words, target_words, safe_mode=True)
176 | 
177 |             if insert < best:
178 |                 alignment.append((None, actuals[j]))
179 |                 inserts += 1
180 |                 j -= 1
181 | 
182 |             elif delete < best or (safe_mode and delete==best):
183 |                 alignment.append((targets[i], None))
184 |                 deletions += 1
185 |                 i -= 1
186 | 
187 |             else:
188 |                 if start == subst: # no change
189 |                     matched += 1
190 |                 else:
191 |                     substitns += 1
192 | 
193 |                 alignment.append((targets[i], actuals[j]))
194 |                 j -= 1
195 |                 i -= 1
196 | 
197 |         alignment.reverse()
198 |         path.reverse()
199 |         scored_words = []
200 | 
201 |         # the index returned in scored_words is relative to the *actual* sentence 
202 |         # but we need to keep track of both so we can look up the un-messed-with form of word
203 |         a_idx = 0
204 |         t_idx = 0    
205 |         for pair in alignment:
206 | 
207 |             if pair[0] == pair[1]:
208 |                 actual = actual_words[a_idx]
209 |                 target = target_words[t_idx]
210 |                 scored_words.append((actual, target, a_idx, None))
211 |                 a_idx += 1
212 |                 t_idx += 1
213 | 
214 |             elif pair[0] is None:
215 |                 actual = actual_words[a_idx]
216 |                 scored_words.append((actual, None, a_idx, "added"))
217 |                 a_idx += 1
218 | 
219 |             elif pair[1] is None:
220 |                 target = target_words[t_idx]
221 |                 scored_words.append((None, target, a_idx, "skipped"))
222 |                 t_idx += 1
223 | 
224 |             else:
225 | 
226 |                 actual = actual_words[a_idx]
227 |                 target = target_words[t_idx]
228 |                 scored_words.append((actual, target, a_idx, "changed"))
229 |                 a_idx += 1
230 |                 t_idx += 1
231 | 
232 |         return scored_words, alignment
233 | 
234 |     def _backtrace(self):
235 |         scored_words, alignment =\
236 |             self._do_backtrace(self.actual, self.target, self.matrix, self.actual_words, self.target_words)
237 |         self.scored_words = scored_words
238 |         self.alignment = alignment
239 | 
240 |     def _tokenize(self, sentence):
241 |         normalized_lower = self._normalize(sentence).lower()
242 |         words = normalized_lower.split()
243 |         words = SentenceDiff._single_word_subs(words)
244 |         words = self._spell_out_numbers(words)
245 |         return words
246 | 
247 |     def _tokenize_for_end_user(self, text):
248 |         text = SentenceDiff._sound_out_dollars(
249 |                     profanity.censor(text, 'x'))
250 |         words = str(text).strip().split()
251 |         return [word for word in words if len(self._remove_punctuation(word).strip())>0]
252 | 
253 |     def _normalize(self, text):
254 |         return \
255 |             self._remove_punctuation(
256 |                 SentenceDiff._sound_out_dollars(
257 |                     profanity.censor(text, 'x')))
258 | 
259 |     @staticmethod
260 |     def _assert_not_empty(actual_sentence, target_sentence):
261 |         assert target_sentence is not None
262 |         assert actual_sentence is not None
263 |         t = len(target_sentence)
264 |         a = len(actual_sentence)
265 |         if t == 0 or a == 0\
266 |            and a == t:
267 |             raise Exception("cannot compare empty sentences")
268 | 
269 |     @staticmethod
270 |     def _spell_out_numbers(words):
271 |         p = inflect.engine()
272 |         result = []
273 |         for word in words:
274 |             if SentenceDiff._check_int(word):
275 |                 result.append(p.number_to_words(int(word)))
276 |             else:
277 |                 result.append(word)
278 |         return result
279 | 
280 |     @staticmethod
281 |     def _spell_out_numbers_in_word(word):
282 |         if SentenceDiff._check_int(word):
283 |             p = inflect.engine()
284 |             return p.number_to_words(int(word))
285 |         else:
286 |             return word
287 | 
288 |     @staticmethod
289 |     def _check_int(s):
290 |         if s[0] in ('-', '+'):
291 |             return s[1:].isdigit()
292 |         return s.isdigit()
293 | 
294 |     @staticmethod
295 |     def _remove_punctuation(text):
296 |         if text is None:
297 |             return None
298 |         return text.translate(str.maketrans('', '', string.punctuation))
299 | 
300 |     @staticmethod
301 |     def _sound_out_dollars(text):
302 |         text = re.sub(r"\$1\b", "1 dollar", text)
303 |         _subst = "\\2 dollars"
304 |         _regex = r"(\$)(\d*)\b"
305 |         return re.sub(_regex, _subst, text)
306 | 
307 |     @staticmethod
308 |     def _single_word_subs(words):
309 |         return [SentenceDiff._single_word_sub(word) for word in words]
310 | 
311 |     @staticmethod
312 |     def _single_word_sub(word):
313 |             # specific
314 |         word = re.sub(r"mr", "mister", word, flags=re.IGNORECASE)
315 |         word = re.sub(r"ms", "miss", word, flags=re.IGNORECASE)
316 |         word = re.sub(r"mrs", "mrs", word, flags=re.IGNORECASE)
317 |         word = re.sub(r"dr", "doctor", word, flags=re.IGNORECASE)
318 |         return word
319 | 
320 |     @staticmethod
321 |     def _word_diff_cost(wordA, wordB):
322 |         #substitution cost, similar words cost close to 0 different words cost 1
323 |         denominator = 0
324 |         numerator = 0
325 |         for i, s in enumerate(difflib.ndiff(wordA, wordB)):
326 |             denominator += 1
327 |             if s[0] == '-' or s[0] == '+':
328 |                 numerator += 1
329 | 
330 |         cost = numerator/denominator
331 |         # a little hack to cost zero when its really a random
332 |         # coincidence eg 'hydra' vs 'girl' share one letter -r
333 |         if cost>=0.85 and numerator>2:
334 |             return 1
335 |         else:
336 |             return cost
337 | 
338 |     @staticmethod
339 |     def _word_add_rm_cost(wordA, wordB):
340 |         #addition or removal cost, small words dont cost so much
341 |         word = wordA if wordA is not None else wordB
342 |         if word == "a" or word == "the" or len(word) <= 2:
343 |             return 0.6
344 |         else:
345 |             return 1
346 | 
347 |     @staticmethod
348 |     def _all_substitutions(list_of_lists):
349 |         rslt = []
350 |         for lst in list_of_lists:
351 |             for pair in itertools.permutations(lst, 2):
352 |                 rslt.append(pair)
353 |         return rslt
354 | 
355 |     @staticmethod
356 |     @functools.lru_cache()
357 |     def _homonyms(sentence):
358 |         result = [sentence]
359 |         for pair in SentenceDiff._all_word_subs():
360 |             test_sentence = sentence.replace(pair[0], pair[1])
361 |             if test_sentence != sentence:
362 |                 result.append(test_sentence)
363 |         return result
364 | 
365 |     @staticmethod
366 |     @functools.lru_cache(maxsize=None)
367 |     def _all_word_subs():
368 |         return SentenceDiff._all_substitutions(
369 |                 SentenceDiff._all_word_homonyms())
370 | 
371 |     @staticmethod
372 |     @functools.lru_cache(maxsize=None)
373 |     def _all_word_homonyms():
374 |         return \
375 |         [["there", "their", "they’re"],
376 |          ["see", "sea"],
377 |          ["for", "four"],
378 |          ["by", "buy", "bye"],
379 |          ["passed", "past"],
380 |          ["which", "witch"],
381 |          ["son", "sun"],
382 |          ["who’s", "whose"],
383 |          ["hole", "whole"],
384 |          ["write", "right"],
385 |          ["to", "too", "two"],
386 |          ["threw", "through"],
387 |          ["cereal", "serial"],
388 |          ["desert", "dessert"],
389 |          ["meat", "meet"],
390 |          ["flower", "flour"],
391 |          ["cooking", "cookin"],
392 |          ["jumping", "jumpin"],
393 |          ["principal", "principle"],
394 |          ["blue bell", "bluebell"],
395 |          ["talk town", "talktown"],
396 |          ["silverware", "silver ware"],
397 |          ["majong", "mah jong"],
398 |          ["rock wall", "rockwall"],
399 |          ["chicken soup", "chickensoup"],
400 |          ["tomato soup", "tomatosoup"],
401 |          ["hi tim", "hawaii team"],
402 |          ["hi tim", "hi team"],
403 |          ["hi", "hawaii"],
404 |          ["hi tim", "hawaii team"],
405 |          ["hi tim", "hi team"],
406 |          ["hi", "hawaii"]
407 |          ]


--------------------------------------------------------------------------------