├── .github └── workflows │ └── commit.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── README.rst ├── contractions ├── __init__.py ├── data │ ├── __init__.py │ ├── contractions_dict.json │ ├── leftovers_dict.json │ └── slang_dict.json └── test___init__.py ├── deploy.py ├── requirements.txt ├── setup.cfg └── setup.py /.github/workflows/commit.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: commit 3 | 4 | on: 5 | push: 6 | 7 | jobs: 8 | lint: 9 | name: Python Lint 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout 13 | uses: actions/checkout@v2 14 | - uses: actions/setup-python@v2 15 | with: 16 | python-version: '3.10' 17 | - name: Install requirements 18 | run: pip install flake8 pycodestyle 19 | - name: Run flake8 20 | run: flake8 . --count --show-source --statistics --ignore=E501 21 | 22 | test: 23 | strategy: 24 | matrix: 25 | py-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] 26 | name: Pytest 27 | runs-on: ubuntu-latest 28 | needs: lint 29 | steps: 30 | - name: checkout 31 | uses: actions/checkout@v2 32 | - uses: actions/setup-python@v2 33 | with: 34 | python-version: ${{ matrix.py-version }} 35 | - name: Install requirements 36 | run: pip install pytest -r requirements.txt 37 | - name: Run pytest 38 | run: pytest 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *#* 3 | *.DS_STORE 4 | *.log 5 | *Data.fs* 6 | *flymake* 7 | dist/* 8 | *egg* 9 | urllist* 10 | build/ 11 | __pycache__/ 12 | /.Python 13 | /bin/ 14 | /include/ 15 | /lib/ 16 | /pip-selfcheck.json 17 | .tox/ 18 | .cache 19 | .coverage -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Pascal van Kooten 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include contractions/data * 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # contractions 2 | 3 | **Update**: highly advised to use a `contractions>0.0.18` as it is 50x faster. 4 | 5 | This package is capable of resolving contractions (and slang), examples: 6 | 7 | ``` 8 | you're -> you are 9 | i'm -> I am 10 | # uses \b boundaries for "unsafe" 11 | ima -> I am going to 12 | yall -> you all 13 | gotta -> got to 14 | ``` 15 | 16 | Note that in ambigious cases it will revert to the most common case: 17 | 18 | he's -> he is (instead of he has) 19 | 20 | ## Usage 21 | 22 | ```python 23 | import contractions 24 | contractions.fix("you're happy now") 25 | # "you are happy now" 26 | contractions.fix("yall're happy now", slang=False) # default: true 27 | # "yall are happy" 28 | contractions.fix("yall're happy now") 29 | # "you all are happy now" 30 | ``` 31 | 32 | ## Easy to add your own! 33 | 34 | Since `contractions>0.0.18`, you can easily add your own: 35 | 36 | ```python 37 | import contractions 38 | contractions.add('mychange', 'my change') 39 | ``` 40 | 41 | ## Installation 42 | 43 | ```shell 44 | pip install contractions 45 | ``` 46 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | fixes contractions 2 | -------------------------------------------------------------------------------- /contractions/__init__.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | from textsearch import TextSearch 3 | 4 | import json 5 | import pkgutil 6 | 7 | json_open = pkgutil.get_data("contractions", "data/contractions_dict.json") 8 | contractions_dict = json.loads(json_open.decode("utf-8")) 9 | 10 | json_open = pkgutil.get_data("contractions", "data/leftovers_dict.json") 11 | leftovers_dict = json.loads(json_open.decode("utf-8")) 12 | 13 | json_open = pkgutil.get_data("contractions", "data/slang_dict.json") 14 | slang_dict = json.loads(json_open.decode("utf-8")) 15 | 16 | for month in [ 17 | "january", 18 | "february", 19 | "march", 20 | "april", 21 | "june", 22 | "july", 23 | "august", 24 | "september", 25 | "october", 26 | "november", 27 | "december", 28 | ]: 29 | contractions_dict[month[:3] + "."] = month 30 | 31 | contractions_dict.update({k.replace("'", "’"): v for k, v in contractions_dict.items()}) 32 | 33 | leftovers_dict.update({k.replace("'", "’"): v for k, v in leftovers_dict.items()}) 34 | 35 | safety_keys = set( 36 | ["he's", "he'll", "we'll", "we'd", "it's", "i'd", "we'd", "we're", "i'll", "who're", "o'"] 37 | ) 38 | 39 | 40 | def get_combinations(tokens, joiners): 41 | combs = [] 42 | combs.append(tokens) 43 | results = [] 44 | for option in combs: 45 | option = [[x] for x in option] 46 | option = intersperse(option, joiners) 47 | for c in product(*option): 48 | results.append("".join(c)) 49 | return results 50 | 51 | 52 | def intersperse(lst, item): 53 | result = [item] * (len(lst) * 2 - 1) 54 | result[0::2] = lst 55 | return result 56 | 57 | 58 | # have to create all the possibilities of ' and nothing 59 | unsafe_dict = {} 60 | for k, v in contractions_dict.items(): # contractions_dict.items(): 61 | if k.lower() in safety_keys: 62 | continue 63 | if "'" not in k: 64 | continue 65 | tokens = k.split("'") 66 | for comb in get_combinations(tokens, ["", "'"]): 67 | unsafe_dict[comb] = v 68 | 69 | slang_dict.update(unsafe_dict) 70 | 71 | ts_leftovers = TextSearch("insensitive", "norm") 72 | ts_leftovers.add(contractions_dict) 73 | ts_leftovers.add(leftovers_dict) 74 | 75 | ts_leftovers_slang = TextSearch("insensitive", "norm") 76 | ts_leftovers_slang.add(contractions_dict) 77 | ts_leftovers_slang.add(leftovers_dict) 78 | ts_leftovers_slang.add(slang_dict) 79 | 80 | ts_slang = TextSearch("insensitive", "norm") 81 | ts_slang.add(contractions_dict) 82 | ts_slang.add(slang_dict) 83 | 84 | ts_basic = TextSearch("insensitive", "norm") 85 | ts_basic.add(contractions_dict) 86 | 87 | ts_view_window = TextSearch("insensitive", "object") 88 | ts_view_window.add(list(contractions_dict.keys())) 89 | ts_view_window.add(list(leftovers_dict.keys())) 90 | ts_view_window.add(list(slang_dict.keys())) 91 | 92 | replacers = { 93 | (True, False): ts_leftovers, 94 | (True, True): ts_leftovers_slang, 95 | (False, True): ts_slang, 96 | (False, False): ts_basic, 97 | } 98 | 99 | 100 | def fix(s, leftovers=True, slang=True): 101 | ts = replacers[(leftovers, slang)] 102 | return ts.replace(s) 103 | 104 | 105 | def add(key, value): 106 | for ts in replacers.values(): 107 | ts.add(key, value) 108 | 109 | 110 | def preview(text, flank): 111 | """ 112 | Return all contractions and their location before fix for manual check. Also provide a viewing window to quickly 113 | preview the contractions in the text. 114 | :param text: texture. 115 | :param flank: int number, control the size of the preview window. The window would be "flank-contraction-flank". 116 | :return: preview_items, a list includes all matched contractions and their locations. 117 | """ 118 | try: 119 | int(flank) 120 | except Exception as e: 121 | print(e) 122 | raise Exception("Argument flank must be integer!") 123 | ts = ts_view_window 124 | results = ts.findall(text) 125 | preview_items = [] 126 | for result in results: 127 | window_start = result.start - flank 128 | window_end = result.end + flank 129 | if window_start < 0: 130 | window_start = 0 131 | if window_end > len(text): 132 | window_end = len(text) 133 | preview_items.append({"match": result.match, "start": result.start, "end": result.end, 134 | "viewing_window": text[window_start: window_end]}) 135 | return preview_items 136 | -------------------------------------------------------------------------------- /contractions/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kootenpv/contractions/595188a45c472957427b3a6a07b6ce0492990fad/contractions/data/__init__.py -------------------------------------------------------------------------------- /contractions/data/contractions_dict.json: -------------------------------------------------------------------------------- 1 | { 2 | "I'm": "I am", 3 | "I'm'a": "I am about to", 4 | "I'm'o": "I am going to", 5 | "I've": "I have", 6 | "I'll": "I will", 7 | "I'll've": "I will have", 8 | "I'd": "I would", 9 | "I'd've": "I would have", 10 | "Whatcha": "What are you", 11 | "amn't": "am not", 12 | "ain't": "are not", 13 | "aren't": "are not", 14 | "'cause": "because", 15 | "can't": "cannot", 16 | "can't've": "cannot have", 17 | "could've": "could have", 18 | "couldn't": "could not", 19 | "couldn't've": "could not have", 20 | "daren't": "dare not", 21 | "daresn't": "dare not", 22 | "dasn't": "dare not", 23 | "didn't": "did not", 24 | "didn’t": "did not", 25 | "don't": "do not", 26 | "don’t": "do not", 27 | "doesn't": "does not", 28 | "e'er": "ever", 29 | "everyone's": "everyone is", 30 | "finna": "fixing to", 31 | "gimme": "give me", 32 | "gon't": "go not", 33 | "gonna": "going to", 34 | "gotta": "got to", 35 | "hadn't": "had not", 36 | "hadn't've": "had not have", 37 | "hasn't": "has not", 38 | "haven't": "have not", 39 | "he've": "he have", 40 | "he's": "he is", 41 | "he'll": "he will", 42 | "he'll've": "he will have", 43 | "he'd": "he would", 44 | "he'd've": "he would have", 45 | "here's": "here is", 46 | "how're": "how are", 47 | "how'd": "how did", 48 | "how'd'y": "how do you", 49 | "how's": "how is", 50 | "how'll": "how will", 51 | "isn't": "is not", 52 | "it's": "it is", 53 | "'tis": "it is", 54 | "'twas": "it was", 55 | "it'll": "it will", 56 | "it'll've": "it will have", 57 | "it'd": "it would", 58 | "it'd've": "it would have", 59 | "kinda": "kind of", 60 | "let's": "let us", 61 | "luv": "love", 62 | "ma'am": "madam", 63 | "may've": "may have", 64 | "mayn't": "may not", 65 | "might've": "might have", 66 | "mightn't": "might not", 67 | "mightn't've": "might not have", 68 | "must've": "must have", 69 | "mustn't": "must not", 70 | "mustn't've": "must not have", 71 | "needn't": "need not", 72 | "needn't've": "need not have", 73 | "ne'er": "never", 74 | "o'": "of", 75 | "o'clock": "of the clock", 76 | "ol'": "old", 77 | "oughtn't": "ought not", 78 | "oughtn't've": "ought not have", 79 | "o'er": "over", 80 | "shan't": "shall not", 81 | "sha'n't": "shall not", 82 | "shalln't": "shall not", 83 | "shan't've": "shall not have", 84 | "she's": "she is", 85 | "she'll": "she will", 86 | "she'd": "she would", 87 | "she'd've": "she would have", 88 | "should've": "should have", 89 | "shouldn't": "should not", 90 | "shouldn't've": "should not have", 91 | "so've": "so have", 92 | "so's": "so is", 93 | "somebody's": "somebody is", 94 | "someone's": "someone is", 95 | "something's": "something is", 96 | "sux": "sucks", 97 | "that're": "that are", 98 | "that's": "that is", 99 | "that'll": "that will", 100 | "that'd": "that would", 101 | "that'd've": "that would have", 102 | "'em": "them", 103 | "there're": "there are", 104 | "there's": "there is", 105 | "there'll": "there will", 106 | "there'd": "there would", 107 | "there'd've": "there would have", 108 | "these're": "these are", 109 | "they're": "they are", 110 | "they've": "they have", 111 | "they'll": "they will", 112 | "they'll've": "they will have", 113 | "they'd": "they would", 114 | "they'd've": "they would have", 115 | "this's": "this is", 116 | "this'll": "this will", 117 | "this'd": "this would", 118 | "those're": "those are", 119 | "to've": "to have", 120 | "wanna": "want to", 121 | "wasn't": "was not", 122 | "we're": "we are", 123 | "we've": "we have", 124 | "we'll": "we will", 125 | "we'll've": "we will have", 126 | "we'd": "we would", 127 | "we'd've": "we would have", 128 | "weren't": "were not", 129 | "what're": "what are", 130 | "what'd": "what did", 131 | "what've": "what have", 132 | "what's": "what is", 133 | "what'll": "what will", 134 | "what'll've": "what will have", 135 | "when've": "when have", 136 | "when's": "when is", 137 | "where're": "where are", 138 | "where'd": "where did", 139 | "where've": "where have", 140 | "where's": "where is", 141 | "which's": "which is", 142 | "who're": "who are", 143 | "who've": "who have", 144 | "who's": "who is", 145 | "who'll": "who will", 146 | "who'll've": "who will have", 147 | "who'd": "who would", 148 | "who'd've": "who would have", 149 | "why're": "why are", 150 | "why'd": "why did", 151 | "why've": "why have", 152 | "why's": "why is", 153 | "will've": "will have", 154 | "won't": "will not", 155 | "won't've": "will not have", 156 | "would've": "would have", 157 | "wouldn't": "would not", 158 | "wouldn't've": "would not have", 159 | "y'all": "you all", 160 | "y'all're": "you all are", 161 | "y'all've": "you all have", 162 | "y'all'd": "you all would", 163 | "y'all'd've": "you all would have", 164 | "you're": "you are", 165 | "you've": "you have", 166 | "you'll've": "you shall have", 167 | "you'll": "you will", 168 | "you'd": "you would", 169 | "you'd've": "you would have", 170 | 171 | "to cause": "to cause", 172 | "will cause": "will cause", 173 | "should cause": "should cause", 174 | "would cause": "would cause", 175 | "can cause": "can cause", 176 | "could cause": "could cause", 177 | "must cause": "must cause", 178 | "might cause": "might cause", 179 | "shall cause": "shall cause", 180 | "may cause": "may cause" 181 | } 182 | -------------------------------------------------------------------------------- /contractions/data/leftovers_dict.json: -------------------------------------------------------------------------------- 1 | { 2 | "'all": "", 3 | "'am": "", 4 | "'cause": "because", 5 | "'d": " would", 6 | "'ll": " will", 7 | "'re": " are", 8 | "'em": " them", 9 | "doin'": "doing", 10 | "goin'": "going", 11 | "nothin'": "nothing", 12 | "somethin'": "something", 13 | "havin'": "having", 14 | "lovin'": "loving", 15 | "'coz": "because", 16 | "thats": "that is", 17 | "whats": "what is" 18 | } 19 | -------------------------------------------------------------------------------- /contractions/data/slang_dict.json: -------------------------------------------------------------------------------- 1 | { 2 | "'aight": "alright", 3 | "abt": "about", 4 | "acct": "account", 5 | "altho": "although", 6 | "asap": "as soon as possible", 7 | "avg": "average", 8 | "b4": "before", 9 | "bc": "because", 10 | "bday": "birthday", 11 | "btw": "by the way", 12 | "convo": "conversation", 13 | "cya": "see ya", 14 | "diff": "different", 15 | "dunno": "do not know", 16 | "g'day": "good day", 17 | "gimme": "give me", 18 | "gonna": "going to", 19 | "gotta": "got to", 20 | "howdy": "how do you do", 21 | "idk": "I do not know", 22 | "ima": "I am going to", 23 | "imma": "I am going to", 24 | "innit": "is it not", 25 | "iunno": "I do not know", 26 | "kk": "okay", 27 | "lemme": "let me", 28 | "msg": "message", 29 | "nvm": "nevermind", 30 | "ofc": "of course", 31 | "ppl": "people", 32 | "prolly": "probably", 33 | "pymnt": "payment", 34 | "r ": "are ", 35 | "rlly": "really", 36 | "rly": "really", 37 | "rn": "right now", 38 | "spk": "spoke", 39 | "tbh": "to be honest", 40 | "tho": "though", 41 | "thx": "thanks", 42 | "tlked": "talked", 43 | "tmmw": "tomorrow", 44 | "tmr": "tomorrow", 45 | "tmrw": "tomorrow", 46 | "u": "you", 47 | "ur": "you are", 48 | "wanna": "want to", 49 | "woulda": "would have" 50 | } 51 | -------------------------------------------------------------------------------- /contractions/test___init__.py: -------------------------------------------------------------------------------- 1 | import contractions 2 | 3 | 4 | def test_fix(): 5 | assert contractions.fix("you're happy now") == "you are happy now" 6 | 7 | 8 | def test_insensitivity(): 9 | assert contractions.fix("You're happier now") == "You are happier now" 10 | 11 | 12 | def test_add(): 13 | contractions.add('mychange', 'my change') 14 | assert contractions.fix('mychange') == 'my change' 15 | 16 | 17 | def test_ill(): 18 | txt = 'He is to step down at the end of the week due to ill health' 19 | assert contractions.fix(txt) == txt 20 | assert contractions.fix("I'll") == "I will" 21 | 22 | 23 | def test_preview(): 24 | text = "This's a simple test including two sentences. I'd use it to test preview()." 25 | preview_items = contractions.preview(text, flank=10) 26 | print(preview_items) 27 | assert len(preview_items) == 2 28 | assert preview_items[0]['match'] == "This's" 29 | assert preview_items[1]['match'] == "I'd" 30 | assert text[preview_items[0]['start']: preview_items[0]['end']] == "This's" 31 | assert text[preview_items[1]['start']: preview_items[1]['end']] == "I'd" 32 | assert "This's" in preview_items[0]["viewing_window"] 33 | assert "I'd" in preview_items[1]["viewing_window"] 34 | text2 = "" 35 | preview_items2 = contractions.preview(text2, flank=10) 36 | assert preview_items2 == [] 37 | -------------------------------------------------------------------------------- /deploy.py: -------------------------------------------------------------------------------- 1 | """ File unrelated to the package, except for convenience in deploying """ 2 | import re 3 | import sh 4 | import os 5 | 6 | commit_count = sh.git("rev-list", ["--all"]).count("\n") 7 | 8 | with open("setup.py") as f: 9 | setup = f.read() 10 | 11 | setup = re.sub('MICRO_VERSION = "[0-9]+"', 'MICRO_VERSION = "{}"'.format(commit_count), setup) 12 | 13 | major = re.search('MAJOR_VERSION = "([0-9]+)"', setup).groups()[0] 14 | minor = re.search('MINOR_VERSION = "([0-9]+)"', setup).groups()[0] 15 | micro = re.search('MICRO_VERSION = "([0-9]+)"', setup).groups()[0] 16 | version = "{}.{}.{}".format(major, minor, micro) 17 | 18 | with open("setup.py", "w") as f: 19 | f.write(setup) 20 | 21 | with open("contractions/__init__.py") as f: 22 | init = f.read() 23 | 24 | with open("contractions/__init__.py", "w") as f: 25 | f.write(re.sub('__version__ = "[0-9.]+"', '__version__ = "{}"'.format(version), init)) 26 | 27 | os.system("python setup.py sdist bdist_wheel") 28 | os.system("twine upload dist/*") 29 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | textsearch>=0.0.21 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_rpm] 2 | doc_files = README.rst 3 | 4 | [wheel] 5 | universal = 1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | MAJOR_VERSION = "0" 5 | MINOR_VERSION = "1" 6 | MICRO_VERSION = "72" 7 | VERSION = "{}.{}.{}".format(MAJOR_VERSION, MINOR_VERSION, MICRO_VERSION) 8 | 9 | setup( 10 | name="contractions", 11 | version=VERSION, 12 | description="Fixes contractions such as `you're` to you `are`", 13 | author="Pascal van Kooten", 14 | url="https://github.com/kootenpv/contractions", 15 | package_data={ 16 | # If any package contains *.txt or *.rst files, include them: 17 | # '': ['*.txt', '*.rst'], 18 | "contractions": ["data/*.json"] 19 | }, 20 | author_email="kootenpv@gmail.com", 21 | classifiers=[ 22 | "Intended Audience :: Developers", 23 | "Intended Audience :: Customer Service", 24 | "Intended Audience :: System Administrators", 25 | "Operating System :: Microsoft", 26 | "Operating System :: MacOS :: MacOS X", 27 | "Operating System :: Unix", 28 | "Operating System :: POSIX", 29 | "Programming Language :: Python", 30 | "Programming Language :: Python :: 2.7", 31 | "Programming Language :: Python :: 3", 32 | "Programming Language :: Python :: 3.4", 33 | "Programming Language :: Python :: 3.5", 34 | "Programming Language :: Python :: 3.6", 35 | "Topic :: Software Development", 36 | "Topic :: Software Development :: Libraries", 37 | "Topic :: Software Development :: Libraries :: Python Modules", 38 | "Topic :: System :: Software Distribution", 39 | "Topic :: System :: Systems Administration", 40 | "Topic :: Utilities", 41 | ], 42 | license="MIT", 43 | packages=find_packages(), 44 | zip_safe=False, 45 | platforms="any", 46 | install_requires=["textsearch>=0.0.21"], 47 | ) 48 | --------------------------------------------------------------------------------