├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── README.md ├── quotefix.py ├── quotefix_test.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | build 4 | dist 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "2.7" 5 | - "3.5" 6 | 7 | script: python setup.py test 8 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Peter Teichman 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | quotefix 2 | ======== 3 | 4 | This Python module inserts punctuation to balance quotation marks, 5 | parentheses, and brackets that are unmatched in a Unicode string. 6 | 7 | It allows you to use text synthesis techniques that are unaware of 8 | these internal clauses and still clean things up nicely for 9 | presentation. 10 | 11 | Like this: 12 | 13 | >>> quotefix('She said, "I ran out (to the store') 14 | 'She said, "I ran out (to the store)"' 15 | 16 | >>> quotefix("it works in reverse too)") 17 | '(it works in reverse too)' 18 | 19 | You came here because you want to... 20 | 21 | * Install the latest release: `pip install quotefix` 22 | * Read the documentation: `pydoc quotefix` 23 | * File a bug: https://github.com/pteichman/quotefix/issues 24 | * Submit a change: file a pull request at https://github.com/pteichman/quotefix/pulls 25 | 26 | Assumptions 27 | =========== 28 | 29 | The main assumption made by quotefix is that punctuation ending a 30 | nested clause (a parenthetic aside, a quotation, etc) should also end 31 | any other open clauses. Likewise, punctuation opening a clause should 32 | open any unopened clauses in the remainder of the string. 33 | 34 | A secondary assumption concerns the direction of the double quotation 35 | mark: ". If this character is closer to the beginning of a word, it's 36 | assumed to be an open quote. 37 | 38 | History 39 | ======= 40 | 41 | quotefix started as a function in Fate, a fast & scalable trigram chat 42 | framework: https://github.com/pteichman/fate 43 | -------------------------------------------------------------------------------- /quotefix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # Copyright (c) 2016 Peter Teichman 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | 25 | from collections import namedtuple 26 | import re 27 | 28 | __all__ = ["quotefix", "quotefix_skip"] 29 | 30 | Token = namedtuple("Token", "type char") 31 | fwdchars = u'({[“"' 32 | revchars = u')}]”"' 33 | 34 | 35 | class TokenType: 36 | Literal = "Literal" 37 | Open = "Open" 38 | Close = "Close" 39 | 40 | 41 | def quotefix(text): 42 | """quotefix inserts matching punctuation for mismatched quotes etc""" 43 | def false(text): 44 | return False 45 | return quotefix_skip(text, false) 46 | 47 | 48 | def quotefix_skip(text, skipfunc): 49 | """quotefix_skip inserts matching punctuation, skipping words if needed""" 50 | tokens = [] 51 | 52 | words = re.findall("(\s+|\S+)", text, re.UNICODE) 53 | for word in words: 54 | if skipfunc(word): 55 | tokens.extend(literals(word)) 56 | else: 57 | tokens.extend(quotetokens(word)) 58 | 59 | return "".join(token.char for token in fixrev(fixfwd(tokens))) 60 | 61 | 62 | def quotetokens(word): 63 | ret = [] 64 | for i, c in enumerate(word): 65 | if c == '"': 66 | d = direction(word, i) 67 | ret.append(Token(type=d, char=c)) 68 | elif c in fwdchars: 69 | ret.append(Token(type=TokenType.Open, char=c)) 70 | elif c in revchars: 71 | ret.append(Token(type=TokenType.Close, char=c)) 72 | else: 73 | ret.append(Token(type=TokenType.Literal, char=c)) 74 | 75 | return ret 76 | 77 | 78 | def literals(word): 79 | return [Token(type=TokenType.Literal, char=c) for c in word] 80 | 81 | 82 | def mirror(c): 83 | i = fwdchars.find(c) 84 | if i >= 0: 85 | return revchars[i] 86 | 87 | i = revchars.find(c) 88 | if i >= 0: 89 | return fwdchars[i] 90 | 91 | return c 92 | 93 | 94 | def direction(word, pos): 95 | """Direction returns the direction (open/close) of a quote char at pos""" 96 | if pos < len(word)/2: 97 | return TokenType.Open 98 | return TokenType.Close 99 | 100 | 101 | def fixfwd(tokens): 102 | """fixfwd inserts close tokens for unmatched opens.""" 103 | stack = [] 104 | prev = None 105 | 106 | ret = [] 107 | for t in tokens: 108 | if t.type == TokenType.Open: 109 | stack.append(t.char) 110 | elif stack and t.type == TokenType.Close: 111 | prev = stack.pop() 112 | if prev != mirror(t.char): 113 | ret.append(Token(type=TokenType.Close, char=mirror(prev))) 114 | ret.append(t) 115 | 116 | while stack: 117 | prev = stack.pop() 118 | ret.append(Token(type=TokenType.Close, char=mirror(prev))) 119 | 120 | return ret 121 | 122 | 123 | def fixrev(tokens): 124 | """fixrev inserts open tokens for unmatched closes.""" 125 | stack = [] 126 | prev = None 127 | 128 | ret = [] 129 | for t in reversed(tokens): 130 | if t.type == TokenType.Close: 131 | stack.append(t.char) 132 | elif stack and t.type == TokenType.Open: 133 | prev = stack.pop() 134 | if prev != mirror(t.char): 135 | ret.append(Token(type=TokenType.Open, char=mirror(prev))) 136 | ret.append(t) 137 | 138 | while stack: 139 | prev = stack.pop() 140 | ret.append(Token(type=TokenType.Open, char=mirror(prev))) 141 | 142 | return reversed(ret) 143 | -------------------------------------------------------------------------------- /quotefix_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import re 5 | import unittest 6 | 7 | # Verify that __all__ is set properly in quotefix. 8 | from quotefix import * 9 | 10 | 11 | class TestQuotefix(unittest.TestCase): 12 | def test_basic(self): 13 | # (text, expected) 14 | tests = [ 15 | (u"this is \"a test", u"this is \"a test\""), 16 | (u"\"this\" is \"a test", u"\"this\" is \"a test\""), 17 | (u"this) is \"a test", u"(this) is \"a test\""), 18 | (u"this)) is ((a test)", u"((this)) is ((a test))"), 19 | (u"this]) is ((a test)", u"([this]) is ((a test))"), 20 | (u"this” is “a test”", u"“this” is “a test”"), 21 | (u"(this is a test\"", u"\"(this is a test)\""), 22 | ] 23 | 24 | for text, expected in tests: 25 | fixed = quotefix(text) 26 | self.assertEqual(fixed, expected, 27 | "quotefix(%s) -> %s, want %s" % (text, fixed, expected)) 28 | 29 | def test_skipfunc(self): 30 | def isemoticon(word): 31 | return re.match("[:;]-*[\(\)]+", word, re.UNICODE) 32 | 33 | tests = [ 34 | (u"this is a test :)", u"this is a test :)"), 35 | (u":) :( :-) :-( ;)", u":) :( :-) :-( ;)"), 36 | ] 37 | 38 | for text, expected in tests: 39 | fixed = quotefix_skip(text, isemoticon) 40 | self.assertEqual(fixed, expected, 41 | "quotefix(%s) -> %s, want %s" % (text, fixed, expected)) 42 | 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from setuptools import setup 5 | 6 | setup( 7 | name="quotefix", 8 | version="1.0.1", 9 | author="Peter Teichman", 10 | author_email="peter@teichman.org", 11 | description="Insert matching punctuation in strings", 12 | license="MIT", 13 | url="https://github.com/pteichman/quotefix", 14 | py_modules=["quotefix"], 15 | use_2to3=True, 16 | test_suite="quotefix_test" 17 | ) 18 | --------------------------------------------------------------------------------