├── depoeticizer.js ├── README.md ├── depoeticize.cgi ├── depoeticizer.css ├── depoeticizer.html └── depoeticizer.py /depoeticizer.js: -------------------------------------------------------------------------------- 1 | function depoeticize() 2 | { 3 | $(".depoeticizer-output").html("Depoeticizing..."); 4 | $.ajax({ 5 | type: "GET", 6 | url: "/apps/dp/depoeticize.cgi", 7 | dataType: "text", 8 | data: { 9 | text: $("#textinput").val(), 10 | model: $("#corpus").val(), 11 | errorprob: $("#errorprob").val() 12 | }, 13 | complete: function (response) { 14 | if (response.statusText == "OK") { 15 | var txt = response.responseText; 16 | $(".depoeticizer-output").html(txt); 17 | } else { 18 | } 19 | } 20 | }); 21 | } 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | depoeticizer 2 | ========= 3 | 4 | This program modifies a text so as to bring it more into line with the expectations created by an ngram model. It works by assuming that the text was generated by the model with a certain chance of "typos" occurring. Based on this, it determines what the author is most likely to have "really" meant, with the typos removed. It is basically a parody of autocorrect algorithms like the ones used in OCR. The main differences are 1) this program will "correct" words even if they are proper English; and 2) it is tuned so as to be far more likely to make "corrections" than any real world autocorrector would be. 5 | 6 | To run this, you will need to install [kenlm](https://github.com/kpu/kenlm) and download 7 | an ARPA-format language model (you can find some [here](http://www.keithv.com/software/csr/)). The program can run as a Web application, or you can load it as a Python module and play with it on the command line. 8 | 9 | For better performance, you may also want to convert the .arpa file into .binary format using KenLM's build_binary program. -------------------------------------------------------------------------------- /depoeticize.cgi: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # CGI setup 5 | 6 | import cgi 7 | 8 | data = cgi.FieldStorage() 9 | 10 | text = data.getfirst('text') 11 | model = data.getfirst('model') 12 | error_prob = data.getfirst('errorprob') 13 | 14 | print "Content-Type: text/html;charset=utf-8" 15 | print 16 | 17 | # Data validation 18 | 19 | import re 20 | 21 | text = str(text) 22 | model = str(model) 23 | error_prob = float(error_prob) 24 | 25 | if len(text) > 2000: 26 | print "Text must be under 2000 characters!" 27 | exit() 28 | 29 | if model not in ('lm_csr_5k_nvp_2gram', 'lm_csr_5k_nvp_3gram', 30 | 'lm_csr_20k_nvp_2gram', 'lm_csr_20k_nvp_3gram'): 31 | print "Invalid language model name!" 32 | exit() 33 | model = model + '.binary' 34 | if '2gram' in model: 35 | n = 2 36 | else: 37 | n = 3 38 | 39 | if error_prob <= 0.0 or error_prob >= 1.0: 40 | print "Probability of typo must be between 0 and 1!" 41 | exit() 42 | 43 | # Load settings 44 | 45 | import depoeticizer 46 | depoeticizer.ERROR_PROB = error_prob 47 | depoeticizer.load_language_model(model) 48 | 49 | # Depoeticize 50 | 51 | text = depoeticizer.depoeticize(text, n=n) 52 | print text.replace('\n', '
').replace(' ', ' ') 53 | -------------------------------------------------------------------------------- /depoeticizer.css: -------------------------------------------------------------------------------- 1 | select, input { 2 | font-size: 16px !important; 3 | } 4 | 5 | body { 6 | font-family: Lato, sans-serif; 7 | font-size: 14px; 8 | margin: 1px; 9 | } 10 | 11 | textarea { 12 | font-family: Lato, sans-serif; 13 | font-size: 12px; 14 | } 15 | 16 | select { 17 | float: right; 18 | width: 300px; 19 | } 20 | 21 | .select-label { 22 | float: left; 23 | width: 140px; 24 | position: relative; 25 | top: 3px; 26 | } 27 | 28 | .details-box { 29 | outline: 1px solid grey; 30 | width: 290px; 31 | padding: 5px; 32 | margin: 10px 0px; 33 | } 34 | 35 | .details-heading { 36 | margin-bottom: 2px; 37 | } 38 | 39 | .io-area { 40 | float: left; 41 | width: 50%; 42 | } 43 | 44 | .depoeticizer-input { 45 | outline: 0; 46 | width: 100%; 47 | box-sizing: border-box; 48 | -moz-box-sizing: border-box; 49 | -webkit-box-sizing: border-box; 50 | height: 318px; 51 | padding: 5px; 52 | resize: none; 53 | } 54 | 55 | .depoeticizer-output-area { 56 | outline: 1px solid grey; 57 | width: 100%; 58 | box-sizing: border-box; 59 | -moz-box-sizing: border-box; 60 | -webkit-box-sizing: border-box; 61 | height: 316px; 62 | margin-top: 1px; 63 | padding: 5px; 64 | overflow: scroll; 65 | font-size: 12px; 66 | } 67 | -------------------------------------------------------------------------------- /depoeticizer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 |
14 |
Select a corpus:
15 | 21 |
22 |
23 |
24 |
Probability of typo:
25 | 26 |
27 |
Enter your text here:
28 |
29 | 49 |
50 |
51 |
52 |
Your output will appear here.
53 |
54 |
55 |
56 | 57 |
58 | 59 | 60 | -------------------------------------------------------------------------------- /depoeticizer.py: -------------------------------------------------------------------------------- 1 | import kenlm 2 | from math import log, ceil 3 | 4 | # Maximum number of errors that a word can have based on its length. 5 | MAX_ERRORS_PER_LETTER = 1.0 / 4.0 6 | MAX_ERRORS = 2 7 | 8 | # Probability that a single character will be deleted, etc. 9 | ERROR_PROB = 0.2 10 | 11 | # ARPA language model. 12 | lm = None 13 | 14 | # Ngram probabilities computed using an ARPA-format language model. 15 | def get_ngram_prob(ngram): 16 | ngram = [tok.lower() for tok in ngram] 17 | return lm.score(' '.join(ngram)) 18 | 19 | # Reading probabilities are computed using a simple edit-distance algorithm such as a 20 | # spell-checker might use. 21 | alphabet = 'abcdefghijklmnopqrstuvwxyz' 22 | def isknown(word): 23 | return word.lower() in lm 24 | def get_reading_probs(tok): 25 | readings = {} 26 | def add_tok_readings(tok, prob, n, known): 27 | if prob == 0.0: 28 | return 29 | if n == 0: 30 | reading_prob = prob 31 | else: 32 | reading_prob = prob * (1.0 - ERROR_PROB) 33 | if known: 34 | if tok not in readings: 35 | readings[tok] = reading_prob 36 | if n == 0: 37 | return 38 | # Adapted from http://norvig.com/spell-correct.html. 39 | s = [(tok[:i], tok[i:]) for i in range(len(tok) + 1)] 40 | deletions = [a + b[1:] for a, b in s if b] 41 | transpositions = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1] 42 | substitutions = [a + c + b[1:] for a, b in s for c in alphabet if b] 43 | insertions = [a + c + b for a, b in s for c in alphabet] 44 | edits = set(deletions + transpositions + substitutions + insertions) 45 | for edit in edits: 46 | # Determining the probabilities this way is cheating, really, since I should 47 | # be accounting for the fact that the observed token is only one of many 48 | # possible typos that could be generated from a given word. But my 49 | # intentions in this project are not serious enough to warrant that much 50 | # effort. 51 | add_tok_readings(edit, prob * ERROR_PROB, n-1, isknown(edit)) 52 | add_tok_readings(tok, 1.0, min(int(ceil(MAX_ERRORS_PER_LETTER * len(tok))), MAX_ERRORS), True) 53 | reading_list = [] 54 | for new_tok in readings: 55 | logprob = log(readings[new_tok]) 56 | if tok.isupper() and len(tok) > 1: 57 | new_tok = new_tok.upper() 58 | elif tok[0].isupper(): 59 | new_tok = new_tok[0].upper() + new_tok[1:].lower() 60 | reading_list.append((new_tok, logprob)) 61 | return reading_list 62 | 63 | def compute_ideal_token_seq(tokens, n=3, get_ngram_prob=get_ngram_prob, 64 | get_reading_probs=get_reading_probs): 65 | 66 | # Computes an "ideal" version of the specified token sequence. This 67 | # function assumes that the text was generated using a n-gram-based 68 | # Markov chain - that is, in such a way that the probability of a given 69 | # token coming next is determined based on what n-1 tokens came before - 70 | # with a certain probability of a "typo" each time a new token is 71 | # written. Two functions must be specified, one giving the probability 72 | # that a specified n-token sequence will appear, and the other 73 | # giving a list of all possible words that might have been written as 74 | # a given token, with probabilities. Based on the combination of these 75 | # two models, the function returns the form that the token sequence 76 | # is most likely to have taken before the "typos." 77 | 78 | # This function uses a variant of the Viterbi algorithm where the possible 79 | # state sequences are stored in nested dictionaries rather than a matrix. 80 | # This is because the total number of possible states is enormous (equal 81 | # to the number of possible n-grams), but the number of states that can 82 | # produce a given token is relatively small, which would result in an 83 | # extremely sparse matrix. 84 | 85 | # The first key of d is the most recent token and the second is the token 86 | # before that, etc. The value is a pair containing the probability and the 87 | # complete token sequence up to that point, including the two tokens that 88 | # are used as keys. 89 | 90 | d = {} 91 | 92 | # The computation for the initial state works a little differently from the 93 | # later computations because we don't have values for the previous tokens. 94 | # Instead, we have to compute the probabilities for all possible readings 95 | # of the first ngram. 96 | 97 | if len(tokens) < n: 98 | print 'Text must be at least {0} tokens long!'.format(n) 99 | exit() 100 | 101 | # We want to record all possible values for tokens 2-n, and only 102 | # the optimal values for the first token. 103 | def compute_initial_values(i, toks=[], prob=1.0): 104 | if i == 0: 105 | max_prob = float("-inf") 106 | best_tok0 = None 107 | r = get_reading_probs(tokens[0]) 108 | for tok0, prob0 in r: 109 | ngram_prob = get_ngram_prob([tok0] + toks) 110 | final_prob = prob + ngram_prob 111 | if final_prob > max_prob: 112 | max_prob = final_prob 113 | best_tok0 = tok0 114 | return (max_prob, [best_tok0] + toks) 115 | else: 116 | d = {} 117 | r = get_reading_probs(tokens[i]) 118 | #print tokens[i], ':', len(r) 119 | for toki, probi in r: 120 | d[toki] = compute_initial_values(i-1, [toki] + toks, probi + prob) 121 | return d 122 | d = compute_initial_values(0) 123 | 124 | # Now proceed through the rest of the tokens. For each one we rebuild d, 125 | # keeping all options for the previous n-2 tokens and finding the optimal 126 | # values for the one before that. 127 | def iterate_possibilities(i, toks, prob, d, tok): 128 | if isinstance(d, tuple): 129 | # This should only happen near the beginning when we haven't built up a 130 | # history. 131 | prob0, seq0 = d 132 | ngram_prob = get_ngram_prob(toks) 133 | final_prob = ngram_prob + prob + prob0 134 | return (final_prob, seq0 + [tok]) 135 | if i == 0: 136 | max_prob = float("-inf") 137 | best_tok0 = None 138 | best_seq0 = None 139 | for tok0 in d: 140 | ngram_prob = get_ngram_prob([tok0] + toks) 141 | prob0, seq0 = d[tok0] 142 | final_prob = ngram_prob + prob + prob0 143 | if final_prob > max_prob: 144 | max_prob = final_prob 145 | best_tok0 = tok0 146 | best_seq0 = seq0 147 | return (max_prob, best_seq0 + [tok]) 148 | else: 149 | dnew = {} 150 | for toki in d: 151 | dnew[toki] = iterate_possibilities(i-1, [toki] + toks, prob, d[toki], tok) 152 | return dnew 153 | for tok in tokens[1:]: 154 | dnew = {} 155 | r = get_reading_probs(tok) 156 | #print tok, ':', len(r) 157 | for tokn, probn in r: 158 | dnew[tokn] = iterate_possibilities(n-2, [tokn], probn, d, tokn) 159 | d = dnew 160 | 161 | # Find the optimal sequence from all the possibilities that remain. 162 | def extract_ideal_text(i, d, stats): 163 | if i == 0: 164 | prob, seq = d 165 | if prob > stats['max_prob']: 166 | stats['max_prob'] = prob 167 | stats['best_seq'] = seq 168 | else: 169 | for toki in d: 170 | extract_ideal_text(i-1, d[toki], stats) 171 | stats = {'max_prob': float("-inf"), 'best_seq': None} 172 | extract_ideal_text(n-1, d, stats) 173 | 174 | return stats['best_seq'] 175 | 176 | def load_language_model(filename): 177 | global lm 178 | lm = kenlm.LanguageModel(filename) 179 | 180 | def depoeticize(text, n=3, get_ngram_prob=get_ngram_prob, 181 | get_reading_probs=get_reading_probs): 182 | 183 | from nltk.tokenize import RegexpTokenizer 184 | tokenizer = RegexpTokenizer(r'[\w&]([\w&\']*[\w&])?|\S|\s') 185 | tokens = tokenizer.tokenize(text) 186 | 187 | tokens_alpha = [tok for tok in tokens if tok.isalpha()] 188 | ideal_tokens = compute_ideal_token_seq(tokens_alpha, n, get_ngram_prob, 189 | get_reading_probs) 190 | 191 | ideal_text = [] 192 | for tok in tokens: 193 | if tok.isalpha(): 194 | ideal_text.append(ideal_tokens.pop(0)) 195 | else: 196 | ideal_text.append(tok) 197 | return ''.join(ideal_text) 198 | 199 | 200 | if 0: 201 | load_language_model('lm_csr_20k_nvp_3gram.binary') 202 | print depoeticize('''She walks in beauty, like the night 203 | Of cloudless climes and starry skies; 204 | And all that's best of dark and bright 205 | Meet in her aspect and her eyes; 206 | Thus mellowed to that tender light 207 | Which heaven to gaudy day denies. 208 | 209 | One shade the more, one ray the less, 210 | Had half impaired the nameless grace 211 | Which waves in every raven tress, 212 | Or softly lightens o'er her face; 213 | Where thoughts serenely sweet express, 214 | How pure, how dear their dwelling-place. 215 | 216 | And on that cheek, and o'er that brow, 217 | So soft, so calm, yet eloquent, 218 | The smiles that win, the tints that glow, 219 | But tell of days in goodness spent, 220 | A mind at peace with all below, 221 | A heart whose love is innocent! 222 | ''') 223 | --------------------------------------------------------------------------------