├── depoeticizer.js
├── README.md
├── depoeticize.cgi
├── depoeticizer.css
├── depoeticizer.html
└── depoeticizer.py
/depoeticizer.js:
--------------------------------------------------------------------------------
1 | function depoeticize()
2 | {
3 | $(".depoeticizer-output").html("Depoeticizing...");
4 | $.ajax({
5 | type: "GET",
6 | url: "/apps/dp/depoeticize.cgi",
7 | dataType: "text",
8 | data: {
9 | text: $("#textinput").val(),
10 | model: $("#corpus").val(),
11 | errorprob: $("#errorprob").val()
12 | },
13 | complete: function (response) {
14 | if (response.statusText == "OK") {
15 | var txt = response.responseText;
16 | $(".depoeticizer-output").html(txt);
17 | } else {
18 | }
19 | }
20 | });
21 | }
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | depoeticizer
2 | =========
3 |
4 | This program modifies a text so as to bring it more into line with the expectations created by an ngram model. It works by assuming that the text was generated by the model with a certain chance of "typos" occurring. Based on this, it determines what the author is most likely to have "really" meant, with the typos removed. It is basically a parody of autocorrect algorithms like the ones used in OCR. The main differences are 1) this program will "correct" words even if they are proper English; and 2) it is tuned so as to be far more likely to make "corrections" than any real world autocorrector would be.
5 |
6 | To run this, you will need to install [kenlm](https://github.com/kpu/kenlm) and download
7 | an ARPA-format language model (you can find some [here](http://www.keithv.com/software/csr/)). The program can run as a Web application, or you can load it as a Python module and play with it on the command line.
8 |
9 | For better performance, you may also want to convert the .arpa file into .binary format using KenLM's build_binary program.
--------------------------------------------------------------------------------
/depoeticize.cgi:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # CGI setup
5 |
6 | import cgi
7 |
8 | data = cgi.FieldStorage()
9 |
10 | text = data.getfirst('text')
11 | model = data.getfirst('model')
12 | error_prob = data.getfirst('errorprob')
13 |
14 | print "Content-Type: text/html;charset=utf-8"
15 | print
16 |
17 | # Data validation
18 |
19 | import re
20 |
21 | text = str(text)
22 | model = str(model)
23 | error_prob = float(error_prob)
24 |
25 | if len(text) > 2000:
26 | print "Text must be under 2000 characters!"
27 | exit()
28 |
29 | if model not in ('lm_csr_5k_nvp_2gram', 'lm_csr_5k_nvp_3gram',
30 | 'lm_csr_20k_nvp_2gram', 'lm_csr_20k_nvp_3gram'):
31 | print "Invalid language model name!"
32 | exit()
33 | model = model + '.binary'
34 | if '2gram' in model:
35 | n = 2
36 | else:
37 | n = 3
38 |
39 | if error_prob <= 0.0 or error_prob >= 1.0:
40 | print "Probability of typo must be between 0 and 1!"
41 | exit()
42 |
43 | # Load settings
44 |
45 | import depoeticizer
46 | depoeticizer.ERROR_PROB = error_prob
47 | depoeticizer.load_language_model(model)
48 |
49 | # Depoeticize
50 |
51 | text = depoeticizer.depoeticize(text, n=n)
52 | print text.replace('\n', ' ').replace(' ', ' ')
53 |
--------------------------------------------------------------------------------
/depoeticizer.css:
--------------------------------------------------------------------------------
1 | select, input {
2 | font-size: 16px !important;
3 | }
4 |
5 | body {
6 | font-family: Lato, sans-serif;
7 | font-size: 14px;
8 | margin: 1px;
9 | }
10 |
11 | textarea {
12 | font-family: Lato, sans-serif;
13 | font-size: 12px;
14 | }
15 |
16 | select {
17 | float: right;
18 | width: 300px;
19 | }
20 |
21 | .select-label {
22 | float: left;
23 | width: 140px;
24 | position: relative;
25 | top: 3px;
26 | }
27 |
28 | .details-box {
29 | outline: 1px solid grey;
30 | width: 290px;
31 | padding: 5px;
32 | margin: 10px 0px;
33 | }
34 |
35 | .details-heading {
36 | margin-bottom: 2px;
37 | }
38 |
39 | .io-area {
40 | float: left;
41 | width: 50%;
42 | }
43 |
44 | .depoeticizer-input {
45 | outline: 0;
46 | width: 100%;
47 | box-sizing: border-box;
48 | -moz-box-sizing: border-box;
49 | -webkit-box-sizing: border-box;
50 | height: 318px;
51 | padding: 5px;
52 | resize: none;
53 | }
54 |
55 | .depoeticizer-output-area {
56 | outline: 1px solid grey;
57 | width: 100%;
58 | box-sizing: border-box;
59 | -moz-box-sizing: border-box;
60 | -webkit-box-sizing: border-box;
61 | height: 316px;
62 | margin-top: 1px;
63 | padding: 5px;
64 | overflow: scroll;
65 | font-size: 12px;
66 | }
67 |
--------------------------------------------------------------------------------
/depoeticizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
Select a corpus:
15 |
21 |
22 |
23 |
24 |
Probability of typo:
25 |
26 |
27 |
Enter your text here:
28 |
29 |
49 |
50 |
51 |
52 |
Your output will appear here.
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/depoeticizer.py:
--------------------------------------------------------------------------------
1 | import kenlm
2 | from math import log, ceil
3 |
4 | # Maximum number of errors that a word can have based on its length.
5 | MAX_ERRORS_PER_LETTER = 1.0 / 4.0
6 | MAX_ERRORS = 2
7 |
8 | # Probability that a single character will be deleted, etc.
9 | ERROR_PROB = 0.2
10 |
11 | # ARPA language model.
12 | lm = None
13 |
14 | # Ngram probabilities computed using an ARPA-format language model.
15 | def get_ngram_prob(ngram):
16 | ngram = [tok.lower() for tok in ngram]
17 | return lm.score(' '.join(ngram))
18 |
19 | # Reading probabilities are computed using a simple edit-distance algorithm such as a
20 | # spell-checker might use.
21 | alphabet = 'abcdefghijklmnopqrstuvwxyz'
22 | def isknown(word):
23 | return word.lower() in lm
24 | def get_reading_probs(tok):
25 | readings = {}
26 | def add_tok_readings(tok, prob, n, known):
27 | if prob == 0.0:
28 | return
29 | if n == 0:
30 | reading_prob = prob
31 | else:
32 | reading_prob = prob * (1.0 - ERROR_PROB)
33 | if known:
34 | if tok not in readings:
35 | readings[tok] = reading_prob
36 | if n == 0:
37 | return
38 | # Adapted from http://norvig.com/spell-correct.html.
39 | s = [(tok[:i], tok[i:]) for i in range(len(tok) + 1)]
40 | deletions = [a + b[1:] for a, b in s if b]
41 | transpositions = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1]
42 | substitutions = [a + c + b[1:] for a, b in s for c in alphabet if b]
43 | insertions = [a + c + b for a, b in s for c in alphabet]
44 | edits = set(deletions + transpositions + substitutions + insertions)
45 | for edit in edits:
46 | # Determining the probabilities this way is cheating, really, since I should
47 | # be accounting for the fact that the observed token is only one of many
48 | # possible typos that could be generated from a given word. But my
49 | # intentions in this project are not serious enough to warrant that much
50 | # effort.
51 | add_tok_readings(edit, prob * ERROR_PROB, n-1, isknown(edit))
52 | add_tok_readings(tok, 1.0, min(int(ceil(MAX_ERRORS_PER_LETTER * len(tok))), MAX_ERRORS), True)
53 | reading_list = []
54 | for new_tok in readings:
55 | logprob = log(readings[new_tok])
56 | if tok.isupper() and len(tok) > 1:
57 | new_tok = new_tok.upper()
58 | elif tok[0].isupper():
59 | new_tok = new_tok[0].upper() + new_tok[1:].lower()
60 | reading_list.append((new_tok, logprob))
61 | return reading_list
62 |
63 | def compute_ideal_token_seq(tokens, n=3, get_ngram_prob=get_ngram_prob,
64 | get_reading_probs=get_reading_probs):
65 |
66 | # Computes an "ideal" version of the specified token sequence. This
67 | # function assumes that the text was generated using a n-gram-based
68 | # Markov chain - that is, in such a way that the probability of a given
69 | # token coming next is determined based on what n-1 tokens came before -
70 | # with a certain probability of a "typo" each time a new token is
71 | # written. Two functions must be specified, one giving the probability
72 | # that a specified n-token sequence will appear, and the other
73 | # giving a list of all possible words that might have been written as
74 | # a given token, with probabilities. Based on the combination of these
75 | # two models, the function returns the form that the token sequence
76 | # is most likely to have taken before the "typos."
77 |
78 | # This function uses a variant of the Viterbi algorithm where the possible
79 | # state sequences are stored in nested dictionaries rather than a matrix.
80 | # This is because the total number of possible states is enormous (equal
81 | # to the number of possible n-grams), but the number of states that can
82 | # produce a given token is relatively small, which would result in an
83 | # extremely sparse matrix.
84 |
85 | # The first key of d is the most recent token and the second is the token
86 | # before that, etc. The value is a pair containing the probability and the
87 | # complete token sequence up to that point, including the two tokens that
88 | # are used as keys.
89 |
90 | d = {}
91 |
92 | # The computation for the initial state works a little differently from the
93 | # later computations because we don't have values for the previous tokens.
94 | # Instead, we have to compute the probabilities for all possible readings
95 | # of the first ngram.
96 |
97 | if len(tokens) < n:
98 | print 'Text must be at least {0} tokens long!'.format(n)
99 | exit()
100 |
101 | # We want to record all possible values for tokens 2-n, and only
102 | # the optimal values for the first token.
103 | def compute_initial_values(i, toks=[], prob=1.0):
104 | if i == 0:
105 | max_prob = float("-inf")
106 | best_tok0 = None
107 | r = get_reading_probs(tokens[0])
108 | for tok0, prob0 in r:
109 | ngram_prob = get_ngram_prob([tok0] + toks)
110 | final_prob = prob + ngram_prob
111 | if final_prob > max_prob:
112 | max_prob = final_prob
113 | best_tok0 = tok0
114 | return (max_prob, [best_tok0] + toks)
115 | else:
116 | d = {}
117 | r = get_reading_probs(tokens[i])
118 | #print tokens[i], ':', len(r)
119 | for toki, probi in r:
120 | d[toki] = compute_initial_values(i-1, [toki] + toks, probi + prob)
121 | return d
122 | d = compute_initial_values(0)
123 |
124 | # Now proceed through the rest of the tokens. For each one we rebuild d,
125 | # keeping all options for the previous n-2 tokens and finding the optimal
126 | # values for the one before that.
127 | def iterate_possibilities(i, toks, prob, d, tok):
128 | if isinstance(d, tuple):
129 | # This should only happen near the beginning when we haven't built up a
130 | # history.
131 | prob0, seq0 = d
132 | ngram_prob = get_ngram_prob(toks)
133 | final_prob = ngram_prob + prob + prob0
134 | return (final_prob, seq0 + [tok])
135 | if i == 0:
136 | max_prob = float("-inf")
137 | best_tok0 = None
138 | best_seq0 = None
139 | for tok0 in d:
140 | ngram_prob = get_ngram_prob([tok0] + toks)
141 | prob0, seq0 = d[tok0]
142 | final_prob = ngram_prob + prob + prob0
143 | if final_prob > max_prob:
144 | max_prob = final_prob
145 | best_tok0 = tok0
146 | best_seq0 = seq0
147 | return (max_prob, best_seq0 + [tok])
148 | else:
149 | dnew = {}
150 | for toki in d:
151 | dnew[toki] = iterate_possibilities(i-1, [toki] + toks, prob, d[toki], tok)
152 | return dnew
153 | for tok in tokens[1:]:
154 | dnew = {}
155 | r = get_reading_probs(tok)
156 | #print tok, ':', len(r)
157 | for tokn, probn in r:
158 | dnew[tokn] = iterate_possibilities(n-2, [tokn], probn, d, tokn)
159 | d = dnew
160 |
161 | # Find the optimal sequence from all the possibilities that remain.
162 | def extract_ideal_text(i, d, stats):
163 | if i == 0:
164 | prob, seq = d
165 | if prob > stats['max_prob']:
166 | stats['max_prob'] = prob
167 | stats['best_seq'] = seq
168 | else:
169 | for toki in d:
170 | extract_ideal_text(i-1, d[toki], stats)
171 | stats = {'max_prob': float("-inf"), 'best_seq': None}
172 | extract_ideal_text(n-1, d, stats)
173 |
174 | return stats['best_seq']
175 |
176 | def load_language_model(filename):
177 | global lm
178 | lm = kenlm.LanguageModel(filename)
179 |
180 | def depoeticize(text, n=3, get_ngram_prob=get_ngram_prob,
181 | get_reading_probs=get_reading_probs):
182 |
183 | from nltk.tokenize import RegexpTokenizer
184 | tokenizer = RegexpTokenizer(r'[\w&]([\w&\']*[\w&])?|\S|\s')
185 | tokens = tokenizer.tokenize(text)
186 |
187 | tokens_alpha = [tok for tok in tokens if tok.isalpha()]
188 | ideal_tokens = compute_ideal_token_seq(tokens_alpha, n, get_ngram_prob,
189 | get_reading_probs)
190 |
191 | ideal_text = []
192 | for tok in tokens:
193 | if tok.isalpha():
194 | ideal_text.append(ideal_tokens.pop(0))
195 | else:
196 | ideal_text.append(tok)
197 | return ''.join(ideal_text)
198 |
199 |
200 | if 0:
201 | load_language_model('lm_csr_20k_nvp_3gram.binary')
202 | print depoeticize('''She walks in beauty, like the night
203 | Of cloudless climes and starry skies;
204 | And all that's best of dark and bright
205 | Meet in her aspect and her eyes;
206 | Thus mellowed to that tender light
207 | Which heaven to gaudy day denies.
208 |
209 | One shade the more, one ray the less,
210 | Had half impaired the nameless grace
211 | Which waves in every raven tress,
212 | Or softly lightens o'er her face;
213 | Where thoughts serenely sweet express,
214 | How pure, how dear their dwelling-place.
215 |
216 | And on that cheek, and o'er that brow,
217 | So soft, so calm, yet eloquent,
218 | The smiles that win, the tints that glow,
219 | But tell of days in goodness spent,
220 | A mind at peace with all below,
221 | A heart whose love is innocent!
222 | ''')
223 |
--------------------------------------------------------------------------------