├── .gitignore ├── example1.png ├── example2.png ├── README.md ├── example1.txt ├── example2.txt ├── table.py ├── table_lp.py └── example.lyx /.gitignore: -------------------------------------------------------------------------------- 1 | *.*~* 2 | *.pyc 3 | -------------------------------------------------------------------------------- /example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomlarkworthy/table_scraper/HEAD/example1.png -------------------------------------------------------------------------------- /example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomlarkworthy/table_scraper/HEAD/example2.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | table_scraper 2 | ============= 3 | 4 | uses mixed integer programming and probabalistic reasoning to determine the semantics of elements in plain text tables. 5 | 6 | For an example run:- 7 | 8 | ./table_lp.py example1.txt 9 | 10 | full writup including mathmatical explinations at http://www.edinburghhacklab.com 11 | -------------------------------------------------------------------------------- /example1.txt: -------------------------------------------------------------------------------- 1 | Frequency Digi-Key Cut Tape Price Each T & R Pricing† CTS 2 | (MHz) Part No. 1 10 50 1,000 Part No. 3 | 32.7680 278LVCT-ND 2.04 1.77 1.56 761.61 CB3LV-3C-32M7680 4 | 40.0000 280LVCT-ND 2.04 1.77 1.56 761.61 CB3LV-3C-40M0000 5 | 44.7360 281LVCT-ND 2.58 2.38 1.37 CB3LV-3C-44M7360 6 | 45.0000 297LVCT-ND 2.04 1.77 1.56 761.61 CB3LV-3C-45M0000 7 | 72.0000 288LVCT-ND 2.07 1.80 1.58 771.31 CB3LV-3C-72M0000 8 | 80.0000 289LVCT-ND 2.19 1.90 1.67 816.91 CB3LV-3C-80M0000 9 | CB3LV-3I 3.3V, ±50ppm, -40°C ~ 85°C 10 | 1.84320 945LVCT-ND 1.59 1.38 1.20 761.61 CB3LV-3I-1M8432 11 | 3.57954 929LVCT-ND 1.52 1.32 1.14 565.63 CB3LV-3I-3M579545 12 | 3.68640 946LVCT-ND 2.04 1.77 1.54 761.61 CB3LV-3I-3M6864 13 | 4.00000 930LVCT-ND 2.04 1.77 1.54 761.61 CB3LV-3I-4M0000 14 | -------------------------------------------------------------------------------- /example2.txt: -------------------------------------------------------------------------------- 1 | ESR Cut Tape 2 | @ 25°C Digi-Key Price Each Digi-Key Tape and Reel Citizen 3 | Fig. (Max.) Part No. 1 10 50 Part No. Qty. Pricing Part No. 4 | Cylinder Typ Fork Crystals — SMT 5 | 50kΩ 8341-1-ND .47 .41 .36 8341-2-ND 2,000 174.64/M CR200T-768KDZB-UT 6 | 1 50kΩ 8736-1-ND .47 .41 .36 8736-2-ND 2,000 174.64/M CR200T-768KDZY-UT 7 | 50kΩ 8340-1-ND .47 .41 .36 8340-2-ND 2,000 174.64/M CR200T-768KDZF-UT 8 | 50kΩ 8664-1-ND .58 .50 .44 8664-2-ND 3,000 205.82/M CJ206T-768KDZB-UT 9 | 2 10 | 50kΩ 8663-1-ND .58 .50 .44 8663-2-ND 3,000 205.82/M CJ206T-768KDZF-UT 11 | Tuning Crystals — SMT 12 | 3 65kΩ 8633-1-ND .89 .77 .68 8633-2-ND 3,000 318.09/M C130-768KDZF-UT 13 | 70kΩ 8751-1-ND 1.95 1.69 1.49 8751-2-ND 3,000 701.67/M C315-768KEZF-UT 14 | 70kΩ 8730-1-ND 1.69 1.47 1.29 8730-2-ND 3,000 608.11/M C315-768KDZY-UT 15 | -------------------------------------------------------------------------------- /table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | from sh import pdftoppm, pdftotext 4 | import numpy as np 5 | import re 6 | 7 | NON_WHITESPACE_RE = re.compile('\S+') 8 | TRAILING_CHARS = ",.~†" 9 | 10 | class COLOR: 11 | HEADER = '\033[95m' 12 | OKBLUE = '\033[94m' 13 | OKGREEN = '\033[92m' 14 | WARNING = '\033[93m' 15 | FAIL = '\033[91m' 16 | ENDC = '\033[0m' 17 | 18 | 19 | 20 | class Table(): 21 | ''' 22 | Storage model for a plain text table 23 | ''' 24 | def __init__(self, lines): 25 | self.lines = lines 26 | self.width = 0 27 | for line in lines: 28 | self.width = max(len(line), self.width) 29 | self.height = len(lines) 30 | self.rtypes = {} 31 | self.ctypes = {} 32 | self.ttypes = {} 33 | 34 | 35 | def get_tokens(self): 36 | for y, row in enumerate(self.lines): 37 | for match in NON_WHITESPACE_RE.finditer(row): 38 | word = match.string[match.start():match.end()] 39 | while len(word) > 0 and word[-1] in TRAILING_CHARS: 40 | word = word[0:-1] 41 | 42 | yield (match.start(), match.start() + len(word), y, word) 43 | 44 | def __repr__(self): 45 | return "\n".join(self.lines) 46 | 47 | def setColType(self, x, ctype): 48 | self.ctypes[x] = ctype 49 | 50 | def setRowType(self, y, rtype): 51 | self.rtypes[y] = rtype 52 | 53 | def setTokType(self, tid, ttype): 54 | self.ttypes[tid] = ttype 55 | 56 | def __repr__(self): 57 | lines = [] 58 | 59 | token_iter = self.get_tokens() 60 | (x_s, x_e, y, word) = token_iter.next() 61 | tid = 0 62 | 63 | for j in range(self.height): 64 | row = [] 65 | for i in range(self.width): 66 | 67 | color = None 68 | 69 | try: 70 | while y < j or (y == j and x_e < i) : 71 | tid +=1 72 | (x_s, x_e, y, word) = token_iter.next() 73 | 74 | if y == j and x_s <= i and i < x_e: 75 | char = str(word[i - x_s]) 76 | if ord(char) >= 128: 77 | char = "?" 78 | else: 79 | char = " " 80 | 81 | except StopIteration: 82 | char = " " 83 | 84 | if char != " ": 85 | if self.ttypes[tid] == "ordercode_dec": 86 | color = COLOR.OKBLUE 87 | elif self.ttypes[tid] == "ordercode_val": 88 | color = COLOR.OKGREEN 89 | elif self.ttypes[tid] == "partnum_dec": 90 | color = COLOR.WARNING 91 | elif self.ttypes[tid] == "partnum_val": 92 | color = COLOR.FAIL 93 | elif char != " ": 94 | char = "?" 95 | 96 | 97 | if color != None: 98 | row.append(color) 99 | row.append(char) 100 | if color != None: 101 | row.append(COLOR.ENDC) 102 | 103 | lines.append("".join(row)) 104 | return "\n".join(lines) 105 | 106 | 107 | 108 | 109 | def read_table(filepath): 110 | with open(filepath) as f: 111 | table_lines = [] 112 | for line in f: 113 | table_lines.append(line[0:-1]) 114 | return Table(table_lines) 115 | -------------------------------------------------------------------------------- /table_lp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | from pulp import * 4 | import table 5 | import math 6 | 7 | 8 | def colVar(x, ctype): 9 | return "col%s_is_%s"%(x,ctype) 10 | 11 | def rowVar(y, rtypes): 12 | return "row%s_is_%s"%(y,rtypes) 13 | 14 | def tokVar(t, ttypes): 15 | return "tok%s_is_%s"%(t,ttypes) 16 | 17 | class TableModel: 18 | 19 | def __init__(self, ctypes, table): 20 | self.ctypes = ctypes; 21 | self.rtypes = ["unknown", "product", "header"] 22 | self.ttypes = ["unclassified"] 23 | self.ttypes_dec = [] 24 | self.ttypes_val = [] 25 | for ctype in ctypes: 26 | self.ttypes_dec.append(ctype + "_dec") 27 | self.ttypes_val.append(ctype + "_val") 28 | self.ttypes.append(ctype + "_dec") 29 | self.ttypes.append(ctype + "_val") 30 | 31 | self.table = table 32 | 33 | #build LP problem 34 | self.lp = LpProblem("table", LpMaximize) 35 | self.v = {} 36 | 37 | #build column type variables and constrain only one-of-n is active 38 | for x in range(table.width): 39 | cats = [] 40 | for ctype in self.ctypes: 41 | cats.append(self.addVariable(colVar(x, ctype), 0,1,cat='Integer')) 42 | 43 | self.lp += lpSum(cats) == 1 44 | #build row type variables and constrain only one-of-n is active 45 | for y in range(table.height): 46 | cats = [] 47 | for rtype in self.rtypes: 48 | cats.append(self.addVariable(rowVar(y, rtype), 0,1,cat='Integer')) 49 | 50 | self.lp += lpSum(cats) == 1 51 | 52 | #ensure at least one row in the top three is a header 53 | ''' 54 | cats = [] 55 | for y in range(min(3, table.height)): 56 | cats.append(self.v[rowVar(y, "header")]) 57 | self.lp += lpSum(cats) >=1 58 | ''' 59 | 60 | row_decs = {} 61 | row_vals = {} 62 | 63 | #build token type variables and constrain only one-of-n is active 64 | for tid, (x_s, x_e, y, word) in enumerate(table.get_tokens()): 65 | print word 66 | cats = [] 67 | for ttype in self.ttypes: 68 | cats.append(self.addVariable(tokVar(tid, ttype), 0,1,cat='Integer')) 69 | 70 | self.lp += lpSum(cats) == 1 71 | 72 | #furthermore, ensure only declarations (or unclassified) appear in header rows 73 | #and values appear only in product rows 74 | decs = [] 75 | vals = [] 76 | for ttype in self.ttypes_dec: 77 | decs.append(self.v[tokVar(tid, ttype)]) 78 | vals = [] 79 | for ttype in self.ttypes_val: 80 | vals.append(self.v[tokVar(tid, ttype)]) 81 | 82 | self.lp += lpSum(decs) == self.v[rowVar(y, "header")] 83 | self.lp += lpSum(vals) == self.v[rowVar(y, "product")] 84 | 85 | #furthermore, ensure column types match either a token val or declaration, or unknown 86 | #in entire range of the token 87 | for x in range(x_s, x_e): 88 | for ctype in self.ctypes: 89 | self.lp += \ 90 | self.v[tokVar(tid, ctype + "_dec")] + \ 91 | self.v[tokVar(tid, ctype + "_val")] + \ 92 | self.v[tokVar(tid, "unclassified")] \ 93 | >= self.v[colVar(x, ctype)] 94 | 95 | 96 | def solve(self, token_probability_fn): 97 | #maximize log probability of function that maps words to types 98 | obj = [] 99 | for tid, (x_s, x_e, y, word) in enumerate(table.get_tokens()): 100 | for ttype in self.ttypes: 101 | p = math.log(token_probability_fn(ttype, word)) 102 | 103 | obj.append( p * self.v[tokVar(tid, ttype)]) 104 | self.lp += lpSum(obj) 105 | 106 | print "solving" 107 | self.lp.solve() 108 | print "Status:", LpStatus[self.lp.status] 109 | 110 | for x in range(self.table.width): 111 | for ctype in self.ctypes: 112 | if self.v[colVar(x, ctype)].varValue > 0.5: 113 | self.table.setColType(x, ctype) 114 | print colVar(x, ctype), " is true" 115 | 116 | for y in range(self.table.height): 117 | for rtype in self.rtypes: 118 | if self.v[rowVar(y, rtype)].varValue > 0.5: 119 | self.table.setRowType(y, rtype) 120 | print rowVar(y, rtype), " is true" 121 | 122 | for tid, (x_s, x_e, y, word) in enumerate(self.table.get_tokens()): 123 | for ttype in self.ttypes: 124 | if self.v[tokVar(tid, ttype)].varValue > 0.5: 125 | self.table.setTokType(tid, ttype) 126 | pass 127 | print word, tokVar(tid, ttype), " is true" 128 | 129 | def addVariable(self, name, LB=None, UB=None, cat='Continuous'): 130 | self.v[name] = LpVariable(name, LB,UB,cat) 131 | return self.v[name] 132 | 133 | def probabilityCatagoryGivenWord(catagory, word): 134 | if word == "Digi-Key" and catagory == "ordercode_dec": 135 | return 0.9 136 | 137 | if word.endswith("-ND") and catagory == "ordercode_val": 138 | return 0.8 139 | 140 | if "-" in word and catagory == "partnum_val": 141 | return 0.4 142 | 143 | if catagory == "unclassified" or catagory == "unknown_val" or catagory == "unknown_dec": 144 | return 0.3 145 | 146 | return 0.1 147 | 148 | 149 | if __name__ == "__main__": 150 | table = table.read_table(sys.argv[1]) 151 | model = TableModel(["unknown", "partnum", "ordercode"], table) 152 | 153 | model.solve(probabilityCatagoryGivenWord) 154 | 155 | print table 156 | 157 | -------------------------------------------------------------------------------- /example.lyx: -------------------------------------------------------------------------------- 1 | #LyX 2.0 created this file. For more info see http://www.lyx.org/ 2 | \lyxformat 413 3 | \begin_document 4 | \begin_header 5 | \textclass article 6 | \use_default_options true 7 | \maintain_unincluded_children false 8 | \language english 9 | \language_package default 10 | \inputencoding auto 11 | \fontencoding global 12 | \font_roman default 13 | \font_sans default 14 | \font_typewriter default 15 | \font_default_family default 16 | \use_non_tex_fonts false 17 | \font_sc false 18 | \font_osf false 19 | \font_sf_scale 100 20 | \font_tt_scale 100 21 | 22 | \graphics default 23 | \default_output_format default 24 | \output_sync 0 25 | \bibtex_command default 26 | \index_command default 27 | \paperfontsize default 28 | \use_hyperref false 29 | \papersize default 30 | \use_geometry false 31 | \use_amsmath 1 32 | \use_esint 1 33 | \use_mhchem 1 34 | \use_mathdots 1 35 | \cite_engine basic 36 | \use_bibtopic false 37 | \use_indices false 38 | \paperorientation portrait 39 | \suppress_date false 40 | \use_refstyle 1 41 | \index Index 42 | \shortcut idx 43 | \color #008000 44 | \end_index 45 | \secnumdepth 3 46 | \tocdepth 3 47 | \paragraph_separation indent 48 | \paragraph_indentation default 49 | \quotes_language english 50 | \papercolumns 1 51 | \papersides 1 52 | \paperpagestyle default 53 | \tracking_changes false 54 | \output_changes false 55 | \html_math_output 0 56 | \html_css_as_file 0 57 | \html_be_strict false 58 | \end_header 59 | 60 | \begin_body 61 | 62 | \begin_layout Section 63 | Probabalistic Scraping of Plain Text Tables 64 | \end_layout 65 | 66 | \begin_layout Standard 67 | Recently I have been banging my head trying to import a ton of data expressed 68 | in tabular form into a database. 69 | Anyway I think I came up with a neat approach using probabalistic reasoning 70 | combined with mixed integer programming thats pretty robust to all sorts 71 | of real world issues. 72 | 73 | \end_layout 74 | 75 | \begin_layout Standard 76 | Plain text tables are quite interesting when encoutered in the wild. 77 | They are highly compressed forms of data, but that's a double edged sword, 78 | you can only understand the meaning of a particular table element if, and 79 | only if, you understand the meaning of the row and column it is found within. 80 | Unfortunatly, the meaning of columns and rows vary widly across a dataset 81 | of many independant tables. 82 | Consider the following abridged OCRed examples from the digikey catalogue:- 83 | \end_layout 84 | 85 | \begin_layout Standard 86 | 87 | \family typewriter 88 | \size tiny 89 | \color foreground 90 | EXAMPLES 91 | \end_layout 92 | 93 | \begin_layout Standard 94 | These tables have: differing spatial layout of header fields (e.g. 95 | 96 | \begin_inset Quotes eld 97 | \end_inset 98 | 99 | Cut Price Tape Each 100 | \begin_inset Quotes erd 101 | \end_inset 102 | 103 | ), differing number of table header lines, different number of columns, 104 | and some rows are not data but actually heirachical sub headings (e.g. 105 | 106 | \begin_inset Quotes eld 107 | \end_inset 108 | 109 | CB3LV-3I 3.3V, ±50ppm, -40°C ~ 85°C 110 | \begin_inset Quotes erd 111 | \end_inset 112 | 113 | ). 114 | In the digikey world, ending in 115 | \begin_inset Quotes eld 116 | \end_inset 117 | 118 | -ND 119 | \begin_inset Quotes erd 120 | \end_inset 121 | 122 | is strong evidence that a token is a partnum, however, its not fool proof, 123 | as lots of non-partnums also end in -ND (its a huge catalogue). 124 | To decide whether 125 | \begin_inset Quotes eld 126 | \end_inset 127 | 128 | 297LVCT-ND 129 | \begin_inset Quotes erd 130 | \end_inset 131 | 132 | is a product code, you need to reason over the entire table building up 133 | evidence. 134 | 135 | \end_layout 136 | 137 | \begin_layout Standard 138 | To do the inference I represent the table structuring elements (rows and 139 | columns) and the token labels as random catagorical variables. 140 | A single character wide column is assigned a column type (ordercode, partnum) 141 | or unkown. 142 | A row is either a header, entity or noise. 143 | A token is either unclassified, a declartions of a column type (e.g. 144 | 145 | \begin_inset Quotes eld 146 | \end_inset 147 | 148 | Part No. 149 | \begin_inset Quotes erd 150 | \end_inset 151 | 152 | ), or a value in a column type (e.g. 153 | 154 | \begin_inset Quotes eld 155 | \end_inset 156 | 157 | 281LVCT-ND 158 | \begin_inset Quotes erd 159 | \end_inset 160 | 161 | ). 162 | \end_layout 163 | 164 | \begin_layout Standard 165 | \begin_inset Formula 166 | \[ 167 | col_{i}\in\{ordercode,\, partnum,\, unknown\} 168 | \] 169 | 170 | \end_inset 171 | 172 | 173 | \end_layout 174 | 175 | \begin_layout Standard 176 | \begin_inset Formula 177 | \[ 178 | row_{j}\in\{header,\, entity,\, noise\} 179 | \] 180 | 181 | \end_inset 182 | 183 | 184 | \begin_inset Formula 185 | \[ 186 | token_{t}\in\{unclassified,\, partnum\_dec,\, partnum\_val,\, ordercode\_dec,\, ordercode\_val,unknow\_dec,unknown\_val\} 187 | \] 188 | 189 | \end_inset 190 | 191 | 192 | \end_layout 193 | 194 | \begin_layout Standard 195 | The important thing in a table is that 196 | \shape italic 197 | values and declaration tokens have to match types, and be consistent over 198 | an entire column 199 | \shape default 200 | . 201 | We can express these hard logical constraints using mixed integer programming 202 | (MIP). 203 | The first step is to encode the variables states into numerical variables. 204 | A catagorical variable is split into a one-of-n vector encoding. 205 | For example, every column catagorical variable becomes three integer variables 206 | each encoding a boolean state 207 | \end_layout 208 | 209 | \begin_layout Standard 210 | \begin_inset Formula 211 | \[ 212 | is\_ordercode_{i}\in\{0,1\} 213 | \] 214 | 215 | \end_inset 216 | 217 | 218 | \end_layout 219 | 220 | \begin_layout Standard 221 | \begin_inset Formula 222 | \[ 223 | is\_partnum_{i}\in\{0,1\} 224 | \] 225 | 226 | \end_inset 227 | 228 | 229 | \end_layout 230 | 231 | \begin_layout Standard 232 | \begin_inset Formula 233 | \[ 234 | is\_unknown_{i}\in\{0,1\} 235 | \] 236 | 237 | \end_inset 238 | 239 | 240 | \end_layout 241 | 242 | \begin_layout Standard 243 | We force one of the variables to be one by adding a linear constraint, 244 | \begin_inset Formula $is\_ordercode_{i}+is\_partnum_{i}+is\_unknown_{i}=1$ 245 | \end_inset 246 | 247 | for each 248 | \begin_inset Formula $i$ 249 | \end_inset 250 | 251 | . 252 | We repeat this for the row and token catagoricals. 253 | \end_layout 254 | 255 | \begin_layout Standard 256 | The next thing is to ensure that only declarations appear in header rows, 257 | and values in entity rows, for every row, 258 | \begin_inset Formula $j$ 259 | \end_inset 260 | 261 | , and token, 262 | \begin_inset Formula $t$ 263 | \end_inset 264 | 265 | . 266 | \end_layout 267 | 268 | \begin_layout Standard 269 | \begin_inset Formula 270 | \[ 271 | is\_header_{j}=is\_partnum\_dec_{t}+is\_ordercode\_dec_{t} 272 | \] 273 | 274 | \end_inset 275 | 276 | 277 | \end_layout 278 | 279 | \begin_layout Standard 280 | \begin_inset Formula 281 | \[ 282 | is\_entity_{j}=is\_partnum\_val_{t}+is\_ordercode\_val_{t} 283 | \] 284 | 285 | \end_inset 286 | 287 | 288 | \end_layout 289 | 290 | \begin_layout Standard 291 | As each variable is either a one or a zero, 292 | \end_layout 293 | 294 | \begin_layout Standard 295 | The final set of constraints is ensuring each column contains declarations 296 | and values for a spefic column type. 297 | Each token is one or more characters long, so each token intersects several 298 | single character wide columns. 299 | So for each token, 300 | \begin_inset Formula $t$ 301 | \end_inset 302 | 303 | , and for every column intersecting that token, 304 | \begin_inset Formula $i$ 305 | \end_inset 306 | 307 | we add the following constraints 308 | \end_layout 309 | 310 | \begin_layout Standard 311 | \begin_inset Formula 312 | \[ 313 | is\_partnum_{i}\leq is\_partnum\_val_{t}+is\_partnum\_dec_{t}+unclassified_{t} 314 | \] 315 | 316 | \end_inset 317 | 318 | 319 | \end_layout 320 | 321 | \begin_layout Standard 322 | (and the same for 323 | \begin_inset Formula $ordercode$ 324 | \end_inset 325 | 326 | ) 327 | \end_layout 328 | 329 | \begin_layout Standard 330 | So at this point we have expressed the problem as a huge set of binary integer 331 | variables, with linear constraints between them. 332 | A MIP solver can now optimize any linear cost function involving those 333 | variables subject to those constraints. 334 | We choose our objective function to encode the probability of a labeling 335 | which we maximize to give the maximum liklihood estimate (which tells us 336 | which tokens are values). 337 | \end_layout 338 | 339 | \begin_layout Standard 340 | Given a number of naive, independant, classifiers, their joint probability 341 | is their product, which is what we want to maximise. 342 | 343 | \end_layout 344 | 345 | \begin_layout Standard 346 | \begin_inset Formula 347 | \[ 348 | max\, p(v_{1}...v_{n})=max\,\prod_{i}^{n}p(v_{i}) 349 | \] 350 | 351 | \end_inset 352 | 353 | 354 | \end_layout 355 | 356 | \begin_layout Standard 357 | The product term is not compatible with linear programming, so we note that 358 | the maximization result is not affected by taking logs, which usefully 359 | turns products into sums 360 | \begin_inset Formula 361 | \[ 362 | max\, p(v_{1}...v_{n})=log\, max\,\sum_{i}^{n}p(v_{i}) 363 | \] 364 | 365 | \end_inset 366 | 367 | 368 | \end_layout 369 | 370 | \begin_layout Standard 371 | This we can express as an objective function for our MIP. 372 | For this example I assign a probability of a token labeling based only 373 | on the content of the token. 374 | The three important cases I used were: 375 | \end_layout 376 | 377 | \begin_layout Standard 378 | \begin_inset Formula 379 | \[ 380 | p(token_{t}=partnum\_val|content.endswith("-ND"))=0.8 381 | \] 382 | 383 | \end_inset 384 | 385 | 386 | \end_layout 387 | 388 | \begin_layout Standard 389 | \begin_inset Formula 390 | \[ 391 | p(token_{t}=ordercode\_val|contents.contains("-"))=0.4 392 | \] 393 | 394 | \end_inset 395 | 396 | 397 | \end_layout 398 | 399 | \begin_layout Standard 400 | \begin_inset Formula 401 | \[ 402 | p(token_{t}=partnum\_dec|content.contains("Digi-Key"))=0.9 403 | \] 404 | 405 | \end_inset 406 | 407 | 408 | \end_layout 409 | 410 | \begin_layout Standard 411 | A catch all case when no specific clues are present is 412 | \end_layout 413 | 414 | \begin_layout Standard 415 | \begin_inset Formula 416 | \[ 417 | p(token_{t}=unclassified|content.is\_none\, of\, the\, above)=0.3 418 | \] 419 | 420 | \end_inset 421 | 422 | 423 | \end_layout 424 | 425 | \begin_layout Standard 426 | You should theoretically fill in the missing combinations to make sure the 427 | probabilities add up to one, but in practice it does not matter too much. 428 | \end_layout 429 | 430 | \begin_layout Standard 431 | So now we can express our maximum liklihood estimate objective function 432 | as 433 | \begin_inset Formula 434 | \[ 435 | max\,\sum_{t}\forall_{class}log(p(token_{t}=class|contents_{t})) 436 | \] 437 | 438 | \end_inset 439 | 440 | 441 | \end_layout 442 | 443 | \begin_layout Standard 444 | which a integer programming package like PuLP can solve very quickly. 445 | I have coded up this example in python (github) to demonstrate how easy 446 | it is to encode, once the constraint model has been worked out on paper 447 | (all th. 448 | \end_layout 449 | 450 | \begin_layout Subsection 451 | Results 452 | \end_layout 453 | 454 | \end_body 455 | \end_document 456 | --------------------------------------------------------------------------------