├── .gitignore
├── example1.png
├── example2.png
├── README.md
├── example1.txt
├── example2.txt
├── table.py
├── table_lp.py
└── example.lyx


/.gitignore:
--------------------------------------------------------------------------------
1 | *.*~*
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/example1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomlarkworthy/table_scraper/HEAD/example1.png


--------------------------------------------------------------------------------
/example2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomlarkworthy/table_scraper/HEAD/example2.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | table_scraper
 2 | =============
 3 | 
 4 | uses mixed integer programming and probabalistic reasoning to determine the semantics of elements in plain text tables. 
 5 | 
 6 | For an example run:-
 7 | 
 8 | ./table_lp.py example1.txt
 9 | 
10 | full writup including mathmatical explinations at http://www.edinburghhacklab.com
11 | 


--------------------------------------------------------------------------------
/example1.txt:
--------------------------------------------------------------------------------
 1 |  Frequency  Digi-Key       Cut Tape Price Each     T & R Pricing† CTS
 2 |    (MHz)    Part No.       1       10       50       1,000        Part No.
 3 |    32.7680  278LVCT-ND     2.04    1.77   1.56       761.61       CB3LV-3C-32M7680
 4 |    40.0000  280LVCT-ND     2.04    1.77   1.56       761.61       CB3LV-3C-40M0000
 5 |    44.7360  281LVCT-ND     2.58    2.38   1.37                    CB3LV-3C-44M7360
 6 |    45.0000  297LVCT-ND     2.04    1.77   1.56       761.61       CB3LV-3C-45M0000
 7 |    72.0000  288LVCT-ND     2.07    1.80   1.58       771.31       CB3LV-3C-72M0000
 8 |    80.0000  289LVCT-ND     2.19    1.90   1.67       816.91       CB3LV-3C-80M0000
 9 |                      CB3LV-3I 3.3V, ±50ppm, -40°C ~ 85°C      
10 |    1.84320  945LVCT-ND     1.59    1.38   1.20       761.61       CB3LV-3I-1M8432
11 |    3.57954  929LVCT-ND     1.52    1.32   1.14       565.63       CB3LV-3I-3M579545
12 |    3.68640  946LVCT-ND     2.04    1.77   1.54       761.61       CB3LV-3I-3M6864
13 |    4.00000  930LVCT-ND     2.04    1.77   1.54       761.61       CB3LV-3I-4M0000
14 | 


--------------------------------------------------------------------------------
/example2.txt:
--------------------------------------------------------------------------------
 1 |        ESR                           Cut Tape
 2 |       @ 25°C   Digi-Key             Price Each        Digi-Key     Tape and Reel     Citizen
 3 | Fig.  (Max.)   Part No.          1      10      50    Part No.    Qty.    Pricing    Part No.
 4 |                  Cylinder Typ Fork Crystals — SMT
 5 |       50kΩ     8341-1-ND       .47     .41     .36   8341-2-ND   2,000   174.64/M   CR200T-768KDZB-UT
 6 |  1    50kΩ     8736-1-ND       .47     .41     .36   8736-2-ND   2,000   174.64/M   CR200T-768KDZY-UT
 7 |       50kΩ     8340-1-ND       .47     .41     .36   8340-2-ND   2,000   174.64/M   CR200T-768KDZF-UT
 8 |       50kΩ     8664-1-ND       .58     .50     .44   8664-2-ND   3,000   205.82/M   CJ206T-768KDZB-UT
 9 |  2                                                                                          
10 |       50kΩ     8663-1-ND       .58     .50     .44   8663-2-ND   3,000   205.82/M   CJ206T-768KDZF-UT
11 |                      Tuning Crystals — SMT
12 |  3    65kΩ     8633-1-ND       .89     .77     .68   8633-2-ND   3,000   318.09/M   C130-768KDZF-UT
13 |       70kΩ     8751-1-ND      1.95    1.69    1.49   8751-2-ND   3,000   701.67/M   C315-768KEZF-UT
14 |       70kΩ     8730-1-ND      1.69    1.47    1.29   8730-2-ND   3,000   608.11/M   C315-768KDZY-UT
15 | 


--------------------------------------------------------------------------------
/table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | from sh import pdftoppm, pdftotext
  4 | import numpy as np
  5 | import re
  6 | 
  7 | NON_WHITESPACE_RE = re.compile('\S+')
  8 | TRAILING_CHARS = ",.~†"
  9 | 
 10 | class COLOR:
 11 |     HEADER = '\033[95m'
 12 |     OKBLUE = '\033[94m'
 13 |     OKGREEN = '\033[92m'
 14 |     WARNING = '\033[93m'
 15 |     FAIL = '\033[91m'
 16 |     ENDC = '\033[0m'
 17 | 
 18 |         
 19 | 
 20 | class Table():
 21 | 	'''
 22 | 	Storage model for a plain text table
 23 | 	'''		
 24 | 	def __init__(self, lines):
 25 | 		self.lines = lines		
 26 | 		self.width = 0
 27 | 		for line in lines:
 28 | 			self.width = max(len(line), self.width)
 29 | 		self.height = len(lines)
 30 | 		self.rtypes = {}
 31 | 		self.ctypes = {}
 32 | 		self.ttypes = {}
 33 | 		
 34 | 	
 35 | 	def get_tokens(self):
 36 | 		for y, row in enumerate(self.lines):
 37 | 			for match in NON_WHITESPACE_RE.finditer(row):
 38 | 				word =  match.string[match.start():match.end()]
 39 | 				while len(word) > 0 and word[-1] in TRAILING_CHARS:
 40 | 					word = word[0:-1]
 41 | 					
 42 | 				yield (match.start(), match.start() + len(word), y, word)				
 43 | 			
 44 | 	def __repr__(self):
 45 | 		return "\n".join(self.lines)
 46 | 
 47 | 	def setColType(self, x, ctype):
 48 | 		self.ctypes[x] = ctype
 49 | 
 50 | 	def setRowType(self, y, rtype):
 51 | 		self.rtypes[y] = rtype
 52 | 
 53 | 	def setTokType(self, tid, ttype):
 54 | 		self.ttypes[tid] = ttype
 55 | 		
 56 | 	def __repr__(self):
 57 | 		lines = []
 58 | 		
 59 | 		token_iter = self.get_tokens()
 60 | 		(x_s, x_e, y, word)	 = token_iter.next()
 61 | 		tid = 0
 62 | 		
 63 | 		for j in range(self.height):
 64 | 			row = []
 65 | 			for i in range(self.width):
 66 | 				
 67 | 				color = None
 68 | 				
 69 | 				try:
 70 | 					while y < j or (y == j and x_e < i) :
 71 | 						tid +=1
 72 | 						(x_s, x_e, y, word)	 = token_iter.next()
 73 | 						
 74 | 					if y == j and x_s <= i and i < x_e:
 75 | 						char = str(word[i - x_s])
 76 | 						if ord(char) >= 128:
 77 | 							char = "?"							
 78 | 					else:
 79 | 						char = " "
 80 | 					
 81 | 				except StopIteration:
 82 | 					char = " "
 83 | 					
 84 | 				if char != " ":
 85 | 					if self.ttypes[tid] == "ordercode_dec":
 86 | 						color = COLOR.OKBLUE
 87 | 					elif self.ttypes[tid] == "ordercode_val":
 88 | 						color = COLOR.OKGREEN
 89 | 					elif self.ttypes[tid] == "partnum_dec":
 90 | 						color = COLOR.WARNING
 91 | 					elif self.ttypes[tid] == "partnum_val":
 92 | 						color = COLOR.FAIL
 93 | 					elif char != " ":
 94 | 						char = "?"
 95 | 				
 96 | 					
 97 | 				if color != None:
 98 | 					row.append(color)
 99 | 				row.append(char)
100 | 				if color != None:
101 | 					row.append(COLOR.ENDC)
102 | 			
103 | 			lines.append("".join(row))
104 | 		return "\n".join(lines)	
105 | 		
106 | 		
107 | 		
108 | 
109 | def read_table(filepath):		
110 | 	with open(filepath) as f:
111 | 		table_lines = []
112 | 		for line in f:			
113 | 			table_lines.append(line[0:-1])
114 | 		return Table(table_lines)					
115 | 


--------------------------------------------------------------------------------
/table_lp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding=utf-8
  3 | from pulp import * 
  4 | import table
  5 | import math
  6 | 
  7 | 
  8 | def colVar(x, ctype):
  9 | 	return "col%s_is_%s"%(x,ctype)
 10 | 	
 11 | def rowVar(y, rtypes):
 12 | 	return "row%s_is_%s"%(y,rtypes)
 13 | 	
 14 | def tokVar(t, ttypes):
 15 | 	return "tok%s_is_%s"%(t,ttypes)
 16 | 	
 17 | class TableModel:	
 18 | 	
 19 | 	def __init__(self, ctypes, table):
 20 | 		self.ctypes = ctypes;
 21 | 		self.rtypes = ["unknown", "product", "header"]
 22 | 		self.ttypes = ["unclassified"]
 23 | 		self.ttypes_dec = []
 24 | 		self.ttypes_val = []
 25 | 		for ctype in ctypes:
 26 | 			self.ttypes_dec.append(ctype + "_dec")
 27 | 			self.ttypes_val.append(ctype + "_val")
 28 | 			self.ttypes.append(ctype + "_dec")
 29 | 			self.ttypes.append(ctype + "_val")	
 30 | 		
 31 | 		self.table = table
 32 | 		
 33 | 		#build LP problem		
 34 | 		self.lp = LpProblem("table", LpMaximize)
 35 | 		self.v = {}
 36 | 		
 37 | 		#build column type variables and constrain only one-of-n is active
 38 | 		for x in range(table.width):
 39 | 			cats = []
 40 | 			for ctype in self.ctypes:
 41 | 				cats.append(self.addVariable(colVar(x, ctype), 0,1,cat='Integer'))
 42 | 			
 43 | 			self.lp += lpSum(cats) == 1
 44 | 		#build row type variables and constrain only one-of-n is active
 45 | 		for y in range(table.height):
 46 | 			cats = []
 47 | 			for rtype in self.rtypes:
 48 | 				cats.append(self.addVariable(rowVar(y, rtype), 0,1,cat='Integer'))
 49 | 			
 50 | 			self.lp += lpSum(cats) == 1
 51 | 		
 52 | 		#ensure at least one row in the top three is a header
 53 | 		'''
 54 | 		cats = []
 55 | 		for y in range(min(3, table.height)):
 56 | 			cats.append(self.v[rowVar(y, "header")])
 57 | 		self.lp += lpSum(cats) >=1
 58 | 		'''
 59 | 			
 60 | 		row_decs = {}
 61 | 		row_vals = {}
 62 | 		
 63 | 		#build token type variables and constrain only one-of-n is active	
 64 | 		for tid, (x_s, x_e, y, word) in enumerate(table.get_tokens()):
 65 | 			print word
 66 | 			cats = []
 67 | 			for ttype in self.ttypes:
 68 | 				cats.append(self.addVariable(tokVar(tid, ttype), 0,1,cat='Integer'))
 69 | 			
 70 | 			self.lp += lpSum(cats) == 1
 71 | 			
 72 | 			#furthermore, ensure only declarations (or unclassified) appear in header rows
 73 | 			#and values appear only in product rows
 74 | 			decs = []
 75 | 			vals = []
 76 | 			for ttype in self.ttypes_dec:
 77 | 				decs.append(self.v[tokVar(tid, ttype)])
 78 | 			vals = []
 79 | 			for ttype in self.ttypes_val:
 80 | 				vals.append(self.v[tokVar(tid, ttype)])
 81 | 									
 82 | 			self.lp += lpSum(decs) == self.v[rowVar(y, "header")]
 83 | 			self.lp += lpSum(vals) == self.v[rowVar(y, "product")]
 84 | 			
 85 | 			#furthermore, ensure column types match either a token val or declaration, or unknown
 86 | 			#in entire range of the token
 87 | 			for x in range(x_s, x_e):
 88 | 				for ctype in self.ctypes:
 89 | 					self.lp += \
 90 | 						self.v[tokVar(tid, ctype + "_dec")] + \
 91 | 						self.v[tokVar(tid, ctype + "_val")] + \
 92 | 						self.v[tokVar(tid, "unclassified")] \
 93 | 						>= self.v[colVar(x, ctype)]
 94 | 						
 95 | 						
 96 | 	def solve(self, token_probability_fn):
 97 | 		#maximize log probability of function that maps words to types
 98 | 		obj = []
 99 | 		for tid, (x_s, x_e, y, word) in enumerate(table.get_tokens()):
100 | 			for ttype in self.ttypes:
101 | 				p = math.log(token_probability_fn(ttype, word))
102 | 				
103 | 				obj.append( p * self.v[tokVar(tid, ttype)])
104 | 		self.lp += lpSum(obj)
105 | 		
106 | 		print "solving"
107 | 		self.lp.solve()
108 | 		print "Status:", LpStatus[self.lp.status]
109 | 		
110 | 		for x in range(self.table.width):
111 | 			for ctype in self.ctypes:
112 | 				if self.v[colVar(x, ctype)].varValue > 0.5:
113 | 					self.table.setColType(x, ctype)
114 | 					print colVar(x, ctype), " is true"
115 | 		
116 | 		for y in range(self.table.height):
117 | 			for rtype in self.rtypes:
118 | 				if self.v[rowVar(y, rtype)].varValue > 0.5:
119 | 					self.table.setRowType(y, rtype)
120 | 					print rowVar(y, rtype), " is true"
121 | 					
122 | 		for tid, (x_s, x_e, y, word) in enumerate(self.table.get_tokens()):
123 | 			for ttype in self.ttypes:
124 | 				if self.v[tokVar(tid, ttype)].varValue > 0.5:
125 | 					self.table.setTokType(tid, ttype)
126 | 					pass 
127 | 					print word, tokVar(tid, ttype), " is true"
128 | 		
129 | 	def addVariable(self, name, LB=None, UB=None, cat='Continuous'):
130 | 		self.v[name] = LpVariable(name, LB,UB,cat)
131 | 		return self.v[name]
132 | 	
133 | def probabilityCatagoryGivenWord(catagory, word):
134 | 	if word == "Digi-Key" and catagory == "ordercode_dec":
135 | 			return 0.9
136 | 	
137 | 	if word.endswith("-ND") and catagory == "ordercode_val":
138 | 			return 0.8
139 | 				
140 | 	if "-" in word and catagory == "partnum_val":
141 | 			return 0.4
142 | 			
143 | 	if catagory == "unclassified" or catagory == "unknown_val" or catagory == "unknown_dec":
144 | 		return 0.3
145 | 	
146 | 	return 0.1
147 | 
148 | 
149 | if __name__ == "__main__":
150 | 	table = table.read_table(sys.argv[1])
151 | 	model = TableModel(["unknown", "partnum", "ordercode"], table)
152 | 	
153 | 	model.solve(probabilityCatagoryGivenWord)
154 | 	
155 | 	print table
156 | 	
157 | 


--------------------------------------------------------------------------------
/example.lyx:
--------------------------------------------------------------------------------
  1 | #LyX 2.0 created this file. For more info see http://www.lyx.org/
  2 | \lyxformat 413
  3 | \begin_document
  4 | \begin_header
  5 | \textclass article
  6 | \use_default_options true
  7 | \maintain_unincluded_children false
  8 | \language english
  9 | \language_package default
 10 | \inputencoding auto
 11 | \fontencoding global
 12 | \font_roman default
 13 | \font_sans default
 14 | \font_typewriter default
 15 | \font_default_family default
 16 | \use_non_tex_fonts false
 17 | \font_sc false
 18 | \font_osf false
 19 | \font_sf_scale 100
 20 | \font_tt_scale 100
 21 | 
 22 | \graphics default
 23 | \default_output_format default
 24 | \output_sync 0
 25 | \bibtex_command default
 26 | \index_command default
 27 | \paperfontsize default
 28 | \use_hyperref false
 29 | \papersize default
 30 | \use_geometry false
 31 | \use_amsmath 1
 32 | \use_esint 1
 33 | \use_mhchem 1
 34 | \use_mathdots 1
 35 | \cite_engine basic
 36 | \use_bibtopic false
 37 | \use_indices false
 38 | \paperorientation portrait
 39 | \suppress_date false
 40 | \use_refstyle 1
 41 | \index Index
 42 | \shortcut idx
 43 | \color #008000
 44 | \end_index
 45 | \secnumdepth 3
 46 | \tocdepth 3
 47 | \paragraph_separation indent
 48 | \paragraph_indentation default
 49 | \quotes_language english
 50 | \papercolumns 1
 51 | \papersides 1
 52 | \paperpagestyle default
 53 | \tracking_changes false
 54 | \output_changes false
 55 | \html_math_output 0
 56 | \html_css_as_file 0
 57 | \html_be_strict false
 58 | \end_header
 59 | 
 60 | \begin_body
 61 | 
 62 | \begin_layout Section
 63 | Probabalistic Scraping of Plain Text Tables 
 64 | \end_layout
 65 | 
 66 | \begin_layout Standard
 67 | Recently I have been banging my head trying to import a ton of data expressed
 68 |  in tabular form into a database.
 69 |  Anyway I think I came up with a neat approach using probabalistic reasoning
 70 |  combined with mixed integer programming thats pretty robust to all sorts
 71 |  of real world issues.
 72 |  
 73 | \end_layout
 74 | 
 75 | \begin_layout Standard
 76 | Plain text tables are quite interesting when encoutered in the wild.
 77 |  They are highly compressed forms of data, but that's a double edged sword,
 78 |  you can only understand the meaning of a particular table element if, and
 79 |  only if, you understand the meaning of the row and column it is found within.
 80 |  Unfortunatly, the meaning of columns and rows vary widly across a dataset
 81 |  of many independant tables.
 82 |  Consider the following abridged OCRed examples from the digikey catalogue:-
 83 | \end_layout
 84 | 
 85 | \begin_layout Standard
 86 | 
 87 | \family typewriter
 88 | \size tiny
 89 | \color foreground
 90 | EXAMPLES
 91 | \end_layout
 92 | 
 93 | \begin_layout Standard
 94 | These tables have: differing spatial layout of header fields (e.g.
 95 |  
 96 | \begin_inset Quotes eld
 97 | \end_inset
 98 | 
 99 | Cut Price Tape Each
100 | \begin_inset Quotes erd
101 | \end_inset
102 | 
103 | ), differing number of table header lines, different number of columns,
104 |  and some rows are not data but actually heirachical sub headings (e.g.
105 |  
106 | \begin_inset Quotes eld
107 | \end_inset
108 | 
109 | CB3LV-3I 3.3V, ±50ppm, -40°C ~ 85°C
110 | \begin_inset Quotes erd
111 | \end_inset
112 | 
113 | ).
114 |  In the digikey world, ending in 
115 | \begin_inset Quotes eld
116 | \end_inset
117 | 
118 | -ND
119 | \begin_inset Quotes erd
120 | \end_inset
121 | 
122 |  is strong evidence that a token is a partnum, however, its not fool proof,
123 |  as lots of non-partnums also end in -ND (its a huge catalogue).
124 |  To decide whether 
125 | \begin_inset Quotes eld
126 | \end_inset
127 | 
128 | 297LVCT-ND
129 | \begin_inset Quotes erd
130 | \end_inset
131 | 
132 |  is a product code, you need to reason over the entire table building up
133 |  evidence.
134 |  
135 | \end_layout
136 | 
137 | \begin_layout Standard
138 | To do the inference I represent the table structuring elements (rows and
139 |  columns) and the token labels as random catagorical variables.
140 |  A single character wide column is assigned a column type (ordercode, partnum)
141 |  or unkown.
142 |  A row is either a header, entity or noise.
143 |  A token is either unclassified, a declartions of a column type (e.g.
144 |  
145 | \begin_inset Quotes eld
146 | \end_inset
147 | 
148 | Part No.
149 | \begin_inset Quotes erd
150 | \end_inset
151 | 
152 | ), or a value in a column type (e.g.
153 |  
154 | \begin_inset Quotes eld
155 | \end_inset
156 | 
157 | 281LVCT-ND
158 | \begin_inset Quotes erd
159 | \end_inset
160 | 
161 | ).
162 | \end_layout
163 | 
164 | \begin_layout Standard
165 | \begin_inset Formula 
166 | \[
167 | col_{i}\in\{ordercode,\, partnum,\, unknown\}
168 | \]
169 | 
170 | \end_inset
171 | 
172 | 
173 | \end_layout
174 | 
175 | \begin_layout Standard
176 | \begin_inset Formula 
177 | \[
178 | row_{j}\in\{header,\, entity,\, noise\}
179 | \]
180 | 
181 | \end_inset
182 | 
183 | 
184 | \begin_inset Formula 
185 | \[
186 | token_{t}\in\{unclassified,\, partnum\_dec,\, partnum\_val,\, ordercode\_dec,\, ordercode\_val,unknow\_dec,unknown\_val\}
187 | \]
188 | 
189 | \end_inset
190 | 
191 | 
192 | \end_layout
193 | 
194 | \begin_layout Standard
195 | The important thing in a table is that 
196 | \shape italic
197 | values and declaration tokens have to match types, and be consistent over
198 |  an entire column
199 | \shape default
200 | .
201 |  We can express these hard logical constraints using mixed integer programming
202 |  (MIP).
203 |  The first step is to encode the variables states into numerical variables.
204 |  A catagorical variable is split into a one-of-n vector encoding.
205 |  For example, every column catagorical variable becomes three integer variables
206 |  each encoding a boolean state
207 | \end_layout
208 | 
209 | \begin_layout Standard
210 | \begin_inset Formula 
211 | \[
212 | is\_ordercode_{i}\in\{0,1\}
213 | \]
214 | 
215 | \end_inset
216 | 
217 | 
218 | \end_layout
219 | 
220 | \begin_layout Standard
221 | \begin_inset Formula 
222 | \[
223 | is\_partnum_{i}\in\{0,1\}
224 | \]
225 | 
226 | \end_inset
227 | 
228 | 
229 | \end_layout
230 | 
231 | \begin_layout Standard
232 | \begin_inset Formula 
233 | \[
234 | is\_unknown_{i}\in\{0,1\}
235 | \]
236 | 
237 | \end_inset
238 | 
239 | 
240 | \end_layout
241 | 
242 | \begin_layout Standard
243 | We force one of the variables to be one by adding a linear constraint, 
244 | \begin_inset Formula $is\_ordercode_{i}+is\_partnum_{i}+is\_unknown_{i}=1$
245 | \end_inset
246 | 
247 |  for each 
248 | \begin_inset Formula $i$
249 | \end_inset
250 | 
251 | .
252 |  We repeat this for the row and token catagoricals.
253 | \end_layout
254 | 
255 | \begin_layout Standard
256 | The next thing is to ensure that only declarations appear in header rows,
257 |  and values in entity rows, for every row, 
258 | \begin_inset Formula $j$
259 | \end_inset
260 | 
261 | , and token, 
262 | \begin_inset Formula $t$
263 | \end_inset
264 | 
265 | .
266 | \end_layout
267 | 
268 | \begin_layout Standard
269 | \begin_inset Formula 
270 | \[
271 | is\_header_{j}=is\_partnum\_dec_{t}+is\_ordercode\_dec_{t}
272 | \]
273 | 
274 | \end_inset
275 | 
276 | 
277 | \end_layout
278 | 
279 | \begin_layout Standard
280 | \begin_inset Formula 
281 | \[
282 | is\_entity_{j}=is\_partnum\_val_{t}+is\_ordercode\_val_{t}
283 | \]
284 | 
285 | \end_inset
286 | 
287 | 
288 | \end_layout
289 | 
290 | \begin_layout Standard
291 | As each variable is either a one or a zero, 
292 | \end_layout
293 | 
294 | \begin_layout Standard
295 | The final set of constraints is ensuring each column contains declarations
296 |  and values for a spefic column type.
297 |  Each token is one or more characters long, so each token intersects several
298 |  single character wide columns.
299 |  So for each token, 
300 | \begin_inset Formula $t$
301 | \end_inset
302 | 
303 | , and for every column intersecting that token, 
304 | \begin_inset Formula $i$
305 | \end_inset
306 | 
307 |  we add the following constraints
308 | \end_layout
309 | 
310 | \begin_layout Standard
311 | \begin_inset Formula 
312 | \[
313 | is\_partnum_{i}\leq is\_partnum\_val_{t}+is\_partnum\_dec_{t}+unclassified_{t}
314 | \]
315 | 
316 | \end_inset
317 | 
318 | 
319 | \end_layout
320 | 
321 | \begin_layout Standard
322 | (and the same for 
323 | \begin_inset Formula $ordercode$
324 | \end_inset
325 | 
326 | )
327 | \end_layout
328 | 
329 | \begin_layout Standard
330 | So at this point we have expressed the problem as a huge set of binary integer
331 |  variables, with linear constraints between them.
332 |  A MIP solver can now optimize any linear cost function involving those
333 |  variables subject to those constraints.
334 |  We choose our objective function to encode the probability of a labeling
335 |  which we maximize to give the maximum liklihood estimate (which tells us
336 |  which tokens are values).
337 | \end_layout
338 | 
339 | \begin_layout Standard
340 | Given a number of naive, independant, classifiers, their joint probability
341 |  is their product, which is what we want to maximise.
342 |  
343 | \end_layout
344 | 
345 | \begin_layout Standard
346 | \begin_inset Formula 
347 | \[
348 | max\, p(v_{1}...v_{n})=max\,\prod_{i}^{n}p(v_{i})
349 | \]
350 | 
351 | \end_inset
352 | 
353 | 
354 | \end_layout
355 | 
356 | \begin_layout Standard
357 | The product term is not compatible with linear programming, so we note that
358 |  the maximization result is not affected by taking logs, which usefully
359 |  turns products into sums
360 | \begin_inset Formula 
361 | \[
362 | max\, p(v_{1}...v_{n})=log\, max\,\sum_{i}^{n}p(v_{i})
363 | \]
364 | 
365 | \end_inset
366 | 
367 | 
368 | \end_layout
369 | 
370 | \begin_layout Standard
371 | This we can express as an objective function for our MIP.
372 |  For this example I assign a probability of a token labeling based only
373 |  on the content of the token.
374 |  The three important cases I used were:
375 | \end_layout
376 | 
377 | \begin_layout Standard
378 | \begin_inset Formula 
379 | \[
380 | p(token_{t}=partnum\_val|content.endswith("-ND"))=0.8
381 | \]
382 | 
383 | \end_inset
384 | 
385 | 
386 | \end_layout
387 | 
388 | \begin_layout Standard
389 | \begin_inset Formula 
390 | \[
391 | p(token_{t}=ordercode\_val|contents.contains("-"))=0.4
392 | \]
393 | 
394 | \end_inset
395 | 
396 | 
397 | \end_layout
398 | 
399 | \begin_layout Standard
400 | \begin_inset Formula 
401 | \[
402 | p(token_{t}=partnum\_dec|content.contains("Digi-Key"))=0.9
403 | \]
404 | 
405 | \end_inset
406 | 
407 | 
408 | \end_layout
409 | 
410 | \begin_layout Standard
411 | A catch all case when no specific clues are present is
412 | \end_layout
413 | 
414 | \begin_layout Standard
415 | \begin_inset Formula 
416 | \[
417 | p(token_{t}=unclassified|content.is\_none\, of\, the\, above)=0.3
418 | \]
419 | 
420 | \end_inset
421 | 
422 | 
423 | \end_layout
424 | 
425 | \begin_layout Standard
426 | You should theoretically fill in the missing combinations to make sure the
427 |  probabilities add up to one, but in practice it does not matter too much.
428 | \end_layout
429 | 
430 | \begin_layout Standard
431 | So now we can express our maximum liklihood estimate objective function
432 |  as
433 | \begin_inset Formula 
434 | \[
435 | max\,\sum_{t}\forall_{class}log(p(token_{t}=class|contents_{t}))
436 | \]
437 | 
438 | \end_inset
439 | 
440 | 
441 | \end_layout
442 | 
443 | \begin_layout Standard
444 | which a integer programming package like PuLP can solve very quickly.
445 |  I have coded up this example in python (github) to demonstrate how easy
446 |  it is to encode, once the constraint model has been worked out on paper
447 |  (all th.
448 | \end_layout
449 | 
450 | \begin_layout Subsection
451 | Results
452 | \end_layout
453 | 
454 | \end_body
455 | \end_document
456 | 


--------------------------------------------------------------------------------