├── .gitignore
├── README.rst
├── pip-req.txt
├── src
    ├── dataset_manage.py
    ├── evaluate_manage.py
    ├── extract_manage.py
    ├── plot_manage.py
    ├── settings.py-template
    ├── tee
    └── txtexeval
    │   ├── __init__.py
    │   ├── data.py
    │   ├── evaluation.py
    │   ├── extractor.py
    │   └── util
    │       ├── __init__.py
    │       ├── common.py
    │       └── zemanta
    │           ├── __init__.py
    │           ├── client.py
    │           ├── thrift
    │               ├── ceservice.thrift
    │               └── generate_thrift.sh
    │           └── thriftgen
    │               ├── __init__.py
    │               └── ceservice
    │                   ├── ExtractorService-remote
    │                   ├── ExtractorService.py
    │                   ├── __init__.py
    │                   ├── constants.py
    │                   └── ttypes.py
└── tests
    ├── test_evaluation.py
    ├── test_plot.py
    └── testsrunner.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | develop-eggs
12 | .installed.cfg
13 | 
14 | # Installer logs
15 | pip-log.txt
16 | 
17 | # Unit test / coverage reports
18 | .coverage
19 | .tox
20 | 
21 | # Project specific
22 | src/settings.py
23 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Text extraction evaluation framework
 2 | ------------------------------------
 3 | 
 4 | Framework for evaluating text extraction algorithms implemented as web services.
 5 | 
 6 | 
 7 | Author
 8 | ------
 9 | 
10 | Tomaž Kovačič <tomaz.kovacic@gmail.com>
11 | 
12 | 	
13 | Licence
14 | -------
15 | 
16 | Copyright (C) 2011  Tomaž Kovačič
17 |     
18 | This program is free software: you can redistribute it and/or modify
19 | it under the terms of the GNU General Public License as published by
20 | the Free Software Foundation, either version 3 of the License, or
21 | (at your option) any later version.
22 | 
23 | This program is distributed in the hope that it will be useful,
24 | but WITHOUT ANY WARRANTY; without even the implied warranty of
25 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26 | GNU General Public License for more details.
27 | 
28 | You should have received a copy of the GNU General Public License
29 | along with this program.  If not, see <http://www.gnu.org/licenses/>.


--------------------------------------------------------------------------------
/pip-req.txt:
--------------------------------------------------------------------------------
 1 | BeautifulSoup==3.2.0
 2 | PyYAML==3.09
 3 | argparse==1.1
 4 | chardet==2.0.1
 5 | matplotlib==1.1.0svn
 6 | numpy==1.5.1
 7 | -e git+https://github.com/gfxmonk/python-readability.git@b5639a08225a9a6cc3ccd43f0b5c07b82958ebda#egg=python_readability-0.0.0-py2.6-dev
 8 | unittest2==0.5.1
 9 | Thrift==0.1
10 | justext==1.1
11 | lxml==2.3
12 | selenium==2.2.0


--------------------------------------------------------------------------------
/src/dataset_manage.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Script for generating meta data files and preprocessing datasets.
  3 | 
  4 | Throughout the script we're assuming the following structure 
  5 | of the directory that settings.PATH_LOCAL_DATA points to.
  6 | 
  7 | |-- datasets
  8 | |   |-- testdataset
  9 | |   |   |-- clean
 10 | |   |   |   `-- example.txt
 11 | |   |   |-- meta.yaml ----> this is where the output will reside
 12 | |   |   `-- raw
 13 | |   |       `-- example.html
 14 | |-- plot-output
 15 | |   `-- ...
 16 | `-- results-cache
 17 |     `-- ...
 18 | '''
 19 | import os
 20 | import sys
 21 | import re
 22 | import codecs
 23 | import logging
 24 | 
 25 | import yaml
 26 | import argparse
 27 | import chardet
 28 | from BeautifulSoup import BeautifulSoup
 29 | 
 30 | from txtexeval.util import check_local_path, get_local_path
 31 | 
 32 | # module logger
 33 | logger = logging.getLogger()
 34 | 
 35 | # exceptions
 36 | 
 37 | class MetaGeneratorError(Exception):
 38 |     pass
 39 | 
 40 | class PreprocessingError(Exception):
 41 |     pass
 42 | 
 43 | class SkipTrigger(ValueError):
 44 |     pass
 45 | 
 46 | # private helpers
 47 |     
 48 | def _verify_args(args):
 49 |     # verify arguments provoded by argparse and
 50 |     # return the path to the output directory
 51 |     
 52 |     # printing arguments
 53 |     print 'dataset type: %s' % args.dataset_type
 54 |     print 'dataset name: %s' % args.dataset_name
 55 |     
 56 |     #validate dataset name
 57 |     if not check_local_path(args.dataset_name):
 58 |         print 'error: this dataset does not exist'
 59 |         sys.exit(-1)
 60 |     
 61 |     # validate path argument
 62 |     if args.path and not os.path.exists(args.path):
 63 |         print 'error: path does not exist'
 64 |         sys.exit(-1)
 65 |         
 66 |     output_dir = args.path or get_local_path(args.dataset_name)
 67 |     print 'output directory: %s' % output_dir
 68 |     return output_dir
 69 | 
 70 |     
 71 | def _get_attribute(tag, name):
 72 |     # params: BS tag and attribute name
 73 |     # return None or attribute value
 74 |     # takes care of encoding
 75 |     try: 
 76 |         return tag[name].encode('ascii', 'ignore')
 77 |     except KeyError:
 78 |         return None
 79 |     
 80 | regex_BEG = re.compile(r'(?P<text_tag>^(\s*)<(\s*)text((\s*)(id|title|encoding)(\s*)=(\s*)"(.*)")*(\s*)>)')
 81 | regex_END = re.compile(r'(?P<closing_text_tag><(\s*)/(\s*)text(\s*)>(.*)$)')
 82 | def _remove_text_tag(html_string, filename):
 83 |     # Cleaneval has a <text> tag that wraps the whole html structure. This 
 84 |     # function removes it with a pessimistic regular expression because we don't 
 85 |     # want to mess with the rest of the structure with a parser
 86 |     
 87 |     # remove <text> at the beginning
 88 |     match_start = regex_BEG.match(html_string)
 89 |     if match_start:
 90 |         logger.debug('removing text tag in %s: %s', filename, match_start.group('text_tag'))
 91 |         html_string = regex_BEG.sub('', html_string)
 92 |     else:
 93 |         raise PreprocessingError('no starting text tag in %s' % filename)
 94 | 
 95 |     # remove </text>
 96 |     match_end = regex_END.search(html_string)
 97 |     if match_end:
 98 |         logger.debug('removing closing text tag in %s: %s', filename, match_end.group('closing_text_tag'))
 99 |         html_string = regex_END.sub('', html_string)
100 |     else:
101 |         raise PreprocessingError('no closing text tag in %s' % filename)
102 |     
103 |     return html_string
104 | 
105 | def _get_charset(html_string, raw_filename):
106 |     # based on a string that represents the html document
107 |     # get the charset from the meta http-equiv tag e.g.:
108 |     # <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
109 |     # or html5 <meta charset="UTF-8" />
110 |     # return None if no such tag was found
111 |     # raw_filename is used only for logging
112 |     charset = None
113 |     
114 |     soup = BeautifulSoup(html_string)
115 |     r_ct = re.compile('[C|c]ontent-[T|t]ype|CONTENT-TYPE')
116 |     r_cont = re.compile('\s*text\s*/\s*html\s*;\s*charset\s*=\s*(?P<charset>[a-zA-Z0-9_-]+)')
117 |     
118 |     for tag in soup.findAll('meta'):
119 |         
120 |         if tag.has_key('http-equiv') and tag.has_key('content') and r_ct.match(tag['content']):
121 |             content = tag['content'].lower()
122 |             match = r_cont(content)
123 |             if match:
124 |                 charset = match.group('charset')
125 |                 logger.debug('charset %s found via meta http-equiv in %s', charset, raw_filename)
126 |             else:
127 |                 logger.warn('meta http-equiv exists but it does not match the content regex in %s: %s', raw_filename, str(tag))
128 |         
129 |         elif tag.has_key('http-equiv') and not tag.has_key('content'):
130 |             logger.warn('no content attribute in meta http-equiv tag in %s: %s', raw_filename, str(tag))
131 |         
132 |         elif tag.has_key('charset'):
133 |             charset = tag['charset']
134 |             logger.info('charset %s found via meta charset (html5 style) in %s', charset, raw_filename)
135 |         
136 |     if not charset:
137 |         logger.debug('no meta tag with charset definition in %s', raw_filename)
138 |             
139 |     return charset
140 | 
141 | def _get_safe_encoding_name(encoding):
142 |     if encoding == None:
143 |         raise MetaGeneratorError('no encoding given')
144 |     try:
145 |         codec = codecs.lookup(encoding)
146 |     except LookupError:
147 |         raise MetaGeneratorError('no safe encoding name is found for %s' % encoding)
148 |     else:
149 |         return codec.name
150 |     
151 | def _skip_file(regex, raw_filename):
152 |     # if filename does not match the given regular expr. 
153 |     # then raise the skip trigger
154 |     if not regex.match(raw_filename):
155 |         logger.debug('skipping file %s', raw_filename)
156 |         raise SkipTrigger
157 | 
158 | # decorators
159 | 
160 | def itarate_raw_filename(method):
161 |     def wrap(self):
162 |         for raw_filename in self._raw_filenames():
163 |             try:
164 |                 method(self, raw_filename)
165 |             except SkipTrigger:
166 |                 continue
167 |     return wrap
168 | 
169 | def dump_meta_data(method):
170 |     def wrap(self,*args,**kwargs):
171 |         method(self,*args,**kwargs)
172 |         self._serialize_meta_data()
173 |     return wrap
174 |     
175 | # dataset specific processor classes
176 | 
177 | class BaseProcessor(object):
178 |     
179 |     def __init__(self, output_dir, dataset_name):   
180 |         self.dataset_name = dataset_name
181 |         self._dataset_dir = get_local_path(dataset_name)
182 |         self._output_dir = output_dir
183 |         self.meta_data_list = [] # list to be serialized
184 |     
185 |     def _raw_filenames(self):       
186 |         return os.listdir(os.path.join(self._dataset_dir, 'raw')) 
187 |     
188 |     def _clean_filenames(self):
189 |         return os.listdir(os.path.join(self._dataset_dir, 'clean'))
190 |     
191 |     def _serialize_meta_data(self):
192 |         with open(os.path.join(self._output_dir, 'meta.yaml'), 'w') as meta_file:
193 |             meta_string = yaml.dump(self.meta_data_list, default_flow_style=False) 
194 |             meta_file.write(meta_string)
195 |     
196 |     
197 | class GooglenewsProcessor(BaseProcessor):
198 |     
199 |     re_TAIL = re.compile(r'(?P<id>.+)\.html$')
200 |     
201 |     @dump_meta_data    
202 |     @itarate_raw_filename
203 |     def generate_meta_data(self, raw_filename):
204 |         _skip_file(self.re_TAIL, raw_filename)
205 |         
206 |         with open(os.path.join(self._dataset_dir, 'raw', raw_filename), 'r' ) as f:
207 |             # check for cleaned file counterpart
208 |             if not os.path.exists(os.path.join(self._dataset_dir, 'clean', raw_filename )):
209 |                 raise MetaGeneratorError('No existing clean file counterpart for %s' % raw_filename)
210 |             
211 |             html_string = f.read()
212 |             
213 |             charset = _get_charset(html_string, raw_filename)
214 |             confidence = None
215 |             # if no charset is retrieved with document parsing
216 |             # use chardet library to detect encoding
217 |             if charset:
218 |                 raw_encoding = charset
219 |             else:
220 |                 det = chardet.detect(html_string)
221 |                 raw_encoding = det['encoding']
222 |                 confidence =  det['confidence']
223 |                 logger.debug('detected encoding %s in %s with confidence %f', raw_encoding, raw_filename, confidence)
224 |                 
225 |             safe_raw_encoding = _get_safe_encoding_name(raw_encoding)
226 |             
227 |             self.meta_data_list.append(dict(
228 |                 id = self.re_TAIL.match(raw_filename).group('id'),
229 |                 url = None,
230 |                 raw_encoding = safe_raw_encoding,
231 |                 clean_encoding = safe_raw_encoding, # TODO: must verify if this is allways true
232 |                 raw = raw_filename, 
233 |                 clean = raw_filename,
234 |                 meta = {'encoding_confidence': confidence}
235 |             ))
236 |                     
237 |               
238 | class CleanevalProcessor(BaseProcessor):
239 |     
240 |     re_BACK = re.compile(r'^(?P<id>\d+)\.html\.backup$')
241 |     re_NEW = re.compile(r'^\d+\.html$')
242 |     
243 |     @itarate_raw_filename
244 |     def create_backups(self, raw_filename):
245 |         # rename every unprocessed [number].html to [number].html.backup 
246 |         
247 |         raw_filename_path = os.path.join(self._dataset_dir, 'raw', raw_filename)
248 |         backup_path = raw_filename_path + '.backup'
249 |         logger.debug('renaming %s to %s', raw_filename, raw_filename + '.backup')
250 |         os.rename(raw_filename_path, backup_path)
251 |     
252 |     @dump_meta_data
253 |     @itarate_raw_filename
254 |     def generate_meta_data(self, raw_filename):
255 |         _skip_file(self.re_BACK, raw_filename)
256 |         with open(os.path.join(self._dataset_dir, 'raw', raw_filename), 'r' ) as f:
257 |             html_string = f.read()
258 |             
259 |             # check for an existing clean file counterpart
260 |             # FIXME: this is a hack, because cleaneval-final uses only [number].txt
261 |             #        and [number]-cleaned.txt in cleaneval-dev
262 |             if self.dataset_name == 'cleaneval-final':
263 |                 clean_filename = self.re_BACK.match(raw_filename).group('id') + '.txt' 
264 |             else:
265 |                 clean_filename = self.re_BACK.match(raw_filename).group('id') + '-cleaned.txt' 
266 |             if not os.path.exists(os.path.join(self._dataset_dir, 'clean', clean_filename )):
267 |                 msg = 'No existing clean file counterpart for %s' % raw_filename
268 |                 logger.warning(msg)
269 |                 raise SkipTrigger(msg)
270 |             
271 |             # get meta data from <text ...> tag
272 |             soup = BeautifulSoup(html_string)
273 |             text_tag = soup.find('text')
274 |             if text_tag == None:
275 |                 raise MetaGeneratorError('No <text> tag in %s' % raw_filename)
276 |             encoding = text_tag.get('encoding',None)
277 |             
278 |             # extract dataset specific meta-data and store it into a dict with
279 |             # keys id, title, encoding
280 |             # since we'll be removing the <text> tag from every document
281 |             # we better store this attributes in it's original form in meta.yaml
282 |             cleaneval_specific = {
283 |                 'id': _get_attribute(text_tag, 'id'),
284 |                 'title': _get_attribute(text_tag, 'title'),
285 |                 'encoding': _get_attribute(text_tag, 'encoding'),
286 |             }
287 |             
288 |             # get a safe encoding name
289 |             try:
290 |                 safe_encoding = _get_safe_encoding_name(encoding)
291 |             except MetaGeneratorError:
292 |                 det = chardet.detect(html_string)
293 |                 safe_encoding = _get_safe_encoding_name(det['encoding'])
294 |                 logger.info('detected encoding %s in %s with confidence %f', safe_encoding, raw_filename, det['confidence'] )
295 |     
296 |             logger.debug('generating meta data for %s', raw_filename)
297 |             self.meta_data_list.append(dict(
298 |                 id = self.re_BACK.match(raw_filename).group('id'),
299 |                 url = None,
300 |                 raw_encoding = safe_encoding,
301 |                 # acording to anotation guidelines of cleaneval 
302 |                 # all cleaned text files are utf-8 encoded
303 |                 clean_encoding = 'utf-8',
304 |                 # we'll be generating [number].html in the preprocessing phase
305 |                 raw = raw_filename.replace('.backup', ''), 
306 |                 clean = clean_filename,
307 |                 meta = cleaneval_specific
308 |             ))
309 |    
310 |     @itarate_raw_filename
311 |     def preprocess(self, raw_filename):
312 |         # remove all <text> tags
313 |         # add missing <html><body> tags where needed
314 |                 
315 |         _skip_file(self.re_BACK, raw_filename)
316 |         with open(os.path.join(self._dataset_dir, 'raw', raw_filename), 'r' ) as f:
317 |             html_string = _remove_text_tag(f.read(), raw_filename)
318 |             
319 |             soup = BeautifulSoup(html_string)
320 |             if (not soup.find('html')) and (not soup.find('body')):
321 |                 # no html no body tag
322 |                 logger.warn('appending body and html tags to %s', raw_filename)
323 |                 html_string = '<html><body>  %s  </body></html>' % html_string
324 |                 
325 |             elif (not soup.find('html')) or (not soup.find('body')):
326 |                 # really weird case
327 |                 logger.warning('%s has html tag or body tag but not both', raw_filename) 
328 |             else:
329 |                 logger.info('no tag appending on %s', raw_filename)
330 |             
331 |             output_filename = raw_filename.replace('.backup','')
332 |             logger.debug('preprocesing complete: %s ---> %s',raw_filename,output_filename)
333 |             with open(os.path.join(self._dataset_dir, 'raw', output_filename) ,'w') as output:
334 |                 output.write(html_string)
335 | 
336 | def parse_args(args):               
337 |     # sys argument parsing using argparse
338 |     parser = argparse.ArgumentParser(description = 'Tool for generating meta data files and cleanup preprocessing regarding datasets')
339 |     parser.add_argument('dataset_type', choices = ('cleaneval','gnews'), help = 'dataset type e.g. cleaneval' )
340 |     parser.add_argument('dataset_name', help = 'name of the dataset')
341 |     parser.add_argument('-p','--path', help = 'path to the meta data output file and .log file (uses the default path if not provided)')
342 |     parser.add_argument('-v','--verbose', action = 'store_true', help = 'print log to console')
343 |     return parser.parse_args(args)
344 |                 
345 | def main(args):
346 |     pargs = parse_args(args)
347 |     # get the ouput direcotry - this is where the .yaml and .log file will reside
348 |     output_dir = _verify_args(pargs)
349 |     
350 |     # now we can initialize logging
351 |     print 'log: %s' % os.path.join(output_dir, 'preproc.log')
352 |     logging.basicConfig(filename= os.path.join(output_dir, 'preproc.log'), level=logging.DEBUG)
353 |     
354 |     # add a console handler to root logger if user provides a --verbose flag
355 |     if pargs.verbose:
356 |         console = logging.StreamHandler()
357 |         formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
358 |         console.setFormatter(formatter)
359 |         console.setLevel(logging.DEBUG)
360 |         logging.getLogger().addHandler(console)
361 |     
362 |     if pargs.dataset_type == 'cleaneval':
363 |         processor = CleanevalProcessor(output_dir, pargs.dataset_name)
364 |         try:
365 |             print '[CREATE BACKUPS]'
366 |             processor.create_backups()
367 |             print '[GENERATING META DATA]'
368 |             processor.generate_meta_data()
369 |             print '[PREPROCESSING]'
370 |             processor.preprocess()
371 |         except MetaGeneratorError as e:
372 |             print 'META DATA RELATED ERROR:'
373 |             print e
374 |             sys.exit(-1)
375 |         except PreprocessingError as e:
376 |             print 'PREPROCESSING ERROR:'
377 |             print e
378 |             sys.exit(-1)
379 |             
380 |     elif pargs.dataset_type == 'gnews':
381 |         processor = GooglenewsProcessor(output_dir, pargs.dataset_name)
382 |         try:
383 |             print '[GENERATING META DATA]'
384 |             processor.generate_meta_data()
385 |         
386 |         except MetaGeneratorError as e:
387 |             print 'META DATA RELATED ERROR:'
388 |             print e
389 |             sys.exit(-1)
390 |         except PreprocessingError as e:
391 |             print 'PREPROCESSING ERROR:'
392 |             print e
393 |             sys.exit(-1)
394 |             
395 |     print '[DONE]'
396 |     
397 |     
398 | if __name__ == '__main__':
399 |     import sys
400 |     main(sys.argv[1:])


--------------------------------------------------------------------------------
/src/evaluate_manage.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Script for generating evaluation results
 3 | '''
 4 | import os
 5 | import logging
 6 | 
 7 | import argparse
 8 | 
 9 | import settings
10 | from txtexeval.extractor import extractor_list, get_extractor_cls
11 | from txtexeval.data import LocalDatasetLoader, LocalResultStorage
12 | from txtexeval.data import DataError
13 | from txtexeval.evaluation import TextBasedResults, TextOnlyEvaluator
14 | from txtexeval.evaluation import from_document_factory, dataset_format_map
15 | 
16 | logger = logging.getLogger()
17 | 
18 | def single_evaluation(extractor_cls, results, dataset_type, dataset_name):
19 |     logger.info('started evaluating extractor %s', extractor_cls.NAME)
20 |     results.set_extractor(extractor_cls.SLUG)
21 |     storage = LocalResultStorage(dataset_name, extractor_cls)
22 |     
23 |     loader = LocalDatasetLoader(dataset_name)
24 |     for doc in loader:
25 |         logger.debug('doc: %s', doc.id)
26 |         format_clean = from_document_factory(doc, slug = dataset_type)
27 |         try:
28 |             result_string = storage.fetch_result(doc)
29 |         except DataError:
30 |             logger.info('no stored result for %s at %s extractor',
31 |                         doc.id, extractor_cls.NAME)
32 |             continue
33 |         else:
34 |             format_result = extractor_cls.formatted_result(result_string)
35 |             evaluator = TextOnlyEvaluator(
36 |                         retrieved = format_result,
37 |                         relevant = format_clean,
38 |                         id = doc.id)
39 |             results.add_result(evaluator.get_eval_results())
40 | 
41 | def local_evaluate(dataset_type, dataset_name, update_ext_slug = None):
42 |     results = TextBasedResults()
43 |     
44 |     if update_ext_slug:
45 |         results.load(dataset_name)
46 |         ex_cls = get_extractor_cls(update_ext_slug)
47 |         single_evaluation(ex_cls, results, dataset_type, dataset_name)
48 |     else:
49 |         for extractor_cls in extractor_list:
50 |             single_evaluation(extractor_cls, results, dataset_type, dataset_name)
51 | 
52 |     results.dataset_len = len(LocalDatasetLoader(dataset_name))
53 |     results.save(dataset_name)     
54 |     results.print_results()
55 |     
56 | def parse_args(args):
57 |     '''Sys argument parsing trough argparse'''
58 |     parser = argparse.ArgumentParser(description = 'Tool for for generating evaluation results')
59 |     parser.add_argument('dataset_type', choices = [i[0] for i in dataset_format_map], help = 'dataset type e.g. cleaneval' )
60 |     parser.add_argument('dataset_name', help = 'name of the dataset')
61 |     parser.add_argument('-v','--verbose', action = 'store_true', help = 'print log to console')
62 |     parser.add_argument('-u','--update', choices = [e.SLUG for e in extractor_list], help = 'update the results for a single extractor')
63 |     return parser.parse_args(args)
64 |     
65 | def logging_setup(verbose):
66 |     '''Set verbose to True if you want the log to appear on stderr'''
67 |     logger = logging.getLogger()
68 |     logger.setLevel(logging.DEBUG)
69 |     logd = os.path.join(settings.PATH_LOCAL_DATA,'results-cache','results.log')
70 |     file = logging.FileHandler(filename = logd)
71 |     file.setLevel(logging.INFO)
72 |     file.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s'))
73 |     logger.addHandler(file)
74 |     if verbose:
75 |         console = logging.StreamHandler()
76 |         console.setLevel(logging.DEBUG)
77 |         console.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s'))
78 |         logger.addHandler(console)
79 |     print 'log: %s' % logd
80 | 
81 | def main(args):
82 |     pargs = parse_args(args)
83 |     logging_setup(pargs.verbose)
84 |     print '[STARTED]'
85 |     local_evaluate(pargs.dataset_type, pargs.dataset_name, pargs.update)
86 |     print '[DONE]'
87 |     
88 | if __name__ == '__main__':
89 |     import sys
90 |     main(sys.argv[1:])


--------------------------------------------------------------------------------
/src/extract_manage.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Script for extracting article text from dataset instances
 3 | '''
 4 | import time
 5 | import logging
 6 | 
 7 | import argparse
 8 | 
 9 | from txtexeval.extractor import get_extractor_cls, extractor_list
10 | from txtexeval.data import LocalDatasetLoader, LocalResultStorage
11 | from txtexeval.util import get_local_path
12 | 
13 | logger = logging.getLogger()
14 | 
15 | def local_extract(dataset_name, extractor_slug, timeout, retry_failed, skip_existing):
16 |     # init storage and loader
17 |     ex = get_extractor_cls(extractor_slug)
18 |     
19 |     failed_slug = extractor_slug if retry_failed else None
20 |     skip_slug = extractor_slug if skip_existing else None
21 |     
22 |     loader = LocalDatasetLoader(dataset_name, 
23 |                                 load_failed=failed_slug, 
24 |                                 skip_existing=skip_slug)
25 |     storage = LocalResultStorage(dataset_name, ex)
26 |     
27 |     logger.info('started extracting content from %s dataset using %s', dataset_name, ex.NAME)
28 |     for doc in loader:
29 |         storage.push_result(doc)
30 |         if timeout:
31 |             time.sleep(timeout)
32 |         
33 |     storage.dump_summary()
34 |     logger.info('finished with %s dataset', dataset_name)
35 |     
36 | def parse_args(args):
37 |     '''Sys argument parsing trough argparse'''
38 |     ex_list = [e.SLUG for e in extractor_list]    
39 |     parser = argparse.ArgumentParser(description = 'Tool for extracting article text from dataset instances')
40 |     parser.add_argument('extractor', choices = ex_list, help = 'extractor slug')
41 |     parser.add_argument('dataset_name', help = 'name of the dataset')
42 |     parser.add_argument('-v','--verbose', action = 'store_true', help = 'print log to console')
43 |     parser.add_argument('-t','--timeout', type=int, default=0, help='wait x seconds between extraction operations')
44 |     parser.add_argument('-rf','--retry_failed', action = 'store_true', help = 'retry to extract text from instances that failed')
45 |     parser.add_argument('-se','--skip_existing', action = 'store_true', help = 'skip all documents that already have their result stored in the database/filesystem')
46 |     return parser.parse_args(args)
47 |     
48 | def logging_setup(verbose, output_path):
49 |     '''Set verbose to True if you want the log to appear on stderr'''
50 |     logger = logging.getLogger()
51 |     logger.setLevel(logging.DEBUG)
52 |     file = logging.FileHandler(filename = output_path)
53 |     file.setLevel(logging.INFO)
54 |     file.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s'))
55 |     logger.addHandler(file)
56 |     
57 |     if verbose:
58 |         console = logging.StreamHandler()
59 |         console.setLevel(logging.DEBUG)
60 |         console.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s'))
61 |         logger.addHandler(console)
62 | 
63 | def main(args):
64 |     pargs = parse_args(args)
65 |     logging_setup(pargs.verbose, get_local_path(pargs.dataset_name,'result','result.log'))
66 |     
67 |     print '[STARTED]'
68 |     local_extract(pargs.dataset_name, pargs.extractor, 
69 |                   pargs.timeout, pargs.retry_failed, pargs.skip_existing)
70 |     print '[DONE]'
71 |     
72 | if __name__ == '__main__':
73 |     import sys
74 |     main(sys.argv[1:])


--------------------------------------------------------------------------------
/src/plot_manage.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Script for plotting evaluation results.
  3 | '''
  4 | import os
  5 | import math
  6 | 
  7 | import argparse
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | from mpl_toolkits.mplot3d import Axes3D
 11 | 
 12 | import settings
 13 | from txtexeval.evaluation import TextBasedResults
 14 | from txtexeval.extractor import extractor_list, get_extractor_cls
 15 | 
 16 | def extractor_list_filter(extractor_slugs):
 17 |     '''
 18 |     Produce a filtered extractor_list based on a list that contains slugs of
 19 |     desired extractors. We need this because the global extractor_list 
 20 |     dictates the correct order.
 21 |     '''
 22 |     return [e for e in extractor_list if e.SLUG in extractor_slugs]
 23 | 
 24 | 
 25 | def dataset_stat_latex_print(dataset_name):
 26 |     '''
 27 |     Print the avg precision, recall and F1 score in latex format
 28 |     to console. 
 29 |     '''
 30 |     # get results
 31 |     txt_results = TextBasedResults()
 32 |     txt_results.load(dataset_name)
 33 |     txt_results.print_results()
 34 |     
 35 |     #package results
 36 |     elist = extractor_list_filter(txt_results.text_eval_results.keys())
 37 |     extractor_slugs = tuple([e.SLUG for e in elist])
 38 |     
 39 |     result_list = []
 40 |     for e in extractor_slugs:
 41 |         result_tuple = (
 42 |     		get_extractor_cls(e).NAME,
 43 |     		txt_results.precision_statistics(e)[0],
 44 |     		txt_results.recall_statistics(e)[0],
 45 |     		txt_results.f1score_statistics(e)[0],
 46 |     	)
 47 |         result_list.append(result_tuple)
 48 |     result_list.sort(key = lambda i: i[3])
 49 |     result_list.reverse()
 50 |     
 51 |     for r in result_list:
 52 |         print '\\texttt{%s} & %.4f & %.4f & %.4f \\\\ \\hline' % r
 53 |     
 54 |     
 55 | 
 56 | def dataset_stat_plot(dataset_name, img_name):
 57 |     '''
 58 |     Plot the avg precision, recall and F1 score bar chart for the given dataset
 59 |     name.
 60 |     '''
 61 |     # get results
 62 |     txt_results = TextBasedResults()
 63 |     txt_results.load(dataset_name)
 64 |     txt_results.print_results()
 65 |     
 66 |     #package results
 67 |     elist = extractor_list_filter(txt_results.text_eval_results.keys())
 68 |     extractor_slugs = tuple([e.SLUG for e in elist])
 69 |     packaged_data = (
 70 |         ('Precision', [ (txt_results.precision_statistics(e), e) for e in extractor_slugs ] ),
 71 |         ('Recall', [ (txt_results.recall_statistics(e), e) for e in extractor_slugs ] ),
 72 |         ('F1 score', [ (txt_results.f1score_statistics(e), e) for e in extractor_slugs ] ),
 73 |     )
 74 |     
 75 |     bar_color = ('b','c','m')
 76 |     for i,pdata in enumerate(packaged_data):
 77 |     
 78 |         # package plotting values 
 79 |         num_of_extractors = len(extractor_slugs)
 80 |         ind = np.arange(num_of_extractors)  # the x locations for the groups
 81 |         width = 0.6      # the width of the bars
 82 |         
 83 |         result_list = pdata[1]
 84 |         result_list.sort(key=lambda i: i[0][0])
 85 |         result_list.reverse()
 86 |         
 87 |         avg = [ x[0][0] for x in result_list]
 88 |         stddev = [ x[0][1] for x in result_list]
 89 |         
 90 |         # plot
 91 |         plt.subplot(3,1,i+1)
 92 |         plt.grid(True, alpha = 0.5)
 93 |         
 94 |         rects_avg = plt.bar(ind, avg, width,color=bar_color[i], ecolor ='g' ,
 95 |             yerr = stddev, linewidth = 0.5, alpha = 0.8)
 96 |         
 97 |         # lables and titles
 98 |         extractor_names = [ get_extractor_cls(r[1]).NAME for r in result_list]
 99 |         plt.title(pdata[0])
100 |         plt.xticks(ind+width/2., extractor_names, size = 'xx-small', rotation = 'vertical')
101 |         plt.legend(  (rects_avg[0],),
102 |                      ('avg',),
103 |                      fancybox = True,
104 |                      prop = dict(size='x-small'),
105 |                      loc = 4 # lower right 
106 |         )
107 |         for rect in rects_avg:
108 |             height = rect.get_height()
109 |             plt.text(rect.get_x()+rect.get_width()/2.25,rect.get_height() + 0.01 ,
110 |                  '%1.2f'%height, ha='center', va='bottom', size = 'x-small')
111 |         
112 |         
113 |     #subplots adjusting
114 |     plt.subplots_adjust( wspace=0.5, hspace=0.9)
115 |     
116 |     #adjust figure height
117 |     fig = plt.gcf()
118 |     w,h = fig.get_size_inches()
119 |     fig.set_size_inches( w , h*1.6)
120 |     
121 |     # output 
122 |     out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name)
123 |     plt.savefig(out_path)
124 |     
125 | def equidistant_count(start, stop, step , list):
126 |     '''Return a tuple containing equidistant distribution baskets.'''
127 |     limit_list = np.arange(start,stop, step)
128 |     count = [0] * len(limit_list) 
129 |     
130 |     for value in list:
131 |         value = float(value)
132 |         assert start <= value <= stop
133 |         mark = False
134 |         for i, low in enumerate(limit_list):
135 |             up = low + step
136 |             if i < (len(limit_list)-1) and low <= value < up:
137 |                 count[i] += 1
138 |                 mark =True
139 |                 break
140 |             elif i == (len(limit_list)-1) and low <= value <=up:
141 |                 count[i] += 1
142 |                 mark  =True
143 |                 break
144 |         '''
145 |         if not mark:
146 |             print len(limit_list)
147 |             print j
148 |             print value
149 |             print type(value)
150 |             print 0.3 <= value < 0.35
151 |             raise Exception('something very weird is going on - %s' % str(value))
152 |         '''
153 |     return tuple(count)
154 | 
155 | def resize_axis_tick_labels(axis, size = 'xx-small'):
156 |     for label in axis.get_ticklabels():
157 |         label.set_size(size)
158 |         
159 | def extractor_stat_plot(dataset_name, img_name):
160 |     '''Plot the distributions of per-document precision, recall & F1 score '''
161 |     #np.seterr(all='raise')
162 |     fig = plt.figure()
163 |     
164 |     # get results and repackage the data
165 |     txt_results = TextBasedResults()
166 |     txt_results.load(dataset_name)
167 |     txt_results.print_results()
168 |     
169 |     elist = extractor_list_filter(txt_results.text_eval_results.keys())
170 |     for ex_index,extractor_cls in enumerate(elist):
171 |     
172 |         # repackage results
173 |         extractor_results = txt_results.filtered_results(extractor_cls.SLUG)
174 |         results_list_prec = [r.precision for r in extractor_results] 
175 |         results_list_rec = [r.recall for r in extractor_results]
176 |         results_list_f1 = [r.f1_score for r in extractor_results ]
177 |         
178 |         width = 0.05  # the width of the bars
179 |         ind = np.arange(0,1,width)
180 |         n = len(ind)
181 | 
182 |         print extractor_cls.NAME
183 |         eq_count_prec = equidistant_count(0, 1, width, results_list_prec)
184 |         print len(results_list_prec)
185 |         print sum(eq_count_prec)
186 |         eq_count_rec = equidistant_count(0, 1, width, results_list_rec)
187 |         print len(results_list_rec)
188 |         print sum(eq_count_rec)
189 |         eq_count_f1 = equidistant_count(0, 1, width, results_list_f1)
190 |         print len(results_list_f1)
191 |         print sum(eq_count_f1)
192 |         
193 |         # plotting
194 |         ax = fig.add_subplot(6,3,ex_index+1,projection = '3d')
195 |         
196 |         ax.bar3d(ind,np.array([0]*n), np.array([0]*n) ,
197 |                  dx = width, dy = width*2,dz=eq_count_prec,
198 |                  color ='b', linewidth = 0.3, alpha = 0.4)
199 |         ax.bar3d(ind,np.array([1]*n), np.array([0]*n) ,
200 |                  dx = width, dy = width*2,dz=eq_count_rec,
201 |                  color ='c', linewidth = 0.3,alpha = 0.5)
202 |         ax.bar3d(ind,np.array([2]*n), np.array([0]*n) ,
203 |                  dx = width, dy = width*2,dz=eq_count_f1,
204 |                  color ='m', linewidth = 0.3,alpha = 0.8)
205 |     
206 |         
207 |         ax.set_title(extractor_cls.NAME, size = 'small')
208 |         #ax.set_xlabel('\nlimits',size = 'x-small', linespacing=2)
209 |         ax.set_zlabel('\nnum. of instances',size = 'x-small', linespacing=1)
210 |         ax.yaxis.set_ticks([])
211 |         resize_axis_tick_labels(ax.xaxis)
212 |         resize_axis_tick_labels(ax.zaxis)
213 |         ax.grid(True, alpha = 0.7)
214 |         
215 |     # with 3d plotting we need to use proxy artist because legends
216 |     # are not supported
217 |     blue = plt.Rectangle((0, 0), 1, 1, fc='b') # proxys
218 |     cyan = plt.Rectangle((0, 0), 1, 1, fc='c')
219 |     mag = plt.Rectangle((0, 0), 1, 1, fc='m')
220 |     fig.legend(      (blue,cyan,mag),
221 |                      ('precision','recall','f1 score'),
222 |                      fancybox = True,
223 |                      prop = dict(size='x-small')
224 |     )
225 |     w,h = fig.get_size_inches()
226 |     fig.set_size_inches( w *1.5, h*2.5)
227 |     fig.subplots_adjust( wspace=0.025, hspace=0.15)
228 |     
229 |     # save plot
230 |     out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name)
231 |     fig.savefig(out_path,bbox_inches='tight')
232 | 
233 | 
234 | def dataset_contents_print_latex(dataset_name):
235 |     '''Print the error case analysis in latex'''
236 |     # get results
237 |     txt_results = TextBasedResults()
238 |     txt_results.load(dataset_name)
239 |     
240 |     # package data
241 |     elist = extractor_list_filter(txt_results.text_eval_results.keys())
242 |     for e in elist:
243 |         print '\\texttt{%s} & %d & %d & %d & %d & %d & %d \\\\ \\hline' % \
244 |     	(
245 |     	e.NAME,
246 |     	txt_results.result_contents(e.SLUG).rel_empty,
247 |     	txt_results.result_contents(e.SLUG).rel_ret_empty,
248 |     	txt_results.result_contents(e.SLUG).ret_empty,
249 |     	txt_results.result_contents(e.SLUG).missmatch,
250 |     	txt_results.result_contents(e.SLUG).fail,
251 |     	txt_results.result_contents(e.SLUG).succ,
252 |     	)
253 |     
254 | def dataset_contents_plot(dataset_name, img_name):
255 |     '''Plot the error case analysis.'''
256 |     # get results
257 |     txt_results = TextBasedResults()
258 |     txt_results.load(dataset_name)
259 |     txt_results.print_results()
260 |     
261 |     # package data
262 |     elist = extractor_list_filter(txt_results.text_eval_results.keys())
263 |     extractor_slugs = tuple( [e.SLUG for e in elist] )
264 |     package = [
265 |         ('|rel| = 0','#9DFADE', [ txt_results.result_contents(ex).rel_empty for ex in extractor_slugs] ),
266 |         ('|rel intersect ret| = 0','#3C70A3', [ txt_results.result_contents(ex).rel_ret_empty for ex in extractor_slugs] ),
267 |         ('|ret| = 0','#5CCBED', [ txt_results.result_contents(ex).ret_empty for ex in extractor_slugs] ),
268 |         ('mismatch','#A76CF5', [ txt_results.result_contents(ex).missmatch for ex in extractor_slugs] ),
269 |         ('failed','#C43156', [ txt_results.result_contents(ex).fail for ex in extractor_slugs] ),
270 |         ('successful','#31C460', [ txt_results.result_contents(ex).succ for ex in extractor_slugs] ),
271 |     ]
272 |     num_of_extractors = len(extractor_slugs)
273 |     ind = np.arange(num_of_extractors)  # the x locations for the groups
274 |     width = 0.6
275 |     
276 |     fig = plt.gcf()
277 |     fig.legend(      [plt.Rectangle((0, 0), 1, 1, fc=p[1]) for p in package],
278 |                      [p[0] for p in package],
279 |                      fancybox = True,
280 |                      prop = dict(size='x-small'),                     
281 |     )
282 |     
283 |     # with successful instances
284 |     ax1 = plt.subplot(121)
285 |     bottom_y = np.zeros(num_of_extractors)
286 |     for pdata in package:
287 |         ax1.bar(ind, pdata[2],width,bottom = bottom_y,color=pdata[1], 
288 |                 ecolor ='g', linewidth = 0.2, alpha = 0.95)
289 |         bottom_y += pdata[2]
290 | 
291 |     ax2 = plt.subplot(122)
292 |     bottom_y = np.zeros(num_of_extractors)
293 |     del package[-1]
294 |     for pdata in package:
295 |         ax2.bar(ind, pdata[2],width,bottom = bottom_y,color=pdata[1], 
296 |                 ecolor ='g', linewidth = 0.2, alpha = 0.95)
297 |         bottom_y += pdata[2]
298 |     
299 |     # xticks labels
300 |     extractor_names = [ get_extractor_cls(e).NAME for e in extractor_slugs]
301 |     ax1.set_xticks(ind+width/2.)
302 |     ax1.set_xticklabels(extractor_names, size = 'xx-small', rotation = 'vertical')
303 |     ax2.set_xticks(ind+width/2.)
304 |     ax2.set_xticklabels(extractor_names, size = 'xx-small', rotation = 'vertical')
305 |     
306 |     # grid settings
307 |     fig.suptitle('Boundary cases')
308 |     ax1.grid(True, alpha = 0.5)
309 |     ax2.grid(True, alpha = 0.5)
310 |     
311 |     # adjustment
312 |     w,h = fig.get_size_inches()
313 |     fig.set_size_inches( w*1.5, h*1.5)
314 |     fig.subplots_adjust( bottom = 0.2)
315 |     
316 |     # output 
317 |     out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name)
318 |     fig.savefig(out_path,bbox_inches='tight')
319 |     
320 | def parse_args(args):
321 |     parser = argparse.ArgumentParser(description = 'Plotting tool')
322 |     parser.add_argument('action', choices = ('dataset_stat', 'extr_stat','contents','contents_latex','dataset_latex'))
323 |     parser.add_argument('dataset_name', help = 'name of the dataset')
324 |     parser.add_argument('-f','--format', type=str, help = 'format: png, pdf, ps, eps or svg')
325 |     return parser.parse_args(args)
326 |     
327 | def main(args):
328 |     pargs = parse_args(args)
329 |     
330 |     output_img_name = '%s-%s' % (pargs.dataset_name, pargs.action)
331 |     if pargs.format:
332 |         output_img_name = '%s.%s'  % (output_img_name, pargs.format) 
333 |     else:
334 |         output_img_name = '%s.%s'  % (output_img_name, 'png') 
335 |         
336 |     if pargs.action == 'dataset_stat':
337 |         dataset_stat_plot(pargs.dataset_name, output_img_name)
338 |     elif pargs.action == 'dataset_latex':
339 |         dataset_stat_latex_print(pargs.dataset_name)        
340 |     elif pargs.action == 'extr_stat':
341 |         extractor_stat_plot(pargs.dataset_name, output_img_name)
342 |     elif pargs.action == 'contents':
343 |         dataset_contents_plot(pargs.dataset_name, output_img_name)
344 |     elif pargs.action == 'contents_latex':
345 |         dataset_contents_print_latex(pargs.dataset_name)
346 |     
347 |     print '[DONE]'
348 | 
349 | if __name__ == '__main__':
350 |     import sys
351 |     main(sys.argv[1:])
352 | 


--------------------------------------------------------------------------------
/src/settings.py-template:
--------------------------------------------------------------------------------
 1 | #path to local root data directory 
 2 | PATH_LOCAL_DATA = '/home/you/data/'
 3 | 
 4 | #path to remote root data directory
 5 | PATH_REMOTE_DATA = 'http://example.com/data/'
 6 | 
 7 | #api keys (you'll have to obtain these yourself)
 8 | ALCHEMY_API_KEY = ''
 9 | DIFFBOT_KEY = ''
10 | REPUSTATE_API_KEY = ''
11 | EXTRACTIV_API_KEY = ''
12 | 
13 | #MSS api endpoint provided by Jeffrey Pasternack <jpaster2@illinois.edu>
14 | #(I'm not allowed to distribute this url)
15 | MSS_URL = (
16 |     ('text', ''),
17 |     ('offset', ''),
18 | )
19 | 
20 | #Boilerpipe API url (https://github.com/tomazk/Java-Text-Extractor-API)
21 | BOILERPIPE_API_ENDPOINT = 'http://yourdomain/boilerpipe/extract/'
22 | 
23 | #Goose API url (https://github.com/tomazk/Java-Text-Extractor-API)
24 | GOOSE_API_ENDPOINT = 'http://yourdomain/goose/extract/'
25 | 
26 | #TTR API url (https://github.com/tomazk/Java-Text-Extractor-API)
27 | TTR_API_ENDPOINT = 'http://yourdomain/ttr/extract/'
28 | 
29 | #Readability API (https://github.com/tomazk/Simple-Readability-API)
30 | READABILITY_ENDPOINT = 'http://yourdomain/extract/'
31 | 
32 | #Trendiction API
33 | TRENDICTION_ENDPOINT = ''
34 | 
35 | #thrift RPC endpoint provided by Zemanta Ltd
36 | ZEMANTA_THRIFT = (
37 |     ('host', ''),
38 |     ('port', <int port number>),
39 | )
40 | 
41 | #readability bookmarklet location e.g. http://localhost/readability.js
42 | READABILITY_BOOKMARKLET = 'http://yourplace/readability.js'


--------------------------------------------------------------------------------
/src/tee:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | '''
 3 | Common command line tool
 4 | '''
 5 | 
 6 | import sys
 7 | 
 8 | def print_help_exit(msg = ''):
 9 |     if msg:
10 |         print msg    
11 |     print '''
12 |     usage: tee {plot|data|eval|ext|help} [-h] [<arg1>,<arg2>,...]
13 |       plot     Plotting script
14 |       data     Dataset management script
15 |       eval     Evaluation script
16 |       ext      Extraction management script
17 |       help     Print out help text for the tee command
18 |      
19 |     flags:
20 |       -h       Print out help text for the selected script
21 |     '''
22 |     sys.exit(-1)
23 | 
24 | def main():
25 |     args = sys.argv[1:]
26 |     if len(args) == 0:
27 |         print_help_exit('Not enough arguments')
28 |     if args[0] not in ('plot','data','eval','ext','help'):
29 |         print_help_exit('Unknown command')      
30 |           
31 |     if args[0] == 'help':
32 |         print_help_exit()
33 |     elif args[0] == 'plot':
34 |         import plot_manage
35 |         plot_manage.main(args[1:])
36 |     elif args[0] == 'data':
37 |         import dataset_manage
38 |         dataset_manage.main(args[1:])
39 |     elif args[0] == 'eval':
40 |         import evaluate_manage
41 |         evaluate_manage.main(args[1:])
42 |     elif args[0] == 'ext':
43 |         import extract_manage
44 |         extract_manage.main(args[1:])
45 | 
46 | if __name__  == '__main__':
47 |     main()


--------------------------------------------------------------------------------
/src/txtexeval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomazk/Text-Extraction-Evaluation/06d6070d895f1bae604dfaf10fa6537700d59e34/src/txtexeval/__init__.py


--------------------------------------------------------------------------------
/src/txtexeval/data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import urlparse
  3 | import codecs
  4 | import logging
  5 | 
  6 | import yaml
  7 | 
  8 | import settings
  9 | from .util import check_local_path, get_local_path
 10 | from .extractor import extractor_list, get_extractor_cls
 11 | from .extractor import  ExtractorError, ContentExtractorError
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | class DataError(Exception):
 16 |     pass
 17 | 
 18 | def verify_local_dataset(init):
 19 |     def wrapper(self, dataset, *args, **kwargs):
 20 |         if not check_local_path(dataset):
 21 |             raise DataError('local dataset %s does not exist' % dataset)
 22 |         init(self, dataset, *args, **kwargs)
 23 |     return wrapper
 24 | 
 25 | class BaseDatasetLoader(object):
 26 |     '''
 27 |     If you want a loader with a different backend (e.g. database)just extend 
 28 |     this class and implement __iter__ method which returns an iterator over 
 29 |     document instances
 30 |     '''
 31 |     
 32 |     def __iter__(self):
 33 |         raise NotImplementedError
 34 | 
 35 | class LocalDatasetLoader(BaseDatasetLoader):
 36 |     '''Dataset loader using local filesystem'''
 37 |     
 38 |     @verify_local_dataset
 39 |     def __init__(self, dataset_name, load_failed = None, skip_existing = None):     
 40 |         self.dataset = dataset_name   
 41 |         self._skip_existing = skip_existing
 42 |         
 43 |         # load meta data
 44 |         meta_filepath = get_local_path( dataset_name, 'meta.yaml')
 45 |         with open(meta_filepath, 'r') as f:
 46 |             self.meta_yaml = yaml.load(f.read())
 47 |             self._len = len(self.meta_yaml)
 48 |             
 49 |         if load_failed:
 50 |             self._failed_list = ExtractionSummary(self.dataset) \
 51 |                                 .get_failed_ids(load_failed) 
 52 |         else:
 53 |             self._failed_list = None
 54 |             
 55 |     def __iter__(self):
 56 |         '''DataInstance generator'''
 57 |         for dict in self.meta_yaml:
 58 |             document = LocalDocument(self.dataset, **dict)
 59 |             
 60 |             # check if all conditions for yielding a document are set
 61 |             yield_ = True
 62 |             if self._skip_existing != None and \
 63 |             document.check_existing_clean(self._skip_existing):
 64 |                 yield_ = False
 65 |             elif self._failed_list != None and \
 66 |             dict['id'] not in self._failed_list:
 67 |                 yield_ = False
 68 |                 
 69 |             if yield_:
 70 |                 yield document
 71 |             else: 
 72 |                 logger.debug('skipping document %s', document.id)
 73 |                 continue
 74 |             
 75 |     def __len__(self):
 76 |         return self._len
 77 |     
 78 | 
 79 | class BaseDocument(object):
 80 |     # same goes for document instances
 81 |     
 82 |     def get_raw_html(self):
 83 |         pass
 84 |     
 85 |     def get_url(self):
 86 |         pass
 87 | 
 88 |     def get_url_local(self):
 89 |         pass
 90 |     
 91 |     def get_clean(self):
 92 |         pass
 93 |     
 94 | class LocalDocument(BaseDocument):
 95 |     '''Evaluation data representation using local filesystem'''
 96 |     
 97 |     def __init__(self, dataset, **kwargs):
 98 |         self.dataset = dataset
 99 |         
100 |         # instance attributes
101 |         self.id = kwargs.pop('id')
102 |         self.raw_filename = kwargs.pop('raw')
103 |         self.clean_filename = kwargs.pop('clean')
104 |         self.url = kwargs.pop('url')
105 |         self.raw_encoding = kwargs.pop('raw_encoding')
106 |         self.clean_encoding = kwargs.pop('clean_encoding')
107 |         
108 |     def get_raw_html(self):
109 |         file_path = get_local_path(self.dataset,'raw',self.raw_filename)
110 |         with codecs.open(file_path,'r', encoding = self.raw_encoding, errors = 'ignore') as f:
111 |             return f.read()
112 |     
113 |     def get_url(self):
114 |         if self.url: 
115 |             return self.url
116 |         else:
117 |             tail = self.dataset + '/' + self.raw_filename
118 |             return urlparse.urljoin(settings.PATH_REMOTE_DATA, tail)
119 |         
120 |     def get_url_local(self):
121 |         # file:///home/tomaz/workspace/diploma/txt-ex-eval-data/datasets/cleaneval-final/raw/100.html
122 |         return 'file://' + settings.PATH_LOCAL_DATA + '/datasets/' \
123 |              + self.dataset + '/raw/' + self.raw_filename
124 |         
125 |     def get_clean(self):
126 |         file_path = get_local_path(self.dataset,'clean',self.clean_filename)
127 |         with open(file_path, 'r') as f:
128 |             return f.read()
129 |         
130 |     def check_existing_clean(self, extractor_slug):
131 |         ex_cls = get_extractor_cls(extractor_slug)
132 |         return check_local_path(self.dataset,'result',extractor_slug,
133 |                                 '%s.%s' %(self.id, ex_cls.FORMAT))
134 |         
135 |         
136 | class ExtractionSummary(object):
137 |     
138 |     @verify_local_dataset
139 |     def __init__(self, dataset_name, extractor_slug = None):
140 |         self._summary_path = get_local_path(dataset_name,'result', 'summary.yaml')
141 |         
142 |         if os.path.exists(self._summary_path):
143 |             with open(self._summary_path,'r') as f:
144 |                 self._summary_structure = yaml.load(f.read())
145 |         else:
146 |             self._summary_structure = {} 
147 |             for e in extractor_list:
148 |                 self._summary_structure[e.SLUG] = []
149 |                 
150 |         self.set_extractor(extractor_slug)
151 |         
152 |     def set_extractor(self, extractor_slug):
153 |         if extractor_slug:
154 |             self.extractor_slug = extractor_slug
155 |             self._summary_structure[self.extractor_slug] = []
156 |         else:
157 |             self.extractor_slug = None
158 |         
159 |     def get_failed_ids(self, extractor_slug):
160 |         if self.extractor_slug:
161 |             raise DataError('extractor_slug set - list of fails was reinitialized')
162 |         return [f['id'] for f in self._summary_structure[extractor_slug]]
163 |         
164 |     def add_fail(self, id, reason = None):
165 |         if self.extractor_slug == None:
166 |             raise DataError('extractor not set')
167 |         
168 |         self._summary_structure[self.extractor_slug].append({
169 |             'id': id,
170 |             'reason': reason
171 |         })
172 |         
173 |     def serialize(self):
174 |         with open(self._summary_path, 'w') as out:
175 |             out.write(yaml.dump(self._summary_structure, default_flow_style=False ))
176 |     
177 |     def short_summary(self, extractor_slug = None):
178 |         if extractor_slug:
179 |             return 'extraction summary: %i failed' \
180 |                % len(self._summary_structure[extractor_slug])
181 |         elif self.extractor_slug: 
182 |             return 'extraction summary: %i failed' \
183 |                % len(self._summary_structure[self.extractor_slug])
184 |         else:
185 |             raise DataError('extractor not set')
186 |         
187 | class BaseResultStorage(object):
188 |     
189 |     def __init__(self, dataset_name, extractor_class):
190 |         self.dataset =  dataset_name
191 |         self.extractor_cls = extractor_class
192 |         
193 |     def push_result(self, document):
194 |         pass
195 |     
196 |     def fetch_result(self, document): 
197 |         pass
198 |     
199 | class LocalResultStorage(BaseResultStorage):
200 |     
201 |     @verify_local_dataset
202 |     def __init__(self, dataset_name, extractor_class):
203 |         super(LocalResultStorage, self).__init__(dataset_name, extractor_class)
204 |         
205 |         # with dataset name out of the way, we must now check the existance of
206 |         # the result folder for the given extractor
207 |         self._result_dir = get_local_path( self.dataset,'result')
208 |         
209 |         self._extractor_result_dir = os.path.join(
210 |             self._result_dir,
211 |             self.extractor_cls.SLUG)
212 |         
213 |         if not os.path.exists( self._extractor_result_dir ):
214 |             os.mkdir(self._extractor_result_dir)
215 |             
216 |         # create an object to be serialized into a .yaml file
217 |         # we need this to store a summary of the extraction process for the 
218 |         # whole dataset
219 |         self._summary = ExtractionSummary(self.dataset, self.extractor_cls.SLUG)
220 |         
221 |     def push_result(self, document):
222 |         extractor = self.extractor_cls(document)
223 |         try:
224 |             result = extractor.extract()
225 |         except DataError as e:
226 |             err_msg = 'Data related error: %r' % e
227 |             logger.warning(err_msg)
228 |             self._summary.add_fail(document.id, err_msg)
229 |         except ContentExtractorError as e:
230 |             err_msg = 'Content extractor related error: %r' % e
231 |             logger.warning(err_msg)
232 |             self._summary.add_fail(document.id, err_msg)
233 |         except ExtractorError as e:
234 |             err_msg = 'Extractor related error: %r' % e
235 |             logger.warning(err_msg)
236 |             self._summary.add_fail(document.id, err_msg)
237 |         except NotImplementedError:
238 |             logger.debug('extraction method is not implemented - do nothing')
239 |             pass
240 |         except Exception as e:
241 |             err_msg = 'Unknown error: %r' % e
242 |             logger.warning(err_msg)
243 |             self._summary.add_fail(document.id, err_msg)
244 |         else:
245 |             logger.debug('extracted content from %s', document.id)
246 |             output_file = '%s.%s' % (document.id,self.extractor_cls.FORMAT)
247 |             with open(os.path.join(self._extractor_result_dir, output_file), 'w') as out:
248 |                 out.write(result)
249 |                 
250 |     def fetch_result(self, document):
251 |         result_file = '%s.%s' % (document.id,self.extractor_cls.FORMAT)
252 |         result_file_path = os.path.join(self._extractor_result_dir, result_file)
253 |         if not os.path.exists(result_file_path):
254 |             raise DataError('result file %s does not exist' % result_file)
255 |         with open(result_file_path,'r') as f:
256 |             return f.read()
257 |         
258 |     def dump_summary(self):
259 |         logger.info(self._summary.short_summary())
260 |         self._summary.serialize()


--------------------------------------------------------------------------------
/src/txtexeval/evaluation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import pickle
  4 | import string
  5 | import difflib
  6 | import math
  7 | import logging
  8 | 
  9 | from BeautifulSoup import BeautifulSoup
 10 | 
 11 | import settings
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | # module utils
 16 | 
 17 | re_CONTROL = re.compile("[\x00-\x1F]+")
 18 | re_WS = re.compile("\s+")
 19 | re_NONASCII = re.compile("[\x80-\xFF]+")
 20 | 
 21 | def _tokenize_text(dirty_text):
 22 |     '''Tokenize dirty text into a normalized list of words'''
 23 |     # remove punctuation and replace with whitespace
 24 |     table = string.maketrans(string.punctuation, ' '*len(string.punctuation))
 25 |     dirty_text =  dirty_text.translate(table)
 26 |     # remove any control char
 27 |     dirty_text = re_CONTROL.sub(' ', dirty_text)
 28 |     # remove any non ascii char to mitigate the troubles of broken encodings
 29 |     dirty_text = re_NONASCII.sub('', dirty_text)
 30 |     # normalize to lowercase
 31 |     dirty_text = dirty_text.lower()
 32 |     # remove empty tokens
 33 |     return filter(lambda w: w != '', re_WS.split(dirty_text))
 34 | 
 35 | def _bow(word_tokens):
 36 |     '''Returns bag of words dictionary from a list of word tokens'''
 37 |     bow = {}
 38 |     for i in word_tokens:
 39 |         if i not in bow:
 40 |             bow[i] = 1
 41 |         else:
 42 |             bow[i] += 1
 43 |     return bow
 44 |     
 45 | # results
 46 | 
 47 | class Result(object):
 48 |     
 49 |     def __init__(self, precision, recall, f1_score, id = None):
 50 |         # validate result 
 51 |         if math.isinf(precision) and not math.isinf(recall):
 52 |             assert recall == 0
 53 |             assert math.isnan(f1_score)
 54 |         elif not math.isinf(precision) and math.isinf(recall):
 55 |             assert precision == 0
 56 |             assert math.isnan(f1_score)
 57 |         elif math.isinf(precision) and math.isinf(recall):
 58 |             assert math.isnan(f1_score)
 59 |         elif precision == recall == 0:
 60 |             assert math.isinf(f1_score)
 61 |         elif not math.isinf(precision) and not math.isinf(recall):
 62 |             assert 0 < precision <= 1
 63 |             assert 0 < recall <= 1
 64 |             assert 0 < f1_score <= 1
 65 |         
 66 |         self.precision = precision
 67 |         self.recall = recall
 68 |         self.f1_score = f1_score
 69 |         self.id = id
 70 |     
 71 |     @property
 72 |     def retrieved_empty(self):
 73 |         return math.isinf(self.precision) and self.recall == 0
 74 |     
 75 |     @property
 76 |     def relevant_empty(self):
 77 |         return math.isinf(self.recall) and self.precision == 0
 78 |     
 79 |     @property
 80 |     def relevant_retrieved_empty(self):
 81 |         return math.isinf(self.precision) and math.isinf(self.recall)
 82 |     
 83 |     @property
 84 |     def missmatch(self):
 85 |         return self.precision == self.recall == 0
 86 |     
 87 |     @property
 88 |     def succ(self):
 89 |         return 0 < self.f1_score <= 1
 90 |         
 91 | class ResultContents(object):
 92 |     
 93 |     def __init__(self,succ,rel_empty,rel_ret_empty,ret_empty,missmatch,dataset_len):
 94 |         assert dataset_len >= succ+rel_empty+rel_ret_empty+ret_empty+missmatch
 95 |         
 96 |         self.succ = succ
 97 |         self.rel_empty = rel_empty
 98 |         self.rel_ret_empty = rel_ret_empty
 99 |         self.ret_empty = ret_empty
100 |         self.missmatch = missmatch
101 |          
102 |         self.fail =  dataset_len-(succ+rel_empty+rel_ret_empty+ret_empty+missmatch)
103 |         
104 | class TextBasedResults(object):
105 |             
106 |     __pickle_path = os.path.join(settings.PATH_LOCAL_DATA,'results-cache')
107 |     
108 |     def __init__(self, extractor = None):
109 |         self.text_eval_results = {}
110 |         self.dataset_len = 0
111 |         
112 |         # optional
113 |         if extractor != None:
114 |             self.text_eval_results[extractor] = []
115 |         self._extractor = extractor
116 | 
117 |     def save(self, dataset_name):
118 |         '''Pickle the internal state'''
119 |         pickle_path = os.path.join(self.__pickle_path,'%s.pickle' % dataset_name)
120 |         logger.info('saving text based results to: %s', pickle_path)
121 |         
122 |         with open(pickle_path,'wb') as f:
123 |             pickle.dump( self.__dict__ ,f)
124 |     
125 |     def load(self, dataset_name):
126 |         '''Unpickle the internal state'''
127 |         pickle_path = os.path.join(self.__pickle_path,'%s.pickle' % dataset_name)
128 |         logger.info('loading text based results from: %s', pickle_path)
129 |         
130 |         try:
131 |             f = open(pickle_path,'rb')
132 |         except IOError as e:
133 |             logger.warning('no pickle found: %s', repr(e))
134 |         else:
135 |             self.__dict__.update( pickle.load(f) )
136 |             f.close()
137 |             
138 |             
139 |     def set_extractor(self, extractor):
140 |         self._extractor = extractor
141 |         self.text_eval_results[extractor] = []
142 |     
143 |     def add_result(self, result):
144 |         if self._extractor == None:
145 |             raise TypeError('extractor not set')
146 |         self.text_eval_results[self._extractor].append(result)
147 |         
148 |     def filtered_results(self, extractor):
149 |         result_filter = lambda r: r.succ
150 |         return filter(result_filter, self.text_eval_results[extractor])
151 |     
152 |     def result_contents(self, extractor):
153 |         results = self.text_eval_results[extractor]
154 |         
155 |         succ = len(self.filtered_results(extractor))
156 |         rel_empty     = len(filter(lambda r: r.relevant_empty, results))
157 |         ret_empty     = len(filter(lambda r: r.retrieved_empty, results))
158 |         rel_ret_empty = len(filter(lambda r: r.relevant_retrieved_empty, results))
159 |         missmatch     = len(filter(lambda r: r.missmatch, results))
160 |     
161 |         return ResultContents(succ, rel_empty, rel_ret_empty, ret_empty,
162 |                                 missmatch, self.dataset_len)
163 |     
164 |     def _statistics(self, extractor, stat_typ): # DRY helper
165 |         results_list = [getattr(r, stat_typ) for r in  self.filtered_results(extractor)]
166 |         # average
167 |         avg = sum(results_list) / float(len(results_list))
168 |         # std deviation
169 |         stddev =  sum([(r - avg)**2. for r in results_list]) / float(len(results_list))
170 |         stddev = math.sqrt(stddev)
171 |         return avg, stddev
172 |       
173 |     def precision_statistics(self, extractor):
174 |         '''Return a tuple containing (avg, stddev)'''
175 |         return self._statistics(extractor, 'precision')
176 |     
177 |     def recall_statistics(self, extractor):
178 |         '''Return a tuple containing (avg, stddev)'''
179 |         return self._statistics(extractor, 'recall')
180 |     
181 |     def f1score_statistics(self, extractor):
182 |         '''Return a tuple containing (avg, stddev)'''
183 |         return self._statistics(extractor, 'f1_score')
184 |         
185 |     def print_results(self):
186 |         print 'results based on text based evaluation'
187 |         for extractor in self.text_eval_results.iterkeys():
188 |             print '----------------'
189 |             print 'Ex. name:       %s' % extractor
190 |             print 'avg. precision: %f   stddev: %f' \
191 |              % self.precision_statistics(extractor) 
192 |             print 'avg. recall:    %f   stddev: %f' \
193 |              % self.recall_statistics(extractor) 
194 |             print 'avg. F1 score:  %f   stddev: %f' \
195 |              % self.f1score_statistics(extractor) 
196 |              
197 |             rcontents = self.result_contents(extractor) 
198 |             print 'relevant  empty:   %d' % rcontents.rel_empty
199 |             print 'retrieved empty:   %d' % rcontents.ret_empty
200 |             print 'rel intersect ret: %d' % rcontents.rel_ret_empty
201 |             print 'success:           %d' % rcontents.succ
202 |             print 'missmatch:         %d' % rcontents.missmatch
203 |             print 'fail:              %d' % rcontents.fail
204 |             print 'dataset_len=%d' % self.dataset_len
205 |                                              
206 | # evaluators    
207 | 
208 | class BaseEvaluator():
209 |     '''Outline for evaluators'''
210 |     
211 |     def __init__(self, retrieved, relevant, id = None):
212 |         self.retrieved = retrieved
213 |         self.relevant = relevant
214 |         self.id = id
215 |     
216 |     def get_eval_results(self):
217 |         # return instance of Result
218 |         pass
219 |     
220 | class TextOnlyEvaluator(BaseEvaluator):
221 |     
222 |     def get_eval_results(self):
223 |         
224 |         s = difflib.SequenceMatcher()
225 |         rel = self.relevant.get_word_seq()
226 |         ret = self.retrieved.get_word_seq()
227 |         
228 |         s.set_seqs(rel, ret)
229 |         matches = s.get_matching_blocks()[:-1]
230 |         
231 |         rel_union_ret = sum(i.size for i in matches) if len(matches) > 0 else 0
232 |         
233 |         precision = float(rel_union_ret) / float(len(ret)) \
234 |                     if len(ret) > 0 else float('inf')
235 |         recall = float(rel_union_ret) / float(len(rel)) \
236 |                     if len(rel) > 0 else float('inf')
237 |                     
238 |         # nan when prec or recall are inf 
239 |         f1_score = (2. * precision * recall)/(precision + recall) \
240 |                     if precision + recall > 0 else float('inf')
241 |         
242 |         return Result(precision, recall, f1_score, self.id)
243 |         
244 | #formats
245 |     
246 | class BaseResultFormat(object):
247 |     
248 |     def get_word_seq(self):# sequence of words
249 |         pass
250 |     
251 |     def get_bow(self):# bag of words
252 |         pass
253 |     
254 | class TextResultFormat(BaseResultFormat):
255 |     '''Basic format for dirty text'''
256 |     
257 |     def __init__(self, dirty_text):
258 |         self._text = dirty_text
259 | 
260 |     def get_word_seq(self):
261 |         return _tokenize_text(self._text)
262 |     
263 |     def get_bow(self):
264 |         return _bow(_tokenize_text(self._text))
265 |     
266 | class CleanEvalFormat(BaseResultFormat):
267 |     '''Format specific for cleaneval dataset'''
268 |     
269 |     re_URL = re.compile(r'^(\s*)URL:(.*)$', re.IGNORECASE | re.MULTILINE)
270 |     re_TAG = re.compile(r'^(\s*)<(p|h|l)>', re.IGNORECASE | re.MULTILINE)
271 |     
272 |     @staticmethod
273 |     def from_document(document):
274 |         return CleanEvalFormat(document.get_clean())
275 |     
276 |     def __init__(self, cleaneval_string):
277 |         # remove URL meta data
278 |         self._text = self.re_URL.sub('', cleaneval_string)
279 |         # remove tag guidelines
280 |         self._text = self.re_TAG.sub('', self._text)
281 |         
282 |     def get_word_seq(self):
283 |         return _tokenize_text(self._text)
284 |         
285 |     def get_bow(self):
286 |         return _bow(_tokenize_text(self._text))
287 |         
288 | class GoogleNewsFormat(BaseResultFormat):
289 |     '''
290 |     Format specific for google news dataset
291 |     
292 |     From README.txt distributed with google news dataset:
293 |     The human-assessed documents contain annotations in the form of <SPAN> tags
294 |     with specific CSS classes that indicate the type of content:
295 |     x-nc-sel0    Not content
296 |     x-nc-sel1    Headline
297 |     x-nc-sel2    Full text
298 |     x-nc-sel3    Supplemental
299 |     x-nc-sel4    Related content
300 |     x-nc-sel5    Comments
301 |     '''
302 |     
303 |     re_CLASS = re.compile('x-nc-sel[1|2]')
304 |     
305 |     @staticmethod
306 |     def from_document(document):
307 |         return GoogleNewsFormat(document.get_clean(), document.clean_encoding)
308 |     
309 |     def __init__(self, gnews_string, encoding):
310 |         soup = BeautifulSoup(gnews_string, fromEncoding = encoding)
311 |         
312 |         # The trouble of google news dataset is that it sometimes nests 
313 |         # the annotated span tags. That's why we first have to find any 
314 |         # annotated children and remove them from the content_tags list.
315 |         redundant_tags = []
316 |         content_tags = soup.findAll('span',attrs = {'class' : self.re_CLASS })
317 |         for ct in content_tags:
318 |             red = ct.findAll('span',attrs = {'class' : self.re_CLASS })
319 |             redundant_tags.extend(red)
320 |         self._content_tags = filter(lambda tag: tag not in redundant_tags, content_tags)
321 |         # Next we find all the text and concatenate it into one single string
322 |         content_strings = []
323 |         for ct in self._content_tags:
324 |             content_strings.extend(ct.findAll(text=True))
325 |         self._content_string = ' '.join(map(lambda e: e.encode(encoding,'ignore'), content_strings))
326 |         
327 |     def get_word_seq(self):
328 |         return _tokenize_text(self._content_string)
329 |         
330 |     def get_bow(self):
331 |         return _bow(_tokenize_text(self._content_string))
332 |     
333 | # formats in this mapping should have a from_document static method implemented
334 | dataset_format_map = (
335 |     ('cleaneval', CleanEvalFormat),
336 |     ('gnews', GoogleNewsFormat),
337 | )
338 | 
339 | def from_document_factory(document, slug):
340 |     '''
341 |     Factory function that returns an instance of a format class listed in the
342 |     dataset format map.
343 |     '''
344 |     map_ = dict(dataset_format_map)
345 |     cls = map_[slug]
346 |     return cls.from_document(document)


--------------------------------------------------------------------------------
/src/txtexeval/extractor.py:
--------------------------------------------------------------------------------
  1 | import urllib
  2 | import json
  3 | import logging
  4 | import time
  5 | 
  6 | import readability
  7 | import justext
  8 | from selenium import webdriver
  9 | from selenium.webdriver import FirefoxProfile
 10 | from selenium.common.exceptions import NoSuchElementException
 11 | 
 12 | import settings
 13 | from .util import Request, html_to_text
 14 | from .util.zemanta.client import ClientManager
 15 | from .evaluation import TextResultFormat, CleanEvalFormat
 16 | 
 17 | logging.getLogger('selenium').setLevel(logging.WARN)
 18 | 
 19 | class ExtractorError(Exception):
 20 |     '''Extractor failed on the network layer'''
 21 |     pass
 22 | 
 23 | class ContentExtractorError(ExtractorError):
 24 |     '''
 25 |     Raised when the error is included in the content (e.g. json formatted 
 26 |     response has a status field) fetched by the extractor
 27 |     '''
 28 |     pass
 29 | 
 30 | def return_content(extract):
 31 |     '''
 32 |     DRY decorator that wraps the extract method. We check for response
 33 |     success and raise the appropriate error or return the content.
 34 |     '''
 35 |     def wrapper(self):
 36 |         # fetch the response
 37 |         response = extract(self)
 38 |         # check for any network related errors
 39 |         if not response.success():
 40 |             raise ExtractorError(response.err_msg) 
 41 |         return response.content
 42 |     return wrapper
 43 | 
 44 | def check_content_status(extract):
 45 |     '''
 46 |     DRY decorator that mitigates the trouble of inserting boilerplate code 
 47 |     inside the extract method for invoking the private method _content_status.
 48 |     WhateverExtractor._content_status is used to check for errors returned in
 49 |     the response content itself.
 50 |     '''
 51 |     def wrapper(self):
 52 |         self._content = extract(self)
 53 |         self._content_status()
 54 |         return self._content
 55 |     return wrapper
 56 | 
 57 | class BaseExtractor(object):
 58 |     '''Extractor base class
 59 |     
 60 |     Using a base class to ensure a common representation. 
 61 |     If an extractor returns only e.g. text based results it 
 62 |     should raise a NotImpelemntedError for the respective
 63 |     method'''
 64 |     
 65 |     NAME = ''# unique name
 66 |     SLUG = ''# unique slug name ([a-z_]+)
 67 |     FORMAT = ''# txt|html|json|xml
 68 |     
 69 |     def __init__(self, data_instance):
 70 |         self.data_instance = data_instance
 71 |         
 72 |     def extract(self):
 73 |         '''Returns unformatted extractor resposne'''
 74 |         pass
 75 |     
 76 |     @classmethod
 77 |     def formatted_result(cls, result_string):
 78 |         pass
 79 |     
 80 |     
 81 | class _ContentCheckMin(object):
 82 |     
 83 |     def _content_status(self):
 84 |         js = json.loads(self._content)
 85 |         if js['status'] == "ERROR":
 86 |             raise ContentExtractorError(js['errorMsg'].encode('utf-8','ignore'))
 87 |         
 88 | class _FormattedResultMin(object):
 89 |     
 90 |     @classmethod
 91 |     def formatted_result(cls, result_string):
 92 |         js = json.loads(result_string, encoding = 'utf8')
 93 |         return TextResultFormat(js['result'].encode('utf8','ignore'))
 94 | 
 95 | 
 96 | class TTRDefaultExtractor(_ContentCheckMin,BaseExtractor):
 97 |     '''Boilerpipe default extractor '''
 98 |     
 99 |     NAME = 'TTR'
100 |     SLUG = 'ttr_def'
101 |     FORMAT = 'json'
102 |     
103 |     _extractor_type = 'default'
104 |     
105 |     @check_content_status
106 |     @return_content
107 |     def extract(self):
108 |         html = self.data_instance.get_raw_html()
109 |         req = Request(
110 |             settings.TTR_API_ENDPOINT,
111 |             data = {
112 |                 "extractorType":self._extractor_type,
113 |                 "rawHtml": html.encode(self.data_instance.raw_encoding,'ignore') 
114 |             },
115 |             headers = {'Content-Type':'application/x-www-form-urlencoded'}
116 |         )
117 |         return req.post()
118 |     
119 |     @classmethod
120 |     def formatted_result(cls, result_string):
121 |         js = json.loads(result_string, encoding = 'utf8')
122 |         result_html = js['result'].encode('utf8','ignore')
123 |         return TextResultFormat(html_to_text(result_html,'utf8'))
124 | 
125 | 
126 |        
127 | class BoilerpipeDefaultExtractor(_FormattedResultMin,_ContentCheckMin,BaseExtractor):
128 |     '''Boilerpipe default extractor '''
129 |     
130 |     NAME = 'Boilerpipe DEF'
131 |     SLUG = 'boilerpipe_def'
132 |     FORMAT = 'json'
133 |     
134 |     _extractor_type = 'default'
135 |     
136 |     @check_content_status
137 |     @return_content
138 |     def extract(self):
139 |         html = self.data_instance.get_raw_html()
140 |         req = Request(
141 |             settings.BOILERPIPE_API_ENDPOINT,
142 |             data = {
143 |                 "extractorType":self._extractor_type,
144 |                 "rawHtml": html.encode(self.data_instance.raw_encoding,'ignore') 
145 |             },
146 |             headers = {'Content-Type':'application/x-www-form-urlencoded'}
147 |         )
148 |         return req.post()
149 |         
150 |     
151 | class BoilerpipeArticleExtractor(BoilerpipeDefaultExtractor):
152 |     '''Boilerpipe article extractor'''
153 |     
154 |     NAME = 'Boilerpipe ART'
155 |     SLUG = 'boilerpipe_art'
156 |     FORMAT = 'json'
157 |     
158 |     _extractor_type = 'article'
159 |     
160 | class BoilerpipeArticleSentencesExtractor(BoilerpipeDefaultExtractor):
161 |     '''Boilerpipe extractor tuned for extracting article sentences'''
162 |     
163 |     NAME = 'Boilerpipe SENT'
164 |     SLUG = 'boilerpipe_sent'
165 |     FORMAT = 'json'
166 |     
167 |     _extractor_type = 'sentence'
168 |     
169 | class GooseExtractor(_FormattedResultMin,_ContentCheckMin,BaseExtractor):
170 |     '''Goose project extractor'''
171 |     
172 |     NAME = 'Goose'
173 |     SLUG = 'goose'
174 |     FORMAT = 'json'
175 |     
176 |     @return_content
177 |     def extract(self):
178 |         html = self.data_instance.get_raw_html()
179 |         req = Request(
180 |             settings.GOOSE_API_ENDPOINT,
181 |             data = dict(rawHtml = html.encode(self.data_instance.raw_encoding,'ignore')),
182 |             headers = {'Content-Type':'application/x-www-form-urlencoded'}
183 |         )
184 |         return req.post()
185 |     
186 | class MSSExtractor(BaseExtractor):
187 |     '''MSS implementation by Jeffrey Pasternack'''
188 |     
189 |     NAME = 'MSS'
190 |     SLUG = 'mss'
191 |     FORMAT = 'html'
192 |     
193 |     @return_content
194 |     def extract(self):
195 |         html = self.data_instance.get_raw_html()
196 |         req = Request(
197 |             dict(settings.MSS_URL)['text'],
198 |             #this implementation requires utf-8 encoded input
199 |             data = html.encode('utf-8','ignore'),
200 |             headers= {'Content-Type': 'text/plain;charset=UTF-8'}
201 |         )
202 |         return req.post()
203 |     
204 |     @classmethod
205 |     def formatted_result(cls, result_string):
206 |         return TextResultFormat(html_to_text(result_string, 'utf8'))
207 |         
208 |     
209 | class PythonReadabilityExtractor(BaseExtractor):
210 |     '''Extractor based on python-readability 
211 |     (https://github.com/gfxmonk/python-readability)'''
212 |     
213 |     NAME = 'Python Readability'
214 |     SLUG = 'python_read'
215 |     FORMAT = 'html'
216 |     
217 |     def extract(self):
218 |         html = self.data_instance.get_raw_html()
219 |         doc = readability.Document(html)
220 |         # FIXME
221 |         return doc.summary().encode('ascii','ignore')
222 |     
223 |     @classmethod
224 |     def formatted_result(cls, result_string):
225 |         return TextResultFormat(html_to_text(result_string, 'utf8'))
226 |     
227 | class NodeReadabilityExtractor(_FormattedResultMin,BaseExtractor):
228 |     '''Extractor based on node-readability'''
229 |     
230 |     NAME = 'Node Readability'
231 |     SLUG = 'node_read'
232 |     FORMAT = 'json'
233 |     
234 |     @check_content_status
235 |     @return_content
236 |     def extract(self):
237 |         html = self.data_instance.get_raw_html()
238 |         
239 |         req = Request(
240 |             settings.READABILITY_ENDPOINT,
241 |             #this implementation requires utf-8 encoded input
242 |             data = html.encode('utf-8','ignore'),
243 |             headers= {'Content-Type': 'text/plain;charset=UTF-8'}
244 |         )
245 |         return req.post() 
246 |     
247 |     def _content_status(self):
248 |         js = json.loads(self._content, encoding = 'utf8')
249 |         if js['status'] == 'ERROR':
250 |             raise ContentExtractorError('failed')
251 |         
252 | class SeleniumReadabilityExtractor(BaseExtractor):
253 |     '''
254 |     Using selenium webdriver API to harvest the results of the original
255 |     readability bookmarklet
256 |     '''
257 |     
258 |     NAME = 'Readability'
259 |     SLUG = 'orig_read'
260 |     FORMAT = 'txt'
261 |     
262 |     _driver = None # lazy webdriver.Firefox()
263 |     #TODO: share the modified code
264 |     _bookmarklet_source = "(function(){readConvertLinksToFootnotes=false;readStyle='style-newspaper';readSize='size-medium';readMargin='margin-wide';_bookm=document.createElement('script');_bookm.type='text/javascript';_bookm.src='" + \
265 |     settings.READABILITY_BOOKMARKLET + "?x='+Math.random();document.getElementsByTagName('head')[0].appendChild(_bookm);})();"
266 |     
267 |     def _check_content_presence(self):
268 |         cls = self.__class__
269 |         try:
270 |             # this was a modification to readability.js script
271 |             # if it failed to extract any meaningful content
272 |             # we renamed the id of the content block to
273 |             # explicitly indicate this special case
274 |             cls._driver.find_element_by_id('readability-content-failed')
275 |         except NoSuchElementException:
276 |             pass
277 |         else:
278 |             raise ContentExtractorError('readability failed to extract any content')
279 |     
280 |     def extract(self):
281 |         # lazy init
282 |         cls = self.__class__
283 |         if cls._driver == None:
284 |             # init firefox web driver
285 |             cls._driver = webdriver.Firefox()
286 |         
287 |         url = self.data_instance.get_url_local()
288 |         cls._driver.get(url)
289 |         time.sleep(2)
290 |         cls._driver.execute_script(self._bookmarklet_source)
291 |         
292 |         try:
293 |             # find the node that contains  content
294 |             # and check if readability managed to extract anything meaningful
295 |             element = cls._driver.find_element_by_id('readInner')
296 |             self._check_content_presence()
297 |         except NoSuchElementException:
298 |             raise ContentExtractorError('readability failed to produce the #readInner DOM node')
299 |         else:
300 |             return element.text.encode(self.data_instance.raw_encoding, 'ignore')
301 |         
302 |     @classmethod
303 |     def formatted_result(cls, result_string):
304 |         return TextResultFormat(result_string)
305 | 
306 | class AlchemyExtractor(BaseExtractor):
307 |     '''Alchemy API extractor'''
308 |     
309 |     NAME = 'Alchemy API'
310 |     SLUG = 'alchemy'
311 |     FORMAT = 'json'
312 |     
313 |     @check_content_status
314 |     @return_content
315 |     def extract(self):
316 |         html = self.data_instance.get_raw_html()
317 |         req = Request(
318 |             'http://access.alchemyapi.com/calls/html/HTMLGetText',
319 |             data = {'apikey':settings.ALCHEMY_API_KEY,
320 |                     'html': html.encode(self.data_instance.raw_encoding,'ignore'),
321 |                     'outputMode':'json'
322 |             } 
323 |             
324 |         )
325 |         return req.post()
326 |     
327 |     def _content_status(self):
328 |         js = json.loads(self._content, encoding = 'utf8')
329 |         if js['status'] == 'ERROR':
330 |             raise ContentExtractorError(js['statusInfo'].encode('utf8','ignore'))
331 |         
332 |     @classmethod
333 |     def formatted_result(cls, result_string):
334 |         js = json.loads(result_string, encoding = 'utf8')
335 |         return TextResultFormat(js['text'].encode('utf8','ignore'))
336 |         
337 | class DiffbotExtractor(BaseExtractor):
338 |     '''Diffbot extractor'''
339 |     
340 |     NAME = 'Diffbot'
341 |     SLUG = 'diffbot'
342 |     FORMAT = 'json'
343 |     
344 |     @return_content
345 |     def extract(self):        
346 |         data = urllib.urlencode(dict(
347 |             token = settings.DIFFBOT_API_KEY,
348 |             url = self.data_instance.get_url(),
349 |             format = 'json'
350 |         ))
351 |         data += '&stats' # use '&html' for html formatted result
352 |         req = Request(
353 |             'http://www.diffbot.com/api/article',
354 |             data = data
355 |         )
356 |         return req.get()
357 |     
358 |     @classmethod
359 |     def formatted_result(cls, result_string):
360 |         js = json.loads(result_string, encoding = 'utf8')
361 |         return TextResultFormat(
362 |             js.get('title','').encode('utf8','ignore') + ' ' +\
363 |             js['text'].encode('utf8','ignore')
364 |         )
365 |     
366 | class ExtractivExtractor(BaseExtractor):
367 |     '''Extractiv extractor'''
368 |     
369 |     NAME = 'Extractiv'
370 |     SLUG = 'extractiv'
371 |     FORMAT = 'json'
372 |     
373 |     @return_content
374 |     def extract(self):
375 |         html = self.data_instance.get_raw_html()
376 |         req = Request(
377 |             'http://rest.extractiv.com/extractiv/',
378 |             data = {'api_key':settings.EXTRACTIV_API_KEY,
379 |                     'content': html.encode(self.data_instance.raw_encoding,'ignore'),
380 |                     'output_format':'json'
381 |             } 
382 |             
383 |         )
384 |         return req.post()
385 |     
386 |     @classmethod
387 |     def formatted_result(cls, result_string):
388 |         js = json.loads(result_string, encoding = 'utf8')
389 |         
390 |         text = js['Document']['text']
391 |         content_sentences = []
392 |         for se in js['sentences']:
393 |             zone = se.get('zone','regular')
394 |             if zone == 'regular':
395 |                 content_sentences.append(text[se['offset']:se['offset']+se['len']] ) 
396 |         
397 |         return TextResultFormat(
398 |             js['Document'].get('title','').encode('utf8','ignore') + ' ' +\
399 |             (' '.join(content_sentences)).encode('utf8','ignore')
400 |         )
401 |         
402 | class RepustateExtractor(BaseExtractor):
403 |     '''Repustate extractor'''
404 |     
405 |     NAME = 'Repustate'
406 |     SLUG = 'repustate'
407 |     FORMAT = 'json'
408 |     
409 |     @check_content_status
410 |     @return_content
411 |     def extract(self):
412 |         req  = Request(
413 |             'http://api.repustate.com/v1/%s/clean-html.json' \
414 |              % settings.REPUSTATE_API_KEY,
415 |             data = 'url=%s' % self.data_instance.get_url()
416 |         )
417 |         return req.get()
418 |     
419 |     def _content_status(self):
420 |         js = json.loads(self._content, encoding = 'utf8')
421 |         if js['status'] != 'OK':
422 |             raise ContentExtractorError(js['status'].encode('utf8','ignore'))
423 |         
424 |     @classmethod
425 |     def formatted_result(cls, result_string):
426 |         js = json.loads(result_string, encoding = 'utf8')
427 |         return TextResultFormat(js['text'].encode('utf8','ignore'))
428 |     
429 | class ZemantaExtractor(BaseExtractor):
430 |     '''Extractor used internally by Zemanta Ltd'''
431 |     
432 |     NAME = 'Zextractor'
433 |     SLUG = 'zemanta'
434 |     FORMAT = 'txt'
435 |     
436 |     def extract(self):
437 |         html = self.data_instance.get_raw_html()
438 |         html = html.encode(self.data_instance.raw_encoding,'ignore')
439 |         cm = ClientManager()
440 |         
441 |         response = cm.extract(html, self.data_instance.raw_encoding)
442 |         if response.error:
443 |             raise ExtractorError(response.error)
444 |         return response.text
445 |     
446 |     @classmethod
447 |     def formatted_result(cls, result_string):
448 |         return TextResultFormat(result_string)
449 |     
450 | class NCleanerStdEnExtractor(BaseExtractor):
451 |     '''NCleaner extractor using the standard english n-gram model'''
452 |     
453 |     NAME = 'NCleaner En'
454 |     SLUG = 'ncleaner_en'
455 |     FORMAT = 'txt'
456 |     
457 |     def extract(self):
458 |         '''
459 |         This method is not implemented (for now), because ncleaner
460 |         comes with a handy command line tool that trivially executes  
461 |         the extraction task for us.
462 |         '''
463 |         raise NotImplementedError
464 |     
465 |     @classmethod
466 |     def formatted_result(cls, result_string):
467 |         # ncleaner uses the cleaneval style format for its output
468 |         return CleanEvalFormat(result_string)
469 |     
470 | class NCleanerNonLexExtractor(NCleanerStdEnExtractor):
471 |     '''NCleaner extractor using the non lexical n-gram model'''
472 |     
473 |     NAME = 'NCleaner NonLex'
474 |     SLUG = 'ncleaner_nonlex'
475 |     FORMAT = 'txt'
476 |     
477 | class TrendictionExtractor(BaseExtractor):
478 |     '''Trendiction API'''
479 |     
480 |     NAME = 'Trendiction'
481 |     SLUG = 'trendiction'
482 |     FORMAT = 'json'
483 |     
484 |     @check_content_status
485 |     @return_content
486 |     def extract(self):
487 |         req  = Request(
488 |             settings.TRENDICTION_ENDPOINT,
489 |             data = {
490 |                 'ckey':'',
491 |                 'url':self.data_instance.get_url(),
492 |                 'onlycontent':'false',
493 |                 'outf':'json',
494 |             }
495 |         )
496 |         return req.get()
497 |     
498 |     def _content_status(self):
499 |         js = json.loads(self._content, encoding = 'utf8')
500 |         try:
501 |             js['result_content']['data'][0]['content']['content_text']
502 |             js['result_content']['data'][0]['content']['title_text']
503 |         except (IndexError, KeyError) as e:
504 |             raise ContentExtractorError('content not present in the response' + repr(e))
505 |         
506 |     @classmethod
507 |     def formatted_result(cls, result_string):
508 |         js = json.loads(result_string, encoding = 'utf8')
509 |         content = js['result_content']['data'][0]['content']['content_text']
510 |         title = js['result_content']['data'][0]['content']['title_text']
511 |         return TextResultFormat((title +' '+ content).encode('utf8','ignore'))
512 |     
513 | class JustextExtractor(BaseExtractor):
514 |     '''Justext extractor'''
515 |     
516 |     NAME = 'JusText'
517 |     SLUG = 'justext'
518 |     FORMAT = 'txt'
519 |     
520 |     def extract(self):
521 |         html = self.data_instance.get_raw_html()
522 |         html = html.encode(self.data_instance.raw_encoding,'ignore')
523 |         paragraphs = justext.justext(html, justext.get_stoplist('English'),
524 |                              encoding = self.data_instance.raw_encoding)    
525 |         good_paragraphs = []
526 |         for para in paragraphs:
527 |             if para['class'] == 'good':
528 |                 paragraph_text = para['text']
529 |                 # this asseration makes sure we catch string and unicode only
530 |                 assert isinstance(paragraph_text, basestring)
531 |                 if type(paragraph_text) == unicode:
532 |                     good_paragraphs.append(paragraph_text.encode('utf8', 'ignore'))
533 |                 else:
534 |                     good_paragraphs.append(paragraph_text)
535 |             
536 |         return '\n\n'.join(good_paragraphs)
537 |         
538 |     @classmethod
539 |     def formatted_result(cls, result_string):
540 |         return TextResultFormat(result_string)
541 |     
542 | # list of all extractor classes         
543 | extractor_list = (
544 |     BoilerpipeDefaultExtractor,
545 |     BoilerpipeArticleExtractor,
546 |     BoilerpipeArticleSentencesExtractor,
547 |     GooseExtractor,
548 |     MSSExtractor,
549 |     PythonReadabilityExtractor,
550 |     NodeReadabilityExtractor,
551 |     SeleniumReadabilityExtractor,
552 |     AlchemyExtractor,
553 |     DiffbotExtractor,
554 |     ExtractivExtractor,
555 |     RepustateExtractor,
556 |     ZemantaExtractor,
557 |     NCleanerStdEnExtractor,
558 |     NCleanerNonLexExtractor,
559 |     #TrendictionExtractor,
560 |     JustextExtractor,
561 |     TTRDefaultExtractor,
562 | )
563 | 
564 | def get_extractor_cls(extractor_slug):
565 |     '''Return the extractor class given a slug'''
566 |     for e in extractor_list:
567 |         if e.SLUG == extractor_slug: 
568 |             return e
569 |     


--------------------------------------------------------------------------------
/src/txtexeval/util/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import Request
2 | from .common import get_local_path
3 | from .common import check_local_path
4 | from .common import html_to_text


--------------------------------------------------------------------------------
/src/txtexeval/util/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib
 3 | import urllib2
 4 | 
 5 | from BeautifulSoup import BeautifulSoup
 6 | 
 7 | import settings
 8 | 
 9 | # urllib wrappers
10 | 
11 | class _Response(object):
12 |     
13 |     def __init__(self, status_code = None, headers = None, 
14 |                  content = None, err_msg = None):
15 |         self.status_code = status_code
16 |         self.headers = headers
17 |         self.content = content
18 |         self._err_msg = err_msg
19 |         
20 |     def success(self):
21 |         if self._err_msg: 
22 |             return False
23 |         elif self.status_code and str(self.status_code).startswith('2'):# see RFC 2616
24 |             return True
25 |         else: 
26 |             return False 
27 |     
28 |     @property
29 |     def err_msg(self):
30 |         if self._err_msg: 
31 |             return self._err_msg
32 |         elif self.status_code and str(self.status_code).startswith('2'):
33 |             return 'Status code: %i' % self.status_code
34 |         else: 
35 |             return '' 
36 |             
37 | 
38 | class Request(object):
39 |     
40 |     def __init__(self, url, data, **kwargs):
41 |         self.url = url   
42 |         self.kwargs = kwargs     
43 |         if isinstance(data, dict):
44 |             self.data = urllib.urlencode(data)
45 |         else:
46 |             self.data = data
47 |         
48 |     def post(self):
49 |         request = urllib2.Request(self.url, self.data, **self.kwargs)
50 |         try: 
51 |             r = urllib2.urlopen(request)
52 |         except urllib2.URLError as e:
53 |             return _Response(err_msg = str(e))
54 |         else:
55 |             return _Response(r.code, r.headers, r.read())
56 |             
57 |     def get(self):
58 |         request = urllib2.Request('%s?%s' % (self.url, self.data), **self.kwargs)
59 |         try: 
60 |             r = urllib2.urlopen(request)
61 |         except urllib2.URLError as e:
62 |             return _Response(err_msg = str(e))
63 |         else:
64 |             return _Response(r.code, r.headers, r.read())
65 |         
66 | # dataset helpers
67 | 
68 | def check_local_path(*args):
69 |     return os.path.exists( 
70 |             os.path.join(settings.PATH_LOCAL_DATA, 'datasets', *args)
71 |     )
72 |     
73 | def get_local_path(*args):
74 |     return os.path.join(settings.PATH_LOCAL_DATA, 'datasets', *args)
75 | 
76 | # others
77 | 
78 | def execute_only_once(method):
79 |     '''A decorator that runs a method only once.'''
80 |     attrname = "_%s_once_result" % id(method)
81 |     def wrap(self, *args, **kwargs):
82 |         try:
83 |             return getattr(self, attrname)
84 |         except AttributeError:
85 |             setattr(self, attrname, method(self, *args, **kwargs))
86 |             return getattr(self, attrname)
87 |     return wrap
88 | 
89 | def html_to_text(html, encoding):
90 |     '''Get all the text from a given html string'''
91 |     soup = BeautifulSoup(html, fromEncoding = encoding)
92 |     tags = soup.findAll(text = True)
93 |     useful = lambda e: e.parent.name not in ('style', 'script', 'head', 'title')
94 |     tags = filter(useful, tags)
95 |     return ' '.join(map(lambda e: e.encode(encoding), tags))


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/__init__.py:
--------------------------------------------------------------------------------
1 | # this package was generated automatically by thrift 0.5.0 python compiler 
2 | # see: ../thrift/generate_thrift.sh 


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/client.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | from thrift import Thrift
 4 | from thrift.transport import TSocket, TTransport
 5 | from thrift.protocol import TBinaryProtocol
 6 | 
 7 | # this is the code thrift generates for us 
 8 | # gen-py directory was renamed to thriftgen
 9 | from .thriftgen.ceservice import ExtractorService
10 | from .thriftgen.ceservice import ttypes
11 | 
12 | import settings
13 | #from ..common import execute_only_once
14 | credentials = dict(settings.ZEMANTA_THRIFT) 
15 | 
16 | Response = namedtuple('Response', 'text error')
17 | 
18 | class ClientManager(object):
19 |     
20 |     __internal_state = {} # Borg design pattern (singleton)
21 |     
22 |     def __init__(self, extractor = None):
23 |         self.__dict__ = self.__internal_state
24 |         self.set_client()
25 |           
26 |     def set_client(self):
27 |         self._transport = TTransport.TBufferedTransport(
28 |             TSocket.TSocket(credentials['host'], credentials['port'])
29 |         )
30 |         self._protocol = TBinaryProtocol.TBinaryProtocol(self._transport)
31 |         self._client = ExtractorService.Client(self._protocol)
32 |         self._transport.open()
33 |         
34 |     def extract(self, encoded_htmldata, encoding):
35 |         error = None
36 |         text = ''
37 |         try:
38 |             response = self._client.extract('', '', encoded_htmldata, encoding)
39 |         except ttypes.TAppException as e:
40 |             error = '%r' % e
41 |         except Thrift.TException as e:
42 |             error = '%r' % e
43 |         except Exception as e:
44 |             error = '%r' % e
45 |         else:
46 |             if response.success:
47 |                 text = response.body.encode('utf8')
48 |             else:
49 |                 error = 'ExtractorService.extract returned a response but the success flag was set to False'
50 |         finally:
51 |             self._transport.close()
52 |         return Response(text, error)


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thrift/ceservice.thrift:
--------------------------------------------------------------------------------
 1 | 
 2 | // Command line to produce py-gen directory:
 3 | //  ~/prefix/bin/thrift -r --gen py:utf8strings ceservice.thrift
 4 | 
 5 | enum ExceptionCode {
 6 | 	FORCED_FAILED = 1,
 7 | 	PARSING_FAILED = 2,
 8 | 	FLATTENING_FAILED = 3,
 9 | 	CLASSIFICATION_FAILED = 4,
10 | }
11 | 
12 | 
13 | exception TAppException {
14 | 	1: ExceptionCode code,
15 | 	2: string msg,
16 |  	3: string backtrace,
17 | }
18 | 
19 | struct extract_RET
20 | {
21 | 	1:bool success,
22 | 	2:string body,
23 | }
24 | 
25 | service ExtractorService
26 | {
27 | 	string ping(1: string param)
28 | 		throws (1:TAppException e),
29 | 
30 | 
31 | 	extract_RET extract(
32 | 		1:string url, 
33 | 		2:string title, 
34 | 		3:binary htmldata, 
35 | 		4:string encoding,
36 | 		)
37 | 		throws (1:TAppException e),
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thrift/generate_thrift.sh:
--------------------------------------------------------------------------------
1 | echo "generating python source ..." 
2 | thrift  -r -v -o .. --gen py:utf8strings ceservice.thrift 
3 | echo "renaming gen-py into thriftgen"
4 | mv ../gen-py ../thriftgen
5 | echo "done"


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thriftgen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomazk/Text-Extraction-Evaluation/06d6070d895f1bae604dfaf10fa6537700d59e34/src/txtexeval/util/zemanta/thriftgen/__init__.py


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thriftgen/ceservice/ExtractorService-remote:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Autogenerated by Thrift
 4 | #
 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 6 | #
 7 | 
 8 | import sys
 9 | import pprint
10 | from urlparse import urlparse
11 | from thrift.transport import TTransport
12 | from thrift.transport import TSocket
13 | from thrift.transport import THttpClient
14 | from thrift.protocol import TBinaryProtocol
15 | 
16 | import ExtractorService
17 | from ttypes import *
18 | 
19 | if len(sys.argv) <= 1 or sys.argv[1] == '--help':
20 |   print ''
21 |   print 'Usage: ' + sys.argv[0] + ' [-h host:port] [-u url] [-f[ramed]] function [arg1 [arg2...]]'
22 |   print ''
23 |   print 'Functions:'
24 |   print '  string ping(string param)'
25 |   print '  extract_RET extract(string url, string title, string htmldata, string encoding)'
26 |   print ''
27 |   sys.exit(0)
28 | 
29 | pp = pprint.PrettyPrinter(indent = 2)
30 | host = 'localhost'
31 | port = 9090
32 | uri = ''
33 | framed = False
34 | http = False
35 | argi = 1
36 | 
37 | if sys.argv[argi] == '-h':
38 |   parts = sys.argv[argi+1].split(':')
39 |   host = parts[0]
40 |   port = int(parts[1])
41 |   argi += 2
42 | 
43 | if sys.argv[argi] == '-u':
44 |   url = urlparse(sys.argv[argi+1])
45 |   parts = url[1].split(':')
46 |   host = parts[0]
47 |   if len(parts) > 1:
48 |     port = int(parts[1])
49 |   else:
50 |     port = 80
51 |   uri = url[2]
52 |   if url[4]:
53 |     uri += '?%s' % url[4]
54 |   http = True
55 |   argi += 2
56 | 
57 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed':
58 |   framed = True
59 |   argi += 1
60 | 
61 | cmd = sys.argv[argi]
62 | args = sys.argv[argi+1:]
63 | 
64 | if http:
65 |   transport = THttpClient.THttpClient(host, port, uri)
66 | else:
67 |   socket = TSocket.TSocket(host, port)
68 |   if framed:
69 |     transport = TTransport.TFramedTransport(socket)
70 |   else:
71 |     transport = TTransport.TBufferedTransport(socket)
72 | protocol = TBinaryProtocol.TBinaryProtocol(transport)
73 | client = ExtractorService.Client(protocol)
74 | transport.open()
75 | 
76 | if cmd == 'ping':
77 |   if len(args) != 1:
78 |     print 'ping requires 1 args'
79 |     sys.exit(1)
80 |   pp.pprint(client.ping(args[0],))
81 | 
82 | elif cmd == 'extract':
83 |   if len(args) != 4:
84 |     print 'extract requires 4 args'
85 |     sys.exit(1)
86 |   pp.pprint(client.extract(args[0],args[1],args[2],args[3],))
87 | 
88 | else:
89 |   print 'Unrecognized method %s' % cmd
90 |   sys.exit(1)
91 | 
92 | transport.close()
93 | 


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thriftgen/ceservice/ExtractorService.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Autogenerated by Thrift
  3 | #
  4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
  5 | #
  6 | 
  7 | from thrift.Thrift import *
  8 | from ttypes import *
  9 | from thrift.Thrift import TProcessor
 10 | from thrift.transport import TTransport
 11 | from thrift.protocol import TBinaryProtocol, TProtocol
 12 | try:
 13 |   from thrift.protocol import fastbinary
 14 | except:
 15 |   fastbinary = None
 16 | 
 17 | 
 18 | class Iface:
 19 |   def ping(self, param):
 20 |     """
 21 |     Parameters:
 22 |      - param
 23 |     """
 24 |     pass
 25 | 
 26 |   def extract(self, url, title, htmldata, encoding):
 27 |     """
 28 |     Parameters:
 29 |      - url
 30 |      - title
 31 |      - htmldata
 32 |      - encoding
 33 |     """
 34 |     pass
 35 | 
 36 | 
 37 | class Client(Iface):
 38 |   def __init__(self, iprot, oprot=None):
 39 |     self._iprot = self._oprot = iprot
 40 |     if oprot != None:
 41 |       self._oprot = oprot
 42 |     self._seqid = 0
 43 | 
 44 |   def ping(self, param):
 45 |     """
 46 |     Parameters:
 47 |      - param
 48 |     """
 49 |     self.send_ping(param)
 50 |     return self.recv_ping()
 51 | 
 52 |   def send_ping(self, param):
 53 |     self._oprot.writeMessageBegin('ping', TMessageType.CALL, self._seqid)
 54 |     args = ping_args()
 55 |     args.param = param
 56 |     args.write(self._oprot)
 57 |     self._oprot.writeMessageEnd()
 58 |     self._oprot.trans.flush()
 59 | 
 60 |   def recv_ping(self, ):
 61 |     (fname, mtype, rseqid) = self._iprot.readMessageBegin()
 62 |     if mtype == TMessageType.EXCEPTION:
 63 |       x = TApplicationException()
 64 |       x.read(self._iprot)
 65 |       self._iprot.readMessageEnd()
 66 |       raise x
 67 |     result = ping_result()
 68 |     result.read(self._iprot)
 69 |     self._iprot.readMessageEnd()
 70 |     if result.success != None:
 71 |       return result.success
 72 |     if result.e != None:
 73 |       raise result.e
 74 |     raise TApplicationException(TApplicationException.MISSING_RESULT, "ping failed: unknown result");
 75 | 
 76 |   def extract(self, url, title, htmldata, encoding):
 77 |     """
 78 |     Parameters:
 79 |      - url
 80 |      - title
 81 |      - htmldata
 82 |      - encoding
 83 |     """
 84 |     self.send_extract(url, title, htmldata, encoding)
 85 |     return self.recv_extract()
 86 | 
 87 |   def send_extract(self, url, title, htmldata, encoding):
 88 |     self._oprot.writeMessageBegin('extract', TMessageType.CALL, self._seqid)
 89 |     args = extract_args()
 90 |     args.url = url
 91 |     args.title = title
 92 |     args.htmldata = htmldata
 93 |     args.encoding = encoding
 94 |     args.write(self._oprot)
 95 |     self._oprot.writeMessageEnd()
 96 |     self._oprot.trans.flush()
 97 | 
 98 |   def recv_extract(self, ):
 99 |     (fname, mtype, rseqid) = self._iprot.readMessageBegin()
100 |     if mtype == TMessageType.EXCEPTION:
101 |       x = TApplicationException()
102 |       x.read(self._iprot)
103 |       self._iprot.readMessageEnd()
104 |       raise x
105 |     result = extract_result()
106 |     result.read(self._iprot)
107 |     self._iprot.readMessageEnd()
108 |     if result.success != None:
109 |       return result.success
110 |     if result.e != None:
111 |       raise result.e
112 |     raise TApplicationException(TApplicationException.MISSING_RESULT, "extract failed: unknown result");
113 | 
114 | 
115 | class Processor(Iface, TProcessor):
116 |   def __init__(self, handler):
117 |     self._handler = handler
118 |     self._processMap = {}
119 |     self._processMap["ping"] = Processor.process_ping
120 |     self._processMap["extract"] = Processor.process_extract
121 | 
122 |   def process(self, iprot, oprot):
123 |     (name, type, seqid) = iprot.readMessageBegin()
124 |     if name not in self._processMap:
125 |       iprot.skip(TType.STRUCT)
126 |       iprot.readMessageEnd()
127 |       x = TApplicationException(TApplicationException.UNKNOWN_METHOD, 'Unknown function %s' % (name))
128 |       oprot.writeMessageBegin(name, TMessageType.EXCEPTION, seqid)
129 |       x.write(oprot)
130 |       oprot.writeMessageEnd()
131 |       oprot.trans.flush()
132 |       return
133 |     else:
134 |       self._processMap[name](self, seqid, iprot, oprot)
135 |     return True
136 | 
137 |   def process_ping(self, seqid, iprot, oprot):
138 |     args = ping_args()
139 |     args.read(iprot)
140 |     iprot.readMessageEnd()
141 |     result = ping_result()
142 |     try:
143 |       result.success = self._handler.ping(args.param)
144 |     except TAppException, e:
145 |       result.e = e
146 |     oprot.writeMessageBegin("ping", TMessageType.REPLY, seqid)
147 |     result.write(oprot)
148 |     oprot.writeMessageEnd()
149 |     oprot.trans.flush()
150 | 
151 |   def process_extract(self, seqid, iprot, oprot):
152 |     args = extract_args()
153 |     args.read(iprot)
154 |     iprot.readMessageEnd()
155 |     result = extract_result()
156 |     try:
157 |       result.success = self._handler.extract(args.url, args.title, args.htmldata, args.encoding)
158 |     except TAppException, e:
159 |       result.e = e
160 |     oprot.writeMessageBegin("extract", TMessageType.REPLY, seqid)
161 |     result.write(oprot)
162 |     oprot.writeMessageEnd()
163 |     oprot.trans.flush()
164 | 
165 | 
166 | # HELPER FUNCTIONS AND STRUCTURES
167 | 
168 | class ping_args:
169 |   """
170 |   Attributes:
171 |    - param
172 |   """
173 | 
174 |   thrift_spec = (
175 |     None, # 0
176 |     (1, TType.STRING, 'param', None, None, ), # 1
177 |   )
178 | 
179 |   def __init__(self, param=None,):
180 |     self.param = param
181 | 
182 |   def read(self, iprot):
183 |     if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
184 |       fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
185 |       return
186 |     iprot.readStructBegin()
187 |     while True:
188 |       (fname, ftype, fid) = iprot.readFieldBegin()
189 |       if ftype == TType.STOP:
190 |         break
191 |       if fid == 1:
192 |         if ftype == TType.STRING:
193 |           self.param = iprot.readString().decode('utf-8')
194 |         else:
195 |           iprot.skip(ftype)
196 |       else:
197 |         iprot.skip(ftype)
198 |       iprot.readFieldEnd()
199 |     iprot.readStructEnd()
200 | 
201 |   def write(self, oprot):
202 |     if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
203 |       oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
204 |       return
205 |     oprot.writeStructBegin('ping_args')
206 |     if self.param != None:
207 |       oprot.writeFieldBegin('param', TType.STRING, 1)
208 |       oprot.writeString(self.param.encode('utf-8'))
209 |       oprot.writeFieldEnd()
210 |     oprot.writeFieldStop()
211 |     oprot.writeStructEnd()
212 |     def validate(self):
213 |       return
214 | 
215 | 
216 |   def __repr__(self):
217 |     L = ['%s=%r' % (key, value)
218 |       for key, value in self.__dict__.iteritems()]
219 |     return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
220 | 
221 |   def __eq__(self, other):
222 |     return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
223 | 
224 |   def __ne__(self, other):
225 |     return not (self == other)
226 | 
227 | class ping_result:
228 |   """
229 |   Attributes:
230 |    - success
231 |    - e
232 |   """
233 | 
234 |   thrift_spec = (
235 |     (0, TType.STRING, 'success', None, None, ), # 0
236 |     (1, TType.STRUCT, 'e', (TAppException, TAppException.thrift_spec), None, ), # 1
237 |   )
238 | 
239 |   def __init__(self, success=None, e=None,):
240 |     self.success = success
241 |     self.e = e
242 | 
243 |   def read(self, iprot):
244 |     if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
245 |       fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
246 |       return
247 |     iprot.readStructBegin()
248 |     while True:
249 |       (fname, ftype, fid) = iprot.readFieldBegin()
250 |       if ftype == TType.STOP:
251 |         break
252 |       if fid == 0:
253 |         if ftype == TType.STRING:
254 |           self.success = iprot.readString().decode('utf-8')
255 |         else:
256 |           iprot.skip(ftype)
257 |       elif fid == 1:
258 |         if ftype == TType.STRUCT:
259 |           self.e = TAppException()
260 |           self.e.read(iprot)
261 |         else:
262 |           iprot.skip(ftype)
263 |       else:
264 |         iprot.skip(ftype)
265 |       iprot.readFieldEnd()
266 |     iprot.readStructEnd()
267 | 
268 |   def write(self, oprot):
269 |     if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
270 |       oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
271 |       return
272 |     oprot.writeStructBegin('ping_result')
273 |     if self.success != None:
274 |       oprot.writeFieldBegin('success', TType.STRING, 0)
275 |       oprot.writeString(self.success.encode('utf-8'))
276 |       oprot.writeFieldEnd()
277 |     if self.e != None:
278 |       oprot.writeFieldBegin('e', TType.STRUCT, 1)
279 |       self.e.write(oprot)
280 |       oprot.writeFieldEnd()
281 |     oprot.writeFieldStop()
282 |     oprot.writeStructEnd()
283 |     def validate(self):
284 |       return
285 | 
286 | 
287 |   def __repr__(self):
288 |     L = ['%s=%r' % (key, value)
289 |       for key, value in self.__dict__.iteritems()]
290 |     return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
291 | 
292 |   def __eq__(self, other):
293 |     return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
294 | 
295 |   def __ne__(self, other):
296 |     return not (self == other)
297 | 
298 | class extract_args:
299 |   """
300 |   Attributes:
301 |    - url
302 |    - title
303 |    - htmldata
304 |    - encoding
305 |   """
306 | 
307 |   thrift_spec = (
308 |     None, # 0
309 |     (1, TType.STRING, 'url', None, None, ), # 1
310 |     (2, TType.STRING, 'title', None, None, ), # 2
311 |     (3, TType.STRING, 'htmldata', None, None, ), # 3
312 |     (4, TType.STRING, 'encoding', None, None, ), # 4
313 |   )
314 | 
315 |   def __init__(self, url=None, title=None, htmldata=None, encoding=None,):
316 |     self.url = url
317 |     self.title = title
318 |     self.htmldata = htmldata
319 |     self.encoding = encoding
320 | 
321 |   def read(self, iprot):
322 |     if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
323 |       fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
324 |       return
325 |     iprot.readStructBegin()
326 |     while True:
327 |       (fname, ftype, fid) = iprot.readFieldBegin()
328 |       if ftype == TType.STOP:
329 |         break
330 |       if fid == 1:
331 |         if ftype == TType.STRING:
332 |           self.url = iprot.readString().decode('utf-8')
333 |         else:
334 |           iprot.skip(ftype)
335 |       elif fid == 2:
336 |         if ftype == TType.STRING:
337 |           self.title = iprot.readString().decode('utf-8')
338 |         else:
339 |           iprot.skip(ftype)
340 |       elif fid == 3:
341 |         if ftype == TType.STRING:
342 |           self.htmldata = iprot.readString();
343 |         else:
344 |           iprot.skip(ftype)
345 |       elif fid == 4:
346 |         if ftype == TType.STRING:
347 |           self.encoding = iprot.readString().decode('utf-8')
348 |         else:
349 |           iprot.skip(ftype)
350 |       else:
351 |         iprot.skip(ftype)
352 |       iprot.readFieldEnd()
353 |     iprot.readStructEnd()
354 | 
355 |   def write(self, oprot):
356 |     if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
357 |       oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
358 |       return
359 |     oprot.writeStructBegin('extract_args')
360 |     if self.url != None:
361 |       oprot.writeFieldBegin('url', TType.STRING, 1)
362 |       oprot.writeString(self.url.encode('utf-8'))
363 |       oprot.writeFieldEnd()
364 |     if self.title != None:
365 |       oprot.writeFieldBegin('title', TType.STRING, 2)
366 |       oprot.writeString(self.title.encode('utf-8'))
367 |       oprot.writeFieldEnd()
368 |     if self.htmldata != None:
369 |       oprot.writeFieldBegin('htmldata', TType.STRING, 3)
370 |       oprot.writeString(self.htmldata)
371 |       oprot.writeFieldEnd()
372 |     if self.encoding != None:
373 |       oprot.writeFieldBegin('encoding', TType.STRING, 4)
374 |       oprot.writeString(self.encoding.encode('utf-8'))
375 |       oprot.writeFieldEnd()
376 |     oprot.writeFieldStop()
377 |     oprot.writeStructEnd()
378 |     def validate(self):
379 |       return
380 | 
381 | 
382 |   def __repr__(self):
383 |     L = ['%s=%r' % (key, value)
384 |       for key, value in self.__dict__.iteritems()]
385 |     return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
386 | 
387 |   def __eq__(self, other):
388 |     return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
389 | 
390 |   def __ne__(self, other):
391 |     return not (self == other)
392 | 
393 | class extract_result:
394 |   """
395 |   Attributes:
396 |    - success
397 |    - e
398 |   """
399 | 
400 |   thrift_spec = (
401 |     (0, TType.STRUCT, 'success', (extract_RET, extract_RET.thrift_spec), None, ), # 0
402 |     (1, TType.STRUCT, 'e', (TAppException, TAppException.thrift_spec), None, ), # 1
403 |   )
404 | 
405 |   def __init__(self, success=None, e=None,):
406 |     self.success = success
407 |     self.e = e
408 | 
409 |   def read(self, iprot):
410 |     if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
411 |       fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
412 |       return
413 |     iprot.readStructBegin()
414 |     while True:
415 |       (fname, ftype, fid) = iprot.readFieldBegin()
416 |       if ftype == TType.STOP:
417 |         break
418 |       if fid == 0:
419 |         if ftype == TType.STRUCT:
420 |           self.success = extract_RET()
421 |           self.success.read(iprot)
422 |         else:
423 |           iprot.skip(ftype)
424 |       elif fid == 1:
425 |         if ftype == TType.STRUCT:
426 |           self.e = TAppException()
427 |           self.e.read(iprot)
428 |         else:
429 |           iprot.skip(ftype)
430 |       else:
431 |         iprot.skip(ftype)
432 |       iprot.readFieldEnd()
433 |     iprot.readStructEnd()
434 | 
435 |   def write(self, oprot):
436 |     if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
437 |       oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
438 |       return
439 |     oprot.writeStructBegin('extract_result')
440 |     if self.success != None:
441 |       oprot.writeFieldBegin('success', TType.STRUCT, 0)
442 |       self.success.write(oprot)
443 |       oprot.writeFieldEnd()
444 |     if self.e != None:
445 |       oprot.writeFieldBegin('e', TType.STRUCT, 1)
446 |       self.e.write(oprot)
447 |       oprot.writeFieldEnd()
448 |     oprot.writeFieldStop()
449 |     oprot.writeStructEnd()
450 |     def validate(self):
451 |       return
452 | 
453 | 
454 |   def __repr__(self):
455 |     L = ['%s=%r' % (key, value)
456 |       for key, value in self.__dict__.iteritems()]
457 |     return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
458 | 
459 |   def __eq__(self, other):
460 |     return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
461 | 
462 |   def __ne__(self, other):
463 |     return not (self == other)
464 | 


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thriftgen/ceservice/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['ttypes', 'constants', 'ExtractorService']
2 | 


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thriftgen/ceservice/constants.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Autogenerated by Thrift
 3 | #
 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
 5 | #
 6 | 
 7 | from thrift.Thrift import *
 8 | from ttypes import *
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/txtexeval/util/zemanta/thriftgen/ceservice/ttypes.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Autogenerated by Thrift
  3 | #
  4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
  5 | #
  6 | 
  7 | from thrift.Thrift import *
  8 | 
  9 | from thrift.transport import TTransport
 10 | from thrift.protocol import TBinaryProtocol, TProtocol
 11 | try:
 12 |   from thrift.protocol import fastbinary
 13 | except:
 14 |   fastbinary = None
 15 | 
 16 | 
 17 | class ExceptionCode:
 18 |   FORCED_FAILED = 1
 19 |   PARSING_FAILED = 2
 20 |   FLATTENING_FAILED = 3
 21 |   CLASSIFICATION_FAILED = 4
 22 | 
 23 |   _VALUES_TO_NAMES = {
 24 |     1: "FORCED_FAILED",
 25 |     2: "PARSING_FAILED",
 26 |     3: "FLATTENING_FAILED",
 27 |     4: "CLASSIFICATION_FAILED",
 28 |   }
 29 | 
 30 |   _NAMES_TO_VALUES = {
 31 |     "FORCED_FAILED": 1,
 32 |     "PARSING_FAILED": 2,
 33 |     "FLATTENING_FAILED": 3,
 34 |     "CLASSIFICATION_FAILED": 4,
 35 |   }
 36 | 
 37 | 
 38 | class TAppException(Exception):
 39 |   """
 40 |   Attributes:
 41 |    - code
 42 |    - msg
 43 |    - backtrace
 44 |   """
 45 | 
 46 |   thrift_spec = (
 47 |     None, # 0
 48 |     (1, TType.I32, 'code', None, None, ), # 1
 49 |     (2, TType.STRING, 'msg', None, None, ), # 2
 50 |     (3, TType.STRING, 'backtrace', None, None, ), # 3
 51 |   )
 52 | 
 53 |   def __init__(self, code=None, msg=None, backtrace=None,):
 54 |     self.code = code
 55 |     self.msg = msg
 56 |     self.backtrace = backtrace
 57 | 
 58 |   def read(self, iprot):
 59 |     if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
 60 |       fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
 61 |       return
 62 |     iprot.readStructBegin()
 63 |     while True:
 64 |       (fname, ftype, fid) = iprot.readFieldBegin()
 65 |       if ftype == TType.STOP:
 66 |         break
 67 |       if fid == 1:
 68 |         if ftype == TType.I32:
 69 |           self.code = iprot.readI32();
 70 |         else:
 71 |           iprot.skip(ftype)
 72 |       elif fid == 2:
 73 |         if ftype == TType.STRING:
 74 |           self.msg = iprot.readString().decode('utf-8')
 75 |         else:
 76 |           iprot.skip(ftype)
 77 |       elif fid == 3:
 78 |         if ftype == TType.STRING:
 79 |           self.backtrace = iprot.readString().decode('utf-8')
 80 |         else:
 81 |           iprot.skip(ftype)
 82 |       else:
 83 |         iprot.skip(ftype)
 84 |       iprot.readFieldEnd()
 85 |     iprot.readStructEnd()
 86 | 
 87 |   def write(self, oprot):
 88 |     if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
 89 |       oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
 90 |       return
 91 |     oprot.writeStructBegin('TAppException')
 92 |     if self.code != None:
 93 |       oprot.writeFieldBegin('code', TType.I32, 1)
 94 |       oprot.writeI32(self.code)
 95 |       oprot.writeFieldEnd()
 96 |     if self.msg != None:
 97 |       oprot.writeFieldBegin('msg', TType.STRING, 2)
 98 |       oprot.writeString(self.msg.encode('utf-8'))
 99 |       oprot.writeFieldEnd()
100 |     if self.backtrace != None:
101 |       oprot.writeFieldBegin('backtrace', TType.STRING, 3)
102 |       oprot.writeString(self.backtrace.encode('utf-8'))
103 |       oprot.writeFieldEnd()
104 |     oprot.writeFieldStop()
105 |     oprot.writeStructEnd()
106 |     def validate(self):
107 |       return
108 | 
109 | 
110 |   def __str__(self):
111 |     return repr(self)
112 | 
113 |   def __repr__(self):
114 |     L = ['%s=%r' % (key, value)
115 |       for key, value in self.__dict__.iteritems()]
116 |     return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
117 | 
118 |   def __eq__(self, other):
119 |     return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
120 | 
121 |   def __ne__(self, other):
122 |     return not (self == other)
123 | 
124 | class extract_RET:
125 |   """
126 |   Attributes:
127 |    - success
128 |    - body
129 |   """
130 | 
131 |   thrift_spec = (
132 |     None, # 0
133 |     (1, TType.BOOL, 'success', None, None, ), # 1
134 |     (2, TType.STRING, 'body', None, None, ), # 2
135 |   )
136 | 
137 |   def __init__(self, success=None, body=None,):
138 |     self.success = success
139 |     self.body = body
140 | 
141 |   def read(self, iprot):
142 |     if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
143 |       fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
144 |       return
145 |     iprot.readStructBegin()
146 |     while True:
147 |       (fname, ftype, fid) = iprot.readFieldBegin()
148 |       if ftype == TType.STOP:
149 |         break
150 |       if fid == 1:
151 |         if ftype == TType.BOOL:
152 |           self.success = iprot.readBool();
153 |         else:
154 |           iprot.skip(ftype)
155 |       elif fid == 2:
156 |         if ftype == TType.STRING:
157 |           self.body = iprot.readString().decode('utf-8')
158 |         else:
159 |           iprot.skip(ftype)
160 |       else:
161 |         iprot.skip(ftype)
162 |       iprot.readFieldEnd()
163 |     iprot.readStructEnd()
164 | 
165 |   def write(self, oprot):
166 |     if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
167 |       oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
168 |       return
169 |     oprot.writeStructBegin('extract_RET')
170 |     if self.success != None:
171 |       oprot.writeFieldBegin('success', TType.BOOL, 1)
172 |       oprot.writeBool(self.success)
173 |       oprot.writeFieldEnd()
174 |     if self.body != None:
175 |       oprot.writeFieldBegin('body', TType.STRING, 2)
176 |       oprot.writeString(self.body.encode('utf-8'))
177 |       oprot.writeFieldEnd()
178 |     oprot.writeFieldStop()
179 |     oprot.writeStructEnd()
180 |     def validate(self):
181 |       return
182 | 
183 | 
184 |   def __repr__(self):
185 |     L = ['%s=%r' % (key, value)
186 |       for key, value in self.__dict__.iteritems()]
187 |     return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
188 | 
189 |   def __eq__(self, other):
190 |     return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
191 | 
192 |   def __ne__(self, other):
193 |     return not (self == other)
194 | 


--------------------------------------------------------------------------------
/tests/test_evaluation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import math
  4 | 
  5 | import unittest2
  6 | 
  7 | from txtexeval.util import html_to_text
  8 | from txtexeval.evaluation import _tokenize_text, _bow
  9 | from txtexeval.evaluation import TextOnlyEvaluator
 10 | from txtexeval.evaluation import TextBasedResults, Result
 11 | from txtexeval.evaluation import BaseResultFormat, TextResultFormat, \
 12 |                                  CleanEvalFormat,GoogleNewsFormat
 13 |                                  
 14 |                                  
 15 | class TestHelpers(unittest2.TestCase):        
 16 |                                  
 17 |     def test_tokenize_text(self):
 18 |         s = '''
 19 |         This is (some text). AAAA!!"#.{}
 20 |         special charčć€šđž.
 21 |         '''
 22 |         r = _tokenize_text(s)
 23 |         self.assertEqual(r, ['this','is','some','text','aaaa','special','char'])
 24 |         
 25 |     def test_tokenize_text_empty(self):
 26 |         s = ''
 27 |         r = _tokenize_text(s)
 28 |         self.assertEqual(r, [])
 29 |     
 30 |     def test_html_to_text(self):
 31 |         s = '''
 32 |         <html>
 33 |             <head>
 34 |                 <title>Title</title>
 35 |                 <style>
 36 |                 p
 37 |                 {
 38 |                 font-family:"Times New Roman";
 39 |                 font-size:20px;
 40 |                 }
 41 |                 </style>
 42 |             </head>
 43 |             
 44 |             <body>
 45 |                 <script type="text/javascript">
 46 |                  $.document()
 47 |                 </script>
 48 | 
 49 |                 Body
 50 |                 <p>Paragraph <strong>here</strong></p>
 51 |                 More text
 52 |             </body>
 53 |         </html>
 54 |         '''
 55 |         t = html_to_text(s, encoding = 'ascii')
 56 |         t = t.strip()
 57 |         self.assertTrue(t.startswith('Body'))
 58 |         self.assertTrue(t.endswith('text'))
 59 |         
 60 |     def test_html_to_text_empty(self):
 61 |         s = ''
 62 |         t = html_to_text(s, encoding = 'ascii')
 63 |         self.assertTrue(re.match('\s*', t))
 64 | 
 65 | class TestFormats(unittest2.TestCase):
 66 |         
 67 |     def test_textresultformat(self):
 68 |         s = '''
 69 |         This is (some text). AAAA!!"#.{}
 70 |         special charčć€šđž char.
 71 |         '''
 72 |         t = TextResultFormat(s)
 73 |         self.assertEqual(t.get_word_seq(), ['this','is','some','text','aaaa','special','char','char'])
 74 |         self.assertEqual(t.get_bow(), {'this':1,'is':1,'some':1,'text':1,'aaaa':1,'special':1,'char':2})
 75 |         
 76 |     def test_textresultformat_empty(self):
 77 |         t = TextResultFormat('''
 78 |         
 79 |         
 80 |             ''')
 81 |         self.assertEqual(t.get_word_seq(), [])
 82 |         self.assertEqual(t.get_bow(), {})
 83 |     
 84 |     def test_cleanevalformat(self):
 85 |         s = '''
 86 |         URL: http://childparenting.about.com/b/archives.htm
 87 |         <p> this is
 88 |         <h> cleaneval
 89 |         <l> format
 90 |         
 91 |         <P> this is
 92 |         <H> cleaneval
 93 |         <L> format
 94 |         '''
 95 |         ce = CleanEvalFormat(s)
 96 |         self.assertEqual(ce.get_word_seq(), ['this','is','cleaneval','format','this','is','cleaneval','format'])
 97 |         self.assertEqual(ce.get_bow(), {'this':2,'is':2,'cleaneval':2,'format':2})
 98 | 
 99 |     def test_cleanevalformat_empty(self):
100 |         s = '''URL: http://childparenting.about.com/b/archives.htm
101 |         '''
102 |         ce = CleanEvalFormat(s)
103 |         self.assertEqual(ce.get_word_seq(), [])
104 |         self.assertEqual(ce.get_bow(), {})
105 |         
106 |     def test_googlenewsformat(self):
107 |         s = '''
108 |         <p>
109 |         <span class="x-nc-sel1"> 
110 |             Headline here
111 |         </span>
112 |         <span class="bodysmall">
113 |             <span class="x-nc-sel2"> 
114 |                 Double content 
115 |                 <span class="x-nc-sel2"> 
116 |                     Text content here€
117 |                 </span>
118 |                 content
119 |             </span>
120 |         </span> 
121 |         Not content
122 |         </p>
123 |         '''
124 |         gn = GoogleNewsFormat(s, 'utf8')
125 |         self.assertEqual(gn.get_word_seq(), ['headline','here','double','content','text','content','here','content',])
126 |         self.assertEqual(gn.get_bow(), {'headline':1,'here':2,'double':1,'content':3,'text':1})
127 |         
128 |     def test_googlenewsformat_empty1(self):
129 |         s = '''
130 |         <p>
131 |         <span class="x-nc-sel5"> 
132 |             Headline here (not content)
133 |         </span>
134 |         <span class="bodysmall">
135 |             <span class="x-nc-sel5"> 
136 |                 not content 
137 |                 <span class="x-nc-sel5"> 
138 |                     no content here€
139 |                 </span>
140 |                 not content
141 |             </span>
142 |         </span> 
143 |         Not content
144 |         </p>
145 |         '''
146 |         gn = GoogleNewsFormat(s, 'utf8')
147 |         self.assertEqual(gn.get_word_seq(), [])
148 |         self.assertEqual(gn.get_bow(), {})
149 |     
150 |     def test_googlenewsformat_empty2(self):
151 |         gn = GoogleNewsFormat('','ascii')
152 |         self.assertEqual(gn.get_word_seq(), [])
153 |         self.assertEqual(gn.get_bow(), {})
154 |         
155 | def dummy_format_factory(word_seq):
156 |     class DummyFormat(BaseResultFormat):
157 |         def get_bow(self):
158 |             return _bow(word_seq) 
159 |     
160 |         def get_word_seq(self):
161 |             return word_seq
162 |     return DummyFormat()
163 |         
164 | class TestTextEvaluator(unittest2.TestCase):
165 |     
166 |     def test_empty_relevant(self):
167 |         ret = dummy_format_factory(['one','two'])
168 |         rel = dummy_format_factory([])
169 |         # args: TextOnlyEvaluator(retrieved, relevant)
170 |         e = TextOnlyEvaluator(ret, rel)
171 |         r = e.get_eval_results()
172 |         self.assertEqual(r.precision, 0)
173 |         self.assertTrue(math.isinf(r.recall))
174 |         self.assertTrue(math.isnan(r.f1_score))
175 |         
176 |     def test_empty_retrieved(self):
177 |         ret = dummy_format_factory([])
178 |         rel = dummy_format_factory(['one','two'])
179 |         # args: TextOnlyEvaluator(retrieved, relevant)
180 |         e = TextOnlyEvaluator(ret, rel)
181 |         r = e.get_eval_results()
182 |         self.assertEqual(r.recall, 0)
183 |         self.assertTrue(math.isinf(r.precision))
184 |         self.assertTrue(math.isnan(r.f1_score))
185 |         
186 |     def test_both_empty(self):
187 |         ret = dummy_format_factory([])
188 |         rel = dummy_format_factory([])
189 |         # args: TextOnlyEvaluator(retrieved, relevant)
190 |         e = TextOnlyEvaluator(ret, rel)
191 |         r = e.get_eval_results()
192 |         self.assertTrue(math.isinf(r.precision))
193 |         self.assertTrue(math.isinf(r.recall))
194 |         self.assertTrue(math.isnan(r.f1_score))
195 |         
196 |     def test_missmatch(self):
197 |         ret = dummy_format_factory(['one','four'])
198 |         rel = dummy_format_factory(['two','three'])
199 |         # args: TextOnlyEvaluator(retrieved, relevant)
200 |         e = TextOnlyEvaluator(ret, rel)
201 |         r = e.get_eval_results()
202 |         self.assertEqual(r.precision, 0)
203 |         self.assertEqual(r.recall, 0)
204 |         self.assertTrue(math.isinf(r.f1_score))
205 |         
206 |     def test_match(self):
207 |         ret = dummy_format_factory(['zero','one','two','four'])
208 |         rel = dummy_format_factory(['one','two','three'])
209 |         # args: TextOnlyEvaluator(retrieved, relevant)
210 |         e = TextOnlyEvaluator(ret, rel)
211 |         r = e.get_eval_results()
212 |         self.assertAlmostEqual(r.precision, 0.5)
213 |         self.assertAlmostEqual(r.recall, 0.6666, delta = 0.0001)
214 |         self.assertAlmostEqual(r.f1_score, 0.5714, delta = 0.001)
215 |         
216 |     def test_perfect_match(self):
217 |         ret = dummy_format_factory(['zero'])
218 |         rel = dummy_format_factory(['zero'])
219 |         # args: TextOnlyEvaluator(retrieved, relevant)
220 |         e = TextOnlyEvaluator(ret, rel)
221 |         r = e.get_eval_results()
222 |         self.assertAlmostEqual(r.precision, 1)
223 |         self.assertAlmostEqual(r.recall, 1)
224 |         self.assertAlmostEqual(r.f1_score, 1)
225 |         
226 | class TestTextBasedResults(unittest2.TestCase):
227 |     
228 |     def setUp(self):
229 |         self.results = TextBasedResults('e1')
230 |         # Result(precision, recall, f1_score, id)
231 |         self.results.add_result(Result(0,0,float('inf'),None))
232 |         
233 |         self.results.add_result(Result(float('inf'),0,float('nan'),None))
234 |         self.results.add_result(Result(float('inf'),0,float('nan'),None))
235 |         
236 |         self.results.add_result(Result(0,float('inf'),float('nan'),None))
237 |         self.results.add_result(Result(0,float('inf'),float('nan'),None))
238 |         
239 |         self.results.add_result(Result(float('inf'),float('inf'),float('nan'),None))
240 |         
241 |         self.results.add_result(Result(0.2,0.2,0.2,None))
242 |         self.results.add_result(Result(0.2,0.2,0.2,None))
243 |         self.results.add_result(Result(0.2,0.2,0.2,None))
244 |         self.results.add_result(Result(0.2,0.2,0.2,None))
245 |         
246 |         self.results.dataset_len = 12
247 |         
248 |     def tearDown(self):
249 |         self.results.text_eval_results['e1'] = []
250 |         
251 |     def test_results_contents(self):
252 |         contents = self.results.result_contents('e1')
253 |         self.assertEqual(contents.fail, 2)
254 |         self.assertEqual(contents.succ, 4)
255 |         self.assertEqual(contents.rel_empty, 2)
256 |         self.assertEqual(contents.ret_empty, 2)
257 |         self.assertEqual(contents.rel_ret_empty, 1)
258 |         self.assertEqual(contents.missmatch, 1)
259 |         
260 |     def test_result_filter(self):
261 |         fr = self.results.filtered_results('e1')
262 |         self.assertEqual(len(fr), 4)
263 |         
264 |     def test_precision_statistics(self):
265 |         avg, std = self.results.precision_statistics('e1')
266 |         self.assertEqual(avg, 0.2)
267 |         self.assertEqual(std, 0.)
268 |         
269 |     def test_recall_statistics(self):
270 |         avg, std = self.results.recall_statistics('e1')
271 |         self.assertEqual(avg, 0.2)
272 |         self.assertEqual(std, 0.)
273 |         
274 |     def test_f1score_statistics(self):
275 |         avg, std = self.results.f1score_statistics('e1')
276 |         self.assertEqual(avg, 0.2)
277 |         self.assertEqual(std, 0.)
278 |         
279 |     def test_add_bad_result(self):
280 |         r = TextBasedResults('e2')
281 |         with self.assertRaises(AssertionError):
282 |             r.add_result(Result(2,1,1,None))
283 |         with self.assertRaises(AssertionError):
284 |             r.add_result(Result(float('inf'),float('inf'),1,None))
285 |         with self.assertRaises(AssertionError):
286 |             r.add_result(Result(float('inf'),0,1,None))
287 |         with self.assertRaises(AssertionError):
288 |             r.add_result(Result(0,0,1,None))
289 |             
290 |     def test_add_good_result(self):
291 |         r = TextBasedResults('e3')
292 |         try:
293 |             r.add_result(Result(0.2,0.2,0.2,None))
294 |         except AssertionError:
295 |             self.fail()
296 | 
297 | def main():
298 |     unittest2.main(exit = False, verbosity = 2)
299 |     
300 | if __name__ == '__main__':
301 |     main()
302 |     


--------------------------------------------------------------------------------
/tests/test_plot.py:
--------------------------------------------------------------------------------
 1 | import unittest2
 2 | 
 3 | from plot_manage import equidistant_count
 4 |                                  
 5 | class TestEvaluation(unittest2.TestCase):
 6 |     
 7 |     def test_equidistant_count(self):
 8 |         r = equidistant_count(0, 1, 0.2, [0.11,0.22,0.32])
 9 |         self.assertEqual(r, (1,2,0,0,0))
10 |         
11 |         r = equidistant_count(0, 1, 0.5, [0.,0.22,0.32,0.5])
12 |         self.assertEqual(r, (3,1))
13 |     
14 | def main():
15 |     unittest2.main(exit = False, verbosity = 2)
16 |     
17 | if __name__ == '__main__':
18 |     main()


--------------------------------------------------------------------------------
/tests/testsrunner.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Run all test cases residing in all modules that follow the test_[name].py
 3 | naming template.
 4 | 
 5 | We could also use nose test autodiscovery tool instead.
 6 | '''
 7 | import os
 8 | import unittest2
 9 | 
10 | def test_modules():
11 |     '''Get all test modules'''
12 |     modlist = []
13 |     for mod in os.listdir('.'):
14 |         if mod.startswith('test_') and mod.endswith(".py"):
15 |             modlist.append(mod[0:-3])
16 |     return modlist
17 | 
18 | if __name__ == "__main__":    
19 |     suite = unittest2.TestSuite()
20 |     for mod in test_modules():
21 |         suite.addTests(unittest2.TestLoader().loadTestsFromName(mod))    
22 |     unittest2.TextTestRunner(verbosity=2).run(suite)


--------------------------------------------------------------------------------