├── .gitignore ├── README.rst ├── pip-req.txt ├── src ├── dataset_manage.py ├── evaluate_manage.py ├── extract_manage.py ├── plot_manage.py ├── settings.py-template ├── tee └── txtexeval │ ├── __init__.py │ ├── data.py │ ├── evaluation.py │ ├── extractor.py │ └── util │ ├── __init__.py │ ├── common.py │ └── zemanta │ ├── __init__.py │ ├── client.py │ ├── thrift │ ├── ceservice.thrift │ └── generate_thrift.sh │ └── thriftgen │ ├── __init__.py │ └── ceservice │ ├── ExtractorService-remote │ ├── ExtractorService.py │ ├── __init__.py │ ├── constants.py │ └── ttypes.py └── tests ├── test_evaluation.py ├── test_plot.py └── testsrunner.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | develop-eggs 12 | .installed.cfg 13 | 14 | # Installer logs 15 | pip-log.txt 16 | 17 | # Unit test / coverage reports 18 | .coverage 19 | .tox 20 | 21 | # Project specific 22 | src/settings.py 23 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Text extraction evaluation framework 2 | ------------------------------------ 3 | 4 | Framework for evaluating text extraction algorithms implemented as web services. 5 | 6 | 7 | Author 8 | ------ 9 | 10 | Tomaž Kovačič 11 | 12 | 13 | Licence 14 | ------- 15 | 16 | Copyright (C) 2011 Tomaž Kovačič 17 | 18 | This program is free software: you can redistribute it and/or modify 19 | it under the terms of the GNU General Public License as published by 20 | the Free Software Foundation, either version 3 of the License, or 21 | (at your option) any later version. 22 | 23 | This program is distributed in the hope that it will be useful, 24 | but WITHOUT ANY WARRANTY; without even the implied warranty of 25 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 26 | GNU General Public License for more details. 27 | 28 | You should have received a copy of the GNU General Public License 29 | along with this program. If not, see . -------------------------------------------------------------------------------- /pip-req.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup==3.2.0 2 | PyYAML==3.09 3 | argparse==1.1 4 | chardet==2.0.1 5 | matplotlib==1.1.0svn 6 | numpy==1.5.1 7 | -e git+https://github.com/gfxmonk/python-readability.git@b5639a08225a9a6cc3ccd43f0b5c07b82958ebda#egg=python_readability-0.0.0-py2.6-dev 8 | unittest2==0.5.1 9 | Thrift==0.1 10 | justext==1.1 11 | lxml==2.3 12 | selenium==2.2.0 -------------------------------------------------------------------------------- /src/dataset_manage.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script for generating meta data files and preprocessing datasets. 3 | 4 | Throughout the script we're assuming the following structure 5 | of the directory that settings.PATH_LOCAL_DATA points to. 6 | 7 | |-- datasets 8 | | |-- testdataset 9 | | | |-- clean 10 | | | | `-- example.txt 11 | | | |-- meta.yaml ----> this is where the output will reside 12 | | | `-- raw 13 | | | `-- example.html 14 | |-- plot-output 15 | | `-- ... 16 | `-- results-cache 17 | `-- ... 18 | ''' 19 | import os 20 | import sys 21 | import re 22 | import codecs 23 | import logging 24 | 25 | import yaml 26 | import argparse 27 | import chardet 28 | from BeautifulSoup import BeautifulSoup 29 | 30 | from txtexeval.util import check_local_path, get_local_path 31 | 32 | # module logger 33 | logger = logging.getLogger() 34 | 35 | # exceptions 36 | 37 | class MetaGeneratorError(Exception): 38 | pass 39 | 40 | class PreprocessingError(Exception): 41 | pass 42 | 43 | class SkipTrigger(ValueError): 44 | pass 45 | 46 | # private helpers 47 | 48 | def _verify_args(args): 49 | # verify arguments provoded by argparse and 50 | # return the path to the output directory 51 | 52 | # printing arguments 53 | print 'dataset type: %s' % args.dataset_type 54 | print 'dataset name: %s' % args.dataset_name 55 | 56 | #validate dataset name 57 | if not check_local_path(args.dataset_name): 58 | print 'error: this dataset does not exist' 59 | sys.exit(-1) 60 | 61 | # validate path argument 62 | if args.path and not os.path.exists(args.path): 63 | print 'error: path does not exist' 64 | sys.exit(-1) 65 | 66 | output_dir = args.path or get_local_path(args.dataset_name) 67 | print 'output directory: %s' % output_dir 68 | return output_dir 69 | 70 | 71 | def _get_attribute(tag, name): 72 | # params: BS tag and attribute name 73 | # return None or attribute value 74 | # takes care of encoding 75 | try: 76 | return tag[name].encode('ascii', 'ignore') 77 | except KeyError: 78 | return None 79 | 80 | regex_BEG = re.compile(r'(?P^(\s*)<(\s*)text((\s*)(id|title|encoding)(\s*)=(\s*)"(.*)")*(\s*)>)') 81 | regex_END = re.compile(r'(?P<(\s*)/(\s*)text(\s*)>(.*)$)') 82 | def _remove_text_tag(html_string, filename): 83 | # Cleaneval has a tag that wraps the whole html structure. This 84 | # function removes it with a pessimistic regular expression because we don't 85 | # want to mess with the rest of the structure with a parser 86 | 87 | # remove at the beginning 88 | match_start = regex_BEG.match(html_string) 89 | if match_start: 90 | logger.debug('removing text tag in %s: %s', filename, match_start.group('text_tag')) 91 | html_string = regex_BEG.sub('', html_string) 92 | else: 93 | raise PreprocessingError('no starting text tag in %s' % filename) 94 | 95 | # remove 96 | match_end = regex_END.search(html_string) 97 | if match_end: 98 | logger.debug('removing closing text tag in %s: %s', filename, match_end.group('closing_text_tag')) 99 | html_string = regex_END.sub('', html_string) 100 | else: 101 | raise PreprocessingError('no closing text tag in %s' % filename) 102 | 103 | return html_string 104 | 105 | def _get_charset(html_string, raw_filename): 106 | # based on a string that represents the html document 107 | # get the charset from the meta http-equiv tag e.g.: 108 | # 109 | # or html5 110 | # return None if no such tag was found 111 | # raw_filename is used only for logging 112 | charset = None 113 | 114 | soup = BeautifulSoup(html_string) 115 | r_ct = re.compile('[C|c]ontent-[T|t]ype|CONTENT-TYPE') 116 | r_cont = re.compile('\s*text\s*/\s*html\s*;\s*charset\s*=\s*(?P[a-zA-Z0-9_-]+)') 117 | 118 | for tag in soup.findAll('meta'): 119 | 120 | if tag.has_key('http-equiv') and tag.has_key('content') and r_ct.match(tag['content']): 121 | content = tag['content'].lower() 122 | match = r_cont(content) 123 | if match: 124 | charset = match.group('charset') 125 | logger.debug('charset %s found via meta http-equiv in %s', charset, raw_filename) 126 | else: 127 | logger.warn('meta http-equiv exists but it does not match the content regex in %s: %s', raw_filename, str(tag)) 128 | 129 | elif tag.has_key('http-equiv') and not tag.has_key('content'): 130 | logger.warn('no content attribute in meta http-equiv tag in %s: %s', raw_filename, str(tag)) 131 | 132 | elif tag.has_key('charset'): 133 | charset = tag['charset'] 134 | logger.info('charset %s found via meta charset (html5 style) in %s', charset, raw_filename) 135 | 136 | if not charset: 137 | logger.debug('no meta tag with charset definition in %s', raw_filename) 138 | 139 | return charset 140 | 141 | def _get_safe_encoding_name(encoding): 142 | if encoding == None: 143 | raise MetaGeneratorError('no encoding given') 144 | try: 145 | codec = codecs.lookup(encoding) 146 | except LookupError: 147 | raise MetaGeneratorError('no safe encoding name is found for %s' % encoding) 148 | else: 149 | return codec.name 150 | 151 | def _skip_file(regex, raw_filename): 152 | # if filename does not match the given regular expr. 153 | # then raise the skip trigger 154 | if not regex.match(raw_filename): 155 | logger.debug('skipping file %s', raw_filename) 156 | raise SkipTrigger 157 | 158 | # decorators 159 | 160 | def itarate_raw_filename(method): 161 | def wrap(self): 162 | for raw_filename in self._raw_filenames(): 163 | try: 164 | method(self, raw_filename) 165 | except SkipTrigger: 166 | continue 167 | return wrap 168 | 169 | def dump_meta_data(method): 170 | def wrap(self,*args,**kwargs): 171 | method(self,*args,**kwargs) 172 | self._serialize_meta_data() 173 | return wrap 174 | 175 | # dataset specific processor classes 176 | 177 | class BaseProcessor(object): 178 | 179 | def __init__(self, output_dir, dataset_name): 180 | self.dataset_name = dataset_name 181 | self._dataset_dir = get_local_path(dataset_name) 182 | self._output_dir = output_dir 183 | self.meta_data_list = [] # list to be serialized 184 | 185 | def _raw_filenames(self): 186 | return os.listdir(os.path.join(self._dataset_dir, 'raw')) 187 | 188 | def _clean_filenames(self): 189 | return os.listdir(os.path.join(self._dataset_dir, 'clean')) 190 | 191 | def _serialize_meta_data(self): 192 | with open(os.path.join(self._output_dir, 'meta.yaml'), 'w') as meta_file: 193 | meta_string = yaml.dump(self.meta_data_list, default_flow_style=False) 194 | meta_file.write(meta_string) 195 | 196 | 197 | class GooglenewsProcessor(BaseProcessor): 198 | 199 | re_TAIL = re.compile(r'(?P.+)\.html$') 200 | 201 | @dump_meta_data 202 | @itarate_raw_filename 203 | def generate_meta_data(self, raw_filename): 204 | _skip_file(self.re_TAIL, raw_filename) 205 | 206 | with open(os.path.join(self._dataset_dir, 'raw', raw_filename), 'r' ) as f: 207 | # check for cleaned file counterpart 208 | if not os.path.exists(os.path.join(self._dataset_dir, 'clean', raw_filename )): 209 | raise MetaGeneratorError('No existing clean file counterpart for %s' % raw_filename) 210 | 211 | html_string = f.read() 212 | 213 | charset = _get_charset(html_string, raw_filename) 214 | confidence = None 215 | # if no charset is retrieved with document parsing 216 | # use chardet library to detect encoding 217 | if charset: 218 | raw_encoding = charset 219 | else: 220 | det = chardet.detect(html_string) 221 | raw_encoding = det['encoding'] 222 | confidence = det['confidence'] 223 | logger.debug('detected encoding %s in %s with confidence %f', raw_encoding, raw_filename, confidence) 224 | 225 | safe_raw_encoding = _get_safe_encoding_name(raw_encoding) 226 | 227 | self.meta_data_list.append(dict( 228 | id = self.re_TAIL.match(raw_filename).group('id'), 229 | url = None, 230 | raw_encoding = safe_raw_encoding, 231 | clean_encoding = safe_raw_encoding, # TODO: must verify if this is allways true 232 | raw = raw_filename, 233 | clean = raw_filename, 234 | meta = {'encoding_confidence': confidence} 235 | )) 236 | 237 | 238 | class CleanevalProcessor(BaseProcessor): 239 | 240 | re_BACK = re.compile(r'^(?P\d+)\.html\.backup$') 241 | re_NEW = re.compile(r'^\d+\.html$') 242 | 243 | @itarate_raw_filename 244 | def create_backups(self, raw_filename): 245 | # rename every unprocessed [number].html to [number].html.backup 246 | 247 | raw_filename_path = os.path.join(self._dataset_dir, 'raw', raw_filename) 248 | backup_path = raw_filename_path + '.backup' 249 | logger.debug('renaming %s to %s', raw_filename, raw_filename + '.backup') 250 | os.rename(raw_filename_path, backup_path) 251 | 252 | @dump_meta_data 253 | @itarate_raw_filename 254 | def generate_meta_data(self, raw_filename): 255 | _skip_file(self.re_BACK, raw_filename) 256 | with open(os.path.join(self._dataset_dir, 'raw', raw_filename), 'r' ) as f: 257 | html_string = f.read() 258 | 259 | # check for an existing clean file counterpart 260 | # FIXME: this is a hack, because cleaneval-final uses only [number].txt 261 | # and [number]-cleaned.txt in cleaneval-dev 262 | if self.dataset_name == 'cleaneval-final': 263 | clean_filename = self.re_BACK.match(raw_filename).group('id') + '.txt' 264 | else: 265 | clean_filename = self.re_BACK.match(raw_filename).group('id') + '-cleaned.txt' 266 | if not os.path.exists(os.path.join(self._dataset_dir, 'clean', clean_filename )): 267 | msg = 'No existing clean file counterpart for %s' % raw_filename 268 | logger.warning(msg) 269 | raise SkipTrigger(msg) 270 | 271 | # get meta data from tag 272 | soup = BeautifulSoup(html_string) 273 | text_tag = soup.find('text') 274 | if text_tag == None: 275 | raise MetaGeneratorError('No tag in %s' % raw_filename) 276 | encoding = text_tag.get('encoding',None) 277 | 278 | # extract dataset specific meta-data and store it into a dict with 279 | # keys id, title, encoding 280 | # since we'll be removing the tag from every document 281 | # we better store this attributes in it's original form in meta.yaml 282 | cleaneval_specific = { 283 | 'id': _get_attribute(text_tag, 'id'), 284 | 'title': _get_attribute(text_tag, 'title'), 285 | 'encoding': _get_attribute(text_tag, 'encoding'), 286 | } 287 | 288 | # get a safe encoding name 289 | try: 290 | safe_encoding = _get_safe_encoding_name(encoding) 291 | except MetaGeneratorError: 292 | det = chardet.detect(html_string) 293 | safe_encoding = _get_safe_encoding_name(det['encoding']) 294 | logger.info('detected encoding %s in %s with confidence %f', safe_encoding, raw_filename, det['confidence'] ) 295 | 296 | logger.debug('generating meta data for %s', raw_filename) 297 | self.meta_data_list.append(dict( 298 | id = self.re_BACK.match(raw_filename).group('id'), 299 | url = None, 300 | raw_encoding = safe_encoding, 301 | # acording to anotation guidelines of cleaneval 302 | # all cleaned text files are utf-8 encoded 303 | clean_encoding = 'utf-8', 304 | # we'll be generating [number].html in the preprocessing phase 305 | raw = raw_filename.replace('.backup', ''), 306 | clean = clean_filename, 307 | meta = cleaneval_specific 308 | )) 309 | 310 | @itarate_raw_filename 311 | def preprocess(self, raw_filename): 312 | # remove all tags 313 | # add missing tags where needed 314 | 315 | _skip_file(self.re_BACK, raw_filename) 316 | with open(os.path.join(self._dataset_dir, 'raw', raw_filename), 'r' ) as f: 317 | html_string = _remove_text_tag(f.read(), raw_filename) 318 | 319 | soup = BeautifulSoup(html_string) 320 | if (not soup.find('html')) and (not soup.find('body')): 321 | # no html no body tag 322 | logger.warn('appending body and html tags to %s', raw_filename) 323 | html_string = ' %s ' % html_string 324 | 325 | elif (not soup.find('html')) or (not soup.find('body')): 326 | # really weird case 327 | logger.warning('%s has html tag or body tag but not both', raw_filename) 328 | else: 329 | logger.info('no tag appending on %s', raw_filename) 330 | 331 | output_filename = raw_filename.replace('.backup','') 332 | logger.debug('preprocesing complete: %s ---> %s',raw_filename,output_filename) 333 | with open(os.path.join(self._dataset_dir, 'raw', output_filename) ,'w') as output: 334 | output.write(html_string) 335 | 336 | def parse_args(args): 337 | # sys argument parsing using argparse 338 | parser = argparse.ArgumentParser(description = 'Tool for generating meta data files and cleanup preprocessing regarding datasets') 339 | parser.add_argument('dataset_type', choices = ('cleaneval','gnews'), help = 'dataset type e.g. cleaneval' ) 340 | parser.add_argument('dataset_name', help = 'name of the dataset') 341 | parser.add_argument('-p','--path', help = 'path to the meta data output file and .log file (uses the default path if not provided)') 342 | parser.add_argument('-v','--verbose', action = 'store_true', help = 'print log to console') 343 | return parser.parse_args(args) 344 | 345 | def main(args): 346 | pargs = parse_args(args) 347 | # get the ouput direcotry - this is where the .yaml and .log file will reside 348 | output_dir = _verify_args(pargs) 349 | 350 | # now we can initialize logging 351 | print 'log: %s' % os.path.join(output_dir, 'preproc.log') 352 | logging.basicConfig(filename= os.path.join(output_dir, 'preproc.log'), level=logging.DEBUG) 353 | 354 | # add a console handler to root logger if user provides a --verbose flag 355 | if pargs.verbose: 356 | console = logging.StreamHandler() 357 | formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') 358 | console.setFormatter(formatter) 359 | console.setLevel(logging.DEBUG) 360 | logging.getLogger().addHandler(console) 361 | 362 | if pargs.dataset_type == 'cleaneval': 363 | processor = CleanevalProcessor(output_dir, pargs.dataset_name) 364 | try: 365 | print '[CREATE BACKUPS]' 366 | processor.create_backups() 367 | print '[GENERATING META DATA]' 368 | processor.generate_meta_data() 369 | print '[PREPROCESSING]' 370 | processor.preprocess() 371 | except MetaGeneratorError as e: 372 | print 'META DATA RELATED ERROR:' 373 | print e 374 | sys.exit(-1) 375 | except PreprocessingError as e: 376 | print 'PREPROCESSING ERROR:' 377 | print e 378 | sys.exit(-1) 379 | 380 | elif pargs.dataset_type == 'gnews': 381 | processor = GooglenewsProcessor(output_dir, pargs.dataset_name) 382 | try: 383 | print '[GENERATING META DATA]' 384 | processor.generate_meta_data() 385 | 386 | except MetaGeneratorError as e: 387 | print 'META DATA RELATED ERROR:' 388 | print e 389 | sys.exit(-1) 390 | except PreprocessingError as e: 391 | print 'PREPROCESSING ERROR:' 392 | print e 393 | sys.exit(-1) 394 | 395 | print '[DONE]' 396 | 397 | 398 | if __name__ == '__main__': 399 | import sys 400 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /src/evaluate_manage.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script for generating evaluation results 3 | ''' 4 | import os 5 | import logging 6 | 7 | import argparse 8 | 9 | import settings 10 | from txtexeval.extractor import extractor_list, get_extractor_cls 11 | from txtexeval.data import LocalDatasetLoader, LocalResultStorage 12 | from txtexeval.data import DataError 13 | from txtexeval.evaluation import TextBasedResults, TextOnlyEvaluator 14 | from txtexeval.evaluation import from_document_factory, dataset_format_map 15 | 16 | logger = logging.getLogger() 17 | 18 | def single_evaluation(extractor_cls, results, dataset_type, dataset_name): 19 | logger.info('started evaluating extractor %s', extractor_cls.NAME) 20 | results.set_extractor(extractor_cls.SLUG) 21 | storage = LocalResultStorage(dataset_name, extractor_cls) 22 | 23 | loader = LocalDatasetLoader(dataset_name) 24 | for doc in loader: 25 | logger.debug('doc: %s', doc.id) 26 | format_clean = from_document_factory(doc, slug = dataset_type) 27 | try: 28 | result_string = storage.fetch_result(doc) 29 | except DataError: 30 | logger.info('no stored result for %s at %s extractor', 31 | doc.id, extractor_cls.NAME) 32 | continue 33 | else: 34 | format_result = extractor_cls.formatted_result(result_string) 35 | evaluator = TextOnlyEvaluator( 36 | retrieved = format_result, 37 | relevant = format_clean, 38 | id = doc.id) 39 | results.add_result(evaluator.get_eval_results()) 40 | 41 | def local_evaluate(dataset_type, dataset_name, update_ext_slug = None): 42 | results = TextBasedResults() 43 | 44 | if update_ext_slug: 45 | results.load(dataset_name) 46 | ex_cls = get_extractor_cls(update_ext_slug) 47 | single_evaluation(ex_cls, results, dataset_type, dataset_name) 48 | else: 49 | for extractor_cls in extractor_list: 50 | single_evaluation(extractor_cls, results, dataset_type, dataset_name) 51 | 52 | results.dataset_len = len(LocalDatasetLoader(dataset_name)) 53 | results.save(dataset_name) 54 | results.print_results() 55 | 56 | def parse_args(args): 57 | '''Sys argument parsing trough argparse''' 58 | parser = argparse.ArgumentParser(description = 'Tool for for generating evaluation results') 59 | parser.add_argument('dataset_type', choices = [i[0] for i in dataset_format_map], help = 'dataset type e.g. cleaneval' ) 60 | parser.add_argument('dataset_name', help = 'name of the dataset') 61 | parser.add_argument('-v','--verbose', action = 'store_true', help = 'print log to console') 62 | parser.add_argument('-u','--update', choices = [e.SLUG for e in extractor_list], help = 'update the results for a single extractor') 63 | return parser.parse_args(args) 64 | 65 | def logging_setup(verbose): 66 | '''Set verbose to True if you want the log to appear on stderr''' 67 | logger = logging.getLogger() 68 | logger.setLevel(logging.DEBUG) 69 | logd = os.path.join(settings.PATH_LOCAL_DATA,'results-cache','results.log') 70 | file = logging.FileHandler(filename = logd) 71 | file.setLevel(logging.INFO) 72 | file.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')) 73 | logger.addHandler(file) 74 | if verbose: 75 | console = logging.StreamHandler() 76 | console.setLevel(logging.DEBUG) 77 | console.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')) 78 | logger.addHandler(console) 79 | print 'log: %s' % logd 80 | 81 | def main(args): 82 | pargs = parse_args(args) 83 | logging_setup(pargs.verbose) 84 | print '[STARTED]' 85 | local_evaluate(pargs.dataset_type, pargs.dataset_name, pargs.update) 86 | print '[DONE]' 87 | 88 | if __name__ == '__main__': 89 | import sys 90 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /src/extract_manage.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script for extracting article text from dataset instances 3 | ''' 4 | import time 5 | import logging 6 | 7 | import argparse 8 | 9 | from txtexeval.extractor import get_extractor_cls, extractor_list 10 | from txtexeval.data import LocalDatasetLoader, LocalResultStorage 11 | from txtexeval.util import get_local_path 12 | 13 | logger = logging.getLogger() 14 | 15 | def local_extract(dataset_name, extractor_slug, timeout, retry_failed, skip_existing): 16 | # init storage and loader 17 | ex = get_extractor_cls(extractor_slug) 18 | 19 | failed_slug = extractor_slug if retry_failed else None 20 | skip_slug = extractor_slug if skip_existing else None 21 | 22 | loader = LocalDatasetLoader(dataset_name, 23 | load_failed=failed_slug, 24 | skip_existing=skip_slug) 25 | storage = LocalResultStorage(dataset_name, ex) 26 | 27 | logger.info('started extracting content from %s dataset using %s', dataset_name, ex.NAME) 28 | for doc in loader: 29 | storage.push_result(doc) 30 | if timeout: 31 | time.sleep(timeout) 32 | 33 | storage.dump_summary() 34 | logger.info('finished with %s dataset', dataset_name) 35 | 36 | def parse_args(args): 37 | '''Sys argument parsing trough argparse''' 38 | ex_list = [e.SLUG for e in extractor_list] 39 | parser = argparse.ArgumentParser(description = 'Tool for extracting article text from dataset instances') 40 | parser.add_argument('extractor', choices = ex_list, help = 'extractor slug') 41 | parser.add_argument('dataset_name', help = 'name of the dataset') 42 | parser.add_argument('-v','--verbose', action = 'store_true', help = 'print log to console') 43 | parser.add_argument('-t','--timeout', type=int, default=0, help='wait x seconds between extraction operations') 44 | parser.add_argument('-rf','--retry_failed', action = 'store_true', help = 'retry to extract text from instances that failed') 45 | parser.add_argument('-se','--skip_existing', action = 'store_true', help = 'skip all documents that already have their result stored in the database/filesystem') 46 | return parser.parse_args(args) 47 | 48 | def logging_setup(verbose, output_path): 49 | '''Set verbose to True if you want the log to appear on stderr''' 50 | logger = logging.getLogger() 51 | logger.setLevel(logging.DEBUG) 52 | file = logging.FileHandler(filename = output_path) 53 | file.setLevel(logging.INFO) 54 | file.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')) 55 | logger.addHandler(file) 56 | 57 | if verbose: 58 | console = logging.StreamHandler() 59 | console.setLevel(logging.DEBUG) 60 | console.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')) 61 | logger.addHandler(console) 62 | 63 | def main(args): 64 | pargs = parse_args(args) 65 | logging_setup(pargs.verbose, get_local_path(pargs.dataset_name,'result','result.log')) 66 | 67 | print '[STARTED]' 68 | local_extract(pargs.dataset_name, pargs.extractor, 69 | pargs.timeout, pargs.retry_failed, pargs.skip_existing) 70 | print '[DONE]' 71 | 72 | if __name__ == '__main__': 73 | import sys 74 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /src/plot_manage.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script for plotting evaluation results. 3 | ''' 4 | import os 5 | import math 6 | 7 | import argparse 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from mpl_toolkits.mplot3d import Axes3D 11 | 12 | import settings 13 | from txtexeval.evaluation import TextBasedResults 14 | from txtexeval.extractor import extractor_list, get_extractor_cls 15 | 16 | def extractor_list_filter(extractor_slugs): 17 | ''' 18 | Produce a filtered extractor_list based on a list that contains slugs of 19 | desired extractors. We need this because the global extractor_list 20 | dictates the correct order. 21 | ''' 22 | return [e for e in extractor_list if e.SLUG in extractor_slugs] 23 | 24 | 25 | def dataset_stat_latex_print(dataset_name): 26 | ''' 27 | Print the avg precision, recall and F1 score in latex format 28 | to console. 29 | ''' 30 | # get results 31 | txt_results = TextBasedResults() 32 | txt_results.load(dataset_name) 33 | txt_results.print_results() 34 | 35 | #package results 36 | elist = extractor_list_filter(txt_results.text_eval_results.keys()) 37 | extractor_slugs = tuple([e.SLUG for e in elist]) 38 | 39 | result_list = [] 40 | for e in extractor_slugs: 41 | result_tuple = ( 42 | get_extractor_cls(e).NAME, 43 | txt_results.precision_statistics(e)[0], 44 | txt_results.recall_statistics(e)[0], 45 | txt_results.f1score_statistics(e)[0], 46 | ) 47 | result_list.append(result_tuple) 48 | result_list.sort(key = lambda i: i[3]) 49 | result_list.reverse() 50 | 51 | for r in result_list: 52 | print '\\texttt{%s} & %.4f & %.4f & %.4f \\\\ \\hline' % r 53 | 54 | 55 | 56 | def dataset_stat_plot(dataset_name, img_name): 57 | ''' 58 | Plot the avg precision, recall and F1 score bar chart for the given dataset 59 | name. 60 | ''' 61 | # get results 62 | txt_results = TextBasedResults() 63 | txt_results.load(dataset_name) 64 | txt_results.print_results() 65 | 66 | #package results 67 | elist = extractor_list_filter(txt_results.text_eval_results.keys()) 68 | extractor_slugs = tuple([e.SLUG for e in elist]) 69 | packaged_data = ( 70 | ('Precision', [ (txt_results.precision_statistics(e), e) for e in extractor_slugs ] ), 71 | ('Recall', [ (txt_results.recall_statistics(e), e) for e in extractor_slugs ] ), 72 | ('F1 score', [ (txt_results.f1score_statistics(e), e) for e in extractor_slugs ] ), 73 | ) 74 | 75 | bar_color = ('b','c','m') 76 | for i,pdata in enumerate(packaged_data): 77 | 78 | # package plotting values 79 | num_of_extractors = len(extractor_slugs) 80 | ind = np.arange(num_of_extractors) # the x locations for the groups 81 | width = 0.6 # the width of the bars 82 | 83 | result_list = pdata[1] 84 | result_list.sort(key=lambda i: i[0][0]) 85 | result_list.reverse() 86 | 87 | avg = [ x[0][0] for x in result_list] 88 | stddev = [ x[0][1] for x in result_list] 89 | 90 | # plot 91 | plt.subplot(3,1,i+1) 92 | plt.grid(True, alpha = 0.5) 93 | 94 | rects_avg = plt.bar(ind, avg, width,color=bar_color[i], ecolor ='g' , 95 | yerr = stddev, linewidth = 0.5, alpha = 0.8) 96 | 97 | # lables and titles 98 | extractor_names = [ get_extractor_cls(r[1]).NAME for r in result_list] 99 | plt.title(pdata[0]) 100 | plt.xticks(ind+width/2., extractor_names, size = 'xx-small', rotation = 'vertical') 101 | plt.legend( (rects_avg[0],), 102 | ('avg',), 103 | fancybox = True, 104 | prop = dict(size='x-small'), 105 | loc = 4 # lower right 106 | ) 107 | for rect in rects_avg: 108 | height = rect.get_height() 109 | plt.text(rect.get_x()+rect.get_width()/2.25,rect.get_height() + 0.01 , 110 | '%1.2f'%height, ha='center', va='bottom', size = 'x-small') 111 | 112 | 113 | #subplots adjusting 114 | plt.subplots_adjust( wspace=0.5, hspace=0.9) 115 | 116 | #adjust figure height 117 | fig = plt.gcf() 118 | w,h = fig.get_size_inches() 119 | fig.set_size_inches( w , h*1.6) 120 | 121 | # output 122 | out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name) 123 | plt.savefig(out_path) 124 | 125 | def equidistant_count(start, stop, step , list): 126 | '''Return a tuple containing equidistant distribution baskets.''' 127 | limit_list = np.arange(start,stop, step) 128 | count = [0] * len(limit_list) 129 | 130 | for value in list: 131 | value = float(value) 132 | assert start <= value <= stop 133 | mark = False 134 | for i, low in enumerate(limit_list): 135 | up = low + step 136 | if i < (len(limit_list)-1) and low <= value < up: 137 | count[i] += 1 138 | mark =True 139 | break 140 | elif i == (len(limit_list)-1) and low <= value <=up: 141 | count[i] += 1 142 | mark =True 143 | break 144 | ''' 145 | if not mark: 146 | print len(limit_list) 147 | print j 148 | print value 149 | print type(value) 150 | print 0.3 <= value < 0.35 151 | raise Exception('something very weird is going on - %s' % str(value)) 152 | ''' 153 | return tuple(count) 154 | 155 | def resize_axis_tick_labels(axis, size = 'xx-small'): 156 | for label in axis.get_ticklabels(): 157 | label.set_size(size) 158 | 159 | def extractor_stat_plot(dataset_name, img_name): 160 | '''Plot the distributions of per-document precision, recall & F1 score ''' 161 | #np.seterr(all='raise') 162 | fig = plt.figure() 163 | 164 | # get results and repackage the data 165 | txt_results = TextBasedResults() 166 | txt_results.load(dataset_name) 167 | txt_results.print_results() 168 | 169 | elist = extractor_list_filter(txt_results.text_eval_results.keys()) 170 | for ex_index,extractor_cls in enumerate(elist): 171 | 172 | # repackage results 173 | extractor_results = txt_results.filtered_results(extractor_cls.SLUG) 174 | results_list_prec = [r.precision for r in extractor_results] 175 | results_list_rec = [r.recall for r in extractor_results] 176 | results_list_f1 = [r.f1_score for r in extractor_results ] 177 | 178 | width = 0.05 # the width of the bars 179 | ind = np.arange(0,1,width) 180 | n = len(ind) 181 | 182 | print extractor_cls.NAME 183 | eq_count_prec = equidistant_count(0, 1, width, results_list_prec) 184 | print len(results_list_prec) 185 | print sum(eq_count_prec) 186 | eq_count_rec = equidistant_count(0, 1, width, results_list_rec) 187 | print len(results_list_rec) 188 | print sum(eq_count_rec) 189 | eq_count_f1 = equidistant_count(0, 1, width, results_list_f1) 190 | print len(results_list_f1) 191 | print sum(eq_count_f1) 192 | 193 | # plotting 194 | ax = fig.add_subplot(6,3,ex_index+1,projection = '3d') 195 | 196 | ax.bar3d(ind,np.array([0]*n), np.array([0]*n) , 197 | dx = width, dy = width*2,dz=eq_count_prec, 198 | color ='b', linewidth = 0.3, alpha = 0.4) 199 | ax.bar3d(ind,np.array([1]*n), np.array([0]*n) , 200 | dx = width, dy = width*2,dz=eq_count_rec, 201 | color ='c', linewidth = 0.3,alpha = 0.5) 202 | ax.bar3d(ind,np.array([2]*n), np.array([0]*n) , 203 | dx = width, dy = width*2,dz=eq_count_f1, 204 | color ='m', linewidth = 0.3,alpha = 0.8) 205 | 206 | 207 | ax.set_title(extractor_cls.NAME, size = 'small') 208 | #ax.set_xlabel('\nlimits',size = 'x-small', linespacing=2) 209 | ax.set_zlabel('\nnum. of instances',size = 'x-small', linespacing=1) 210 | ax.yaxis.set_ticks([]) 211 | resize_axis_tick_labels(ax.xaxis) 212 | resize_axis_tick_labels(ax.zaxis) 213 | ax.grid(True, alpha = 0.7) 214 | 215 | # with 3d plotting we need to use proxy artist because legends 216 | # are not supported 217 | blue = plt.Rectangle((0, 0), 1, 1, fc='b') # proxys 218 | cyan = plt.Rectangle((0, 0), 1, 1, fc='c') 219 | mag = plt.Rectangle((0, 0), 1, 1, fc='m') 220 | fig.legend( (blue,cyan,mag), 221 | ('precision','recall','f1 score'), 222 | fancybox = True, 223 | prop = dict(size='x-small') 224 | ) 225 | w,h = fig.get_size_inches() 226 | fig.set_size_inches( w *1.5, h*2.5) 227 | fig.subplots_adjust( wspace=0.025, hspace=0.15) 228 | 229 | # save plot 230 | out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name) 231 | fig.savefig(out_path,bbox_inches='tight') 232 | 233 | 234 | def dataset_contents_print_latex(dataset_name): 235 | '''Print the error case analysis in latex''' 236 | # get results 237 | txt_results = TextBasedResults() 238 | txt_results.load(dataset_name) 239 | 240 | # package data 241 | elist = extractor_list_filter(txt_results.text_eval_results.keys()) 242 | for e in elist: 243 | print '\\texttt{%s} & %d & %d & %d & %d & %d & %d \\\\ \\hline' % \ 244 | ( 245 | e.NAME, 246 | txt_results.result_contents(e.SLUG).rel_empty, 247 | txt_results.result_contents(e.SLUG).rel_ret_empty, 248 | txt_results.result_contents(e.SLUG).ret_empty, 249 | txt_results.result_contents(e.SLUG).missmatch, 250 | txt_results.result_contents(e.SLUG).fail, 251 | txt_results.result_contents(e.SLUG).succ, 252 | ) 253 | 254 | def dataset_contents_plot(dataset_name, img_name): 255 | '''Plot the error case analysis.''' 256 | # get results 257 | txt_results = TextBasedResults() 258 | txt_results.load(dataset_name) 259 | txt_results.print_results() 260 | 261 | # package data 262 | elist = extractor_list_filter(txt_results.text_eval_results.keys()) 263 | extractor_slugs = tuple( [e.SLUG for e in elist] ) 264 | package = [ 265 | ('|rel| = 0','#9DFADE', [ txt_results.result_contents(ex).rel_empty for ex in extractor_slugs] ), 266 | ('|rel intersect ret| = 0','#3C70A3', [ txt_results.result_contents(ex).rel_ret_empty for ex in extractor_slugs] ), 267 | ('|ret| = 0','#5CCBED', [ txt_results.result_contents(ex).ret_empty for ex in extractor_slugs] ), 268 | ('mismatch','#A76CF5', [ txt_results.result_contents(ex).missmatch for ex in extractor_slugs] ), 269 | ('failed','#C43156', [ txt_results.result_contents(ex).fail for ex in extractor_slugs] ), 270 | ('successful','#31C460', [ txt_results.result_contents(ex).succ for ex in extractor_slugs] ), 271 | ] 272 | num_of_extractors = len(extractor_slugs) 273 | ind = np.arange(num_of_extractors) # the x locations for the groups 274 | width = 0.6 275 | 276 | fig = plt.gcf() 277 | fig.legend( [plt.Rectangle((0, 0), 1, 1, fc=p[1]) for p in package], 278 | [p[0] for p in package], 279 | fancybox = True, 280 | prop = dict(size='x-small'), 281 | ) 282 | 283 | # with successful instances 284 | ax1 = plt.subplot(121) 285 | bottom_y = np.zeros(num_of_extractors) 286 | for pdata in package: 287 | ax1.bar(ind, pdata[2],width,bottom = bottom_y,color=pdata[1], 288 | ecolor ='g', linewidth = 0.2, alpha = 0.95) 289 | bottom_y += pdata[2] 290 | 291 | ax2 = plt.subplot(122) 292 | bottom_y = np.zeros(num_of_extractors) 293 | del package[-1] 294 | for pdata in package: 295 | ax2.bar(ind, pdata[2],width,bottom = bottom_y,color=pdata[1], 296 | ecolor ='g', linewidth = 0.2, alpha = 0.95) 297 | bottom_y += pdata[2] 298 | 299 | # xticks labels 300 | extractor_names = [ get_extractor_cls(e).NAME for e in extractor_slugs] 301 | ax1.set_xticks(ind+width/2.) 302 | ax1.set_xticklabels(extractor_names, size = 'xx-small', rotation = 'vertical') 303 | ax2.set_xticks(ind+width/2.) 304 | ax2.set_xticklabels(extractor_names, size = 'xx-small', rotation = 'vertical') 305 | 306 | # grid settings 307 | fig.suptitle('Boundary cases') 308 | ax1.grid(True, alpha = 0.5) 309 | ax2.grid(True, alpha = 0.5) 310 | 311 | # adjustment 312 | w,h = fig.get_size_inches() 313 | fig.set_size_inches( w*1.5, h*1.5) 314 | fig.subplots_adjust( bottom = 0.2) 315 | 316 | # output 317 | out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name) 318 | fig.savefig(out_path,bbox_inches='tight') 319 | 320 | def parse_args(args): 321 | parser = argparse.ArgumentParser(description = 'Plotting tool') 322 | parser.add_argument('action', choices = ('dataset_stat', 'extr_stat','contents','contents_latex','dataset_latex')) 323 | parser.add_argument('dataset_name', help = 'name of the dataset') 324 | parser.add_argument('-f','--format', type=str, help = 'format: png, pdf, ps, eps or svg') 325 | return parser.parse_args(args) 326 | 327 | def main(args): 328 | pargs = parse_args(args) 329 | 330 | output_img_name = '%s-%s' % (pargs.dataset_name, pargs.action) 331 | if pargs.format: 332 | output_img_name = '%s.%s' % (output_img_name, pargs.format) 333 | else: 334 | output_img_name = '%s.%s' % (output_img_name, 'png') 335 | 336 | if pargs.action == 'dataset_stat': 337 | dataset_stat_plot(pargs.dataset_name, output_img_name) 338 | elif pargs.action == 'dataset_latex': 339 | dataset_stat_latex_print(pargs.dataset_name) 340 | elif pargs.action == 'extr_stat': 341 | extractor_stat_plot(pargs.dataset_name, output_img_name) 342 | elif pargs.action == 'contents': 343 | dataset_contents_plot(pargs.dataset_name, output_img_name) 344 | elif pargs.action == 'contents_latex': 345 | dataset_contents_print_latex(pargs.dataset_name) 346 | 347 | print '[DONE]' 348 | 349 | if __name__ == '__main__': 350 | import sys 351 | main(sys.argv[1:]) 352 | -------------------------------------------------------------------------------- /src/settings.py-template: -------------------------------------------------------------------------------- 1 | #path to local root data directory 2 | PATH_LOCAL_DATA = '/home/you/data/' 3 | 4 | #path to remote root data directory 5 | PATH_REMOTE_DATA = 'http://example.com/data/' 6 | 7 | #api keys (you'll have to obtain these yourself) 8 | ALCHEMY_API_KEY = '' 9 | DIFFBOT_KEY = '' 10 | REPUSTATE_API_KEY = '' 11 | EXTRACTIV_API_KEY = '' 12 | 13 | #MSS api endpoint provided by Jeffrey Pasternack 14 | #(I'm not allowed to distribute this url) 15 | MSS_URL = ( 16 | ('text', ''), 17 | ('offset', ''), 18 | ) 19 | 20 | #Boilerpipe API url (https://github.com/tomazk/Java-Text-Extractor-API) 21 | BOILERPIPE_API_ENDPOINT = 'http://yourdomain/boilerpipe/extract/' 22 | 23 | #Goose API url (https://github.com/tomazk/Java-Text-Extractor-API) 24 | GOOSE_API_ENDPOINT = 'http://yourdomain/goose/extract/' 25 | 26 | #TTR API url (https://github.com/tomazk/Java-Text-Extractor-API) 27 | TTR_API_ENDPOINT = 'http://yourdomain/ttr/extract/' 28 | 29 | #Readability API (https://github.com/tomazk/Simple-Readability-API) 30 | READABILITY_ENDPOINT = 'http://yourdomain/extract/' 31 | 32 | #Trendiction API 33 | TRENDICTION_ENDPOINT = '' 34 | 35 | #thrift RPC endpoint provided by Zemanta Ltd 36 | ZEMANTA_THRIFT = ( 37 | ('host', ''), 38 | ('port', ), 39 | ) 40 | 41 | #readability bookmarklet location e.g. http://localhost/readability.js 42 | READABILITY_BOOKMARKLET = 'http://yourplace/readability.js' -------------------------------------------------------------------------------- /src/tee: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' 3 | Common command line tool 4 | ''' 5 | 6 | import sys 7 | 8 | def print_help_exit(msg = ''): 9 | if msg: 10 | print msg 11 | print ''' 12 | usage: tee {plot|data|eval|ext|help} [-h] [,,...] 13 | plot Plotting script 14 | data Dataset management script 15 | eval Evaluation script 16 | ext Extraction management script 17 | help Print out help text for the tee command 18 | 19 | flags: 20 | -h Print out help text for the selected script 21 | ''' 22 | sys.exit(-1) 23 | 24 | def main(): 25 | args = sys.argv[1:] 26 | if len(args) == 0: 27 | print_help_exit('Not enough arguments') 28 | if args[0] not in ('plot','data','eval','ext','help'): 29 | print_help_exit('Unknown command') 30 | 31 | if args[0] == 'help': 32 | print_help_exit() 33 | elif args[0] == 'plot': 34 | import plot_manage 35 | plot_manage.main(args[1:]) 36 | elif args[0] == 'data': 37 | import dataset_manage 38 | dataset_manage.main(args[1:]) 39 | elif args[0] == 'eval': 40 | import evaluate_manage 41 | evaluate_manage.main(args[1:]) 42 | elif args[0] == 'ext': 43 | import extract_manage 44 | extract_manage.main(args[1:]) 45 | 46 | if __name__ == '__main__': 47 | main() -------------------------------------------------------------------------------- /src/txtexeval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomazk/Text-Extraction-Evaluation/06d6070d895f1bae604dfaf10fa6537700d59e34/src/txtexeval/__init__.py -------------------------------------------------------------------------------- /src/txtexeval/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urlparse 3 | import codecs 4 | import logging 5 | 6 | import yaml 7 | 8 | import settings 9 | from .util import check_local_path, get_local_path 10 | from .extractor import extractor_list, get_extractor_cls 11 | from .extractor import ExtractorError, ContentExtractorError 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class DataError(Exception): 16 | pass 17 | 18 | def verify_local_dataset(init): 19 | def wrapper(self, dataset, *args, **kwargs): 20 | if not check_local_path(dataset): 21 | raise DataError('local dataset %s does not exist' % dataset) 22 | init(self, dataset, *args, **kwargs) 23 | return wrapper 24 | 25 | class BaseDatasetLoader(object): 26 | ''' 27 | If you want a loader with a different backend (e.g. database)just extend 28 | this class and implement __iter__ method which returns an iterator over 29 | document instances 30 | ''' 31 | 32 | def __iter__(self): 33 | raise NotImplementedError 34 | 35 | class LocalDatasetLoader(BaseDatasetLoader): 36 | '''Dataset loader using local filesystem''' 37 | 38 | @verify_local_dataset 39 | def __init__(self, dataset_name, load_failed = None, skip_existing = None): 40 | self.dataset = dataset_name 41 | self._skip_existing = skip_existing 42 | 43 | # load meta data 44 | meta_filepath = get_local_path( dataset_name, 'meta.yaml') 45 | with open(meta_filepath, 'r') as f: 46 | self.meta_yaml = yaml.load(f.read()) 47 | self._len = len(self.meta_yaml) 48 | 49 | if load_failed: 50 | self._failed_list = ExtractionSummary(self.dataset) \ 51 | .get_failed_ids(load_failed) 52 | else: 53 | self._failed_list = None 54 | 55 | def __iter__(self): 56 | '''DataInstance generator''' 57 | for dict in self.meta_yaml: 58 | document = LocalDocument(self.dataset, **dict) 59 | 60 | # check if all conditions for yielding a document are set 61 | yield_ = True 62 | if self._skip_existing != None and \ 63 | document.check_existing_clean(self._skip_existing): 64 | yield_ = False 65 | elif self._failed_list != None and \ 66 | dict['id'] not in self._failed_list: 67 | yield_ = False 68 | 69 | if yield_: 70 | yield document 71 | else: 72 | logger.debug('skipping document %s', document.id) 73 | continue 74 | 75 | def __len__(self): 76 | return self._len 77 | 78 | 79 | class BaseDocument(object): 80 | # same goes for document instances 81 | 82 | def get_raw_html(self): 83 | pass 84 | 85 | def get_url(self): 86 | pass 87 | 88 | def get_url_local(self): 89 | pass 90 | 91 | def get_clean(self): 92 | pass 93 | 94 | class LocalDocument(BaseDocument): 95 | '''Evaluation data representation using local filesystem''' 96 | 97 | def __init__(self, dataset, **kwargs): 98 | self.dataset = dataset 99 | 100 | # instance attributes 101 | self.id = kwargs.pop('id') 102 | self.raw_filename = kwargs.pop('raw') 103 | self.clean_filename = kwargs.pop('clean') 104 | self.url = kwargs.pop('url') 105 | self.raw_encoding = kwargs.pop('raw_encoding') 106 | self.clean_encoding = kwargs.pop('clean_encoding') 107 | 108 | def get_raw_html(self): 109 | file_path = get_local_path(self.dataset,'raw',self.raw_filename) 110 | with codecs.open(file_path,'r', encoding = self.raw_encoding, errors = 'ignore') as f: 111 | return f.read() 112 | 113 | def get_url(self): 114 | if self.url: 115 | return self.url 116 | else: 117 | tail = self.dataset + '/' + self.raw_filename 118 | return urlparse.urljoin(settings.PATH_REMOTE_DATA, tail) 119 | 120 | def get_url_local(self): 121 | # file:///home/tomaz/workspace/diploma/txt-ex-eval-data/datasets/cleaneval-final/raw/100.html 122 | return 'file://' + settings.PATH_LOCAL_DATA + '/datasets/' \ 123 | + self.dataset + '/raw/' + self.raw_filename 124 | 125 | def get_clean(self): 126 | file_path = get_local_path(self.dataset,'clean',self.clean_filename) 127 | with open(file_path, 'r') as f: 128 | return f.read() 129 | 130 | def check_existing_clean(self, extractor_slug): 131 | ex_cls = get_extractor_cls(extractor_slug) 132 | return check_local_path(self.dataset,'result',extractor_slug, 133 | '%s.%s' %(self.id, ex_cls.FORMAT)) 134 | 135 | 136 | class ExtractionSummary(object): 137 | 138 | @verify_local_dataset 139 | def __init__(self, dataset_name, extractor_slug = None): 140 | self._summary_path = get_local_path(dataset_name,'result', 'summary.yaml') 141 | 142 | if os.path.exists(self._summary_path): 143 | with open(self._summary_path,'r') as f: 144 | self._summary_structure = yaml.load(f.read()) 145 | else: 146 | self._summary_structure = {} 147 | for e in extractor_list: 148 | self._summary_structure[e.SLUG] = [] 149 | 150 | self.set_extractor(extractor_slug) 151 | 152 | def set_extractor(self, extractor_slug): 153 | if extractor_slug: 154 | self.extractor_slug = extractor_slug 155 | self._summary_structure[self.extractor_slug] = [] 156 | else: 157 | self.extractor_slug = None 158 | 159 | def get_failed_ids(self, extractor_slug): 160 | if self.extractor_slug: 161 | raise DataError('extractor_slug set - list of fails was reinitialized') 162 | return [f['id'] for f in self._summary_structure[extractor_slug]] 163 | 164 | def add_fail(self, id, reason = None): 165 | if self.extractor_slug == None: 166 | raise DataError('extractor not set') 167 | 168 | self._summary_structure[self.extractor_slug].append({ 169 | 'id': id, 170 | 'reason': reason 171 | }) 172 | 173 | def serialize(self): 174 | with open(self._summary_path, 'w') as out: 175 | out.write(yaml.dump(self._summary_structure, default_flow_style=False )) 176 | 177 | def short_summary(self, extractor_slug = None): 178 | if extractor_slug: 179 | return 'extraction summary: %i failed' \ 180 | % len(self._summary_structure[extractor_slug]) 181 | elif self.extractor_slug: 182 | return 'extraction summary: %i failed' \ 183 | % len(self._summary_structure[self.extractor_slug]) 184 | else: 185 | raise DataError('extractor not set') 186 | 187 | class BaseResultStorage(object): 188 | 189 | def __init__(self, dataset_name, extractor_class): 190 | self.dataset = dataset_name 191 | self.extractor_cls = extractor_class 192 | 193 | def push_result(self, document): 194 | pass 195 | 196 | def fetch_result(self, document): 197 | pass 198 | 199 | class LocalResultStorage(BaseResultStorage): 200 | 201 | @verify_local_dataset 202 | def __init__(self, dataset_name, extractor_class): 203 | super(LocalResultStorage, self).__init__(dataset_name, extractor_class) 204 | 205 | # with dataset name out of the way, we must now check the existance of 206 | # the result folder for the given extractor 207 | self._result_dir = get_local_path( self.dataset,'result') 208 | 209 | self._extractor_result_dir = os.path.join( 210 | self._result_dir, 211 | self.extractor_cls.SLUG) 212 | 213 | if not os.path.exists( self._extractor_result_dir ): 214 | os.mkdir(self._extractor_result_dir) 215 | 216 | # create an object to be serialized into a .yaml file 217 | # we need this to store a summary of the extraction process for the 218 | # whole dataset 219 | self._summary = ExtractionSummary(self.dataset, self.extractor_cls.SLUG) 220 | 221 | def push_result(self, document): 222 | extractor = self.extractor_cls(document) 223 | try: 224 | result = extractor.extract() 225 | except DataError as e: 226 | err_msg = 'Data related error: %r' % e 227 | logger.warning(err_msg) 228 | self._summary.add_fail(document.id, err_msg) 229 | except ContentExtractorError as e: 230 | err_msg = 'Content extractor related error: %r' % e 231 | logger.warning(err_msg) 232 | self._summary.add_fail(document.id, err_msg) 233 | except ExtractorError as e: 234 | err_msg = 'Extractor related error: %r' % e 235 | logger.warning(err_msg) 236 | self._summary.add_fail(document.id, err_msg) 237 | except NotImplementedError: 238 | logger.debug('extraction method is not implemented - do nothing') 239 | pass 240 | except Exception as e: 241 | err_msg = 'Unknown error: %r' % e 242 | logger.warning(err_msg) 243 | self._summary.add_fail(document.id, err_msg) 244 | else: 245 | logger.debug('extracted content from %s', document.id) 246 | output_file = '%s.%s' % (document.id,self.extractor_cls.FORMAT) 247 | with open(os.path.join(self._extractor_result_dir, output_file), 'w') as out: 248 | out.write(result) 249 | 250 | def fetch_result(self, document): 251 | result_file = '%s.%s' % (document.id,self.extractor_cls.FORMAT) 252 | result_file_path = os.path.join(self._extractor_result_dir, result_file) 253 | if not os.path.exists(result_file_path): 254 | raise DataError('result file %s does not exist' % result_file) 255 | with open(result_file_path,'r') as f: 256 | return f.read() 257 | 258 | def dump_summary(self): 259 | logger.info(self._summary.short_summary()) 260 | self._summary.serialize() -------------------------------------------------------------------------------- /src/txtexeval/evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import pickle 4 | import string 5 | import difflib 6 | import math 7 | import logging 8 | 9 | from BeautifulSoup import BeautifulSoup 10 | 11 | import settings 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | # module utils 16 | 17 | re_CONTROL = re.compile("[\x00-\x1F]+") 18 | re_WS = re.compile("\s+") 19 | re_NONASCII = re.compile("[\x80-\xFF]+") 20 | 21 | def _tokenize_text(dirty_text): 22 | '''Tokenize dirty text into a normalized list of words''' 23 | # remove punctuation and replace with whitespace 24 | table = string.maketrans(string.punctuation, ' '*len(string.punctuation)) 25 | dirty_text = dirty_text.translate(table) 26 | # remove any control char 27 | dirty_text = re_CONTROL.sub(' ', dirty_text) 28 | # remove any non ascii char to mitigate the troubles of broken encodings 29 | dirty_text = re_NONASCII.sub('', dirty_text) 30 | # normalize to lowercase 31 | dirty_text = dirty_text.lower() 32 | # remove empty tokens 33 | return filter(lambda w: w != '', re_WS.split(dirty_text)) 34 | 35 | def _bow(word_tokens): 36 | '''Returns bag of words dictionary from a list of word tokens''' 37 | bow = {} 38 | for i in word_tokens: 39 | if i not in bow: 40 | bow[i] = 1 41 | else: 42 | bow[i] += 1 43 | return bow 44 | 45 | # results 46 | 47 | class Result(object): 48 | 49 | def __init__(self, precision, recall, f1_score, id = None): 50 | # validate result 51 | if math.isinf(precision) and not math.isinf(recall): 52 | assert recall == 0 53 | assert math.isnan(f1_score) 54 | elif not math.isinf(precision) and math.isinf(recall): 55 | assert precision == 0 56 | assert math.isnan(f1_score) 57 | elif math.isinf(precision) and math.isinf(recall): 58 | assert math.isnan(f1_score) 59 | elif precision == recall == 0: 60 | assert math.isinf(f1_score) 61 | elif not math.isinf(precision) and not math.isinf(recall): 62 | assert 0 < precision <= 1 63 | assert 0 < recall <= 1 64 | assert 0 < f1_score <= 1 65 | 66 | self.precision = precision 67 | self.recall = recall 68 | self.f1_score = f1_score 69 | self.id = id 70 | 71 | @property 72 | def retrieved_empty(self): 73 | return math.isinf(self.precision) and self.recall == 0 74 | 75 | @property 76 | def relevant_empty(self): 77 | return math.isinf(self.recall) and self.precision == 0 78 | 79 | @property 80 | def relevant_retrieved_empty(self): 81 | return math.isinf(self.precision) and math.isinf(self.recall) 82 | 83 | @property 84 | def missmatch(self): 85 | return self.precision == self.recall == 0 86 | 87 | @property 88 | def succ(self): 89 | return 0 < self.f1_score <= 1 90 | 91 | class ResultContents(object): 92 | 93 | def __init__(self,succ,rel_empty,rel_ret_empty,ret_empty,missmatch,dataset_len): 94 | assert dataset_len >= succ+rel_empty+rel_ret_empty+ret_empty+missmatch 95 | 96 | self.succ = succ 97 | self.rel_empty = rel_empty 98 | self.rel_ret_empty = rel_ret_empty 99 | self.ret_empty = ret_empty 100 | self.missmatch = missmatch 101 | 102 | self.fail = dataset_len-(succ+rel_empty+rel_ret_empty+ret_empty+missmatch) 103 | 104 | class TextBasedResults(object): 105 | 106 | __pickle_path = os.path.join(settings.PATH_LOCAL_DATA,'results-cache') 107 | 108 | def __init__(self, extractor = None): 109 | self.text_eval_results = {} 110 | self.dataset_len = 0 111 | 112 | # optional 113 | if extractor != None: 114 | self.text_eval_results[extractor] = [] 115 | self._extractor = extractor 116 | 117 | def save(self, dataset_name): 118 | '''Pickle the internal state''' 119 | pickle_path = os.path.join(self.__pickle_path,'%s.pickle' % dataset_name) 120 | logger.info('saving text based results to: %s', pickle_path) 121 | 122 | with open(pickle_path,'wb') as f: 123 | pickle.dump( self.__dict__ ,f) 124 | 125 | def load(self, dataset_name): 126 | '''Unpickle the internal state''' 127 | pickle_path = os.path.join(self.__pickle_path,'%s.pickle' % dataset_name) 128 | logger.info('loading text based results from: %s', pickle_path) 129 | 130 | try: 131 | f = open(pickle_path,'rb') 132 | except IOError as e: 133 | logger.warning('no pickle found: %s', repr(e)) 134 | else: 135 | self.__dict__.update( pickle.load(f) ) 136 | f.close() 137 | 138 | 139 | def set_extractor(self, extractor): 140 | self._extractor = extractor 141 | self.text_eval_results[extractor] = [] 142 | 143 | def add_result(self, result): 144 | if self._extractor == None: 145 | raise TypeError('extractor not set') 146 | self.text_eval_results[self._extractor].append(result) 147 | 148 | def filtered_results(self, extractor): 149 | result_filter = lambda r: r.succ 150 | return filter(result_filter, self.text_eval_results[extractor]) 151 | 152 | def result_contents(self, extractor): 153 | results = self.text_eval_results[extractor] 154 | 155 | succ = len(self.filtered_results(extractor)) 156 | rel_empty = len(filter(lambda r: r.relevant_empty, results)) 157 | ret_empty = len(filter(lambda r: r.retrieved_empty, results)) 158 | rel_ret_empty = len(filter(lambda r: r.relevant_retrieved_empty, results)) 159 | missmatch = len(filter(lambda r: r.missmatch, results)) 160 | 161 | return ResultContents(succ, rel_empty, rel_ret_empty, ret_empty, 162 | missmatch, self.dataset_len) 163 | 164 | def _statistics(self, extractor, stat_typ): # DRY helper 165 | results_list = [getattr(r, stat_typ) for r in self.filtered_results(extractor)] 166 | # average 167 | avg = sum(results_list) / float(len(results_list)) 168 | # std deviation 169 | stddev = sum([(r - avg)**2. for r in results_list]) / float(len(results_list)) 170 | stddev = math.sqrt(stddev) 171 | return avg, stddev 172 | 173 | def precision_statistics(self, extractor): 174 | '''Return a tuple containing (avg, stddev)''' 175 | return self._statistics(extractor, 'precision') 176 | 177 | def recall_statistics(self, extractor): 178 | '''Return a tuple containing (avg, stddev)''' 179 | return self._statistics(extractor, 'recall') 180 | 181 | def f1score_statistics(self, extractor): 182 | '''Return a tuple containing (avg, stddev)''' 183 | return self._statistics(extractor, 'f1_score') 184 | 185 | def print_results(self): 186 | print 'results based on text based evaluation' 187 | for extractor in self.text_eval_results.iterkeys(): 188 | print '----------------' 189 | print 'Ex. name: %s' % extractor 190 | print 'avg. precision: %f stddev: %f' \ 191 | % self.precision_statistics(extractor) 192 | print 'avg. recall: %f stddev: %f' \ 193 | % self.recall_statistics(extractor) 194 | print 'avg. F1 score: %f stddev: %f' \ 195 | % self.f1score_statistics(extractor) 196 | 197 | rcontents = self.result_contents(extractor) 198 | print 'relevant empty: %d' % rcontents.rel_empty 199 | print 'retrieved empty: %d' % rcontents.ret_empty 200 | print 'rel intersect ret: %d' % rcontents.rel_ret_empty 201 | print 'success: %d' % rcontents.succ 202 | print 'missmatch: %d' % rcontents.missmatch 203 | print 'fail: %d' % rcontents.fail 204 | print 'dataset_len=%d' % self.dataset_len 205 | 206 | # evaluators 207 | 208 | class BaseEvaluator(): 209 | '''Outline for evaluators''' 210 | 211 | def __init__(self, retrieved, relevant, id = None): 212 | self.retrieved = retrieved 213 | self.relevant = relevant 214 | self.id = id 215 | 216 | def get_eval_results(self): 217 | # return instance of Result 218 | pass 219 | 220 | class TextOnlyEvaluator(BaseEvaluator): 221 | 222 | def get_eval_results(self): 223 | 224 | s = difflib.SequenceMatcher() 225 | rel = self.relevant.get_word_seq() 226 | ret = self.retrieved.get_word_seq() 227 | 228 | s.set_seqs(rel, ret) 229 | matches = s.get_matching_blocks()[:-1] 230 | 231 | rel_union_ret = sum(i.size for i in matches) if len(matches) > 0 else 0 232 | 233 | precision = float(rel_union_ret) / float(len(ret)) \ 234 | if len(ret) > 0 else float('inf') 235 | recall = float(rel_union_ret) / float(len(rel)) \ 236 | if len(rel) > 0 else float('inf') 237 | 238 | # nan when prec or recall are inf 239 | f1_score = (2. * precision * recall)/(precision + recall) \ 240 | if precision + recall > 0 else float('inf') 241 | 242 | return Result(precision, recall, f1_score, self.id) 243 | 244 | #formats 245 | 246 | class BaseResultFormat(object): 247 | 248 | def get_word_seq(self):# sequence of words 249 | pass 250 | 251 | def get_bow(self):# bag of words 252 | pass 253 | 254 | class TextResultFormat(BaseResultFormat): 255 | '''Basic format for dirty text''' 256 | 257 | def __init__(self, dirty_text): 258 | self._text = dirty_text 259 | 260 | def get_word_seq(self): 261 | return _tokenize_text(self._text) 262 | 263 | def get_bow(self): 264 | return _bow(_tokenize_text(self._text)) 265 | 266 | class CleanEvalFormat(BaseResultFormat): 267 | '''Format specific for cleaneval dataset''' 268 | 269 | re_URL = re.compile(r'^(\s*)URL:(.*)$', re.IGNORECASE | re.MULTILINE) 270 | re_TAG = re.compile(r'^(\s*)<(p|h|l)>', re.IGNORECASE | re.MULTILINE) 271 | 272 | @staticmethod 273 | def from_document(document): 274 | return CleanEvalFormat(document.get_clean()) 275 | 276 | def __init__(self, cleaneval_string): 277 | # remove URL meta data 278 | self._text = self.re_URL.sub('', cleaneval_string) 279 | # remove tag guidelines 280 | self._text = self.re_TAG.sub('', self._text) 281 | 282 | def get_word_seq(self): 283 | return _tokenize_text(self._text) 284 | 285 | def get_bow(self): 286 | return _bow(_tokenize_text(self._text)) 287 | 288 | class GoogleNewsFormat(BaseResultFormat): 289 | ''' 290 | Format specific for google news dataset 291 | 292 | From README.txt distributed with google news dataset: 293 | The human-assessed documents contain annotations in the form of tags 294 | with specific CSS classes that indicate the type of content: 295 | x-nc-sel0 Not content 296 | x-nc-sel1 Headline 297 | x-nc-sel2 Full text 298 | x-nc-sel3 Supplemental 299 | x-nc-sel4 Related content 300 | x-nc-sel5 Comments 301 | ''' 302 | 303 | re_CLASS = re.compile('x-nc-sel[1|2]') 304 | 305 | @staticmethod 306 | def from_document(document): 307 | return GoogleNewsFormat(document.get_clean(), document.clean_encoding) 308 | 309 | def __init__(self, gnews_string, encoding): 310 | soup = BeautifulSoup(gnews_string, fromEncoding = encoding) 311 | 312 | # The trouble of google news dataset is that it sometimes nests 313 | # the annotated span tags. That's why we first have to find any 314 | # annotated children and remove them from the content_tags list. 315 | redundant_tags = [] 316 | content_tags = soup.findAll('span',attrs = {'class' : self.re_CLASS }) 317 | for ct in content_tags: 318 | red = ct.findAll('span',attrs = {'class' : self.re_CLASS }) 319 | redundant_tags.extend(red) 320 | self._content_tags = filter(lambda tag: tag not in redundant_tags, content_tags) 321 | # Next we find all the text and concatenate it into one single string 322 | content_strings = [] 323 | for ct in self._content_tags: 324 | content_strings.extend(ct.findAll(text=True)) 325 | self._content_string = ' '.join(map(lambda e: e.encode(encoding,'ignore'), content_strings)) 326 | 327 | def get_word_seq(self): 328 | return _tokenize_text(self._content_string) 329 | 330 | def get_bow(self): 331 | return _bow(_tokenize_text(self._content_string)) 332 | 333 | # formats in this mapping should have a from_document static method implemented 334 | dataset_format_map = ( 335 | ('cleaneval', CleanEvalFormat), 336 | ('gnews', GoogleNewsFormat), 337 | ) 338 | 339 | def from_document_factory(document, slug): 340 | ''' 341 | Factory function that returns an instance of a format class listed in the 342 | dataset format map. 343 | ''' 344 | map_ = dict(dataset_format_map) 345 | cls = map_[slug] 346 | return cls.from_document(document) -------------------------------------------------------------------------------- /src/txtexeval/extractor.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import json 3 | import logging 4 | import time 5 | 6 | import readability 7 | import justext 8 | from selenium import webdriver 9 | from selenium.webdriver import FirefoxProfile 10 | from selenium.common.exceptions import NoSuchElementException 11 | 12 | import settings 13 | from .util import Request, html_to_text 14 | from .util.zemanta.client import ClientManager 15 | from .evaluation import TextResultFormat, CleanEvalFormat 16 | 17 | logging.getLogger('selenium').setLevel(logging.WARN) 18 | 19 | class ExtractorError(Exception): 20 | '''Extractor failed on the network layer''' 21 | pass 22 | 23 | class ContentExtractorError(ExtractorError): 24 | ''' 25 | Raised when the error is included in the content (e.g. json formatted 26 | response has a status field) fetched by the extractor 27 | ''' 28 | pass 29 | 30 | def return_content(extract): 31 | ''' 32 | DRY decorator that wraps the extract method. We check for response 33 | success and raise the appropriate error or return the content. 34 | ''' 35 | def wrapper(self): 36 | # fetch the response 37 | response = extract(self) 38 | # check for any network related errors 39 | if not response.success(): 40 | raise ExtractorError(response.err_msg) 41 | return response.content 42 | return wrapper 43 | 44 | def check_content_status(extract): 45 | ''' 46 | DRY decorator that mitigates the trouble of inserting boilerplate code 47 | inside the extract method for invoking the private method _content_status. 48 | WhateverExtractor._content_status is used to check for errors returned in 49 | the response content itself. 50 | ''' 51 | def wrapper(self): 52 | self._content = extract(self) 53 | self._content_status() 54 | return self._content 55 | return wrapper 56 | 57 | class BaseExtractor(object): 58 | '''Extractor base class 59 | 60 | Using a base class to ensure a common representation. 61 | If an extractor returns only e.g. text based results it 62 | should raise a NotImpelemntedError for the respective 63 | method''' 64 | 65 | NAME = ''# unique name 66 | SLUG = ''# unique slug name ([a-z_]+) 67 | FORMAT = ''# txt|html|json|xml 68 | 69 | def __init__(self, data_instance): 70 | self.data_instance = data_instance 71 | 72 | def extract(self): 73 | '''Returns unformatted extractor resposne''' 74 | pass 75 | 76 | @classmethod 77 | def formatted_result(cls, result_string): 78 | pass 79 | 80 | 81 | class _ContentCheckMin(object): 82 | 83 | def _content_status(self): 84 | js = json.loads(self._content) 85 | if js['status'] == "ERROR": 86 | raise ContentExtractorError(js['errorMsg'].encode('utf-8','ignore')) 87 | 88 | class _FormattedResultMin(object): 89 | 90 | @classmethod 91 | def formatted_result(cls, result_string): 92 | js = json.loads(result_string, encoding = 'utf8') 93 | return TextResultFormat(js['result'].encode('utf8','ignore')) 94 | 95 | 96 | class TTRDefaultExtractor(_ContentCheckMin,BaseExtractor): 97 | '''Boilerpipe default extractor ''' 98 | 99 | NAME = 'TTR' 100 | SLUG = 'ttr_def' 101 | FORMAT = 'json' 102 | 103 | _extractor_type = 'default' 104 | 105 | @check_content_status 106 | @return_content 107 | def extract(self): 108 | html = self.data_instance.get_raw_html() 109 | req = Request( 110 | settings.TTR_API_ENDPOINT, 111 | data = { 112 | "extractorType":self._extractor_type, 113 | "rawHtml": html.encode(self.data_instance.raw_encoding,'ignore') 114 | }, 115 | headers = {'Content-Type':'application/x-www-form-urlencoded'} 116 | ) 117 | return req.post() 118 | 119 | @classmethod 120 | def formatted_result(cls, result_string): 121 | js = json.loads(result_string, encoding = 'utf8') 122 | result_html = js['result'].encode('utf8','ignore') 123 | return TextResultFormat(html_to_text(result_html,'utf8')) 124 | 125 | 126 | 127 | class BoilerpipeDefaultExtractor(_FormattedResultMin,_ContentCheckMin,BaseExtractor): 128 | '''Boilerpipe default extractor ''' 129 | 130 | NAME = 'Boilerpipe DEF' 131 | SLUG = 'boilerpipe_def' 132 | FORMAT = 'json' 133 | 134 | _extractor_type = 'default' 135 | 136 | @check_content_status 137 | @return_content 138 | def extract(self): 139 | html = self.data_instance.get_raw_html() 140 | req = Request( 141 | settings.BOILERPIPE_API_ENDPOINT, 142 | data = { 143 | "extractorType":self._extractor_type, 144 | "rawHtml": html.encode(self.data_instance.raw_encoding,'ignore') 145 | }, 146 | headers = {'Content-Type':'application/x-www-form-urlencoded'} 147 | ) 148 | return req.post() 149 | 150 | 151 | class BoilerpipeArticleExtractor(BoilerpipeDefaultExtractor): 152 | '''Boilerpipe article extractor''' 153 | 154 | NAME = 'Boilerpipe ART' 155 | SLUG = 'boilerpipe_art' 156 | FORMAT = 'json' 157 | 158 | _extractor_type = 'article' 159 | 160 | class BoilerpipeArticleSentencesExtractor(BoilerpipeDefaultExtractor): 161 | '''Boilerpipe extractor tuned for extracting article sentences''' 162 | 163 | NAME = 'Boilerpipe SENT' 164 | SLUG = 'boilerpipe_sent' 165 | FORMAT = 'json' 166 | 167 | _extractor_type = 'sentence' 168 | 169 | class GooseExtractor(_FormattedResultMin,_ContentCheckMin,BaseExtractor): 170 | '''Goose project extractor''' 171 | 172 | NAME = 'Goose' 173 | SLUG = 'goose' 174 | FORMAT = 'json' 175 | 176 | @return_content 177 | def extract(self): 178 | html = self.data_instance.get_raw_html() 179 | req = Request( 180 | settings.GOOSE_API_ENDPOINT, 181 | data = dict(rawHtml = html.encode(self.data_instance.raw_encoding,'ignore')), 182 | headers = {'Content-Type':'application/x-www-form-urlencoded'} 183 | ) 184 | return req.post() 185 | 186 | class MSSExtractor(BaseExtractor): 187 | '''MSS implementation by Jeffrey Pasternack''' 188 | 189 | NAME = 'MSS' 190 | SLUG = 'mss' 191 | FORMAT = 'html' 192 | 193 | @return_content 194 | def extract(self): 195 | html = self.data_instance.get_raw_html() 196 | req = Request( 197 | dict(settings.MSS_URL)['text'], 198 | #this implementation requires utf-8 encoded input 199 | data = html.encode('utf-8','ignore'), 200 | headers= {'Content-Type': 'text/plain;charset=UTF-8'} 201 | ) 202 | return req.post() 203 | 204 | @classmethod 205 | def formatted_result(cls, result_string): 206 | return TextResultFormat(html_to_text(result_string, 'utf8')) 207 | 208 | 209 | class PythonReadabilityExtractor(BaseExtractor): 210 | '''Extractor based on python-readability 211 | (https://github.com/gfxmonk/python-readability)''' 212 | 213 | NAME = 'Python Readability' 214 | SLUG = 'python_read' 215 | FORMAT = 'html' 216 | 217 | def extract(self): 218 | html = self.data_instance.get_raw_html() 219 | doc = readability.Document(html) 220 | # FIXME 221 | return doc.summary().encode('ascii','ignore') 222 | 223 | @classmethod 224 | def formatted_result(cls, result_string): 225 | return TextResultFormat(html_to_text(result_string, 'utf8')) 226 | 227 | class NodeReadabilityExtractor(_FormattedResultMin,BaseExtractor): 228 | '''Extractor based on node-readability''' 229 | 230 | NAME = 'Node Readability' 231 | SLUG = 'node_read' 232 | FORMAT = 'json' 233 | 234 | @check_content_status 235 | @return_content 236 | def extract(self): 237 | html = self.data_instance.get_raw_html() 238 | 239 | req = Request( 240 | settings.READABILITY_ENDPOINT, 241 | #this implementation requires utf-8 encoded input 242 | data = html.encode('utf-8','ignore'), 243 | headers= {'Content-Type': 'text/plain;charset=UTF-8'} 244 | ) 245 | return req.post() 246 | 247 | def _content_status(self): 248 | js = json.loads(self._content, encoding = 'utf8') 249 | if js['status'] == 'ERROR': 250 | raise ContentExtractorError('failed') 251 | 252 | class SeleniumReadabilityExtractor(BaseExtractor): 253 | ''' 254 | Using selenium webdriver API to harvest the results of the original 255 | readability bookmarklet 256 | ''' 257 | 258 | NAME = 'Readability' 259 | SLUG = 'orig_read' 260 | FORMAT = 'txt' 261 | 262 | _driver = None # lazy webdriver.Firefox() 263 | #TODO: share the modified code 264 | _bookmarklet_source = "(function(){readConvertLinksToFootnotes=false;readStyle='style-newspaper';readSize='size-medium';readMargin='margin-wide';_bookm=document.createElement('script');_bookm.type='text/javascript';_bookm.src='" + \ 265 | settings.READABILITY_BOOKMARKLET + "?x='+Math.random();document.getElementsByTagName('head')[0].appendChild(_bookm);})();" 266 | 267 | def _check_content_presence(self): 268 | cls = self.__class__ 269 | try: 270 | # this was a modification to readability.js script 271 | # if it failed to extract any meaningful content 272 | # we renamed the id of the content block to 273 | # explicitly indicate this special case 274 | cls._driver.find_element_by_id('readability-content-failed') 275 | except NoSuchElementException: 276 | pass 277 | else: 278 | raise ContentExtractorError('readability failed to extract any content') 279 | 280 | def extract(self): 281 | # lazy init 282 | cls = self.__class__ 283 | if cls._driver == None: 284 | # init firefox web driver 285 | cls._driver = webdriver.Firefox() 286 | 287 | url = self.data_instance.get_url_local() 288 | cls._driver.get(url) 289 | time.sleep(2) 290 | cls._driver.execute_script(self._bookmarklet_source) 291 | 292 | try: 293 | # find the node that contains content 294 | # and check if readability managed to extract anything meaningful 295 | element = cls._driver.find_element_by_id('readInner') 296 | self._check_content_presence() 297 | except NoSuchElementException: 298 | raise ContentExtractorError('readability failed to produce the #readInner DOM node') 299 | else: 300 | return element.text.encode(self.data_instance.raw_encoding, 'ignore') 301 | 302 | @classmethod 303 | def formatted_result(cls, result_string): 304 | return TextResultFormat(result_string) 305 | 306 | class AlchemyExtractor(BaseExtractor): 307 | '''Alchemy API extractor''' 308 | 309 | NAME = 'Alchemy API' 310 | SLUG = 'alchemy' 311 | FORMAT = 'json' 312 | 313 | @check_content_status 314 | @return_content 315 | def extract(self): 316 | html = self.data_instance.get_raw_html() 317 | req = Request( 318 | 'http://access.alchemyapi.com/calls/html/HTMLGetText', 319 | data = {'apikey':settings.ALCHEMY_API_KEY, 320 | 'html': html.encode(self.data_instance.raw_encoding,'ignore'), 321 | 'outputMode':'json' 322 | } 323 | 324 | ) 325 | return req.post() 326 | 327 | def _content_status(self): 328 | js = json.loads(self._content, encoding = 'utf8') 329 | if js['status'] == 'ERROR': 330 | raise ContentExtractorError(js['statusInfo'].encode('utf8','ignore')) 331 | 332 | @classmethod 333 | def formatted_result(cls, result_string): 334 | js = json.loads(result_string, encoding = 'utf8') 335 | return TextResultFormat(js['text'].encode('utf8','ignore')) 336 | 337 | class DiffbotExtractor(BaseExtractor): 338 | '''Diffbot extractor''' 339 | 340 | NAME = 'Diffbot' 341 | SLUG = 'diffbot' 342 | FORMAT = 'json' 343 | 344 | @return_content 345 | def extract(self): 346 | data = urllib.urlencode(dict( 347 | token = settings.DIFFBOT_API_KEY, 348 | url = self.data_instance.get_url(), 349 | format = 'json' 350 | )) 351 | data += '&stats' # use '&html' for html formatted result 352 | req = Request( 353 | 'http://www.diffbot.com/api/article', 354 | data = data 355 | ) 356 | return req.get() 357 | 358 | @classmethod 359 | def formatted_result(cls, result_string): 360 | js = json.loads(result_string, encoding = 'utf8') 361 | return TextResultFormat( 362 | js.get('title','').encode('utf8','ignore') + ' ' +\ 363 | js['text'].encode('utf8','ignore') 364 | ) 365 | 366 | class ExtractivExtractor(BaseExtractor): 367 | '''Extractiv extractor''' 368 | 369 | NAME = 'Extractiv' 370 | SLUG = 'extractiv' 371 | FORMAT = 'json' 372 | 373 | @return_content 374 | def extract(self): 375 | html = self.data_instance.get_raw_html() 376 | req = Request( 377 | 'http://rest.extractiv.com/extractiv/', 378 | data = {'api_key':settings.EXTRACTIV_API_KEY, 379 | 'content': html.encode(self.data_instance.raw_encoding,'ignore'), 380 | 'output_format':'json' 381 | } 382 | 383 | ) 384 | return req.post() 385 | 386 | @classmethod 387 | def formatted_result(cls, result_string): 388 | js = json.loads(result_string, encoding = 'utf8') 389 | 390 | text = js['Document']['text'] 391 | content_sentences = [] 392 | for se in js['sentences']: 393 | zone = se.get('zone','regular') 394 | if zone == 'regular': 395 | content_sentences.append(text[se['offset']:se['offset']+se['len']] ) 396 | 397 | return TextResultFormat( 398 | js['Document'].get('title','').encode('utf8','ignore') + ' ' +\ 399 | (' '.join(content_sentences)).encode('utf8','ignore') 400 | ) 401 | 402 | class RepustateExtractor(BaseExtractor): 403 | '''Repustate extractor''' 404 | 405 | NAME = 'Repustate' 406 | SLUG = 'repustate' 407 | FORMAT = 'json' 408 | 409 | @check_content_status 410 | @return_content 411 | def extract(self): 412 | req = Request( 413 | 'http://api.repustate.com/v1/%s/clean-html.json' \ 414 | % settings.REPUSTATE_API_KEY, 415 | data = 'url=%s' % self.data_instance.get_url() 416 | ) 417 | return req.get() 418 | 419 | def _content_status(self): 420 | js = json.loads(self._content, encoding = 'utf8') 421 | if js['status'] != 'OK': 422 | raise ContentExtractorError(js['status'].encode('utf8','ignore')) 423 | 424 | @classmethod 425 | def formatted_result(cls, result_string): 426 | js = json.loads(result_string, encoding = 'utf8') 427 | return TextResultFormat(js['text'].encode('utf8','ignore')) 428 | 429 | class ZemantaExtractor(BaseExtractor): 430 | '''Extractor used internally by Zemanta Ltd''' 431 | 432 | NAME = 'Zextractor' 433 | SLUG = 'zemanta' 434 | FORMAT = 'txt' 435 | 436 | def extract(self): 437 | html = self.data_instance.get_raw_html() 438 | html = html.encode(self.data_instance.raw_encoding,'ignore') 439 | cm = ClientManager() 440 | 441 | response = cm.extract(html, self.data_instance.raw_encoding) 442 | if response.error: 443 | raise ExtractorError(response.error) 444 | return response.text 445 | 446 | @classmethod 447 | def formatted_result(cls, result_string): 448 | return TextResultFormat(result_string) 449 | 450 | class NCleanerStdEnExtractor(BaseExtractor): 451 | '''NCleaner extractor using the standard english n-gram model''' 452 | 453 | NAME = 'NCleaner En' 454 | SLUG = 'ncleaner_en' 455 | FORMAT = 'txt' 456 | 457 | def extract(self): 458 | ''' 459 | This method is not implemented (for now), because ncleaner 460 | comes with a handy command line tool that trivially executes 461 | the extraction task for us. 462 | ''' 463 | raise NotImplementedError 464 | 465 | @classmethod 466 | def formatted_result(cls, result_string): 467 | # ncleaner uses the cleaneval style format for its output 468 | return CleanEvalFormat(result_string) 469 | 470 | class NCleanerNonLexExtractor(NCleanerStdEnExtractor): 471 | '''NCleaner extractor using the non lexical n-gram model''' 472 | 473 | NAME = 'NCleaner NonLex' 474 | SLUG = 'ncleaner_nonlex' 475 | FORMAT = 'txt' 476 | 477 | class TrendictionExtractor(BaseExtractor): 478 | '''Trendiction API''' 479 | 480 | NAME = 'Trendiction' 481 | SLUG = 'trendiction' 482 | FORMAT = 'json' 483 | 484 | @check_content_status 485 | @return_content 486 | def extract(self): 487 | req = Request( 488 | settings.TRENDICTION_ENDPOINT, 489 | data = { 490 | 'ckey':'', 491 | 'url':self.data_instance.get_url(), 492 | 'onlycontent':'false', 493 | 'outf':'json', 494 | } 495 | ) 496 | return req.get() 497 | 498 | def _content_status(self): 499 | js = json.loads(self._content, encoding = 'utf8') 500 | try: 501 | js['result_content']['data'][0]['content']['content_text'] 502 | js['result_content']['data'][0]['content']['title_text'] 503 | except (IndexError, KeyError) as e: 504 | raise ContentExtractorError('content not present in the response' + repr(e)) 505 | 506 | @classmethod 507 | def formatted_result(cls, result_string): 508 | js = json.loads(result_string, encoding = 'utf8') 509 | content = js['result_content']['data'][0]['content']['content_text'] 510 | title = js['result_content']['data'][0]['content']['title_text'] 511 | return TextResultFormat((title +' '+ content).encode('utf8','ignore')) 512 | 513 | class JustextExtractor(BaseExtractor): 514 | '''Justext extractor''' 515 | 516 | NAME = 'JusText' 517 | SLUG = 'justext' 518 | FORMAT = 'txt' 519 | 520 | def extract(self): 521 | html = self.data_instance.get_raw_html() 522 | html = html.encode(self.data_instance.raw_encoding,'ignore') 523 | paragraphs = justext.justext(html, justext.get_stoplist('English'), 524 | encoding = self.data_instance.raw_encoding) 525 | good_paragraphs = [] 526 | for para in paragraphs: 527 | if para['class'] == 'good': 528 | paragraph_text = para['text'] 529 | # this asseration makes sure we catch string and unicode only 530 | assert isinstance(paragraph_text, basestring) 531 | if type(paragraph_text) == unicode: 532 | good_paragraphs.append(paragraph_text.encode('utf8', 'ignore')) 533 | else: 534 | good_paragraphs.append(paragraph_text) 535 | 536 | return '\n\n'.join(good_paragraphs) 537 | 538 | @classmethod 539 | def formatted_result(cls, result_string): 540 | return TextResultFormat(result_string) 541 | 542 | # list of all extractor classes 543 | extractor_list = ( 544 | BoilerpipeDefaultExtractor, 545 | BoilerpipeArticleExtractor, 546 | BoilerpipeArticleSentencesExtractor, 547 | GooseExtractor, 548 | MSSExtractor, 549 | PythonReadabilityExtractor, 550 | NodeReadabilityExtractor, 551 | SeleniumReadabilityExtractor, 552 | AlchemyExtractor, 553 | DiffbotExtractor, 554 | ExtractivExtractor, 555 | RepustateExtractor, 556 | ZemantaExtractor, 557 | NCleanerStdEnExtractor, 558 | NCleanerNonLexExtractor, 559 | #TrendictionExtractor, 560 | JustextExtractor, 561 | TTRDefaultExtractor, 562 | ) 563 | 564 | def get_extractor_cls(extractor_slug): 565 | '''Return the extractor class given a slug''' 566 | for e in extractor_list: 567 | if e.SLUG == extractor_slug: 568 | return e 569 | -------------------------------------------------------------------------------- /src/txtexeval/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import Request 2 | from .common import get_local_path 3 | from .common import check_local_path 4 | from .common import html_to_text -------------------------------------------------------------------------------- /src/txtexeval/util/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib 3 | import urllib2 4 | 5 | from BeautifulSoup import BeautifulSoup 6 | 7 | import settings 8 | 9 | # urllib wrappers 10 | 11 | class _Response(object): 12 | 13 | def __init__(self, status_code = None, headers = None, 14 | content = None, err_msg = None): 15 | self.status_code = status_code 16 | self.headers = headers 17 | self.content = content 18 | self._err_msg = err_msg 19 | 20 | def success(self): 21 | if self._err_msg: 22 | return False 23 | elif self.status_code and str(self.status_code).startswith('2'):# see RFC 2616 24 | return True 25 | else: 26 | return False 27 | 28 | @property 29 | def err_msg(self): 30 | if self._err_msg: 31 | return self._err_msg 32 | elif self.status_code and str(self.status_code).startswith('2'): 33 | return 'Status code: %i' % self.status_code 34 | else: 35 | return '' 36 | 37 | 38 | class Request(object): 39 | 40 | def __init__(self, url, data, **kwargs): 41 | self.url = url 42 | self.kwargs = kwargs 43 | if isinstance(data, dict): 44 | self.data = urllib.urlencode(data) 45 | else: 46 | self.data = data 47 | 48 | def post(self): 49 | request = urllib2.Request(self.url, self.data, **self.kwargs) 50 | try: 51 | r = urllib2.urlopen(request) 52 | except urllib2.URLError as e: 53 | return _Response(err_msg = str(e)) 54 | else: 55 | return _Response(r.code, r.headers, r.read()) 56 | 57 | def get(self): 58 | request = urllib2.Request('%s?%s' % (self.url, self.data), **self.kwargs) 59 | try: 60 | r = urllib2.urlopen(request) 61 | except urllib2.URLError as e: 62 | return _Response(err_msg = str(e)) 63 | else: 64 | return _Response(r.code, r.headers, r.read()) 65 | 66 | # dataset helpers 67 | 68 | def check_local_path(*args): 69 | return os.path.exists( 70 | os.path.join(settings.PATH_LOCAL_DATA, 'datasets', *args) 71 | ) 72 | 73 | def get_local_path(*args): 74 | return os.path.join(settings.PATH_LOCAL_DATA, 'datasets', *args) 75 | 76 | # others 77 | 78 | def execute_only_once(method): 79 | '''A decorator that runs a method only once.''' 80 | attrname = "_%s_once_result" % id(method) 81 | def wrap(self, *args, **kwargs): 82 | try: 83 | return getattr(self, attrname) 84 | except AttributeError: 85 | setattr(self, attrname, method(self, *args, **kwargs)) 86 | return getattr(self, attrname) 87 | return wrap 88 | 89 | def html_to_text(html, encoding): 90 | '''Get all the text from a given html string''' 91 | soup = BeautifulSoup(html, fromEncoding = encoding) 92 | tags = soup.findAll(text = True) 93 | useful = lambda e: e.parent.name not in ('style', 'script', 'head', 'title') 94 | tags = filter(useful, tags) 95 | return ' '.join(map(lambda e: e.encode(encoding), tags)) -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/__init__.py: -------------------------------------------------------------------------------- 1 | # this package was generated automatically by thrift 0.5.0 python compiler 2 | # see: ../thrift/generate_thrift.sh -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/client.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | from thrift import Thrift 4 | from thrift.transport import TSocket, TTransport 5 | from thrift.protocol import TBinaryProtocol 6 | 7 | # this is the code thrift generates for us 8 | # gen-py directory was renamed to thriftgen 9 | from .thriftgen.ceservice import ExtractorService 10 | from .thriftgen.ceservice import ttypes 11 | 12 | import settings 13 | #from ..common import execute_only_once 14 | credentials = dict(settings.ZEMANTA_THRIFT) 15 | 16 | Response = namedtuple('Response', 'text error') 17 | 18 | class ClientManager(object): 19 | 20 | __internal_state = {} # Borg design pattern (singleton) 21 | 22 | def __init__(self, extractor = None): 23 | self.__dict__ = self.__internal_state 24 | self.set_client() 25 | 26 | def set_client(self): 27 | self._transport = TTransport.TBufferedTransport( 28 | TSocket.TSocket(credentials['host'], credentials['port']) 29 | ) 30 | self._protocol = TBinaryProtocol.TBinaryProtocol(self._transport) 31 | self._client = ExtractorService.Client(self._protocol) 32 | self._transport.open() 33 | 34 | def extract(self, encoded_htmldata, encoding): 35 | error = None 36 | text = '' 37 | try: 38 | response = self._client.extract('', '', encoded_htmldata, encoding) 39 | except ttypes.TAppException as e: 40 | error = '%r' % e 41 | except Thrift.TException as e: 42 | error = '%r' % e 43 | except Exception as e: 44 | error = '%r' % e 45 | else: 46 | if response.success: 47 | text = response.body.encode('utf8') 48 | else: 49 | error = 'ExtractorService.extract returned a response but the success flag was set to False' 50 | finally: 51 | self._transport.close() 52 | return Response(text, error) -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thrift/ceservice.thrift: -------------------------------------------------------------------------------- 1 | 2 | // Command line to produce py-gen directory: 3 | // ~/prefix/bin/thrift -r --gen py:utf8strings ceservice.thrift 4 | 5 | enum ExceptionCode { 6 | FORCED_FAILED = 1, 7 | PARSING_FAILED = 2, 8 | FLATTENING_FAILED = 3, 9 | CLASSIFICATION_FAILED = 4, 10 | } 11 | 12 | 13 | exception TAppException { 14 | 1: ExceptionCode code, 15 | 2: string msg, 16 | 3: string backtrace, 17 | } 18 | 19 | struct extract_RET 20 | { 21 | 1:bool success, 22 | 2:string body, 23 | } 24 | 25 | service ExtractorService 26 | { 27 | string ping(1: string param) 28 | throws (1:TAppException e), 29 | 30 | 31 | extract_RET extract( 32 | 1:string url, 33 | 2:string title, 34 | 3:binary htmldata, 35 | 4:string encoding, 36 | ) 37 | throws (1:TAppException e), 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thrift/generate_thrift.sh: -------------------------------------------------------------------------------- 1 | echo "generating python source ..." 2 | thrift -r -v -o .. --gen py:utf8strings ceservice.thrift 3 | echo "renaming gen-py into thriftgen" 4 | mv ../gen-py ../thriftgen 5 | echo "done" -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thriftgen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomazk/Text-Extraction-Evaluation/06d6070d895f1bae604dfaf10fa6537700d59e34/src/txtexeval/util/zemanta/thriftgen/__init__.py -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thriftgen/ceservice/ExtractorService-remote: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Autogenerated by Thrift 4 | # 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 6 | # 7 | 8 | import sys 9 | import pprint 10 | from urlparse import urlparse 11 | from thrift.transport import TTransport 12 | from thrift.transport import TSocket 13 | from thrift.transport import THttpClient 14 | from thrift.protocol import TBinaryProtocol 15 | 16 | import ExtractorService 17 | from ttypes import * 18 | 19 | if len(sys.argv) <= 1 or sys.argv[1] == '--help': 20 | print '' 21 | print 'Usage: ' + sys.argv[0] + ' [-h host:port] [-u url] [-f[ramed]] function [arg1 [arg2...]]' 22 | print '' 23 | print 'Functions:' 24 | print ' string ping(string param)' 25 | print ' extract_RET extract(string url, string title, string htmldata, string encoding)' 26 | print '' 27 | sys.exit(0) 28 | 29 | pp = pprint.PrettyPrinter(indent = 2) 30 | host = 'localhost' 31 | port = 9090 32 | uri = '' 33 | framed = False 34 | http = False 35 | argi = 1 36 | 37 | if sys.argv[argi] == '-h': 38 | parts = sys.argv[argi+1].split(':') 39 | host = parts[0] 40 | port = int(parts[1]) 41 | argi += 2 42 | 43 | if sys.argv[argi] == '-u': 44 | url = urlparse(sys.argv[argi+1]) 45 | parts = url[1].split(':') 46 | host = parts[0] 47 | if len(parts) > 1: 48 | port = int(parts[1]) 49 | else: 50 | port = 80 51 | uri = url[2] 52 | if url[4]: 53 | uri += '?%s' % url[4] 54 | http = True 55 | argi += 2 56 | 57 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed': 58 | framed = True 59 | argi += 1 60 | 61 | cmd = sys.argv[argi] 62 | args = sys.argv[argi+1:] 63 | 64 | if http: 65 | transport = THttpClient.THttpClient(host, port, uri) 66 | else: 67 | socket = TSocket.TSocket(host, port) 68 | if framed: 69 | transport = TTransport.TFramedTransport(socket) 70 | else: 71 | transport = TTransport.TBufferedTransport(socket) 72 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 73 | client = ExtractorService.Client(protocol) 74 | transport.open() 75 | 76 | if cmd == 'ping': 77 | if len(args) != 1: 78 | print 'ping requires 1 args' 79 | sys.exit(1) 80 | pp.pprint(client.ping(args[0],)) 81 | 82 | elif cmd == 'extract': 83 | if len(args) != 4: 84 | print 'extract requires 4 args' 85 | sys.exit(1) 86 | pp.pprint(client.extract(args[0],args[1],args[2],args[3],)) 87 | 88 | else: 89 | print 'Unrecognized method %s' % cmd 90 | sys.exit(1) 91 | 92 | transport.close() 93 | -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thriftgen/ceservice/ExtractorService.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | 7 | from thrift.Thrift import * 8 | from ttypes import * 9 | from thrift.Thrift import TProcessor 10 | from thrift.transport import TTransport 11 | from thrift.protocol import TBinaryProtocol, TProtocol 12 | try: 13 | from thrift.protocol import fastbinary 14 | except: 15 | fastbinary = None 16 | 17 | 18 | class Iface: 19 | def ping(self, param): 20 | """ 21 | Parameters: 22 | - param 23 | """ 24 | pass 25 | 26 | def extract(self, url, title, htmldata, encoding): 27 | """ 28 | Parameters: 29 | - url 30 | - title 31 | - htmldata 32 | - encoding 33 | """ 34 | pass 35 | 36 | 37 | class Client(Iface): 38 | def __init__(self, iprot, oprot=None): 39 | self._iprot = self._oprot = iprot 40 | if oprot != None: 41 | self._oprot = oprot 42 | self._seqid = 0 43 | 44 | def ping(self, param): 45 | """ 46 | Parameters: 47 | - param 48 | """ 49 | self.send_ping(param) 50 | return self.recv_ping() 51 | 52 | def send_ping(self, param): 53 | self._oprot.writeMessageBegin('ping', TMessageType.CALL, self._seqid) 54 | args = ping_args() 55 | args.param = param 56 | args.write(self._oprot) 57 | self._oprot.writeMessageEnd() 58 | self._oprot.trans.flush() 59 | 60 | def recv_ping(self, ): 61 | (fname, mtype, rseqid) = self._iprot.readMessageBegin() 62 | if mtype == TMessageType.EXCEPTION: 63 | x = TApplicationException() 64 | x.read(self._iprot) 65 | self._iprot.readMessageEnd() 66 | raise x 67 | result = ping_result() 68 | result.read(self._iprot) 69 | self._iprot.readMessageEnd() 70 | if result.success != None: 71 | return result.success 72 | if result.e != None: 73 | raise result.e 74 | raise TApplicationException(TApplicationException.MISSING_RESULT, "ping failed: unknown result"); 75 | 76 | def extract(self, url, title, htmldata, encoding): 77 | """ 78 | Parameters: 79 | - url 80 | - title 81 | - htmldata 82 | - encoding 83 | """ 84 | self.send_extract(url, title, htmldata, encoding) 85 | return self.recv_extract() 86 | 87 | def send_extract(self, url, title, htmldata, encoding): 88 | self._oprot.writeMessageBegin('extract', TMessageType.CALL, self._seqid) 89 | args = extract_args() 90 | args.url = url 91 | args.title = title 92 | args.htmldata = htmldata 93 | args.encoding = encoding 94 | args.write(self._oprot) 95 | self._oprot.writeMessageEnd() 96 | self._oprot.trans.flush() 97 | 98 | def recv_extract(self, ): 99 | (fname, mtype, rseqid) = self._iprot.readMessageBegin() 100 | if mtype == TMessageType.EXCEPTION: 101 | x = TApplicationException() 102 | x.read(self._iprot) 103 | self._iprot.readMessageEnd() 104 | raise x 105 | result = extract_result() 106 | result.read(self._iprot) 107 | self._iprot.readMessageEnd() 108 | if result.success != None: 109 | return result.success 110 | if result.e != None: 111 | raise result.e 112 | raise TApplicationException(TApplicationException.MISSING_RESULT, "extract failed: unknown result"); 113 | 114 | 115 | class Processor(Iface, TProcessor): 116 | def __init__(self, handler): 117 | self._handler = handler 118 | self._processMap = {} 119 | self._processMap["ping"] = Processor.process_ping 120 | self._processMap["extract"] = Processor.process_extract 121 | 122 | def process(self, iprot, oprot): 123 | (name, type, seqid) = iprot.readMessageBegin() 124 | if name not in self._processMap: 125 | iprot.skip(TType.STRUCT) 126 | iprot.readMessageEnd() 127 | x = TApplicationException(TApplicationException.UNKNOWN_METHOD, 'Unknown function %s' % (name)) 128 | oprot.writeMessageBegin(name, TMessageType.EXCEPTION, seqid) 129 | x.write(oprot) 130 | oprot.writeMessageEnd() 131 | oprot.trans.flush() 132 | return 133 | else: 134 | self._processMap[name](self, seqid, iprot, oprot) 135 | return True 136 | 137 | def process_ping(self, seqid, iprot, oprot): 138 | args = ping_args() 139 | args.read(iprot) 140 | iprot.readMessageEnd() 141 | result = ping_result() 142 | try: 143 | result.success = self._handler.ping(args.param) 144 | except TAppException, e: 145 | result.e = e 146 | oprot.writeMessageBegin("ping", TMessageType.REPLY, seqid) 147 | result.write(oprot) 148 | oprot.writeMessageEnd() 149 | oprot.trans.flush() 150 | 151 | def process_extract(self, seqid, iprot, oprot): 152 | args = extract_args() 153 | args.read(iprot) 154 | iprot.readMessageEnd() 155 | result = extract_result() 156 | try: 157 | result.success = self._handler.extract(args.url, args.title, args.htmldata, args.encoding) 158 | except TAppException, e: 159 | result.e = e 160 | oprot.writeMessageBegin("extract", TMessageType.REPLY, seqid) 161 | result.write(oprot) 162 | oprot.writeMessageEnd() 163 | oprot.trans.flush() 164 | 165 | 166 | # HELPER FUNCTIONS AND STRUCTURES 167 | 168 | class ping_args: 169 | """ 170 | Attributes: 171 | - param 172 | """ 173 | 174 | thrift_spec = ( 175 | None, # 0 176 | (1, TType.STRING, 'param', None, None, ), # 1 177 | ) 178 | 179 | def __init__(self, param=None,): 180 | self.param = param 181 | 182 | def read(self, iprot): 183 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 184 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 185 | return 186 | iprot.readStructBegin() 187 | while True: 188 | (fname, ftype, fid) = iprot.readFieldBegin() 189 | if ftype == TType.STOP: 190 | break 191 | if fid == 1: 192 | if ftype == TType.STRING: 193 | self.param = iprot.readString().decode('utf-8') 194 | else: 195 | iprot.skip(ftype) 196 | else: 197 | iprot.skip(ftype) 198 | iprot.readFieldEnd() 199 | iprot.readStructEnd() 200 | 201 | def write(self, oprot): 202 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 203 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 204 | return 205 | oprot.writeStructBegin('ping_args') 206 | if self.param != None: 207 | oprot.writeFieldBegin('param', TType.STRING, 1) 208 | oprot.writeString(self.param.encode('utf-8')) 209 | oprot.writeFieldEnd() 210 | oprot.writeFieldStop() 211 | oprot.writeStructEnd() 212 | def validate(self): 213 | return 214 | 215 | 216 | def __repr__(self): 217 | L = ['%s=%r' % (key, value) 218 | for key, value in self.__dict__.iteritems()] 219 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 220 | 221 | def __eq__(self, other): 222 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 223 | 224 | def __ne__(self, other): 225 | return not (self == other) 226 | 227 | class ping_result: 228 | """ 229 | Attributes: 230 | - success 231 | - e 232 | """ 233 | 234 | thrift_spec = ( 235 | (0, TType.STRING, 'success', None, None, ), # 0 236 | (1, TType.STRUCT, 'e', (TAppException, TAppException.thrift_spec), None, ), # 1 237 | ) 238 | 239 | def __init__(self, success=None, e=None,): 240 | self.success = success 241 | self.e = e 242 | 243 | def read(self, iprot): 244 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 245 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 246 | return 247 | iprot.readStructBegin() 248 | while True: 249 | (fname, ftype, fid) = iprot.readFieldBegin() 250 | if ftype == TType.STOP: 251 | break 252 | if fid == 0: 253 | if ftype == TType.STRING: 254 | self.success = iprot.readString().decode('utf-8') 255 | else: 256 | iprot.skip(ftype) 257 | elif fid == 1: 258 | if ftype == TType.STRUCT: 259 | self.e = TAppException() 260 | self.e.read(iprot) 261 | else: 262 | iprot.skip(ftype) 263 | else: 264 | iprot.skip(ftype) 265 | iprot.readFieldEnd() 266 | iprot.readStructEnd() 267 | 268 | def write(self, oprot): 269 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 270 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 271 | return 272 | oprot.writeStructBegin('ping_result') 273 | if self.success != None: 274 | oprot.writeFieldBegin('success', TType.STRING, 0) 275 | oprot.writeString(self.success.encode('utf-8')) 276 | oprot.writeFieldEnd() 277 | if self.e != None: 278 | oprot.writeFieldBegin('e', TType.STRUCT, 1) 279 | self.e.write(oprot) 280 | oprot.writeFieldEnd() 281 | oprot.writeFieldStop() 282 | oprot.writeStructEnd() 283 | def validate(self): 284 | return 285 | 286 | 287 | def __repr__(self): 288 | L = ['%s=%r' % (key, value) 289 | for key, value in self.__dict__.iteritems()] 290 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 291 | 292 | def __eq__(self, other): 293 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 294 | 295 | def __ne__(self, other): 296 | return not (self == other) 297 | 298 | class extract_args: 299 | """ 300 | Attributes: 301 | - url 302 | - title 303 | - htmldata 304 | - encoding 305 | """ 306 | 307 | thrift_spec = ( 308 | None, # 0 309 | (1, TType.STRING, 'url', None, None, ), # 1 310 | (2, TType.STRING, 'title', None, None, ), # 2 311 | (3, TType.STRING, 'htmldata', None, None, ), # 3 312 | (4, TType.STRING, 'encoding', None, None, ), # 4 313 | ) 314 | 315 | def __init__(self, url=None, title=None, htmldata=None, encoding=None,): 316 | self.url = url 317 | self.title = title 318 | self.htmldata = htmldata 319 | self.encoding = encoding 320 | 321 | def read(self, iprot): 322 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 323 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 324 | return 325 | iprot.readStructBegin() 326 | while True: 327 | (fname, ftype, fid) = iprot.readFieldBegin() 328 | if ftype == TType.STOP: 329 | break 330 | if fid == 1: 331 | if ftype == TType.STRING: 332 | self.url = iprot.readString().decode('utf-8') 333 | else: 334 | iprot.skip(ftype) 335 | elif fid == 2: 336 | if ftype == TType.STRING: 337 | self.title = iprot.readString().decode('utf-8') 338 | else: 339 | iprot.skip(ftype) 340 | elif fid == 3: 341 | if ftype == TType.STRING: 342 | self.htmldata = iprot.readString(); 343 | else: 344 | iprot.skip(ftype) 345 | elif fid == 4: 346 | if ftype == TType.STRING: 347 | self.encoding = iprot.readString().decode('utf-8') 348 | else: 349 | iprot.skip(ftype) 350 | else: 351 | iprot.skip(ftype) 352 | iprot.readFieldEnd() 353 | iprot.readStructEnd() 354 | 355 | def write(self, oprot): 356 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 357 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 358 | return 359 | oprot.writeStructBegin('extract_args') 360 | if self.url != None: 361 | oprot.writeFieldBegin('url', TType.STRING, 1) 362 | oprot.writeString(self.url.encode('utf-8')) 363 | oprot.writeFieldEnd() 364 | if self.title != None: 365 | oprot.writeFieldBegin('title', TType.STRING, 2) 366 | oprot.writeString(self.title.encode('utf-8')) 367 | oprot.writeFieldEnd() 368 | if self.htmldata != None: 369 | oprot.writeFieldBegin('htmldata', TType.STRING, 3) 370 | oprot.writeString(self.htmldata) 371 | oprot.writeFieldEnd() 372 | if self.encoding != None: 373 | oprot.writeFieldBegin('encoding', TType.STRING, 4) 374 | oprot.writeString(self.encoding.encode('utf-8')) 375 | oprot.writeFieldEnd() 376 | oprot.writeFieldStop() 377 | oprot.writeStructEnd() 378 | def validate(self): 379 | return 380 | 381 | 382 | def __repr__(self): 383 | L = ['%s=%r' % (key, value) 384 | for key, value in self.__dict__.iteritems()] 385 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 386 | 387 | def __eq__(self, other): 388 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 389 | 390 | def __ne__(self, other): 391 | return not (self == other) 392 | 393 | class extract_result: 394 | """ 395 | Attributes: 396 | - success 397 | - e 398 | """ 399 | 400 | thrift_spec = ( 401 | (0, TType.STRUCT, 'success', (extract_RET, extract_RET.thrift_spec), None, ), # 0 402 | (1, TType.STRUCT, 'e', (TAppException, TAppException.thrift_spec), None, ), # 1 403 | ) 404 | 405 | def __init__(self, success=None, e=None,): 406 | self.success = success 407 | self.e = e 408 | 409 | def read(self, iprot): 410 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 411 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 412 | return 413 | iprot.readStructBegin() 414 | while True: 415 | (fname, ftype, fid) = iprot.readFieldBegin() 416 | if ftype == TType.STOP: 417 | break 418 | if fid == 0: 419 | if ftype == TType.STRUCT: 420 | self.success = extract_RET() 421 | self.success.read(iprot) 422 | else: 423 | iprot.skip(ftype) 424 | elif fid == 1: 425 | if ftype == TType.STRUCT: 426 | self.e = TAppException() 427 | self.e.read(iprot) 428 | else: 429 | iprot.skip(ftype) 430 | else: 431 | iprot.skip(ftype) 432 | iprot.readFieldEnd() 433 | iprot.readStructEnd() 434 | 435 | def write(self, oprot): 436 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 437 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 438 | return 439 | oprot.writeStructBegin('extract_result') 440 | if self.success != None: 441 | oprot.writeFieldBegin('success', TType.STRUCT, 0) 442 | self.success.write(oprot) 443 | oprot.writeFieldEnd() 444 | if self.e != None: 445 | oprot.writeFieldBegin('e', TType.STRUCT, 1) 446 | self.e.write(oprot) 447 | oprot.writeFieldEnd() 448 | oprot.writeFieldStop() 449 | oprot.writeStructEnd() 450 | def validate(self): 451 | return 452 | 453 | 454 | def __repr__(self): 455 | L = ['%s=%r' % (key, value) 456 | for key, value in self.__dict__.iteritems()] 457 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 458 | 459 | def __eq__(self, other): 460 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 461 | 462 | def __ne__(self, other): 463 | return not (self == other) 464 | -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thriftgen/ceservice/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'ExtractorService'] 2 | -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thriftgen/ceservice/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | 7 | from thrift.Thrift import * 8 | from ttypes import * 9 | 10 | -------------------------------------------------------------------------------- /src/txtexeval/util/zemanta/thriftgen/ceservice/ttypes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | 7 | from thrift.Thrift import * 8 | 9 | from thrift.transport import TTransport 10 | from thrift.protocol import TBinaryProtocol, TProtocol 11 | try: 12 | from thrift.protocol import fastbinary 13 | except: 14 | fastbinary = None 15 | 16 | 17 | class ExceptionCode: 18 | FORCED_FAILED = 1 19 | PARSING_FAILED = 2 20 | FLATTENING_FAILED = 3 21 | CLASSIFICATION_FAILED = 4 22 | 23 | _VALUES_TO_NAMES = { 24 | 1: "FORCED_FAILED", 25 | 2: "PARSING_FAILED", 26 | 3: "FLATTENING_FAILED", 27 | 4: "CLASSIFICATION_FAILED", 28 | } 29 | 30 | _NAMES_TO_VALUES = { 31 | "FORCED_FAILED": 1, 32 | "PARSING_FAILED": 2, 33 | "FLATTENING_FAILED": 3, 34 | "CLASSIFICATION_FAILED": 4, 35 | } 36 | 37 | 38 | class TAppException(Exception): 39 | """ 40 | Attributes: 41 | - code 42 | - msg 43 | - backtrace 44 | """ 45 | 46 | thrift_spec = ( 47 | None, # 0 48 | (1, TType.I32, 'code', None, None, ), # 1 49 | (2, TType.STRING, 'msg', None, None, ), # 2 50 | (3, TType.STRING, 'backtrace', None, None, ), # 3 51 | ) 52 | 53 | def __init__(self, code=None, msg=None, backtrace=None,): 54 | self.code = code 55 | self.msg = msg 56 | self.backtrace = backtrace 57 | 58 | def read(self, iprot): 59 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 60 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 61 | return 62 | iprot.readStructBegin() 63 | while True: 64 | (fname, ftype, fid) = iprot.readFieldBegin() 65 | if ftype == TType.STOP: 66 | break 67 | if fid == 1: 68 | if ftype == TType.I32: 69 | self.code = iprot.readI32(); 70 | else: 71 | iprot.skip(ftype) 72 | elif fid == 2: 73 | if ftype == TType.STRING: 74 | self.msg = iprot.readString().decode('utf-8') 75 | else: 76 | iprot.skip(ftype) 77 | elif fid == 3: 78 | if ftype == TType.STRING: 79 | self.backtrace = iprot.readString().decode('utf-8') 80 | else: 81 | iprot.skip(ftype) 82 | else: 83 | iprot.skip(ftype) 84 | iprot.readFieldEnd() 85 | iprot.readStructEnd() 86 | 87 | def write(self, oprot): 88 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 89 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 90 | return 91 | oprot.writeStructBegin('TAppException') 92 | if self.code != None: 93 | oprot.writeFieldBegin('code', TType.I32, 1) 94 | oprot.writeI32(self.code) 95 | oprot.writeFieldEnd() 96 | if self.msg != None: 97 | oprot.writeFieldBegin('msg', TType.STRING, 2) 98 | oprot.writeString(self.msg.encode('utf-8')) 99 | oprot.writeFieldEnd() 100 | if self.backtrace != None: 101 | oprot.writeFieldBegin('backtrace', TType.STRING, 3) 102 | oprot.writeString(self.backtrace.encode('utf-8')) 103 | oprot.writeFieldEnd() 104 | oprot.writeFieldStop() 105 | oprot.writeStructEnd() 106 | def validate(self): 107 | return 108 | 109 | 110 | def __str__(self): 111 | return repr(self) 112 | 113 | def __repr__(self): 114 | L = ['%s=%r' % (key, value) 115 | for key, value in self.__dict__.iteritems()] 116 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 117 | 118 | def __eq__(self, other): 119 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 120 | 121 | def __ne__(self, other): 122 | return not (self == other) 123 | 124 | class extract_RET: 125 | """ 126 | Attributes: 127 | - success 128 | - body 129 | """ 130 | 131 | thrift_spec = ( 132 | None, # 0 133 | (1, TType.BOOL, 'success', None, None, ), # 1 134 | (2, TType.STRING, 'body', None, None, ), # 2 135 | ) 136 | 137 | def __init__(self, success=None, body=None,): 138 | self.success = success 139 | self.body = body 140 | 141 | def read(self, iprot): 142 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 143 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 144 | return 145 | iprot.readStructBegin() 146 | while True: 147 | (fname, ftype, fid) = iprot.readFieldBegin() 148 | if ftype == TType.STOP: 149 | break 150 | if fid == 1: 151 | if ftype == TType.BOOL: 152 | self.success = iprot.readBool(); 153 | else: 154 | iprot.skip(ftype) 155 | elif fid == 2: 156 | if ftype == TType.STRING: 157 | self.body = iprot.readString().decode('utf-8') 158 | else: 159 | iprot.skip(ftype) 160 | else: 161 | iprot.skip(ftype) 162 | iprot.readFieldEnd() 163 | iprot.readStructEnd() 164 | 165 | def write(self, oprot): 166 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 167 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 168 | return 169 | oprot.writeStructBegin('extract_RET') 170 | if self.success != None: 171 | oprot.writeFieldBegin('success', TType.BOOL, 1) 172 | oprot.writeBool(self.success) 173 | oprot.writeFieldEnd() 174 | if self.body != None: 175 | oprot.writeFieldBegin('body', TType.STRING, 2) 176 | oprot.writeString(self.body.encode('utf-8')) 177 | oprot.writeFieldEnd() 178 | oprot.writeFieldStop() 179 | oprot.writeStructEnd() 180 | def validate(self): 181 | return 182 | 183 | 184 | def __repr__(self): 185 | L = ['%s=%r' % (key, value) 186 | for key, value in self.__dict__.iteritems()] 187 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 188 | 189 | def __eq__(self, other): 190 | return isinstance(other, self.__class__) and self.__dict__ == other.__dict__ 191 | 192 | def __ne__(self, other): 193 | return not (self == other) 194 | -------------------------------------------------------------------------------- /tests/test_evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import math 4 | 5 | import unittest2 6 | 7 | from txtexeval.util import html_to_text 8 | from txtexeval.evaluation import _tokenize_text, _bow 9 | from txtexeval.evaluation import TextOnlyEvaluator 10 | from txtexeval.evaluation import TextBasedResults, Result 11 | from txtexeval.evaluation import BaseResultFormat, TextResultFormat, \ 12 | CleanEvalFormat,GoogleNewsFormat 13 | 14 | 15 | class TestHelpers(unittest2.TestCase): 16 | 17 | def test_tokenize_text(self): 18 | s = ''' 19 | This is (some text). AAAA!!"#.{} 20 | special charč怚đž. 21 | ''' 22 | r = _tokenize_text(s) 23 | self.assertEqual(r, ['this','is','some','text','aaaa','special','char']) 24 | 25 | def test_tokenize_text_empty(self): 26 | s = '' 27 | r = _tokenize_text(s) 28 | self.assertEqual(r, []) 29 | 30 | def test_html_to_text(self): 31 | s = ''' 32 | 33 | 34 | Title 35 | 42 | 43 | 44 | 45 | 48 | 49 | Body 50 |

Paragraph here

51 | More text 52 | 53 | 54 | ''' 55 | t = html_to_text(s, encoding = 'ascii') 56 | t = t.strip() 57 | self.assertTrue(t.startswith('Body')) 58 | self.assertTrue(t.endswith('text')) 59 | 60 | def test_html_to_text_empty(self): 61 | s = '' 62 | t = html_to_text(s, encoding = 'ascii') 63 | self.assertTrue(re.match('\s*', t)) 64 | 65 | class TestFormats(unittest2.TestCase): 66 | 67 | def test_textresultformat(self): 68 | s = ''' 69 | This is (some text). AAAA!!"#.{} 70 | special charč怚đž char. 71 | ''' 72 | t = TextResultFormat(s) 73 | self.assertEqual(t.get_word_seq(), ['this','is','some','text','aaaa','special','char','char']) 74 | self.assertEqual(t.get_bow(), {'this':1,'is':1,'some':1,'text':1,'aaaa':1,'special':1,'char':2}) 75 | 76 | def test_textresultformat_empty(self): 77 | t = TextResultFormat(''' 78 | 79 | 80 | ''') 81 | self.assertEqual(t.get_word_seq(), []) 82 | self.assertEqual(t.get_bow(), {}) 83 | 84 | def test_cleanevalformat(self): 85 | s = ''' 86 | URL: http://childparenting.about.com/b/archives.htm 87 |

this is 88 | cleaneval 89 | format 90 | 91 |

this is 92 | cleaneval 93 | format 94 | ''' 95 | ce = CleanEvalFormat(s) 96 | self.assertEqual(ce.get_word_seq(), ['this','is','cleaneval','format','this','is','cleaneval','format']) 97 | self.assertEqual(ce.get_bow(), {'this':2,'is':2,'cleaneval':2,'format':2}) 98 | 99 | def test_cleanevalformat_empty(self): 100 | s = '''URL: http://childparenting.about.com/b/archives.htm 101 | ''' 102 | ce = CleanEvalFormat(s) 103 | self.assertEqual(ce.get_word_seq(), []) 104 | self.assertEqual(ce.get_bow(), {}) 105 | 106 | def test_googlenewsformat(self): 107 | s = ''' 108 |

109 | 110 | Headline here 111 | 112 | 113 | 114 | Double content 115 | 116 | Text content here€ 117 | 118 | content 119 | 120 | 121 | Not content 122 |

123 | ''' 124 | gn = GoogleNewsFormat(s, 'utf8') 125 | self.assertEqual(gn.get_word_seq(), ['headline','here','double','content','text','content','here','content',]) 126 | self.assertEqual(gn.get_bow(), {'headline':1,'here':2,'double':1,'content':3,'text':1}) 127 | 128 | def test_googlenewsformat_empty1(self): 129 | s = ''' 130 |

131 | 132 | Headline here (not content) 133 | 134 | 135 | 136 | not content 137 | 138 | no content here€ 139 | 140 | not content 141 | 142 | 143 | Not content 144 |

145 | ''' 146 | gn = GoogleNewsFormat(s, 'utf8') 147 | self.assertEqual(gn.get_word_seq(), []) 148 | self.assertEqual(gn.get_bow(), {}) 149 | 150 | def test_googlenewsformat_empty2(self): 151 | gn = GoogleNewsFormat('','ascii') 152 | self.assertEqual(gn.get_word_seq(), []) 153 | self.assertEqual(gn.get_bow(), {}) 154 | 155 | def dummy_format_factory(word_seq): 156 | class DummyFormat(BaseResultFormat): 157 | def get_bow(self): 158 | return _bow(word_seq) 159 | 160 | def get_word_seq(self): 161 | return word_seq 162 | return DummyFormat() 163 | 164 | class TestTextEvaluator(unittest2.TestCase): 165 | 166 | def test_empty_relevant(self): 167 | ret = dummy_format_factory(['one','two']) 168 | rel = dummy_format_factory([]) 169 | # args: TextOnlyEvaluator(retrieved, relevant) 170 | e = TextOnlyEvaluator(ret, rel) 171 | r = e.get_eval_results() 172 | self.assertEqual(r.precision, 0) 173 | self.assertTrue(math.isinf(r.recall)) 174 | self.assertTrue(math.isnan(r.f1_score)) 175 | 176 | def test_empty_retrieved(self): 177 | ret = dummy_format_factory([]) 178 | rel = dummy_format_factory(['one','two']) 179 | # args: TextOnlyEvaluator(retrieved, relevant) 180 | e = TextOnlyEvaluator(ret, rel) 181 | r = e.get_eval_results() 182 | self.assertEqual(r.recall, 0) 183 | self.assertTrue(math.isinf(r.precision)) 184 | self.assertTrue(math.isnan(r.f1_score)) 185 | 186 | def test_both_empty(self): 187 | ret = dummy_format_factory([]) 188 | rel = dummy_format_factory([]) 189 | # args: TextOnlyEvaluator(retrieved, relevant) 190 | e = TextOnlyEvaluator(ret, rel) 191 | r = e.get_eval_results() 192 | self.assertTrue(math.isinf(r.precision)) 193 | self.assertTrue(math.isinf(r.recall)) 194 | self.assertTrue(math.isnan(r.f1_score)) 195 | 196 | def test_missmatch(self): 197 | ret = dummy_format_factory(['one','four']) 198 | rel = dummy_format_factory(['two','three']) 199 | # args: TextOnlyEvaluator(retrieved, relevant) 200 | e = TextOnlyEvaluator(ret, rel) 201 | r = e.get_eval_results() 202 | self.assertEqual(r.precision, 0) 203 | self.assertEqual(r.recall, 0) 204 | self.assertTrue(math.isinf(r.f1_score)) 205 | 206 | def test_match(self): 207 | ret = dummy_format_factory(['zero','one','two','four']) 208 | rel = dummy_format_factory(['one','two','three']) 209 | # args: TextOnlyEvaluator(retrieved, relevant) 210 | e = TextOnlyEvaluator(ret, rel) 211 | r = e.get_eval_results() 212 | self.assertAlmostEqual(r.precision, 0.5) 213 | self.assertAlmostEqual(r.recall, 0.6666, delta = 0.0001) 214 | self.assertAlmostEqual(r.f1_score, 0.5714, delta = 0.001) 215 | 216 | def test_perfect_match(self): 217 | ret = dummy_format_factory(['zero']) 218 | rel = dummy_format_factory(['zero']) 219 | # args: TextOnlyEvaluator(retrieved, relevant) 220 | e = TextOnlyEvaluator(ret, rel) 221 | r = e.get_eval_results() 222 | self.assertAlmostEqual(r.precision, 1) 223 | self.assertAlmostEqual(r.recall, 1) 224 | self.assertAlmostEqual(r.f1_score, 1) 225 | 226 | class TestTextBasedResults(unittest2.TestCase): 227 | 228 | def setUp(self): 229 | self.results = TextBasedResults('e1') 230 | # Result(precision, recall, f1_score, id) 231 | self.results.add_result(Result(0,0,float('inf'),None)) 232 | 233 | self.results.add_result(Result(float('inf'),0,float('nan'),None)) 234 | self.results.add_result(Result(float('inf'),0,float('nan'),None)) 235 | 236 | self.results.add_result(Result(0,float('inf'),float('nan'),None)) 237 | self.results.add_result(Result(0,float('inf'),float('nan'),None)) 238 | 239 | self.results.add_result(Result(float('inf'),float('inf'),float('nan'),None)) 240 | 241 | self.results.add_result(Result(0.2,0.2,0.2,None)) 242 | self.results.add_result(Result(0.2,0.2,0.2,None)) 243 | self.results.add_result(Result(0.2,0.2,0.2,None)) 244 | self.results.add_result(Result(0.2,0.2,0.2,None)) 245 | 246 | self.results.dataset_len = 12 247 | 248 | def tearDown(self): 249 | self.results.text_eval_results['e1'] = [] 250 | 251 | def test_results_contents(self): 252 | contents = self.results.result_contents('e1') 253 | self.assertEqual(contents.fail, 2) 254 | self.assertEqual(contents.succ, 4) 255 | self.assertEqual(contents.rel_empty, 2) 256 | self.assertEqual(contents.ret_empty, 2) 257 | self.assertEqual(contents.rel_ret_empty, 1) 258 | self.assertEqual(contents.missmatch, 1) 259 | 260 | def test_result_filter(self): 261 | fr = self.results.filtered_results('e1') 262 | self.assertEqual(len(fr), 4) 263 | 264 | def test_precision_statistics(self): 265 | avg, std = self.results.precision_statistics('e1') 266 | self.assertEqual(avg, 0.2) 267 | self.assertEqual(std, 0.) 268 | 269 | def test_recall_statistics(self): 270 | avg, std = self.results.recall_statistics('e1') 271 | self.assertEqual(avg, 0.2) 272 | self.assertEqual(std, 0.) 273 | 274 | def test_f1score_statistics(self): 275 | avg, std = self.results.f1score_statistics('e1') 276 | self.assertEqual(avg, 0.2) 277 | self.assertEqual(std, 0.) 278 | 279 | def test_add_bad_result(self): 280 | r = TextBasedResults('e2') 281 | with self.assertRaises(AssertionError): 282 | r.add_result(Result(2,1,1,None)) 283 | with self.assertRaises(AssertionError): 284 | r.add_result(Result(float('inf'),float('inf'),1,None)) 285 | with self.assertRaises(AssertionError): 286 | r.add_result(Result(float('inf'),0,1,None)) 287 | with self.assertRaises(AssertionError): 288 | r.add_result(Result(0,0,1,None)) 289 | 290 | def test_add_good_result(self): 291 | r = TextBasedResults('e3') 292 | try: 293 | r.add_result(Result(0.2,0.2,0.2,None)) 294 | except AssertionError: 295 | self.fail() 296 | 297 | def main(): 298 | unittest2.main(exit = False, verbosity = 2) 299 | 300 | if __name__ == '__main__': 301 | main() 302 | -------------------------------------------------------------------------------- /tests/test_plot.py: -------------------------------------------------------------------------------- 1 | import unittest2 2 | 3 | from plot_manage import equidistant_count 4 | 5 | class TestEvaluation(unittest2.TestCase): 6 | 7 | def test_equidistant_count(self): 8 | r = equidistant_count(0, 1, 0.2, [0.11,0.22,0.32]) 9 | self.assertEqual(r, (1,2,0,0,0)) 10 | 11 | r = equidistant_count(0, 1, 0.5, [0.,0.22,0.32,0.5]) 12 | self.assertEqual(r, (3,1)) 13 | 14 | def main(): 15 | unittest2.main(exit = False, verbosity = 2) 16 | 17 | if __name__ == '__main__': 18 | main() -------------------------------------------------------------------------------- /tests/testsrunner.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Run all test cases residing in all modules that follow the test_[name].py 3 | naming template. 4 | 5 | We could also use nose test autodiscovery tool instead. 6 | ''' 7 | import os 8 | import unittest2 9 | 10 | def test_modules(): 11 | '''Get all test modules''' 12 | modlist = [] 13 | for mod in os.listdir('.'): 14 | if mod.startswith('test_') and mod.endswith(".py"): 15 | modlist.append(mod[0:-3]) 16 | return modlist 17 | 18 | if __name__ == "__main__": 19 | suite = unittest2.TestSuite() 20 | for mod in test_modules(): 21 | suite.addTests(unittest2.TestLoader().loadTestsFromName(mod)) 22 | unittest2.TextTestRunner(verbosity=2).run(suite) --------------------------------------------------------------------------------