├── .gitignore ├── .travis.yml ├── Makefile ├── README.md ├── libshorttext ├── __init__.py ├── analyzer │ ├── __init__.py │ ├── analyzer_impl.py │ └── selector.py ├── classifier │ ├── __init__.py │ ├── classifier_impl.py │ ├── grid.py │ └── learner │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── learner_impl.py │ │ ├── liblinear │ │ ├── COPYRIGHT │ │ ├── Makefile │ │ ├── README │ │ ├── blas │ │ │ ├── Makefile │ │ │ ├── blas.h │ │ │ ├── blasp.h │ │ │ ├── daxpy.c │ │ │ ├── ddot.c │ │ │ ├── dnrm2.c │ │ │ └── dscal.c │ │ ├── heart_scale │ │ ├── linear.cpp │ │ ├── linear.def │ │ ├── linear.h │ │ ├── predict │ │ ├── predict.c │ │ ├── python │ │ │ ├── Makefile │ │ │ ├── README │ │ │ ├── liblinear.py │ │ │ └── liblinearutil.py │ │ ├── train │ │ ├── train.c │ │ ├── tron.cpp │ │ └── tron.h │ │ ├── test │ │ ├── test.cpp │ │ └── util.c └── converter │ ├── __init__.py │ ├── converter_impl.py │ ├── stemmer │ ├── Makefile │ ├── __init__.py │ ├── porter.c │ └── porter.py │ └── stop-words │ ├── stoplist-nsp.regex │ └── stoplist-nsp.regex.pickle └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.o 3 | *.so 4 | *.pyd 5 | *~ 6 | .#* 7 | *.lprof 8 | *.swp 9 | *.swo 10 | .DS_Store 11 | build 12 | .idea -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "2.6" 5 | - "2.7" 6 | 7 | branches: 8 | only: 9 | - master 10 | 11 | before_script: 12 | - python setup.py install 13 | 14 | script: 15 | - python -c "from libshorttext.analyzer import *; from libshorttext.classifier import *; from libshorttext.converter import *" -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: code 3 | 4 | code: stemmer learner 5 | 6 | stemmer: 7 | make -C libshorttext/converter/stemmer 8 | 9 | learner: 10 | make -C libshorttext/classifier/learner 11 | 12 | clean: 13 | 14 | cleanclean: 15 | rm -rf *.svm *.converter *.model *.config *.out *.pyc 16 | make -C doc clean 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | python-libshorttext 2 | =================== 3 | 4 | [![Build Status](https://travis-ci.org/2shou/python-libshorttext.svg?branch=master)](https://travis-ci.org/2shou/python-libshorttext) 5 | 6 | An easy-install script for LibShortText 7 | 8 | I recommend [TextGrocery](https://github.com/2shou/TextGrocery) for beginners, which provides more elegant api for LibShortText. 9 | 10 | [LibShortText](http://www.csie.ntu.edu.tw/~cjlin/libshorttext/) is a high-performance classifier for short-text such as titles, questions, sentences, and short messages. 11 | 12 | This script provides a easy way to install LibShortText. 13 | 14 | Notice 15 | ------ 16 | It only works on Unix-based System like Linux or Mac OS, while the Python version must be 2.6 or newer. 17 | 18 | Install 19 | ------- 20 | 21 | $ python setup install 22 | -------------------------------------------------------------------------------- /libshorttext/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | LibShort is a package for short text classification. It supports training, test, 3 | and analysis tools. 4 | """ 5 | -------------------------------------------------------------------------------- /libshorttext/analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`analyzer` is used for micro (for a single text instance) or macro (e.g., 3 | accuracy) analysis. Users can use :class:`InstanceSet` to specify the scope 4 | to analyze by :class:`Analyzer`. 5 | 6 | :: 7 | 8 | >>> from libshorttext.analyzer import * 9 | >>> 10 | >>> # load instances from an analyzable predict result file 11 | >>> insts = InstanceSet('prediction_result_path') 12 | >>> # find instances labels whose true and predicted labels are as specified 13 | >>> insts = insts.select(with_labels(['Books', 'Music', 'Art'])) 14 | >>> 15 | >>> # create an analyzer 16 | >>> analyzer = Analyzer('model_path') 17 | >>> analyzer.gen_confusion_table(insts) 18 | Books Music Art 19 | Books 169 1 0 20 | Music 2 214 0 21 | Art 6 0 162 22 | 23 | To use the analysis tools, an analyzable result and a model are required. Refer to 24 | :class:`libshorttext.classifier.PredictionResult` and 25 | :class:`libshorttext.classifier.TextModel`. 26 | 27 | """ 28 | 29 | from .analyzer_impl import * 30 | del analyzer_impl 31 | 32 | from .selector import * 33 | del selector 34 | -------------------------------------------------------------------------------- /libshorttext/analyzer/analyzer_impl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os 3 | from collections import defaultdict 4 | from ..classifier import * 5 | __all__ = ['TextInstance', 'InstanceSet', 'Analyzer'] 6 | 7 | if sys.version_info[0] >= 3: 8 | xrange = range 9 | izip = zip 10 | else: 11 | from itertools import izip 12 | 13 | class TextInstance: 14 | ''' 15 | :class:`TextInstance` represents a text instance. It includes the index, 16 | the true label, the predicted label, the text, and the decision values 17 | of the text instance. Normally you do not directly create an instance. 18 | Instead, it is usually manipulated by :class:`InstanceSet`. For 19 | more information, please see the usage in :class:`InstanceSet`. 20 | ''' 21 | 22 | def __init__(self, idx, true_y = '', predicted_y = '', text = '', extra_svm_feats = [], decvals = None): 23 | self.idx = idx #: Instance index in the text source. 24 | 25 | #: The true label (if provided in the text source in the prediction phase). 26 | self.true_y = true_y 27 | 28 | #: The predicted label. 29 | self.predicted_y = predicted_y 30 | 31 | #: The original text. The value is an empty :class:`str` 32 | #: (``''``) at the beginning. The value is filled after 33 | #: :func:`PredInst.load_text` is called. 34 | self.text = text 35 | #: The extra svm features. The value is an empty :class:`str` 36 | #: at the beginning. The value is filled after 37 | #: :func:`PredInst.load_text` is called. 38 | self.extra_svm_feats = extra_svm_feats 39 | 40 | #: A :class:`list` of decision values. The length should be the 41 | #: number of classes. 42 | self.decvals = decvals 43 | 44 | def __str__(self): 45 | string = '''text = {text} 46 | true label = {true_y} 47 | predicted label = {predicted_y} 48 | '''.format(text = self.text, true_y = self.true_y, predicted_y = self.predicted_y) 49 | if self.extra_svm_feats: 50 | string += 'extra svm features = {extra}\n'.format(extra = self.extra_svm_feats) 51 | return string 52 | 53 | class InstanceSet: 54 | ''' 55 | :class:`InstanceSet` is a group of :class:`TextInstance` instances. It is used to 56 | get a subset of interested data. It should be initialized with a prediction 57 | result file (and a testing data). By default, the path to the testing data 58 | is stored in the prediction result file so you can only give the path to 59 | prediction result file. 60 | 61 | >>> from libshorttext.analyzer import * 62 | >>> insts = InstanceSet('prediction_result_path') 63 | 64 | If you have moved testing data, then you must re-assign the path to testing 65 | data. 66 | 67 | >>> from libshorttext.analyzer import * 68 | >>> insts = InstanceSet('prediction_result_path', 'testing_data_path') 69 | ''' 70 | 71 | def __init__(self, rst_src = None, text_src = None): 72 | self.insts = None 73 | self.correct = None 74 | self.filepath = None 75 | self.extra_svm_files = [] 76 | self.true_labels = None 77 | self.predict_labels = None 78 | self.quantity = None 79 | self.selectors = [] 80 | if rst_src is not None: 81 | self._load(rst_src, text_src) 82 | 83 | def __iter__(self): 84 | return iter(self.insts) 85 | 86 | def __getitem__(self, idx): 87 | return self.insts[idx] 88 | 89 | def select(self, *sel_funcs): 90 | ''' 91 | This function helps users find interested data. The arguments 92 | are `selector functions`, where both the argument and returned 93 | values are lists. There are several build-in selector functions. 94 | Refer to :ref:`selectorfunctions`. 95 | 96 | >>> from libshorttext.analyzer import * 97 | >>> insts = InstanceSet('prediction_result_path') 98 | >>> insts1 = insts.select(wrong, with_labels(['Books', 'Music'])) 99 | ''' 100 | ### How to link to the section?? 101 | insts = self.insts 102 | selectors = self.selectors[:] 103 | for sel_func in sel_funcs: 104 | insts = sel_func(insts) 105 | selectors.append(sel_func._libshorttext_msg or '') 106 | #if not insts: 107 | # raise Exception("No instance selected.") 108 | sel_insts = InstanceSet() 109 | sel_insts.filepath = self.filepath 110 | sel_insts.extra_svm_files = self.extra_svm_files 111 | sel_insts.selectors = selectors 112 | sel_insts.insts = insts 113 | return sel_insts 114 | 115 | def load_text(self): 116 | ''' 117 | The text of instances are not stored in the prediction result file, 118 | so you need to call this function to load texts from testing data. 119 | 120 | >>> from libshorttext.analyzer import * 121 | >>> insts = InstanceSet('prediction_result_path') 122 | >>> insts.load_text() 123 | 124 | This method also load the extra svm features if extra svm files 125 | are used when training. 126 | ''' 127 | EMPTY_MESSAGE = '**None**' 128 | sorted_insts = sorted(self.insts, key = lambda inst: inst.idx) 129 | i = 0 130 | for idx, lines in enumerate(izip(*([open(self.filepath, 'r')] + [open(f, 'r') for f in self.extra_svm_files]))): 131 | line = lines[0] 132 | extra_svm_feats = lines[1:] 133 | nr_extra_svm_feats = len(extra_svm_feats) 134 | if idx > sorted_insts[-1].idx: 135 | break 136 | if idx == sorted_insts[i].idx: 137 | try: 138 | sorted_insts[i].text = line.split('\t',1)[1].strip() 139 | except: 140 | sorted_insts[i].text = EMPTY_MESSAGE 141 | 142 | sorted_insts[i].extra_svm_feats = [None] * nr_extra_svm_feats 143 | for j, extra_svm_feat in enumerate(extra_svm_feats): 144 | try: 145 | sorted_insts[i].extra_svm_feats[j] = dict(map(lambda t: (int(t[0]), float(t[1])), [feat.split(':') for feat in extra_svm_feat.split(None, 1)[1].split()])) 146 | except: 147 | sorted_insts[i].extra_svm_feats[j] = EMPTY_MESSAGE 148 | i += 1 149 | 150 | def _load(self, src, text_src): 151 | if isinstance(src, PredictionResult): 152 | pass 153 | elif isinstance(src, str): 154 | result = PredictionResult() 155 | result.load(src) 156 | else: 157 | raise Exception('"result" should be PredictionResult or string.') 158 | 159 | if not result.analyzable(): 160 | raise ValueError('The given result is not analyzable.') 161 | 162 | # +++ Need to move to another place. 163 | #if self.model._hashcode != result.model_id: 164 | # sys.stderr.write('Warning: model ID is different from that in the predicted result. Do you use a different model to analyze?\n') 165 | 166 | if text_src is None: 167 | self.filepath = result.text_src 168 | else: 169 | self.filepath = text_src 170 | self.extra_svm_files = result.extra_svm_files 171 | predicted_y = result.predicted_y 172 | self.acc = result.get_accuracy() 173 | decvals = result.decvals 174 | true_y = result.true_y 175 | 176 | self.insts, self.true_labels, self.predict_labels = [], set(), set() 177 | for idx in range(len(true_y)): 178 | self.insts += [TextInstance(idx, true_y = true_y[idx], predicted_y = predicted_y[idx], decvals = list(decvals[idx]))] 179 | self.true_labels.add(true_y[idx]) 180 | self.predict_labels.add(predicted_y[idx]) 181 | 182 | class Analyzer: 183 | ''' 184 | :class:`Analyzer` is a tool for analyzing a group of instances, which is 185 | controlled by :class:`InstanceSet`. Typically :class:`Analyzer` is initialized 186 | with a path to a model. 187 | 188 | >>> from libshorttext.analyzer import * 189 | >>> analyzer = Analyzer('model_path') 190 | 191 | It can also be initialized with a :class:`libshorttext.classifier.TextModel` 192 | instance. 193 | 194 | >>> from libshorttext.analyzer import * 195 | >>> from libshorttext.classifier import * 196 | >>> text_model = TextModel('model_path') 197 | >>> analyzer = Analyzer(text_model) 198 | 199 | You can also construct an analyzer without a model. However, 200 | model-dependent functions cannot be used. 201 | 202 | >>> from libshorttext.analyzer import * 203 | >>> analyzer = Analyzer() 204 | ''' 205 | 206 | def __init__(self, model = None): 207 | self.labels = None 208 | self.model = None 209 | if model is not None: 210 | self.load_model(model) 211 | 212 | def load_model(self, model): 213 | ''' 214 | :func:`load_model` is used to load a model into 215 | :class:`Analyzer`. If you did not load a model in the constructor or if you 216 | would like to use another model, you can use this function. 217 | 218 | There are two ways to load a model: from an instance of 219 | :class:`libshorttext.classifier.TextModel` or a path to a model. 220 | 221 | >>> from libshorttext.analyzer import * 222 | >>> analyzer = Analyzer('original_model_path') 223 | >>> analyzer.load_model('new_model_path') 224 | ''' 225 | 226 | if isinstance(model, TextModel): 227 | self.model = model 228 | elif isinstance(model, str): 229 | self.model = TextModel() 230 | self.model.load(model) 231 | else: 232 | raise Exception('"model" should be TextModel or string.') 233 | self.labels = self.model.get_labels() 234 | 235 | def analyze_single(self, target, amount = 5, output = None, extra_svm_feats = []): 236 | ''' 237 | :func:`analyze_single` is used to analyze a single instance. It prints 238 | weights of all features in some classes (default 5). The output is 239 | sorted according to decision values in descending order. *target* can be an 240 | instance or a string that you want to analyze. *amount* is how many instances 241 | you want to print. If *output* is specified by a path to a file, the 242 | result will be outputted to the file instead of on the screen. 243 | 244 | >>> from libshorttext.analyzer import * 245 | >>> analyzer = Analyzer('model_path') 246 | >>> insts = InstanceSet('prediction_result_path') 247 | >>> insts.load_text() 248 | >>> analyzer.analyze_single(insts[61], 3) 249 | Jewelry & Watches Cameras & Photo Coins & Paper Money 250 | pb 7.589e-19 2.041e-01 0.000e+00 251 | green -8.897e-02 1.227e-02 -1.507e-01 252 | mm 5.922e-01 6.731e-01 1.256e-03 253 | onyx silver 1.382e-01 -6.198e-02 -4.743e-19 254 | 48 -1.792e-02 2.188e-02 -1.346e-04 255 | pendant 1.107e+00 -1.039e-01 -1.409e-01 256 | silver pendant 2.455e-01 -7.826e-02 -8.379e-02 257 | silver 8.533e-01 -2.205e-02 8.076e-01 258 | onyx 1.520e-01 -6.198e-02 -4.743e-19 259 | **decval** 9.937e-01 1.944e-01 1.444e-01 260 | >>> analyzer.analyze_single('MICKEY MOUSE POT STAKE', 3) 261 | Home & Garden Video Games & Consoles Computers/Tablets & Networking 262 | mickey 9.477e-02 -3.168e-02 6.722e-02 263 | mouse 2.119e-01 2.039e-01 -2.212e-02 264 | pot 8.897e-01 -5.167e-02 -2.466e-02 265 | stake 4.057e-01 -2.147e-02 -3.699e-02 266 | mickey mouse 1.146e-01 -3.168e-02 6.784e-02 267 | mouse pot 4.041e-01 -2.147e-02 -1.588e-02 268 | pot stake 5.363e-01 -2.147e-02 -1.588e-02 269 | **decval** 1.004e+00 9.255e-03 7.385e-03 270 | 271 | 272 | If *target* is a :class:`str` and extra svm files are used in 273 | training, the same number of extra svm features can be 274 | specified in *extra_svm_feats*. Extra svm features should be 275 | a list of dictionaries. If *target* is a :class:`TextInstance`, 276 | the extra features in the :class:`TextInstance` will be used. 277 | ''' 278 | if self.model is None: 279 | raise Exception('Model not loaded.') 280 | if isinstance(target,str): 281 | text = target 282 | true_y = None 283 | result = predict_single_text(text, self.model, extra_svm_feats = extra_svm_feats) 284 | decvals = result.decvals 285 | elif isinstance(target,TextInstance): 286 | if target.text is None: 287 | raise Exception('Please load texts first.') 288 | text, extra_svm_feats, true_y = target.text, target.extra_svm_feats, target.true_y 289 | decvals = target.decvals 290 | if isinstance(output, str): 291 | output = open(output, 'w') 292 | 293 | features, weights, labels = self.model.get_weight(text, extra_svm_feats = extra_svm_feats) 294 | nr_labels = len(labels) 295 | nr_feats = len(features) 296 | if not features or not weights: 297 | raise Exception('Invalid instance.') 298 | features = [' '.join(feature) for feature in features] 299 | features += ['**decval**'] 300 | weights_table = [[0]*nr_labels]*(nr_feats+1) 301 | sorted_idx = sorted(xrange(nr_labels), key=lambda i:decvals[i], reverse=True) 302 | labels = [labels[idx] for idx in sorted_idx] 303 | 304 | for feat in xrange(nr_feats): 305 | formatter = lambda idx: '{0:.3e}'.format(weights[feat][idx]) 306 | weights_table[feat] = [formatter(idx) for idx in sorted_idx] 307 | weights_table[-1] = ['{0:.3e}'.format(decvals[idx]) for idx in sorted_idx] 308 | 309 | if amount != 0: 310 | labels = labels[:amount] 311 | draw_table(features, labels, weights_table, output) 312 | if true_y is not None: 313 | print('True label: {0}'.format(true_y)) 314 | 315 | def _calculate_info(self, pred_insts): 316 | pred_insts.quantity = len(pred_insts.insts) 317 | pred_insts.true_labels, pred_insts.predict_labels, pred_insts.correct = \ 318 | set(), set(), 0 319 | for inst in pred_insts.insts: 320 | pred_insts.true_labels.add(inst.true_y) 321 | pred_insts.predict_labels.add(inst.predicted_y) 322 | if inst.true_y == inst.predicted_y: 323 | pred_insts.correct += 1 324 | 325 | def info(self, pred_insts, output = None): 326 | ''' 327 | :func:`info` gets information about a group of instances (an object 328 | of :class:`InstanceSet`). *pred_insts* is the target instances. If *output* 329 | is specified by a path to a file, the result will be outputted to the file 330 | instead of on the screen. 331 | 332 | >>> from libshorttext.analyzer import * 333 | >>> analyzer = Analyzer('model_path') 334 | >>> insts = InstanceSet('prediction_result_path') 335 | >>> insts = insts.select(with_labels(['Books', 'Music', 'Art'])) 336 | >>> analyzer.info(insts) 337 | Number of instances: 554 338 | Accuracy: 0.983754512635 (545/554) 339 | True labels: "Art" "Books" "Music" 340 | Predict labels: "Art" "Books" "Music" 341 | Text source: 342 | /home/guestwalk/working/short_text/svn/software-dev/test_file 343 | Selectors: 344 | -> labels: "Books", "Music", "Art" 345 | ''' 346 | if isinstance(output, str): 347 | output = open(output, 'w') 348 | if pred_insts.quantity is None: 349 | self._calculate_info(pred_insts) 350 | acc = float(pred_insts.correct)/pred_insts.quantity 351 | 352 | string = '''Number of instances: {quantity} 353 | Accuracy: {acc} ({correct}/{quantity}) 354 | True labels: {true_y} 355 | Predicted labels: {predicted_y} 356 | Text source: {text_src} 357 | Selectors: \n-> {selectors}'''\ 358 | .format(quantity = pred_insts.quantity, correct = pred_insts.correct,\ 359 | acc = acc, true_y = '"'+'" "'.join(pred_insts.true_labels)+'"',\ 360 | predicted_y = '"'+'" "'.join(pred_insts.predict_labels)+'"',\ 361 | text_src = os.path.abspath(pred_insts.filepath),\ 362 | selectors = '\n-> '.join(pred_insts.selectors)) 363 | 364 | write(string, output) 365 | 366 | def gen_confusion_table(self, pred_insts, output = None): 367 | ''' 368 | :func:`gen_confusion_table` generates a confusion table of a group of 369 | predicted instances *pred_insts*. If *output* is specified by a path 370 | to a file, the result will be outputted to the file instead of 371 | on the screen. 372 | 373 | >>> from libshorttext.analyzer import * 374 | >>> analyzer = Analyzer('model_path') 375 | >>> insts = InstanceSet('prediction_result_path') 376 | >>> insts = insts.select(with_labels(['Books', 'Music', 'Art'])) 377 | >>> analyzer.gen_confusion_table(insts) 378 | Books Music Art 379 | Books 169 1 0 380 | Music 2 214 0 381 | Art 6 0 162 382 | ''' 383 | if isinstance(output, str): 384 | output = open(output, 'w') 385 | if pred_insts.quantity is None: 386 | self._calculate_info(pred_insts) 387 | labels = pred_insts.true_labels.union(pred_insts.predict_labels) 388 | #columns = rows 389 | 390 | invalid_labels = [] 391 | for label in labels: 392 | if label not in pred_insts.true_labels and label not in pred_insts.predict_labels: 393 | invalid_labels.append(label) 394 | if invalid_labels: 395 | invalid_labels = ' '.join(invalid_labels) 396 | raise Exception('Labels {0} are invalid.'.format(invalid_labels)) 397 | 398 | labels_dic = dict(zip(labels, xrange(len(labels)))) 399 | confusion_table = [[0 for i in range(len(labels_dic))] for j in range(len(labels_dic))] 400 | for inst in pred_insts.insts: 401 | if inst.true_y in labels_dic and inst.predicted_y in labels_dic: 402 | confusion_table[labels_dic[inst.true_y]][labels_dic[inst.predicted_y]] += 1 403 | for idx_row, row in enumerate(confusion_table): 404 | for idx_col, col in enumerate(row): 405 | confusion_table[idx_row][idx_col] = str(confusion_table[idx_row][idx_col]) 406 | 407 | draw_table(labels, labels, confusion_table, output) 408 | 409 | if output: 410 | output.close() 411 | 412 | def write(string, output = None): 413 | if output is None: 414 | print(string) 415 | else: 416 | output.write(string + '\n') 417 | 418 | 419 | def draw_table(rows, columns, table, output = None): 420 | offset = 2 421 | column_widths = [] 422 | title_width = max([len(row) for row in rows]) + offset 423 | 424 | for col_idx, column in enumerate(columns): 425 | column_widths.append(max([len(table[row_idx][col_idx]) \ 426 | for row_idx, row in enumerate(rows)] + [len(column)]) + offset) 427 | 428 | string = ''.ljust(title_width) 429 | for idx, column in enumerate(columns): 430 | string += column.rjust(column_widths[idx]) 431 | write(string, output) 432 | 433 | for row_idx, row in enumerate(rows): 434 | string = row.ljust(title_width) 435 | for col_idx, column in enumerate(columns): 436 | string += table[row_idx][col_idx].rjust(column_widths[col_idx]) 437 | write(string, output) 438 | -------------------------------------------------------------------------------- /libshorttext/analyzer/selector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from random import sample 4 | 5 | __all__ = ['wrong', 'with_labels', 'sort_by_dec', 'subset', 'selectorize', 'reverse'] 6 | 7 | def selectorize(option = 'general', comment = None): 8 | """ 9 | A function decorator which returns a function wrapper to generate a 10 | selector function. 11 | 12 | *option* can be ``'select'``, ``'sort'``, or ``'general'``. See the 13 | following table. 14 | 15 | +---------------+-----------------------------------------------------+ 16 | | *option* | What should the defined function do? | 17 | +===============+=====================================================+ 18 | | ``'select'`` | The defined function should decide whether an | 19 | | | instance should be selected or not. Therefore, the | 20 | | | input is a :class:`TextInstance`, and the output | 21 | | | should be ``True`` or ``False``. ``True`` means that| 22 | | | this instance should be selected. | 23 | +---------------+-----------------------------------------------------+ 24 | | ``'sort'`` | The defined function should return the key of an | 25 | | | :class:`TextInstance` for sorting. The input is a | 26 | | | :class:`TextInstance`, and the output should be a | 27 | | | value or an object that is comparable. | 28 | +---------------+-----------------------------------------------------+ 29 | | ``'general'`` | Equivalent to the original function without applying| 30 | | | the function wrapper. Therefore, the defined | 31 | | | function's input and output are a list of | 32 | | | :class:`TextInstance`. | 33 | +---------------+-----------------------------------------------------+ 34 | 35 | For example, :func:`wrong` is equivalent to the following function:: 36 | 37 | @selectorize('select', 'Select wrongly predicted instances') 38 | def wrong(inst): 39 | return inst.true_y != inst.predicted_y 40 | 41 | And, :func:`sort_by_dec` is equivalent to the following function:: 42 | 43 | @selectorize('sort', 'Sort by maximum decision values.') 44 | def sort_by_dec(inst): 45 | return max(inst.decvals) 46 | 47 | *comment* is the argument of the comment on the function, which will 48 | be shown by the :meth:`libshorttext.analyzer.Analyzer.info`. See the 49 | following example. 50 | 51 | :: 52 | 53 | >>> from libshorttext.analyzer import * 54 | >>> 55 | >>> @selectorize(comment = 'foo function') 56 | >>> def foo(x): 57 | >>> return x 58 | >>> 59 | >>> insts = InstanceSet('predict_result_path').select(foo) 60 | >>> Analyzer('model_path').info(insts) 61 | [output skipped] 62 | Selectors : 63 | -> foo function 64 | """ 65 | 66 | def inner_func(input_func): 67 | if option == "select": 68 | def inner_func2(insts): 69 | return list(filter(input_func, insts)) 70 | elif option == "sort": 71 | def inner_func2(insts): 72 | return sorted(insts, key = input_func) 73 | elif option == "general": 74 | inner_func2 = input_func 75 | else: 76 | raise Exception("No such setting.") 77 | 78 | if input_func is None or comment is None: 79 | inner_func2._libshorttext_msg = "user-defined selector function" 80 | else: 81 | inner_func2._libshorttext_msg = comment 82 | 83 | inner_func2.__doc__ = input_func.__doc__ 84 | 85 | return inner_func2 86 | return inner_func 87 | 88 | @selectorize('select', 'Select wrongly predicted instances') 89 | def wrong(inst): 90 | ''' 91 | Select wrongly predicted instances. It assumes that the labels in the 92 | test data are true labels. 93 | 94 | This function should be passed to :meth:`InstanceSet.select` without any 95 | argument. 96 | 97 | >>> insts = InstanceSet('prediction_result_path').select(wrong) 98 | ''' 99 | return inst.true_y != inst.predicted_y 100 | 101 | def with_labels(labels, target = 'both'): 102 | ''' 103 | Select instances with specified labels. *labels* is an iterable object 104 | of :class:`str` instances, which represent the label names. 105 | 106 | *target* can be ``'true'``, ``'predict'``, ``'both'``, ``'or'``. If 107 | *target* is ``'true'``, then this function finds instances based on the 108 | true label specified in the test data. If *target* is 109 | ``'predict'``, it finds instances based on the predicted labels. 110 | ``'both'`` and ``'or'`` find the intersection and the union of 111 | ``'true'`` and ``'predict'``, respectively. The default value of 112 | ``'target'`` is ``'both'``. 113 | 114 | The following example selects instances where the true labels are 115 | ``'Music'`` or ``'Books'``. 116 | 117 | >>> insts = InstanceSet('prediction_result_path').select(with_labels(['Books', 'Music'])) 118 | ''' 119 | @selectorize('select', 'labels: "{0}"'.format('", "'.join(labels))) 120 | def inner_func(inst): 121 | if target == 'both': 122 | return inst.true_y in labels and inst.predicted_y in labels 123 | elif target == 'or': 124 | return inst.true_y in labels or inst.predicted_y in labels 125 | elif target == 'true': 126 | return inst.true_y in labels 127 | elif target == 'predict': 128 | return inst.predicted_y in labels 129 | else: 130 | raise Exception("No such setting.") 131 | return inner_func 132 | 133 | @selectorize('sort', 'Sort by maximum decision values.') 134 | def sort_by_dec(inst): 135 | ''' 136 | Sort instances by the decision values of the predicted labels in ascending 137 | order. You can combine this function with :func:`reverse` to sort decision 138 | values from large to small. 139 | 140 | >>> insts = InstanceSet('prediction_result_path').select(sort_by_dec, reverse) 141 | 142 | This function should be passed to :meth:`InstanceSet.select` without any argument. 143 | ''' 144 | return max(inst.decvals) 145 | 146 | def subset(amount, method = 'top'): 147 | ''' 148 | Find a subset of the :class:`InstanceSet`. *amount* is the number of 149 | selected instances. *method* can be ``'top'`` or ``'random'``. If 150 | *method* is ``'top'``, the first *amount* instances are selected. 151 | Otherwise, :meth:`InstanceSet` selects instances randomly. If *amount* is 152 | larger than the number of instances, :meth:`InstanceSet` will return all 153 | instances. 154 | 155 | The ``'top'`` method is useful when used after :func:`sort_by_dec`. The 156 | following example selects ten instances with the smallest decision values of 157 | the predicted label. 158 | 159 | >>> insts = InstanceSet('prediction_result_path').select(sort_by_dec, subset(10)) 160 | ''' 161 | @selectorize(comment = 'Select {0} instances in {1}.'.format(amount, method)) 162 | def inner_func(insts): 163 | if amount > len(insts): 164 | return insts 165 | elif method == 'random': 166 | return sample(insts, amount) 167 | elif method == 'top': 168 | return insts[0:amount] 169 | else: 170 | raise Exception("No such setting.") 171 | return inner_func 172 | 173 | 174 | @selectorize(comment = 'Reverse the order of instances') 175 | def reverse(insts): 176 | """ 177 | Reverse the order of instances. 178 | 179 | This function should be passed to :meth:`InstanceSet.select` without any 180 | argument. 181 | 182 | >>> insts = InstanceSet('prediction_result_path').select(reverse) 183 | """ 184 | return list(reversed(insts)) 185 | -------------------------------------------------------------------------------- /libshorttext/classifier/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`classifier` is a high-level interface to train a short-text data. 3 | Members of :mod:`classifier` include :class:`TextModel` and its utility 4 | functions. :class:`TextModel` is obtained in training and then used in prediction. 5 | 6 | The standard method to get a :class:`TextModel` instance is via function 7 | :func:`train_text` or :func:`train_converted_text`, which trains 8 | text data (refer to :ref:`dataset`) or LIBSVM-format data, respectively. 9 | 10 | >>> from libshorttext.classifier import * 11 | >>> # train a model and save it to a file 12 | >>> m, svm_file = train_text('train_file') 13 | >>> # save the model to a file 14 | >>> m.save('model_path') 15 | 16 | After obtaining a :class:`TextModel`, users can use :func:`predict_text` or 17 | :func:`predict_single_text` to predict the label of a new short text. 18 | 19 | >>> from libshorttext.classifier import * 20 | >>> # load a model from a file 21 | >>> m = TextModel('model_path') 22 | >>> # predict a sentence 23 | >>> result = predict_single_text('This is a sentence.', m) 24 | 25 | Another class in module :mod:`classifier` is :class:`PredictionResult`, which is a 26 | wrapper of prediction results. Both :func:`predict_text` and 27 | :func:`predict_single_text` return a :class:`PredictionResult` object. 28 | 29 | :mod:`classifier` does not access the low-level LIBLINEAR's train and predict 30 | utilities directly. All jobs are passed to a submodule called :mod:`learner`, 31 | which is a middle-level classifier and communicates between :mod:`classifier` 32 | and LIBLINEAR. Users can also use the :mod:`learner` module directly without 33 | :mod:`classifier` to achieve more complicated usages. 34 | """ 35 | 36 | 37 | from .classifier_impl import * 38 | del classifier_impl 39 | -------------------------------------------------------------------------------- /libshorttext/classifier/grid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | __all__ = ['find_parameters'] 3 | 4 | import os, sys, traceback, getpass, time, re 5 | from threading import Thread 6 | from subprocess import * 7 | 8 | if sys.version_info[0] < 3: 9 | from Queue import Queue 10 | else: 11 | from queue import Queue 12 | 13 | telnet_workers = [] 14 | ssh_workers = [] 15 | nr_local_worker = 1 16 | 17 | class GridOption: 18 | def __init__(self, dataset_pathname, options): 19 | dirname = os.path.dirname(__file__) 20 | if sys.platform != 'win32': 21 | self.svmtrain_pathname = os.path.join(dirname, '../svm-train') 22 | self.gnuplot_pathname = '/usr/bin/gnuplot' 23 | else: 24 | # example for windows 25 | self.svmtrain_pathname = os.path.join(dirname, r'..\windows\svm-train.exe') 26 | # svmtrain_pathname = r'c:\Program Files\libsvm\windows\svm-train.exe' 27 | self.gnuplot_pathname = r'c:\tmp\gnuplot\binary\pgnuplot.exe' 28 | self.fold = 5 29 | self.c_begin, self.c_end, self.c_step = -5, 15, 2 30 | self.g_begin, self.g_end, self.g_step = 3, -15, -2 31 | self.grid_with_c, self.grid_with_g = True, True 32 | self.dataset_pathname = dataset_pathname 33 | self.dataset_title = os.path.split(dataset_pathname)[1] 34 | self.out_pathname = '{0}.out'.format(self.dataset_title) 35 | self.png_pathname = '{0}.png'.format(self.dataset_title) 36 | self.pass_through_string = ' ' 37 | self.resume_pathname = None 38 | self.parse_options(options) 39 | 40 | def parse_options(self, options): 41 | if type(options) == str: 42 | options = options.split() 43 | i = 0 44 | pass_through_options = [] 45 | 46 | while i < len(options): 47 | if options[i] == '-log2c': 48 | i = i + 1 49 | if options[i] == 'null': 50 | self.grid_with_c = False 51 | else: 52 | self.c_begin, self.c_end, self.c_step = map(float,options[i].split(',')) 53 | elif options[i] == '-log2g': 54 | i = i + 1 55 | if options[i] == 'null': 56 | self.grid_with_g = False 57 | else: 58 | self.g_begin, self.g_end, self.g_step = map(float,options[i].split(',')) 59 | elif options[i] == '-v': 60 | i = i + 1 61 | self.fold = options[i] 62 | elif options[i] in ('-c','-g'): 63 | raise ValueError('Use -log2c and -log2g.') 64 | elif options[i] == '-svmtrain': 65 | i = i + 1 66 | self.svmtrain_pathname = options[i] 67 | elif options[i] == '-gnuplot': 68 | i = i + 1 69 | if options[i] == 'null': 70 | self.gnuplot_pathname = None 71 | else: 72 | self.gnuplot_pathname = options[i] 73 | elif options[i] == '-out': 74 | i = i + 1 75 | if options[i] == 'null': 76 | self.out_pathname = None 77 | else: 78 | self.out_pathname = options[i] 79 | elif options[i] == '-png': 80 | i = i + 1 81 | self.png_pathname = options[i] 82 | elif options[i] == '-resume': 83 | if i == (len(options)-1) or options[i+1].startswith('-'): 84 | self.resume_pathname = self.dataset_title + '.out' 85 | else: 86 | i = i + 1 87 | self.resume_pathname = options[i] 88 | else: 89 | pass_through_options.append(options[i]) 90 | i = i + 1 91 | 92 | self.pass_through_string = ' '.join(pass_through_options) 93 | if not os.path.exists(self.svmtrain_pathname): 94 | raise IOError('svm-train executable not found') 95 | if not os.path.exists(self.dataset_pathname): 96 | raise IOError('dataset not found') 97 | if self.resume_pathname and not os.path.exists(self.resume_pathname): 98 | raise IOError('file for resumption not found') 99 | if not self.grid_with_c and not self.grid_with_g: 100 | raise ValueError('-log2c and -log2g should not be null simultaneously') 101 | if self.gnuplot_pathname and not os.path.exists(self.gnuplot_pathname): 102 | sys.stderr.write('gnuplot executable not found\n') 103 | self.gnuplot_pathname = None 104 | 105 | def redraw(db,best_param,gnuplot,options,tofile=False): 106 | if len(db) == 0: return 107 | begin_level = round(max(x[2] for x in db)) - 3 108 | step_size = 0.5 109 | 110 | best_log2c,best_log2g,best_rate = best_param 111 | 112 | # if newly obtained c, g, or cv values are the same, 113 | # then stop redrawing the contour. 114 | if all(x[0] == db[0][0] for x in db): return 115 | if all(x[1] == db[0][1] for x in db): return 116 | if all(x[2] == db[0][2] for x in db): return 117 | 118 | if tofile: 119 | gnuplot.write(b"set term png transparent small linewidth 2 medium enhanced\n") 120 | gnuplot.write("set output \"{0}\"\n".format(options.png_pathname.replace('\\','\\\\')).encode()) 121 | #gnuplot.write(b"set term postscript color solid\n") 122 | #gnuplot.write("set output \"{0}.ps\"\n".format(options.dataset_title).encode().encode()) 123 | elif sys.platform == 'win32': 124 | gnuplot.write(b"set term windows\n") 125 | else: 126 | gnuplot.write( b"set term x11\n") 127 | gnuplot.write(b"set xlabel \"log2(C)\"\n") 128 | gnuplot.write(b"set ylabel \"log2(gamma)\"\n") 129 | gnuplot.write("set xrange [{0}:{1}]\n".format(options.c_begin,options.c_end).encode()) 130 | gnuplot.write("set yrange [{0}:{1}]\n".format(options.g_begin,options.g_end).encode()) 131 | gnuplot.write(b"set contour\n") 132 | gnuplot.write("set cntrparam levels incremental {0},{1},100\n".format(begin_level,step_size).encode()) 133 | gnuplot.write(b"unset surface\n") 134 | gnuplot.write(b"unset ztics\n") 135 | gnuplot.write(b"set view 0,0\n") 136 | gnuplot.write("set title \"{0}\"\n".format(options.dataset_title).encode()) 137 | gnuplot.write(b"unset label\n") 138 | gnuplot.write("set label \"Best log2(C) = {0} log2(gamma) = {1} accuracy = {2}%\" \ 139 | at screen 0.5,0.85 center\n". \ 140 | format(best_log2c, best_log2g, best_rate).encode()) 141 | gnuplot.write("set label \"C = {0} gamma = {1}\"" 142 | " at screen 0.5,0.8 center\n".format(2**best_log2c, 2**best_log2g).encode()) 143 | gnuplot.write(b"set key at screen 0.9,0.9\n") 144 | gnuplot.write(b"splot \"-\" with lines\n") 145 | 146 | db.sort(key = lambda x:(x[0], -x[1])) 147 | 148 | prevc = db[0][0] 149 | for line in db: 150 | if prevc != line[0]: 151 | gnuplot.write(b"\n") 152 | prevc = line[0] 153 | gnuplot.write("{0[0]} {0[1]} {0[2]}\n".format(line).encode()) 154 | gnuplot.write(b"e\n") 155 | gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure 156 | gnuplot.flush() 157 | 158 | 159 | def calculate_jobs(options): 160 | 161 | def range_f(begin,end,step): 162 | # like range, but works on non-integer too 163 | seq = [] 164 | while True: 165 | if step > 0 and begin > end: break 166 | if step < 0 and begin < end: break 167 | seq.append(begin) 168 | begin = begin + step 169 | return seq 170 | 171 | def permute_sequence(seq): 172 | n = len(seq) 173 | if n <= 1: return seq 174 | 175 | mid = int(n/2) 176 | left = permute_sequence(seq[:mid]) 177 | right = permute_sequence(seq[mid+1:]) 178 | 179 | ret = [seq[mid]] 180 | while left or right: 181 | if left: ret.append(left.pop(0)) 182 | if right: ret.append(right.pop(0)) 183 | 184 | return ret 185 | 186 | 187 | c_seq = permute_sequence(range_f(options.c_begin,options.c_end,options.c_step)) 188 | g_seq = permute_sequence(range_f(options.g_begin,options.g_end,options.g_step)) 189 | 190 | if not options.grid_with_c: 191 | c_seq = [None] 192 | if not options.grid_with_g: 193 | g_seq = [None] 194 | 195 | nr_c = float(len(c_seq)) 196 | nr_g = float(len(g_seq)) 197 | i, j = 0, 0 198 | jobs = [] 199 | 200 | while i < nr_c or j < nr_g: 201 | if i/nr_c < j/nr_g: 202 | # increase C resolution 203 | line = [] 204 | for k in range(0,j): 205 | line.append((c_seq[i],g_seq[k])) 206 | i = i + 1 207 | jobs.append(line) 208 | else: 209 | # increase g resolution 210 | line = [] 211 | for k in range(0,i): 212 | line.append((c_seq[k],g_seq[j])) 213 | j = j + 1 214 | jobs.append(line) 215 | 216 | resumed_jobs = {} 217 | 218 | if options.resume_pathname is None: 219 | return jobs, resumed_jobs 220 | 221 | for line in open(options.resume_pathname, 'r'): 222 | line = line.strip() 223 | rst = re.findall(r'rate=([0-9.]+)',line) 224 | if not rst: 225 | continue 226 | rate = float(rst[0]) 227 | 228 | c, g = None, None 229 | rst = re.findall(r'log2c=([0-9.-]+)',line) 230 | if rst: 231 | c = float(rst[0]) 232 | rst = re.findall(r'log2g=([0-9.-]+)',line) 233 | if rst: 234 | g = float(rst[0]) 235 | 236 | resumed_jobs[(c,g)] = rate 237 | 238 | return jobs, resumed_jobs 239 | 240 | 241 | class WorkerStopToken: # used to notify the worker to stop or if a worker is dead 242 | pass 243 | 244 | class Worker(Thread): 245 | def __init__(self,name,job_queue,result_queue,options): 246 | Thread.__init__(self) 247 | self.name = name 248 | self.job_queue = job_queue 249 | self.result_queue = result_queue 250 | self.options = options 251 | 252 | def run(self): 253 | while True: 254 | (cexp,gexp) = self.job_queue.get() 255 | if cexp is WorkerStopToken: 256 | self.job_queue.put((cexp,gexp)) 257 | # print('worker {0} stop.'.format(self.name)) 258 | break 259 | try: 260 | c, g = None, None 261 | if cexp != None: 262 | c = 2.0**cexp 263 | if gexp != None: 264 | g = 2.0**gexp 265 | rate = self.run_one(c,g) 266 | if rate is None: raise RuntimeError('get no rate') 267 | except: 268 | # we failed, let others do that and we just quit 269 | 270 | traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) 271 | 272 | self.job_queue.put((cexp,gexp)) 273 | sys.stderr.write('worker {0} quit.\n'.format(self.name)) 274 | break 275 | else: 276 | self.result_queue.put((self.name,cexp,gexp,rate)) 277 | 278 | def get_cmd(self,c,g): 279 | options=self.options 280 | cmdline = options.svmtrain_pathname 281 | if options.grid_with_c: 282 | cmdline += ' -c {0} '.format(c) 283 | if options.grid_with_g: 284 | cmdline += ' -g {0} '.format(g) 285 | cmdline += ' -v {0} {1} {2} '.format\ 286 | (options.fold,options.pass_through_string,options.dataset_pathname) 287 | return cmdline 288 | 289 | class LocalWorker(Worker): 290 | def run_one(self,c,g): 291 | cmdline = self.get_cmd(c,g) 292 | result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout 293 | for line in result.readlines(): 294 | if str(line).find('Cross') != -1: 295 | return float(line.split()[-1][0:-1]) 296 | 297 | class SSHWorker(Worker): 298 | def __init__(self,name,job_queue,result_queue,host,options): 299 | Worker.__init__(self,name,job_queue,result_queue,options) 300 | self.host = host 301 | self.cwd = os.getcwd() 302 | def run_one(self,c,g): 303 | cmdline = 'ssh -x -t -t {0} "cd {1}; {2}"'.format\ 304 | (self.host,self.cwd,self.get_cmd(c,g)) 305 | result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout 306 | for line in result.readlines(): 307 | if str(line).find('Cross') != -1: 308 | return float(line.split()[-1][0:-1]) 309 | 310 | class TelnetWorker(Worker): 311 | def __init__(self,name,job_queue,result_queue,host,username,password,options): 312 | Worker.__init__(self,name,job_queue,result_queue,options) 313 | self.host = host 314 | self.username = username 315 | self.password = password 316 | def run(self): 317 | import telnetlib 318 | self.tn = tn = telnetlib.Telnet(self.host) 319 | tn.read_until('login: ') 320 | tn.write(self.username + '\n') 321 | tn.read_until('Password: ') 322 | tn.write(self.password + '\n') 323 | 324 | # XXX: how to know whether login is successful? 325 | tn.read_until(self.username) 326 | # 327 | print('login ok', self.host) 328 | tn.write('cd '+os.getcwd()+'\n') 329 | Worker.run(self) 330 | tn.write('exit\n') 331 | def run_one(self,c,g): 332 | cmdline = self.get_cmd(c,g) 333 | result = self.tn.write(cmdline+'\n') 334 | (idx,matchm,output) = self.tn.expect(['Cross.*\n']) 335 | for line in output.split('\n'): 336 | if str(line).find('Cross') != -1: 337 | return float(line.split()[-1][0:-1]) 338 | 339 | def find_parameters(dataset_pathname, options=''): 340 | 341 | def update_param(c,g,rate,best_c,best_g,best_rate,worker,resumed): 342 | if (rate > best_rate) or (rate==best_rate and g==best_g and c= 3: 11 | xrange = range 12 | import pickle as cPickle 13 | izip = zip 14 | def unicode(string, setting): 15 | return string 16 | else : 17 | import cPickle 18 | from itertools import izip 19 | 20 | util = CDLL(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'util.so.1')) 21 | 22 | LIBLINEAR_HOME = os.environ.get('LIBLINEAR_HOME') or os.path.dirname(os.path.abspath(__file__)) + '/liblinear' 23 | sys.path = [LIBLINEAR_HOME, LIBLINEAR_HOME + '/python'] + sys.path 24 | 25 | import liblinear 26 | from liblinearutil import train as liblinear_train, predict as liblinear_predict, save_model as liblinear_save_model, load_model as liblinear_load_model 27 | 28 | __all__ = ['LearnerParameter', 'LearnerModel', 29 | 'train', 'predict_one', 'predict', 'LIBLINEAR_HOME'] 30 | 31 | def print_debug(src): 32 | if os.environ.get('SHORTTEXTDEBUG'): 33 | print('[DEBUG]: ' + src) 34 | 35 | def fillprototype(f, restype, argtypes): 36 | f.restype = restype 37 | f.argtypes = argtypes 38 | 39 | def genFields(names, types): 40 | return list(zip(names, types)) 41 | 42 | #--------------Interface to util--------------- 43 | class SVMProblem(Structure): 44 | _names = ["prob", "x_space", "n_x_space"] 45 | _types = [liblinear.problem, POINTER(liblinear.feature_node), c_int64] 46 | _fields_ = genFields(_names, _types) 47 | 48 | def __del__ (self): 49 | print_debug('SVMProblem delete:%s'% id(self)) 50 | util.freeSVMProblem(self) 51 | 52 | def read_SVMProblem(src): 53 | status = c_int64() 54 | svmprob = util.read_problem(src.encode(), 0, pointer(status)) # bias = 0 is required 55 | 56 | status = status.value 57 | 58 | if status == 0: 59 | print_debug('SVMProblem construct:%s'% id(svmprob)) 60 | return svmprob 61 | 62 | if status == -1: 63 | raise IOError("Can not open file " + src + ".") 64 | 65 | if status == -2: 66 | raise MemoryError("Memory Exhausted. Try to restart python.") 67 | 68 | raise ValueError("Wrong file format in line " + str(status) + ".") 69 | 70 | 71 | fillprototype(util.read_problem, SVMProblem, [c_char_p, c_double, POINTER(c_int64)]) 72 | fillprototype(util.freeSVMProblem, None, [SVMProblem]) 73 | fillprototype(util.compute_idf, c_double, [POINTER(liblinear.problem), POINTER(c_double)]) 74 | fillprototype(util.normalize, None, [POINTER(liblinear.problem), c_int, c_int, c_int, c_int, POINTER(c_double)]) 75 | 76 | class LearnerProblem(liblinear.problem): 77 | def __init__(self, src): 78 | #svmprob = util.read_problem(src.encode(), 0) # bias = 0 is required 79 | svmprob = read_SVMProblem(src) # bias = 0 is required 80 | self.x = svmprob.prob.x 81 | self.y = svmprob.prob.y 82 | self.l = svmprob.prob.l 83 | self.n = svmprob.prob.n 84 | self.bias = svmprob.prob.bias 85 | self.x_space = svmprob.x_space 86 | self.n_x_space = svmprob.n_x_space 87 | print_debug('LearnerProblem construct:%s'% id(svmprob)) 88 | 89 | def set_bias(self, bias): 90 | if self.bias == bias: 91 | return 92 | node = liblinear.feature_node(self.n, bias) 93 | if bias >= 0 and self.bias < 0: 94 | self.n += 1 95 | node = liblinear.feature_node(self.n, bias) 96 | if bias < 0 and self.bias >= 0: 97 | self.n -= 1 98 | node = liblinear.feature_node(-1, bias) 99 | 100 | for i in range(1,self.l): 101 | self.x[i][-2] = node 102 | self.x_space[self.n_x_space-2] = node 103 | self.bias = bias 104 | 105 | def normalize(self, learner_param, idf): 106 | print_debug ("normal parameters: bin_feat {0}, inst_norm {1}, tf {2}, idf {3}\n".format(learner_param.binary_feature, 107 | learner_param.inst_normalization, 108 | learner_param.term_frequency, 109 | learner_param.inverse_document_frequency, 110 | )) 111 | util.normalize(pointer(self), 112 | learner_param.binary_feature, 113 | learner_param.inst_normalization, 114 | learner_param.term_frequency, 115 | learner_param.inverse_document_frequency, 116 | idf) 117 | 118 | @staticmethod 119 | def normalize_one(xi, learner_param, idf): 120 | """ 121 | The maximum index of xi should be less 122 | or equal to the weight vector size. 123 | """ 124 | norm = 0 125 | word_count = 0 126 | i = 0 127 | while xi[i].index != -1: 128 | idx = xi[i].index-1 129 | if learner_param.binary_feature: 130 | xi[i].value = xi[i].value != 0 131 | 132 | word_count += abs(xi[i].value) 133 | 134 | if learner_param.inverse_document_frequency and idx < len(idf): 135 | xi[i].value *= idf[idx] 136 | 137 | norm += xi[i].value * xi[i].value 138 | i += 1 139 | 140 | norm **= .5 141 | 142 | 143 | if learner_param.term_frequency: 144 | i = 0 145 | while xi[i].index != -1: 146 | xi[i].value /= word_count 147 | i += 1 148 | 149 | if learner_param.inst_normalization: 150 | i = 0 151 | while xi[i].index != -1: 152 | xi[i].value /= norm 153 | i += 1 154 | 155 | def compute_idf(self): 156 | idf = (c_double * self.n)() 157 | util.compute_idf(self, idf) 158 | return idf 159 | 160 | class LearnerParameter(liblinear.parameter): 161 | """ 162 | :class:`LearnerParameter` is the parameter structure used by 163 | :class:`LearnerModel`. It consists of normalization parameters and 164 | LIBLINEAR parameters. 165 | 166 | Both *liblinear_opts* and *learner_opts* are :class:`str` or a 167 | :class:`list` of :class:`str`. For example, you can write either 168 | 169 | >>> param = LearnerParameter('-N 1 -T 1', '-c 2 -e 1e-2') 170 | 171 | or 172 | 173 | >>> param = LearnerParameter(['-N', '1', '-T', '1'], ['-c', '2', '-e', '1e-2']) 174 | 175 | *liblinear_opts* is LIBLINEAR's parameters. Refer to LIBLINEAR's 176 | document for more details. *learner_opts* includes options for feature 177 | representation and instance-wise normalization. The preprocessor of 178 | LibShortText converts text files to LIBSVM-format data, where the 179 | features are word counts. All *value* in the options should be either 180 | ``1`` or ``0``, where ``1`` enables the option. 181 | 182 | ========== ==================================================== 183 | options explanation when *value* is ``1`` 184 | ========== ==================================================== 185 | -D *value* Binary representation. All non-zero values are 186 | treated as 1. Default is enabled. 187 | -T *value* Term frequency. The data are divided by the feature 188 | sum. That is, 189 | :math:`x_i \leftarrow (x_i)/\sum_j |x_j|`, 190 | where :math:`x` is the training instance and 191 | :math:`x_i` is the :math:`i`-th feature of :math:`x`. 192 | Default is disabled. 193 | -I *value* Inverse document frequency (idf). Default is 194 | disabled. 195 | -N *value* Instance normalization. The training instances are 196 | normalized to unit vectors before training. Default 197 | is enabled. 198 | ========== ==================================================== 199 | 200 | Note that if more than one option is enabled, then they are done in the 201 | order: binary representation, term frequency, IDF, and instance 202 | normalization. The following example is tf-idf representation without 203 | instance normalization. 204 | 205 | >>> param = LearnerParameter('-D 0 -T 1 -I 1 -N 0', liblinear_opts) 206 | 207 | """ 208 | def __init__(self, learner_opts = '', liblinear_opts = ''): 209 | self.parse_options(learner_opts, liblinear_opts) 210 | 211 | def set_to_default_values(self): 212 | """ 213 | Set the options to some values 214 | (``'-D 1 -T 0 -I 0 -N 1'``). 215 | """ 216 | liblinear.parameter.set_to_default_values(self) 217 | self.binary_feature = 1 218 | self.inst_normalization = 1 219 | self.term_frequency = 0 220 | self.inverse_document_frequency = 0 221 | 222 | def parse_options(self, learner_opts, liblinear_opts): 223 | """ 224 | Set the options to the specific values. 225 | """ 226 | 227 | self.raw_options = (learner_opts, liblinear_opts) 228 | if isinstance(learner_opts, list): 229 | argv = learner_opts 230 | elif isinstance(learner_opts, str): 231 | argv = learner_opts.split() 232 | else: 233 | raise TypeError("Wrong types") 234 | self.set_to_default_values() 235 | liblinear.parameter.parse_options(self, liblinear_opts) 236 | 237 | i = 0 238 | while i < len(argv): 239 | if argv[i] == "-D": 240 | i = i + 1 241 | self.binary_feature = int(argv[i]) 242 | elif argv[i] == "-N": 243 | i = i + 1 244 | self.inst_normalization = int(argv[i]) 245 | elif argv[i] == "-I": 246 | i = i + 1 247 | self.inverse_document_frequency = int(argv[i]) 248 | elif argv[i] == "-T": 249 | i = i + 1 250 | self.term_frequency = int(argv[i]) 251 | else : 252 | raise ValueError('No option ' + argv[i]) 253 | i = i + 1 254 | 255 | 256 | class LearnerModel(liblinear.model): 257 | """ 258 | :class:`LearnerModel` is a middle-level classification model. It 259 | inherits from :class:`liblinear.model` by having two more members: 260 | a :class:`LearnerParameter` instance and an inverse document frequency list. 261 | 262 | We do not recommend users to create a :class:`LearnerModel` by themselves. 263 | Instead, users should create and manipulate a :class:`LearnerModel` 264 | via :func:`train`, :func:`predict`, and :func:`predict_one`. 265 | 266 | If users want to redefine :class:`LearnerModel`, they must 267 | implement the following four methods used by 268 | :mod:`libshorttext.classifier` and :mod:`libshorttext.analyzer`. 269 | """ 270 | 271 | def _reconstruct_label_idx(self): 272 | def _get_label_idx(nr_class, labels): 273 | return dict(zip(labels[:nr_class], range(nr_class))) 274 | 275 | if self.c_model is not None: 276 | self.labelidx = _get_label_idx(self.c_model.nr_class, self.c_model.label) 277 | 278 | 279 | def __init__(self, c_model, param = None, idf = None): 280 | """ 281 | constructor of :class:`LearnerModel`. 282 | """ 283 | 284 | print_debug('c_model(%s), self(%s)' % (id(c_model), id(self))) 285 | 286 | if isinstance(c_model, str): 287 | self.load(c_model) 288 | return 289 | elif isinstance(c_model, liblinear.model): 290 | if param is None: 291 | raise ValueError("param can not be None if model is given.") 292 | else: 293 | raise TypeError("c_model should be model file name or a model.") 294 | 295 | self.c_model = c_model # prevent GC 296 | 297 | if isinstance(param, LearnerParameter): 298 | self.param_options = param.raw_options 299 | elif isinstance(param, tuple): 300 | self.param_options = param 301 | else: 302 | raise TypeError("param should be a LearnerParameter or a tuple.") 303 | 304 | if idf is not None: 305 | self.idf = idf[:self.c_model.nr_feature + (self.c_model.bias >= 0)] 306 | else: 307 | self.idf = None 308 | 309 | for attr in c_model._names: 310 | setattr(self, attr, getattr(c_model, attr)) 311 | 312 | self._reconstruct_label_idx() 313 | 314 | def get_weight(self, j, k): 315 | """ 316 | Return the weight of feature *j* and label *k*. 317 | """ 318 | return self.c_model.w[(j-1)*self.c_model.nr_class + self.labelidx[k]] 319 | 320 | def get_labels(self): 321 | """ 322 | Return the labels of this model. 323 | """ 324 | return self.label[:self.nr_class] 325 | 326 | def load(self, model_dir): 327 | """ 328 | Load the contents from a :class:`TextModel` directory. 329 | """ 330 | 331 | self.c_model = liblinear_load_model(path.join(model_dir,'liblinear_model')) 332 | 333 | options_file = path.join(model_dir,'options.pickle') 334 | self.param_options = cPickle.load(open(options_file,'rb')) 335 | 336 | idf_file = path.join(model_dir,'idf.pickle') 337 | self.idf = cPickle.load(open(idf_file,'rb')) 338 | 339 | self.__init__(self.c_model, LearnerParameter(self.param_options[0], self.param_options[1]), self.idf) 340 | 341 | def save(self, model_dir, force=False): 342 | """ 343 | Save the model to a directory. If *force* is set to ``True``, 344 | the existing directory will be overwritten; otherwise, 345 | :class:`IOError` will be raised. 346 | """ 347 | 348 | if path.exists(model_dir): 349 | if force: 350 | shutil.rmtree(model_dir) 351 | else : 352 | raise OSError('Please use force option to overwrite the existing files.') 353 | os.mkdir(model_dir) 354 | 355 | liblinear_save_model(path.join(model_dir,'liblinear_model'), self.c_model) 356 | options_file = path.join(model_dir,'options.pickle') 357 | cPickle.dump(self.param_options, open(options_file,'wb'),-1) 358 | 359 | idf_file = path.join(model_dir,'idf.pickle') 360 | cPickle.dump(self.idf, open(idf_file,'wb'),-1) 361 | 362 | def __str__(self): 363 | if type(self.param_options) is tuple and len(self.param_options) > 0: 364 | return 'LearnerModel: ' + (self.param_options[0] or 'default') 365 | else: 366 | return 'empty LearnerModel' 367 | 368 | def train(data_file_name, learner_opts="", liblinear_opts=""): 369 | """ 370 | Return a :class:`LearnerModel`. 371 | 372 | *data_file_name* is the file path of the LIBSVM-format data. *learner_opts* is a 373 | :class:`str`. Refer to :ref:`learner_param`. *liblinear_opts* is a :class:`str` of 374 | LIBLINEAR's parameters. Refer to LIBLINEAR's document. 375 | """ 376 | 377 | learner_prob = LearnerProblem(data_file_name) 378 | learner_param = LearnerParameter(learner_opts, liblinear_opts) 379 | 380 | idf = None 381 | if learner_param.inverse_document_frequency: 382 | idf = learner_prob.compute_idf() 383 | 384 | learner_prob.normalize(learner_param, idf) 385 | 386 | m = liblinear_train(learner_prob, learner_param) 387 | if not learner_param.cross_validation: 388 | m.x_space = None # This is required to reduce the memory usage... 389 | m = LearnerModel(m, learner_param, idf) 390 | return m 391 | 392 | def predict_one(xi, m): 393 | """ 394 | Return the label and a :class:`c_double` array of decision values of 395 | the test instance *xi* using :class:`LearnerModel` *m*. 396 | 397 | *xi* can be a :class:`list` or a :class:`dict` as in LIBLINEAR python 398 | interface. It can also be a LIBLINEAR feature_node array. 399 | 400 | .. note:: 401 | 402 | This function is designed to analyze the result of one instance. 403 | It has a severe efficiency issue and should be used only by 404 | :func:`libshorttext.classifier.predict_single_text`. If many 405 | instances need to be predicted, they should be stored in a file 406 | and predicted by :func:`predict`. 407 | 408 | .. warning:: 409 | 410 | The content of *xi* may be **changed** after the function call. 411 | """ 412 | 413 | if isinstance(xi, (list, dict)): 414 | xi = liblinear.gen_feature_nodearray(xi)[0] 415 | elif not isinstance(xi, POINTER(liblinear.feature_node)): 416 | raise TypeError("xi should be a test instance") 417 | 418 | learner_param = LearnerParameter(m.param_options[0], m.param_options[1]) 419 | 420 | if m.bias >= 0: 421 | i = 0 422 | while xi[i].index != -1: i += 1 423 | 424 | # Already has bias, or bias reserved. 425 | # Actually this statement should be true if 426 | # the data is read by read_SVMProblem. 427 | if i > 0 and xi[i-1].index == m.nr_feature + 1: 428 | i -= 1 429 | 430 | xi[i] = liblinear.feature_node(m.nr_feature + 1, m.bias) 431 | xi[i+1] = liblinear.feature_node(-1, 0) 432 | 433 | LearnerProblem.normalize_one(xi, learner_param, m.idf) 434 | 435 | dec_values = (c_double * m.nr_class)() 436 | label = liblinear.liblinear.predict_values(m, xi, dec_values) 437 | 438 | return label, dec_values 439 | 440 | def predict(data_file_name, m, liblinear_opts=""): 441 | """ 442 | Return a quadruple: the predicted labels, the accuracy, the decision values, and the 443 | true labels in the test data file (obtained through the :class:`LearnerModel` *m*). 444 | 445 | The predicted labels and true labels in the file are :class:`list`. The accuracy is 446 | evaluated by assuming that the labels in the file are the true label. 447 | 448 | The decision values are in a :class:`list`, where the length is the same as the number 449 | of test instances. Each element in the list is a :class:`c_double` array, and the 450 | values in the array are an instance's decision values in different classes. 451 | For example, the decision value of instance i and class k can be obtained by 452 | 453 | >>> predicted_label, accuracy, all_dec_values, label = predict('svm_file', model) 454 | >>> print all_dec_values[i][k] 455 | """ 456 | 457 | learner_prob = LearnerProblem(data_file_name) 458 | learner_param = LearnerParameter(m.param_options[0], m.param_options[1]) 459 | 460 | idf = None 461 | if m.idf: 462 | idf = (c_double * len(m.idf))() 463 | for i in range(len(m.idf)): idf[i] = m.idf[i] 464 | learner_prob.normalize(learner_param, idf) 465 | 466 | all_dec_values = [] 467 | acc = 0 468 | py = [] # predicted y 469 | ty = [] # true y 470 | 471 | dec_values = (c_double * m.nr_class)() 472 | 473 | for i in range(learner_prob.l): 474 | label = liblinear.liblinear.predict_values(m, learner_prob.x[i], dec_values) 475 | all_dec_values += [dec_values[:m.nr_class]] 476 | py += [label] 477 | ty += [learner_prob.y[i]] 478 | 479 | if label == learner_prob.y[i]: 480 | acc += 1 481 | 482 | acc /= float(learner_prob.l) 483 | 484 | 485 | return py, acc, all_dec_values, ty 486 | 487 | 488 | 489 | if __name__ == '__main__': 490 | argv = sys.argv 491 | if len(argv) < 2: #4 or '-v' not in argv: 492 | print("{0} -v fold [other liblinear_options] [learner_opts] training-data".format(argv[0])) 493 | sys.exit(-1) 494 | data_file_name = argv[-1] 495 | learner_opts, liblinear_opts = [], [] 496 | i = 1 497 | while i < len(argv)-1: 498 | if argv[i] in ["-D", "-N", "-I", "-T"]: 499 | learner_opts += [argv[i], argv[i+1]] 500 | i += 2 501 | else : 502 | liblinear_opts += [argv[i]] 503 | i += 1 504 | m = train(data_file_name, learner_opts, liblinear_opts) 505 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/COPYRIGHT: -------------------------------------------------------------------------------- 1 | 2 | opyright (c) 2007-2012 The LIBLINEAR Project. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | 3. Neither name of copyright holders nor the names of its contributors 17 | may be used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR 25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/Makefile: -------------------------------------------------------------------------------- 1 | CXX ?= g++ 2 | CC ?= gcc 3 | CFLAGS = -Wall -Wconversion -O3 -fPIC 4 | LIBS = blas/blas.a 5 | SHVER = 1 6 | OS = $(shell uname) 7 | #LIBS = -lblas 8 | 9 | all: train predict 10 | 11 | lib: linear.o tron.o blas/blas.a 12 | if [ "$(OS)" = "Darwin" ]; then \ 13 | SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,liblinear.so.$(SHVER)"; \ 14 | else \ 15 | SHARED_LIB_FLAG="-shared -Wl,-soname,liblinear.so.$(SHVER)"; \ 16 | fi; \ 17 | $(CXX) $${SHARED_LIB_FLAG} linear.o tron.o blas/blas.a -o liblinear.so.$(SHVER) 18 | 19 | train: tron.o linear.o train.c blas/blas.a 20 | $(CXX) $(CFLAGS) -o train train.c tron.o linear.o $(LIBS) 21 | 22 | predict: tron.o linear.o predict.c blas/blas.a 23 | $(CXX) $(CFLAGS) -o predict predict.c tron.o linear.o $(LIBS) 24 | 25 | tron.o: tron.cpp tron.h 26 | $(CXX) $(CFLAGS) -c -o tron.o tron.cpp 27 | 28 | linear.o: linear.cpp linear.h 29 | $(CXX) $(CFLAGS) -c -o linear.o linear.cpp 30 | 31 | blas/blas.a: blas/*.c blas/*.h 32 | make -C blas OPTFLAGS='$(CFLAGS)' CC='$(CC)'; 33 | 34 | clean: 35 | make -C blas clean 36 | make -C matlab clean 37 | rm -f *~ tron.o linear.o train predict liblinear.so.$(SHVER) 38 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/README: -------------------------------------------------------------------------------- 1 | LIBLINEAR is a simple package for solving large-scale regularized linear 2 | classification and regression. It currently supports 3 | - L2-regularized logistic regression/L2-loss support vector classification/L1-loss support vector classification 4 | - L1-regularized L2-loss support vector classification/L1-regularized logistic regression 5 | - L2-regularized L2-loss support vector regression/L1-loss support vector regression. 6 | This document explains the usage of LIBLINEAR. 7 | 8 | To get started, please read the ``Quick Start'' section first. 9 | For developers, please check the ``Library Usage'' section to learn 10 | how to integrate LIBLINEAR in your software. 11 | 12 | Table of Contents 13 | ================= 14 | 15 | - When to use LIBLINEAR but not LIBSVM 16 | - Quick Start 17 | - Installation 18 | - `train' Usage 19 | - `predict' Usage 20 | - Examples 21 | - Library Usage 22 | - Additional Information 23 | - MATLAB/OCTAVE interface 24 | - PYTHON interface 25 | 26 | When to use LIBLINEAR but not LIBSVM 27 | ==================================== 28 | 29 | There are some large data for which with/without nonlinear mappings 30 | gives similar performances. Without using kernels, one can 31 | efficiently train a much larger set via linear classification/regression. 32 | These data usually have a large number of features. Document classification 33 | is an example. 34 | 35 | Warning: While generally liblinear is very fast, its default solver 36 | may be slow under certain situations (e.g., data not scaled or C is 37 | large). See Appendix B of our SVM guide about how to handle such 38 | cases. 39 | http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf 40 | 41 | Warning: If you are a beginner and your data sets are not large, you 42 | should consider LIBSVM first. 43 | 44 | LIBSVM page: 45 | http://www.csie.ntu.edu.tw/~cjlin/libsvm 46 | 47 | 48 | Quick Start 49 | =========== 50 | 51 | See the section ``Installation'' for installing LIBLINEAR. 52 | 53 | After installation, there are programs `train' and `predict' for 54 | training and testing, respectively. 55 | 56 | About the data format, please check the README file of LIBSVM. Note 57 | that feature index must start from 1 (but not 0). 58 | 59 | A sample classification data included in this package is `heart_scale'. 60 | 61 | Type `train heart_scale', and the program will read the training 62 | data and output the model file `heart_scale.model'. If you have a test 63 | set called heart_scale.t, then type `predict heart_scale.t 64 | heart_scale.model output' to see the prediction accuracy. The `output' 65 | file contains the predicted class labels. 66 | 67 | For more information about `train' and `predict', see the sections 68 | `train' Usage and `predict' Usage. 69 | 70 | To obtain good performances, sometimes one needs to scale the 71 | data. Please check the program `svm-scale' of LIBSVM. For large and 72 | sparse data, use `-l 0' to keep the sparsity. 73 | 74 | Installation 75 | ============ 76 | 77 | On Unix systems, type `make' to build the `train' and `predict' 78 | programs. Run them without arguments to show the usages. 79 | 80 | This software uses some level-1 BLAS subroutines. The needed functions are 81 | included in this package. If a BLAS library is available on your 82 | machine, you may use it by modifying the Makefile: Unmark the following line 83 | 84 | #LIBS ?= -lblas 85 | 86 | and mark 87 | 88 | LIBS ?= blas/blas.a 89 | 90 | `train' Usage 91 | ============= 92 | 93 | Usage: train [options] training_set_file [model_file] 94 | options: 95 | -s type : set type of solver (default 1) 96 | for multi-class classification 97 | 0 -- L2-regularized logistic regression (primal) 98 | 1 -- L2-regularized L2-loss support vector classification (dual) 99 | 2 -- L2-regularized L2-loss support vector classification (primal) 100 | 3 -- L2-regularized L1-loss support vector classification (dual) 101 | 4 -- support vector classification by Crammer and Singer 102 | 5 -- L1-regularized L2-loss support vector classification 103 | 6 -- L1-regularized logistic regression 104 | 7 -- L2-regularized logistic regression (dual) 105 | for regression 106 | 11 -- L2-regularized L2-loss support vector regression (primal) 107 | 12 -- L2-regularized L2-loss support vector regression (dual) 108 | 13 -- L2-regularized L1-loss support vector regression (dual) 109 | -c cost : set the parameter C (default 1) 110 | -p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1) 111 | -e epsilon : set tolerance of termination criterion 112 | -s 0 and 2 113 | |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, 114 | where f is the primal function and pos/neg are # of 115 | positive/negative data (default 0.01) 116 | -s 11 117 | |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001) 118 | -s 1, 3, 4 and 7 119 | Dual maximal violation <= eps; similar to libsvm (default 0.1) 120 | -s 5 and 6 121 | |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, 122 | where f is the primal function (default 0.01) 123 | -s 12 and 13\n" 124 | |f'(alpha)|_1 <= eps |f'(alpha0)|, 125 | where f is the dual function (default 0.1) 126 | -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) 127 | -wi weight: weights adjust the parameter C of different classes (see README for details) 128 | -v n: n-fold cross validation mode 129 | -q : quiet mode (no outputs) 130 | 131 | Option -v randomly splits the data into n parts and calculates cross 132 | validation accuracy on them. 133 | 134 | Formulations: 135 | 136 | For L2-regularized logistic regression (-s 0), we solve 137 | 138 | min_w w^Tw/2 + C \sum log(1 + exp(-y_i w^Tx_i)) 139 | 140 | For L2-regularized L2-loss SVC dual (-s 1), we solve 141 | 142 | min_alpha 0.5(alpha^T (Q + I/2/C) alpha) - e^T alpha 143 | s.t. 0 <= alpha_i, 144 | 145 | For L2-regularized L2-loss SVC (-s 2), we solve 146 | 147 | min_w w^Tw/2 + C \sum max(0, 1- y_i w^Tx_i)^2 148 | 149 | For L2-regularized L1-loss SVC dual (-s 3), we solve 150 | 151 | min_alpha 0.5(alpha^T Q alpha) - e^T alpha 152 | s.t. 0 <= alpha_i <= C, 153 | 154 | For L1-regularized L2-loss SVC (-s 5), we solve 155 | 156 | min_w \sum |w_j| + C \sum max(0, 1- y_i w^Tx_i)^2 157 | 158 | For L1-regularized logistic regression (-s 6), we solve 159 | 160 | min_w \sum |w_j| + C \sum log(1 + exp(-y_i w^Tx_i)) 161 | 162 | For L2-regularized logistic regression (-s 7), we solve 163 | 164 | min_alpha 0.5(alpha^T Q alpha) + \sum alpha_i*log(alpha_i) + \sum (C-alpha_i)*log(C-alpha_i) - a constant 165 | s.t. 0 <= alpha_i <= C, 166 | 167 | where 168 | 169 | Q is a matrix with Q_ij = y_i y_j x_i^T x_j. 170 | 171 | For L2-regularized L2-loss SVR (-s 11), we solve 172 | 173 | min_w w^Tw/2 + C \sum max(0, |y_i-w^Tx_i|-epsilon)^2 174 | 175 | For L2-regularized L2-loss SVR dual (-s 12), we solve 176 | 177 | min_beta 0.5(beta^T (Q + lambda I/2/C) beta) - y^T beta + \sum |beta_i| 178 | 179 | For L2-regularized L1-loss SVR dual (-s 13), we solve 180 | 181 | min_beta 0.5(beta^T Q beta) - y^T beta + \sum |beta_i| 182 | s.t. -C <= beta_i <= C, 183 | 184 | where 185 | 186 | Q is a matrix with Q_ij = x_i^T x_j. 187 | 188 | If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias]. 189 | 190 | The primal-dual relationship implies that -s 1 and -s 2 give the same 191 | model, -s 0 and -s 7 give the same, and -s 11 and -s 12 give the same. 192 | 193 | We implement 1-vs-the rest multi-class strategy for classification. 194 | In training i vs. non_i, their C parameters are (weight from -wi)*C 195 | and C, respectively. If there are only two classes, we train only one 196 | model. Thus weight1*C vs. weight2*C is used. See examples below. 197 | 198 | We also implement multi-class SVM by Crammer and Singer (-s 4): 199 | 200 | min_{w_m, \xi_i} 0.5 \sum_m ||w_m||^2 + C \sum_i \xi_i 201 | s.t. w^T_{y_i} x_i - w^T_m x_i >= \e^m_i - \xi_i \forall m,i 202 | 203 | where e^m_i = 0 if y_i = m, 204 | e^m_i = 1 if y_i != m, 205 | 206 | Here we solve the dual problem: 207 | 208 | min_{\alpha} 0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i 209 | s.t. \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i 210 | 211 | where w_m(\alpha) = \sum_i \alpha^m_i x_i, 212 | and C^m_i = C if m = y_i, 213 | C^m_i = 0 if m != y_i. 214 | 215 | `predict' Usage 216 | =============== 217 | 218 | Usage: predict [options] test_file model_file output_file 219 | options: 220 | -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only 221 | -q : quiet mode (no outputs) 222 | 223 | Note that -b is only needed in the prediction phase. This is different 224 | from the setting of LIBSVM. 225 | 226 | Examples 227 | ======== 228 | 229 | > train data_file 230 | 231 | Train linear SVM with L2-loss function. 232 | 233 | > train -s 0 data_file 234 | 235 | Train a logistic regression model. 236 | 237 | > train -v 5 -e 0.001 data_file 238 | 239 | Do five-fold cross-validation using L2-loss svm. 240 | Use a smaller stopping tolerance 0.001 than the default 241 | 0.1 if you want more accurate solutions. 242 | 243 | > train -c 10 -w1 2 -w2 5 -w3 2 four_class_data_file 244 | 245 | Train four classifiers: 246 | positive negative Cp Cn 247 | class 1 class 2,3,4. 20 10 248 | class 2 class 1,3,4. 50 10 249 | class 3 class 1,2,4. 20 10 250 | class 4 class 1,2,3. 10 10 251 | 252 | > train -c 10 -w3 1 -w2 5 two_class_data_file 253 | 254 | If there are only two classes, we train ONE model. 255 | The C values for the two classes are 10 and 50. 256 | 257 | > predict -b 1 test_file data_file.model output_file 258 | 259 | Output probability estimates (for logistic regression only). 260 | 261 | Library Usage 262 | ============= 263 | 264 | - Function: model* train(const struct problem *prob, 265 | const struct parameter *param); 266 | 267 | This function constructs and returns a linear classification 268 | or regression model according to the given training data and 269 | parameters. 270 | 271 | struct problem describes the problem: 272 | 273 | struct problem 274 | { 275 | INT64 l, n; 276 | INT64 *y; 277 | struct feature_node **x; 278 | double bias; 279 | }; 280 | 281 | where `l' is the number of training data. If bias >= 0, we assume 282 | that one additional feature is added to the end of each data 283 | instance. `n' is the number of feature (including the bias feature 284 | if bias >= 0). `y' is an array containing the target values. (integers 285 | in classification, real numbers in regression) And `x' is an array 286 | of pointers, each of which points to a sparse representation (array 287 | of feature_node) of one training vector. 288 | 289 | For example, if we have the following training data: 290 | 291 | LABEL ATTR1 ATTR2 ATTR3 ATTR4 ATTR5 292 | ----- ----- ----- ----- ----- ----- 293 | 1 0 0.1 0.2 0 0 294 | 2 0 0.1 0.3 -1.2 0 295 | 1 0.4 0 0 0 0 296 | 2 0 0.1 0 1.4 0.5 297 | 3 -0.1 -0.2 0.1 1.1 0.1 298 | 299 | and bias = 1, then the components of problem are: 300 | 301 | l = 5 302 | n = 6 303 | 304 | y -> 1 2 1 2 3 305 | 306 | x -> [ ] -> (2,0.1) (3,0.2) (6,1) (-1,?) 307 | [ ] -> (2,0.1) (3,0.3) (4,-1.2) (6,1) (-1,?) 308 | [ ] -> (1,0.4) (6,1) (-1,?) 309 | [ ] -> (2,0.1) (4,1.4) (5,0.5) (6,1) (-1,?) 310 | [ ] -> (1,-0.1) (2,-0.2) (3,0.1) (4,1.1) (5,0.1) (6,1) (-1,?) 311 | 312 | struct parameter describes the parameters of a linear classification 313 | or regression model: 314 | 315 | struct parameter 316 | { 317 | INT64 solver_type; 318 | 319 | /* these are for training only */ 320 | double eps; /* stopping criteria */ 321 | double C; 322 | INT64 nr_weight; 323 | INT64 *weight_label; 324 | double* weight; 325 | double p; 326 | }; 327 | 328 | solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL. 329 | for classification 330 | L2R_LR L2-regularized logistic regression (primal) 331 | L2R_L2LOSS_SVC_DUAL L2-regularized L2-loss support vector classification (dual) 332 | L2R_L2LOSS_SVC L2-regularized L2-loss support vector classification (primal) 333 | L2R_L1LOSS_SVC_DUAL L2-regularized L1-loss support vector classification (dual) 334 | MCSVM_CS support vector classification by Crammer and Singer 335 | L1R_L2LOSS_SVC L1-regularized L2-loss support vector classification 336 | L1R_LR L1-regularized logistic regression 337 | L2R_LR_DUAL L2-regularized logistic regression (dual) 338 | for regression 339 | L2R_L2LOSS_SVR L2-regularized L2-loss support vector regression (primal) 340 | L2R_L2LOSS_SVR_DUAL L2-regularized L2-loss support vector regression (dual) 341 | L2R_L1LOSS_SVR_DUAL L2-regularized L1-loss support vector regression (dual) 342 | 343 | C is the cost of constraints violation. 344 | p is the sensitiveness of loss of support vector regression. 345 | eps is the stopping criterion. 346 | 347 | nr_weight, weight_label, and weight are used to change the penalty 348 | for some classes (If the weight for a class is not changed, it is 349 | set to 1). This is useful for training classifier using unbalanced 350 | input data or with asymmetric misclassification cost. 351 | 352 | nr_weight is the number of elements in the array weight_label and 353 | weight. Each weight[i] corresponds to weight_label[i], meaning that 354 | the penalty of class weight_label[i] is scaled by a factor of weight[i]. 355 | 356 | If you do not want to change penalty for any of the classes, 357 | just set nr_weight to 0. 358 | 359 | *NOTE* To avoid wrong parameters, check_parameter() should be 360 | called before train(). 361 | 362 | struct model stores the model obtained from the training procedure: 363 | 364 | struct model 365 | { 366 | struct parameter param; 367 | INT64 nr_class; /* number of classes */ 368 | INT64 nr_feature; 369 | double *w; 370 | INT64 *label; /* label of each class */ 371 | double bias; 372 | }; 373 | 374 | param describes the parameters used to obtain the model. 375 | 376 | nr_class and nr_feature are the number of classes and features, 377 | respectively. nr_class = 2 for regression. 378 | 379 | The nr_feature*nr_class array w gives feature weights. We use one 380 | against the rest for multi-class classification, so each feature 381 | index corresponds to nr_class weight values. Weights are 382 | organized in the following way 383 | 384 | +------------------+------------------+------------+ 385 | | nr_class weights | nr_class weights | ... 386 | | for 1st feature | for 2nd feature | 387 | +------------------+------------------+------------+ 388 | 389 | If bias >= 0, x becomes [x; bias]. The number of features is 390 | increased by one, so w is a (nr_feature+1)*nr_class array. The 391 | value of bias is stored in the variable bias. 392 | 393 | The array label stores class labels. 394 | 395 | - Function: void cross_validation(const problem *prob, const parameter *param, INT64 nr_fold, double *target); 396 | 397 | This function conducts cross validation. Data are separated to 398 | nr_fold folds. Under given parameters, sequentially each fold is 399 | validated using the model from training the remaining. Predicted 400 | labels in the validation process are stored in the array called 401 | target. 402 | 403 | The format of prob is same as that for train(). 404 | 405 | - Function: double predict(const model *model_, const feature_node *x); 406 | 407 | For a classification model, the predicted class for x is returned. 408 | For a regression model, the function value of x calculated using 409 | the model is returned. 410 | 411 | - Function: double predict_values(const struct model *model_, 412 | const struct feature_node *x, double* dec_values); 413 | 414 | This function gives nr_w decision values in the array dec_values. 415 | nr_w=1 if regression is applied or the number of classes is two. An exception is 416 | multi-class svm by Crammer and Singer (-s 4), where nr_w = 2 if there are two classes. For all other situations, nr_w is the 417 | number of classes. 418 | 419 | We implement one-vs-the rest multi-class strategy (-s 0,1,2,3,5,6,7) 420 | and multi-class svm by Crammer and Singer (-s 4) for multi-class SVM. 421 | The class with the highest decision value is returned. 422 | 423 | - Function: double predict_probability(const struct model *model_, 424 | const struct feature_node *x, double* prob_estimates); 425 | 426 | This function gives nr_class probability estimates in the array 427 | prob_estimates. nr_class can be obtained from the function 428 | get_nr_class. The class with the highest probability is 429 | returned. Currently, we support only the probability outputs of 430 | logistic regression. 431 | 432 | - Function: INT64 get_nr_feature(const model *model_); 433 | 434 | The function gives the number of attributes of the model. 435 | 436 | - Function: INT64 get_nr_class(const model *model_); 437 | 438 | The function gives the number of classes of the model. 439 | For a regression model, 2 is returned. 440 | 441 | - Function: void get_labels(const model *model_, INT64* label); 442 | 443 | This function outputs the name of labels into an array called label. 444 | For a regression model, label is unchanged. 445 | 446 | - Function: const char *check_parameter(const struct problem *prob, 447 | const struct parameter *param); 448 | 449 | This function checks whether the parameters are within the feasible 450 | range of the problem. This function should be called before calling 451 | train() and cross_validation(). It returns NULL if the 452 | parameters are feasible, otherwise an error message is returned. 453 | 454 | - Function: INT64 save_model(const char *model_file_name, 455 | const struct model *model_); 456 | 457 | This function saves a model to a file; returns 0 on success, or -1 458 | if an error occurs. 459 | 460 | - Function: struct model *load_model(const char *model_file_name); 461 | 462 | This function returns a pointer to the model read from the file, 463 | or a null pointer if the model could not be loaded. 464 | 465 | - Function: void free_model_content(struct model *model_ptr); 466 | 467 | This function frees the memory used by the entries in a model structure. 468 | 469 | - Function: void free_and_destroy_model(struct model **model_ptr_ptr); 470 | 471 | This function frees the memory used by a model and destroys the model 472 | structure. 473 | 474 | - Function: void destroy_param(struct parameter *param); 475 | 476 | This function frees the memory used by a parameter set. 477 | 478 | - Function: void set_print_string_function(void (*print_func)(const char *)); 479 | 480 | Users can specify their output format by a function. Use 481 | set_print_string_function(NULL); 482 | for default printing to stdout. 483 | 484 | 485 | MATLAB/OCTAVE Interface 486 | ======================= 487 | 488 | Please check the file README in the directory `matlab'. 489 | 490 | PYTHON Interface 491 | ================ 492 | 493 | Please check the file README in the directory `python'. 494 | 495 | Additional Information 496 | ====================== 497 | 498 | If you find LIBLINEAR helpful, please cite it as 499 | 500 | R.-E. Fan, K.-W. Chang, C.-J. Hsieh, X.-R. Wang, and C.-J. Lin. 501 | LIBLINEAR: A Library for Large Linear Classification, Journal of 502 | Machine Learning Research 9(2008), 1871-1874. Software available at 503 | http://www.csie.ntu.edu.tw/~cjlin/liblinear 504 | 505 | For any questions and comments, please send your email to 506 | cjlin@csie.ntu.edu.tw 507 | 508 | 509 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/blas/Makefile: -------------------------------------------------------------------------------- 1 | AR = ar rcv 2 | RANLIB = ranlib 3 | 4 | HEADERS = blas.h blasp.h 5 | FILES = dnrm2.o daxpy.o ddot.o dscal.o 6 | 7 | CFLAGS = $(OPTFLAGS) 8 | FFLAGS = $(OPTFLAGS) 9 | 10 | blas: $(FILES) $(HEADERS) 11 | $(AR) blas.a $(FILES) 12 | $(RANLIB) blas.a 13 | 14 | clean: 15 | - rm -f *.o 16 | - rm -f *.a 17 | - rm -f *~ 18 | 19 | .c.o: 20 | $(CC) $(CFLAGS) -c $*.c 21 | 22 | 23 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/blas/blas.h: -------------------------------------------------------------------------------- 1 | /* blas.h -- C header file for BLAS Ver 1.0 */ 2 | /* Jesse Bennett March 23, 2000 */ 3 | 4 | /** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." 5 | 6 | - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ 7 | 8 | #ifndef BLAS_INCLUDE 9 | #define BLAS_INCLUDE 10 | 11 | #include "stdint.h" 12 | #ifndef INT64_DEFINED 13 | typedef int64_t INT64; 14 | #define INT64_DEFINED 15 | #endif 16 | /* Data types specific to BLAS implementation */ 17 | typedef struct { float r, i; } fcomplex; 18 | typedef struct { double r, i; } dcomplex; 19 | typedef INT64 blasbool; 20 | 21 | #include "blasp.h" /* Prototypes for all BLAS functions */ 22 | 23 | #define FALSE 0 24 | #define TRUE 1 25 | 26 | /* Macro functions */ 27 | #define MIN(a,b) ((a) <= (b) ? (a) : (b)) 28 | #define MAX(a,b) ((a) >= (b) ? (a) : (b)) 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/blas/blasp.h: -------------------------------------------------------------------------------- 1 | /* blasp.h -- C prototypes for BLAS Ver 1.0 */ 2 | /* Jesse Bennett March 23, 2000 */ 3 | 4 | /* Functions listed in alphabetical order */ 5 | #include 6 | #ifndef INT64_DEFINED 7 | typedef int64_t INT64; 8 | #define INT64_DEFINED 9 | #endif 10 | 11 | #ifdef F2C_COMPAT 12 | 13 | void cdotc_(fcomplex *dotval, INT64 *n, fcomplex *cx, INT64 *incx, 14 | fcomplex *cy, INT64 *incy); 15 | 16 | void cdotu_(fcomplex *dotval, INT64 *n, fcomplex *cx, INT64 *incx, 17 | fcomplex *cy, INT64 *incy); 18 | 19 | double sasum_(INT64 *n, float *sx, INT64 *incx); 20 | 21 | double scasum_(INT64 *n, fcomplex *cx, INT64 *incx); 22 | 23 | double scnrm2_(INT64 *n, fcomplex *x, INT64 *incx); 24 | 25 | double sdot_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy); 26 | 27 | double snrm2_(INT64 *n, float *x, INT64 *incx); 28 | 29 | void zdotc_(dcomplex *dotval, INT64 *n, dcomplex *cx, INT64 *incx, 30 | dcomplex *cy, INT64 *incy); 31 | 32 | void zdotu_(dcomplex *dotval, INT64 *n, dcomplex *cx, INT64 *incx, 33 | dcomplex *cy, INT64 *incy); 34 | 35 | #else 36 | 37 | fcomplex cdotc_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy); 38 | 39 | fcomplex cdotu_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy); 40 | 41 | float sasum_(INT64 *n, float *sx, INT64 *incx); 42 | 43 | float scasum_(INT64 *n, fcomplex *cx, INT64 *incx); 44 | 45 | float scnrm2_(INT64 *n, fcomplex *x, INT64 *incx); 46 | 47 | float sdot_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy); 48 | 49 | float snrm2_(INT64 *n, float *x, INT64 *incx); 50 | 51 | dcomplex zdotc_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy); 52 | 53 | dcomplex zdotu_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy); 54 | 55 | #endif 56 | 57 | /* Remaining functions listed in alphabetical order */ 58 | 59 | INT64 caxpy_(INT64 *n, fcomplex *ca, fcomplex *cx, INT64 *incx, fcomplex *cy, 60 | INT64 *incy); 61 | 62 | INT64 ccopy_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy); 63 | 64 | INT64 cgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku, 65 | fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *x, INT64 *incx, 66 | fcomplex *beta, fcomplex *y, INT64 *incy); 67 | 68 | INT64 cgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k, 69 | fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, 70 | fcomplex *beta, fcomplex *c, INT64 *ldc); 71 | 72 | INT64 cgemv_(char *trans, INT64 *m, INT64 *n, fcomplex *alpha, fcomplex *a, 73 | INT64 *lda, fcomplex *x, INT64 *incx, fcomplex *beta, fcomplex *y, 74 | INT64 *incy); 75 | 76 | INT64 cgerc_(INT64 *m, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx, 77 | fcomplex *y, INT64 *incy, fcomplex *a, INT64 *lda); 78 | 79 | INT64 cgeru_(INT64 *m, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx, 80 | fcomplex *y, INT64 *incy, fcomplex *a, INT64 *lda); 81 | 82 | INT64 chbmv_(char *uplo, INT64 *n, INT64 *k, fcomplex *alpha, fcomplex *a, 83 | INT64 *lda, fcomplex *x, INT64 *incx, fcomplex *beta, fcomplex *y, 84 | INT64 *incy); 85 | 86 | INT64 chemm_(char *side, char *uplo, INT64 *m, INT64 *n, fcomplex *alpha, 87 | fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, fcomplex *beta, 88 | fcomplex *c, INT64 *ldc); 89 | 90 | INT64 chemv_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *a, INT64 *lda, 91 | fcomplex *x, INT64 *incx, fcomplex *beta, fcomplex *y, INT64 *incy); 92 | 93 | INT64 cher_(char *uplo, INT64 *n, float *alpha, fcomplex *x, INT64 *incx, 94 | fcomplex *a, INT64 *lda); 95 | 96 | INT64 cher2_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx, 97 | fcomplex *y, INT64 *incy, fcomplex *a, INT64 *lda); 98 | 99 | INT64 cher2k_(char *uplo, char *trans, INT64 *n, INT64 *k, fcomplex *alpha, 100 | fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, float *beta, 101 | fcomplex *c, INT64 *ldc); 102 | 103 | INT64 cherk_(char *uplo, char *trans, INT64 *n, INT64 *k, float *alpha, 104 | fcomplex *a, INT64 *lda, float *beta, fcomplex *c, INT64 *ldc); 105 | 106 | INT64 chpmv_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *ap, fcomplex *x, 107 | INT64 *incx, fcomplex *beta, fcomplex *y, INT64 *incy); 108 | 109 | INT64 chpr_(char *uplo, INT64 *n, float *alpha, fcomplex *x, INT64 *incx, 110 | fcomplex *ap); 111 | 112 | INT64 chpr2_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx, 113 | fcomplex *y, INT64 *incy, fcomplex *ap); 114 | 115 | INT64 crotg_(fcomplex *ca, fcomplex *cb, float *c, fcomplex *s); 116 | 117 | INT64 cscal_(INT64 *n, fcomplex *ca, fcomplex *cx, INT64 *incx); 118 | 119 | INT64 csscal_(INT64 *n, float *sa, fcomplex *cx, INT64 *incx); 120 | 121 | INT64 cswap_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy); 122 | 123 | INT64 csymm_(char *side, char *uplo, INT64 *m, INT64 *n, fcomplex *alpha, 124 | fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, fcomplex *beta, 125 | fcomplex *c, INT64 *ldc); 126 | 127 | INT64 csyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, fcomplex *alpha, 128 | fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, fcomplex *beta, 129 | fcomplex *c, INT64 *ldc); 130 | 131 | INT64 csyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, fcomplex *alpha, 132 | fcomplex *a, INT64 *lda, fcomplex *beta, fcomplex *c, INT64 *ldc); 133 | 134 | INT64 ctbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 135 | fcomplex *a, INT64 *lda, fcomplex *x, INT64 *incx); 136 | 137 | INT64 ctbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 138 | fcomplex *a, INT64 *lda, fcomplex *x, INT64 *incx); 139 | 140 | INT64 ctpmv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *ap, 141 | fcomplex *x, INT64 *incx); 142 | 143 | INT64 ctpsv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *ap, 144 | fcomplex *x, INT64 *incx); 145 | 146 | INT64 ctrmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 147 | INT64 *n, fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *b, 148 | INT64 *ldb); 149 | 150 | INT64 ctrmv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *a, 151 | INT64 *lda, fcomplex *x, INT64 *incx); 152 | 153 | INT64 ctrsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 154 | INT64 *n, fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *b, 155 | INT64 *ldb); 156 | 157 | INT64 ctrsv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *a, 158 | INT64 *lda, fcomplex *x, INT64 *incx); 159 | 160 | INT64 daxpy_(INT64 *n, double *sa, double *sx, INT64 *incx, double *sy, 161 | INT64 *incy); 162 | 163 | INT64 dcopy_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy); 164 | 165 | INT64 dgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku, 166 | double *alpha, double *a, INT64 *lda, double *x, INT64 *incx, 167 | double *beta, double *y, INT64 *incy); 168 | 169 | INT64 dgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k, 170 | double *alpha, double *a, INT64 *lda, double *b, INT64 *ldb, 171 | double *beta, double *c, INT64 *ldc); 172 | 173 | INT64 dgemv_(char *trans, INT64 *m, INT64 *n, double *alpha, double *a, 174 | INT64 *lda, double *x, INT64 *incx, double *beta, double *y, 175 | INT64 *incy); 176 | 177 | INT64 dger_(INT64 *m, INT64 *n, double *alpha, double *x, INT64 *incx, 178 | double *y, INT64 *incy, double *a, INT64 *lda); 179 | 180 | INT64 drot_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy, 181 | double *c, double *s); 182 | 183 | INT64 drotg_(double *sa, double *sb, double *c, double *s); 184 | 185 | INT64 dsbmv_(char *uplo, INT64 *n, INT64 *k, double *alpha, double *a, 186 | INT64 *lda, double *x, INT64 *incx, double *beta, double *y, 187 | INT64 *incy); 188 | 189 | INT64 dscal_(INT64 *n, double *sa, double *sx, INT64 *incx); 190 | 191 | INT64 dspmv_(char *uplo, INT64 *n, double *alpha, double *ap, double *x, 192 | INT64 *incx, double *beta, double *y, INT64 *incy); 193 | 194 | INT64 dspr_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx, 195 | double *ap); 196 | 197 | INT64 dspr2_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx, 198 | double *y, INT64 *incy, double *ap); 199 | 200 | INT64 dswap_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy); 201 | 202 | INT64 dsymm_(char *side, char *uplo, INT64 *m, INT64 *n, double *alpha, 203 | double *a, INT64 *lda, double *b, INT64 *ldb, double *beta, 204 | double *c, INT64 *ldc); 205 | 206 | INT64 dsymv_(char *uplo, INT64 *n, double *alpha, double *a, INT64 *lda, 207 | double *x, INT64 *incx, double *beta, double *y, INT64 *incy); 208 | 209 | INT64 dsyr_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx, 210 | double *a, INT64 *lda); 211 | 212 | INT64 dsyr2_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx, 213 | double *y, INT64 *incy, double *a, INT64 *lda); 214 | 215 | INT64 dsyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, double *alpha, 216 | double *a, INT64 *lda, double *b, INT64 *ldb, double *beta, 217 | double *c, INT64 *ldc); 218 | 219 | INT64 dsyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, double *alpha, 220 | double *a, INT64 *lda, double *beta, double *c, INT64 *ldc); 221 | 222 | INT64 dtbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 223 | double *a, INT64 *lda, double *x, INT64 *incx); 224 | 225 | INT64 dtbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 226 | double *a, INT64 *lda, double *x, INT64 *incx); 227 | 228 | INT64 dtpmv_(char *uplo, char *trans, char *diag, INT64 *n, double *ap, 229 | double *x, INT64 *incx); 230 | 231 | INT64 dtpsv_(char *uplo, char *trans, char *diag, INT64 *n, double *ap, 232 | double *x, INT64 *incx); 233 | 234 | INT64 dtrmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 235 | INT64 *n, double *alpha, double *a, INT64 *lda, double *b, 236 | INT64 *ldb); 237 | 238 | INT64 dtrmv_(char *uplo, char *trans, char *diag, INT64 *n, double *a, 239 | INT64 *lda, double *x, INT64 *incx); 240 | 241 | INT64 dtrsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 242 | INT64 *n, double *alpha, double *a, INT64 *lda, double *b, 243 | INT64 *ldb); 244 | 245 | INT64 dtrsv_(char *uplo, char *trans, char *diag, INT64 *n, double *a, 246 | INT64 *lda, double *x, INT64 *incx); 247 | 248 | 249 | INT64 saxpy_(INT64 *n, float *sa, float *sx, INT64 *incx, float *sy, INT64 *incy); 250 | 251 | INT64 scopy_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy); 252 | 253 | INT64 sgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku, 254 | float *alpha, float *a, INT64 *lda, float *x, INT64 *incx, 255 | float *beta, float *y, INT64 *incy); 256 | 257 | INT64 sgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k, 258 | float *alpha, float *a, INT64 *lda, float *b, INT64 *ldb, 259 | float *beta, float *c, INT64 *ldc); 260 | 261 | INT64 sgemv_(char *trans, INT64 *m, INT64 *n, float *alpha, float *a, 262 | INT64 *lda, float *x, INT64 *incx, float *beta, float *y, 263 | INT64 *incy); 264 | 265 | INT64 sger_(INT64 *m, INT64 *n, float *alpha, float *x, INT64 *incx, 266 | float *y, INT64 *incy, float *a, INT64 *lda); 267 | 268 | INT64 srot_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy, 269 | float *c, float *s); 270 | 271 | INT64 srotg_(float *sa, float *sb, float *c, float *s); 272 | 273 | INT64 ssbmv_(char *uplo, INT64 *n, INT64 *k, float *alpha, float *a, 274 | INT64 *lda, float *x, INT64 *incx, float *beta, float *y, 275 | INT64 *incy); 276 | 277 | INT64 sscal_(INT64 *n, float *sa, float *sx, INT64 *incx); 278 | 279 | INT64 sspmv_(char *uplo, INT64 *n, float *alpha, float *ap, float *x, 280 | INT64 *incx, float *beta, float *y, INT64 *incy); 281 | 282 | INT64 sspr_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx, 283 | float *ap); 284 | 285 | INT64 sspr2_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx, 286 | float *y, INT64 *incy, float *ap); 287 | 288 | INT64 sswap_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy); 289 | 290 | INT64 ssymm_(char *side, char *uplo, INT64 *m, INT64 *n, float *alpha, 291 | float *a, INT64 *lda, float *b, INT64 *ldb, float *beta, 292 | float *c, INT64 *ldc); 293 | 294 | INT64 ssymv_(char *uplo, INT64 *n, float *alpha, float *a, INT64 *lda, 295 | float *x, INT64 *incx, float *beta, float *y, INT64 *incy); 296 | 297 | INT64 ssyr_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx, 298 | float *a, INT64 *lda); 299 | 300 | INT64 ssyr2_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx, 301 | float *y, INT64 *incy, float *a, INT64 *lda); 302 | 303 | INT64 ssyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, float *alpha, 304 | float *a, INT64 *lda, float *b, INT64 *ldb, float *beta, 305 | float *c, INT64 *ldc); 306 | 307 | INT64 ssyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, float *alpha, 308 | float *a, INT64 *lda, float *beta, float *c, INT64 *ldc); 309 | 310 | INT64 stbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 311 | float *a, INT64 *lda, float *x, INT64 *incx); 312 | 313 | INT64 stbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 314 | float *a, INT64 *lda, float *x, INT64 *incx); 315 | 316 | INT64 stpmv_(char *uplo, char *trans, char *diag, INT64 *n, float *ap, 317 | float *x, INT64 *incx); 318 | 319 | INT64 stpsv_(char *uplo, char *trans, char *diag, INT64 *n, float *ap, 320 | float *x, INT64 *incx); 321 | 322 | INT64 strmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 323 | INT64 *n, float *alpha, float *a, INT64 *lda, float *b, 324 | INT64 *ldb); 325 | 326 | INT64 strmv_(char *uplo, char *trans, char *diag, INT64 *n, float *a, 327 | INT64 *lda, float *x, INT64 *incx); 328 | 329 | INT64 strsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 330 | INT64 *n, float *alpha, float *a, INT64 *lda, float *b, 331 | INT64 *ldb); 332 | 333 | INT64 strsv_(char *uplo, char *trans, char *diag, INT64 *n, float *a, 334 | INT64 *lda, float *x, INT64 *incx); 335 | 336 | INT64 zaxpy_(INT64 *n, dcomplex *ca, dcomplex *cx, INT64 *incx, dcomplex *cy, 337 | INT64 *incy); 338 | 339 | INT64 zcopy_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy); 340 | 341 | INT64 zdscal_(INT64 *n, double *sa, dcomplex *cx, INT64 *incx); 342 | 343 | INT64 zgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku, 344 | dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *x, INT64 *incx, 345 | dcomplex *beta, dcomplex *y, INT64 *incy); 346 | 347 | INT64 zgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k, 348 | dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, 349 | dcomplex *beta, dcomplex *c, INT64 *ldc); 350 | 351 | INT64 zgemv_(char *trans, INT64 *m, INT64 *n, dcomplex *alpha, dcomplex *a, 352 | INT64 *lda, dcomplex *x, INT64 *incx, dcomplex *beta, dcomplex *y, 353 | INT64 *incy); 354 | 355 | INT64 zgerc_(INT64 *m, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx, 356 | dcomplex *y, INT64 *incy, dcomplex *a, INT64 *lda); 357 | 358 | INT64 zgeru_(INT64 *m, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx, 359 | dcomplex *y, INT64 *incy, dcomplex *a, INT64 *lda); 360 | 361 | INT64 zhbmv_(char *uplo, INT64 *n, INT64 *k, dcomplex *alpha, dcomplex *a, 362 | INT64 *lda, dcomplex *x, INT64 *incx, dcomplex *beta, dcomplex *y, 363 | INT64 *incy); 364 | 365 | INT64 zhemm_(char *side, char *uplo, INT64 *m, INT64 *n, dcomplex *alpha, 366 | dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, dcomplex *beta, 367 | dcomplex *c, INT64 *ldc); 368 | 369 | INT64 zhemv_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *a, INT64 *lda, 370 | dcomplex *x, INT64 *incx, dcomplex *beta, dcomplex *y, INT64 *incy); 371 | 372 | INT64 zher_(char *uplo, INT64 *n, double *alpha, dcomplex *x, INT64 *incx, 373 | dcomplex *a, INT64 *lda); 374 | 375 | INT64 zher2_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx, 376 | dcomplex *y, INT64 *incy, dcomplex *a, INT64 *lda); 377 | 378 | INT64 zher2k_(char *uplo, char *trans, INT64 *n, INT64 *k, dcomplex *alpha, 379 | dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, double *beta, 380 | dcomplex *c, INT64 *ldc); 381 | 382 | INT64 zherk_(char *uplo, char *trans, INT64 *n, INT64 *k, double *alpha, 383 | dcomplex *a, INT64 *lda, double *beta, dcomplex *c, INT64 *ldc); 384 | 385 | INT64 zhpmv_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *ap, dcomplex *x, 386 | INT64 *incx, dcomplex *beta, dcomplex *y, INT64 *incy); 387 | 388 | INT64 zhpr_(char *uplo, INT64 *n, double *alpha, dcomplex *x, INT64 *incx, 389 | dcomplex *ap); 390 | 391 | INT64 zhpr2_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx, 392 | dcomplex *y, INT64 *incy, dcomplex *ap); 393 | 394 | INT64 zrotg_(dcomplex *ca, dcomplex *cb, double *c, dcomplex *s); 395 | 396 | INT64 zscal_(INT64 *n, dcomplex *ca, dcomplex *cx, INT64 *incx); 397 | 398 | INT64 zswap_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy); 399 | 400 | INT64 zsymm_(char *side, char *uplo, INT64 *m, INT64 *n, dcomplex *alpha, 401 | dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, dcomplex *beta, 402 | dcomplex *c, INT64 *ldc); 403 | 404 | INT64 zsyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, dcomplex *alpha, 405 | dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, dcomplex *beta, 406 | dcomplex *c, INT64 *ldc); 407 | 408 | INT64 zsyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, dcomplex *alpha, 409 | dcomplex *a, INT64 *lda, dcomplex *beta, dcomplex *c, INT64 *ldc); 410 | 411 | INT64 ztbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 412 | dcomplex *a, INT64 *lda, dcomplex *x, INT64 *incx); 413 | 414 | INT64 ztbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k, 415 | dcomplex *a, INT64 *lda, dcomplex *x, INT64 *incx); 416 | 417 | INT64 ztpmv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *ap, 418 | dcomplex *x, INT64 *incx); 419 | 420 | INT64 ztpsv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *ap, 421 | dcomplex *x, INT64 *incx); 422 | 423 | INT64 ztrmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 424 | INT64 *n, dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *b, 425 | INT64 *ldb); 426 | 427 | INT64 ztrmv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *a, 428 | INT64 *lda, dcomplex *x, INT64 *incx); 429 | 430 | INT64 ztrsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m, 431 | INT64 *n, dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *b, 432 | INT64 *ldb); 433 | 434 | INT64 ztrsv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *a, 435 | INT64 *lda, dcomplex *x, INT64 *incx); 436 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/blas/daxpy.c: -------------------------------------------------------------------------------- 1 | #include "blas.h" 2 | 3 | INT64 daxpy_(INT64 *n, double *sa, double *sx, INT64 *incx, double *sy, 4 | INT64 *incy) 5 | { 6 | INT64 i, m, ix, iy, nn, iincx, iincy; 7 | register double ssa; 8 | 9 | /* constant times a vector plus a vector. 10 | uses unrolled loop for increments equal to one. 11 | jack dongarra, linpack, 3/11/78. 12 | modified 12/3/93, array(1) declarations changed to array(*) */ 13 | 14 | /* Dereference inputs */ 15 | nn = *n; 16 | ssa = *sa; 17 | iincx = *incx; 18 | iincy = *incy; 19 | 20 | if( nn > 0 && ssa != 0.0 ) 21 | { 22 | if (iincx == 1 && iincy == 1) /* code for both increments equal to 1 */ 23 | { 24 | m = nn-3; 25 | for (i = 0; i < m; i += 4) 26 | { 27 | sy[i] += ssa * sx[i]; 28 | sy[i+1] += ssa * sx[i+1]; 29 | sy[i+2] += ssa * sx[i+2]; 30 | sy[i+3] += ssa * sx[i+3]; 31 | } 32 | for ( ; i < nn; ++i) /* clean-up loop */ 33 | sy[i] += ssa * sx[i]; 34 | } 35 | else /* code for unequal increments or equal increments not equal to 1 */ 36 | { 37 | ix = iincx >= 0 ? 0 : (1 - nn) * iincx; 38 | iy = iincy >= 0 ? 0 : (1 - nn) * iincy; 39 | for (i = 0; i < nn; i++) 40 | { 41 | sy[iy] += ssa * sx[ix]; 42 | ix += iincx; 43 | iy += iincy; 44 | } 45 | } 46 | } 47 | 48 | return 0; 49 | } /* daxpy_ */ 50 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/blas/ddot.c: -------------------------------------------------------------------------------- 1 | #include "blas.h" 2 | 3 | double ddot_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy) 4 | { 5 | INT64 i, m, nn, iincx, iincy; 6 | double stemp; 7 | INT64 ix, iy; 8 | 9 | /* forms the dot product of two vectors. 10 | uses unrolled loops for increments equal to one. 11 | jack dongarra, linpack, 3/11/78. 12 | modified 12/3/93, array(1) declarations changed to array(*) */ 13 | 14 | /* Dereference inputs */ 15 | nn = *n; 16 | iincx = *incx; 17 | iincy = *incy; 18 | 19 | stemp = 0.0; 20 | if (nn > 0) 21 | { 22 | if (iincx == 1 && iincy == 1) /* code for both increments equal to 1 */ 23 | { 24 | m = nn-4; 25 | for (i = 0; i < m; i += 5) 26 | stemp += sx[i] * sy[i] + sx[i+1] * sy[i+1] + sx[i+2] * sy[i+2] + 27 | sx[i+3] * sy[i+3] + sx[i+4] * sy[i+4]; 28 | 29 | for ( ; i < nn; i++) /* clean-up loop */ 30 | stemp += sx[i] * sy[i]; 31 | } 32 | else /* code for unequal increments or equal increments not equal to 1 */ 33 | { 34 | ix = 0; 35 | iy = 0; 36 | if (iincx < 0) 37 | ix = (1 - nn) * iincx; 38 | if (iincy < 0) 39 | iy = (1 - nn) * iincy; 40 | for (i = 0; i < nn; i++) 41 | { 42 | stemp += sx[ix] * sy[iy]; 43 | ix += iincx; 44 | iy += iincy; 45 | } 46 | } 47 | } 48 | 49 | return stemp; 50 | } /* ddot_ */ 51 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/blas/dnrm2.c: -------------------------------------------------------------------------------- 1 | #include /* Needed for fabs() and sqrt() */ 2 | #include "blas.h" 3 | 4 | double dnrm2_(INT64 *n, double *x, INT64 *incx) 5 | { 6 | INT64 ix, nn, iincx; 7 | double norm, scale, absxi, ssq, temp; 8 | 9 | /* DNRM2 returns the euclidean norm of a vector via the function 10 | name, so that 11 | 12 | DNRM2 := sqrt( x'*x ) 13 | 14 | -- This version written on 25-October-1982. 15 | Modified on 14-October-1993 to inline the call to SLASSQ. 16 | Sven Hammarling, Nag Ltd. */ 17 | 18 | /* Dereference inputs */ 19 | nn = *n; 20 | iincx = *incx; 21 | 22 | if( nn > 0 && iincx > 0 ) 23 | { 24 | if (nn == 1) 25 | { 26 | norm = fabs(x[0]); 27 | } 28 | else 29 | { 30 | scale = 0.0; 31 | ssq = 1.0; 32 | 33 | /* The following loop is equivalent to this call to the LAPACK 34 | auxiliary routine: CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */ 35 | 36 | for (ix=(nn-1)*iincx; ix>=0; ix-=iincx) 37 | { 38 | if (x[ix] != 0.0) 39 | { 40 | absxi = fabs(x[ix]); 41 | if (scale < absxi) 42 | { 43 | temp = scale / absxi; 44 | ssq = ssq * (temp * temp) + 1.0; 45 | scale = absxi; 46 | } 47 | else 48 | { 49 | temp = absxi / scale; 50 | ssq += temp * temp; 51 | } 52 | } 53 | } 54 | norm = scale * sqrt(ssq); 55 | } 56 | } 57 | else 58 | norm = 0.0; 59 | 60 | return norm; 61 | 62 | } /* dnrm2_ */ 63 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/blas/dscal.c: -------------------------------------------------------------------------------- 1 | #include "blas.h" 2 | 3 | INT64 dscal_(INT64 *n, double *sa, double *sx, INT64 *incx) 4 | { 5 | INT64 i, m, nincx, nn, iincx; 6 | double ssa; 7 | 8 | /* scales a vector by a constant. 9 | uses unrolled loops for increment equal to 1. 10 | jack dongarra, linpack, 3/11/78. 11 | modified 3/93 to return if incx .le. 0. 12 | modified 12/3/93, array(1) declarations changed to array(*) */ 13 | 14 | /* Dereference inputs */ 15 | nn = *n; 16 | iincx = *incx; 17 | ssa = *sa; 18 | 19 | if (nn > 0 && iincx > 0) 20 | { 21 | if (iincx == 1) /* code for increment equal to 1 */ 22 | { 23 | m = nn-4; 24 | for (i = 0; i < m; i += 5) 25 | { 26 | sx[i] = ssa * sx[i]; 27 | sx[i+1] = ssa * sx[i+1]; 28 | sx[i+2] = ssa * sx[i+2]; 29 | sx[i+3] = ssa * sx[i+3]; 30 | sx[i+4] = ssa * sx[i+4]; 31 | } 32 | for ( ; i < nn; ++i) /* clean-up loop */ 33 | sx[i] = ssa * sx[i]; 34 | } 35 | else /* code for increment not equal to 1 */ 36 | { 37 | nincx = nn * iincx; 38 | for (i = 0; i < nincx; i += iincx) 39 | sx[i] = ssa * sx[i]; 40 | } 41 | } 42 | 43 | return 0; 44 | } /* dscal_ */ 45 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/linear.def: -------------------------------------------------------------------------------- 1 | LIBRARY liblinear 2 | EXPORTS 3 | train @1 4 | cross_validation @2 5 | save_model @3 6 | load_model @4 7 | get_nr_feature @5 8 | get_nr_class @6 9 | get_labels @7 10 | predict_values @8 11 | predict @9 12 | predict_probability @10 13 | free_and_destroy_model @11 14 | free_model_content @12 15 | destroy_param @13 16 | check_parameter @14 17 | check_probability_model @15 18 | set_print_string_function @16 19 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/linear.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifndef _LIBLINEAR_H 4 | #define _LIBLINEAR_H 5 | #ifndef INT64_DEFINED 6 | typedef int64_t INT64; 7 | #define INT64_DEFINED 8 | #endif 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | struct feature_node 15 | { 16 | INT64 index; 17 | double value; 18 | }; 19 | 20 | struct problem 21 | { 22 | INT64 l, n; 23 | double *y; 24 | struct feature_node **x; 25 | double bias; /* < 0 if no bias term */ 26 | }; 27 | 28 | enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR = 11, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL }; /* solver_type */ 29 | 30 | struct parameter 31 | { 32 | INT64 solver_type; 33 | 34 | /* these are for training only */ 35 | double eps; /* stopping criteria */ 36 | double C; 37 | INT64 nr_weight; 38 | INT64 *weight_label; 39 | double* weight; 40 | double p; 41 | }; 42 | 43 | struct model 44 | { 45 | struct parameter param; 46 | INT64 nr_class; /* number of classes */ 47 | INT64 nr_feature; 48 | double *w; 49 | INT64 *label; /* label of each class */ 50 | double bias; 51 | }; 52 | 53 | struct model* train(const struct problem *prob, const struct parameter *param); 54 | void cross_validation(const struct problem *prob, const struct parameter *param, INT64 nr_fold, double *target); 55 | 56 | double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values); 57 | double predict(const struct model *model_, const struct feature_node *x); 58 | double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates); 59 | 60 | INT64 save_model(const char *model_file_name, const struct model *model_); 61 | struct model *load_model(const char *model_file_name); 62 | 63 | INT64 get_nr_feature(const struct model *model_); 64 | INT64 get_nr_class(const struct model *model_); 65 | void get_labels(const struct model *model_, INT64* label); 66 | 67 | void free_model_content(struct model *model_ptr); 68 | void free_and_destroy_model(struct model **model_ptr_ptr); 69 | void destroy_param(struct parameter *param); 70 | 71 | const char *check_parameter(const struct problem *prob, const struct parameter *param); 72 | INT64 check_probability_model(const struct model *model); 73 | void set_print_string_function(void (*print_func) (const char*)); 74 | 75 | #ifdef __cplusplus 76 | } 77 | #endif 78 | 79 | #endif /* _LIBLINEAR_H */ 80 | 81 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/predict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/classifier/learner/liblinear/predict -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/predict.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "linear.h" 7 | 8 | int print_null(const char *s,...) {return 0;} 9 | 10 | static int (*info)(const char *fmt,...) = &printf; 11 | 12 | struct feature_node *x; 13 | INT64 max_nr_attr = 64; 14 | 15 | struct model* model_; 16 | INT64 flag_predict_probability=0; 17 | 18 | void exit_input_error(INT64 line_num) 19 | { 20 | fprintf(stderr,"Wrong input format at line %lld\n", (long long int)line_num); 21 | exit(1); 22 | } 23 | 24 | static char *line = NULL; 25 | static INT64 max_line_len; 26 | 27 | static char* readline(FILE *input) 28 | { 29 | INT64 len; 30 | 31 | if(fgets(line,max_line_len,input) == NULL) 32 | return NULL; 33 | 34 | while(strrchr(line,'\n') == NULL) 35 | { 36 | max_line_len *= 2; 37 | line = (char *) realloc(line,max_line_len); 38 | len = (INT64) strlen(line); 39 | if(fgets(line+len,max_line_len-len,input) == NULL) 40 | break; 41 | } 42 | return line; 43 | } 44 | 45 | void do_predict(FILE *input, FILE *output) 46 | { 47 | INT64 correct = 0; 48 | INT64 total = 0; 49 | double error = 0; 50 | double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0; 51 | 52 | INT64 nr_class=get_nr_class(model_); 53 | double *prob_estimates=NULL; 54 | INT64 j, n; 55 | INT64 nr_feature=get_nr_feature(model_); 56 | if(model_->bias>=0) 57 | n=nr_feature+1; 58 | else 59 | n=nr_feature; 60 | 61 | if(flag_predict_probability) 62 | { 63 | INT64 *labels; 64 | 65 | if(!check_probability_model(model_)) 66 | { 67 | fprintf(stderr, "probability output is only supported for logistic regression\n"); 68 | exit(1); 69 | } 70 | 71 | labels=(INT64 *) malloc(nr_class*sizeof(INT64)); 72 | get_labels(model_,labels); 73 | prob_estimates = (double *) malloc(nr_class*sizeof(double)); 74 | fprintf(output,"labels"); 75 | for(j=0;j=max_nr_attr-2) // need one more for index = -1 101 | { 102 | max_nr_attr *= 2; 103 | x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node)); 104 | } 105 | 106 | idx = strtok(NULL,":"); 107 | val = strtok(NULL," \t"); 108 | 109 | if(val == NULL) 110 | break; 111 | errno = 0; 112 | x[i].index = (INT64) strtoll(idx,&endptr,10); 113 | if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index) 114 | exit_input_error(total+1); 115 | else 116 | inst_max_index = x[i].index; 117 | 118 | errno = 0; 119 | x[i].value = strtod(val,&endptr); 120 | if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) 121 | exit_input_error(total+1); 122 | 123 | // feature indices larger than those in training are not used 124 | if(x[i].index <= nr_feature) 125 | ++i; 126 | } 127 | 128 | if(model_->bias>=0) 129 | { 130 | x[i].index = n; 131 | x[i].value = model_->bias; 132 | i++; 133 | } 134 | x[i].index = -1; 135 | 136 | if(flag_predict_probability) 137 | { 138 | INT64 j; 139 | predict_label = predict_probability(model_,x,prob_estimates); 140 | fprintf(output,"%g",predict_label); 141 | for(j=0;jnr_class;j++) 142 | fprintf(output," %g",prob_estimates[j]); 143 | fprintf(output,"\n"); 144 | } 145 | else 146 | { 147 | predict_label = predict(model_,x); 148 | fprintf(output,"%g\n",predict_label); 149 | } 150 | 151 | if(predict_label == target_label) 152 | ++correct; 153 | error += (predict_label-target_label)*(predict_label-target_label); 154 | sump += predict_label; 155 | sumt += target_label; 156 | sumpp += predict_label*predict_label; 157 | sumtt += target_label*target_label; 158 | sumpt += predict_label*target_label; 159 | ++total; 160 | } 161 | if(model_->param.solver_type==L2R_L2LOSS_SVR || 162 | model_->param.solver_type==L2R_L1LOSS_SVR_DUAL || 163 | model_->param.solver_type==L2R_L2LOSS_SVR_DUAL) 164 | { 165 | info("Mean squared error = %g (regression)\n",error/total); 166 | info("Squared correlation coefficient = %g (regression)\n", 167 | ((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/ 168 | ((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt)) 169 | ); 170 | } 171 | else 172 | info("Accuracy = %g%% (%lld/%lld)\n",(double) correct/total*100,(long long int)correct,(long long int)total); 173 | if(flag_predict_probability) 174 | free(prob_estimates); 175 | } 176 | 177 | void exit_with_help() 178 | { 179 | printf( 180 | "Usage: predict [options] test_file model_file output_file\n" 181 | "options:\n" 182 | "-b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only\n" 183 | "-q : quiet mode (no outputs)\n" 184 | ); 185 | exit(1); 186 | } 187 | 188 | int main(int argc, char **argv) 189 | { 190 | FILE *input, *output; 191 | INT64 i; 192 | 193 | // parse options 194 | for(i=1;i=argc) 214 | exit_with_help(); 215 | 216 | if(i!=argc-3 || argv[i+1][0] == '-' || argv[i+2][0] == '-') 217 | exit_with_help(); 218 | 219 | input = fopen(argv[i],"r"); 220 | if(input == NULL) 221 | { 222 | fprintf(stderr,"can't open input file %s\n",argv[i]); 223 | exit(1); 224 | } 225 | 226 | output = fopen(argv[i+2],"w"); 227 | if(output == NULL) 228 | { 229 | fprintf(stderr,"can't open output file %s\n",argv[i+2]); 230 | exit(1); 231 | } 232 | 233 | if((model_=load_model(argv[i+1]))==0) 234 | { 235 | fprintf(stderr,"can't open model file %s\n",argv[i+1]); 236 | exit(1); 237 | } 238 | 239 | x = (struct feature_node *) malloc(max_nr_attr*sizeof(struct feature_node)); 240 | do_predict(input, output); 241 | free_and_destroy_model(&model_); 242 | free(line); 243 | free(x); 244 | fclose(input); 245 | fclose(output); 246 | return 0; 247 | } 248 | 249 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/python/Makefile: -------------------------------------------------------------------------------- 1 | all = lib 2 | 3 | lib: 4 | make -C .. lib 5 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/python/README: -------------------------------------------------------------------------------- 1 | ------------------------------------- 2 | --- Python interface of LIBLINEAR --- 3 | ------------------------------------- 4 | 5 | Table of Contents 6 | ================= 7 | 8 | - Introduction 9 | - Installation 10 | - Quick Start 11 | - Design Description 12 | - Data Structures 13 | - Utility Functions 14 | - Additional Information 15 | 16 | Introduction 17 | ============ 18 | 19 | Python (http://www.python.org/) is a programming language suitable for rapid 20 | development. This tool provides a simple Python interface to LIBLINEAR, a library 21 | for support vector machines (http://www.csie.ntu.edu.tw/~cjlin/liblinear). The 22 | interface is very easy to use as the usage is the same as that of LIBLINEAR. The 23 | interface is developed with the built-in Python library "ctypes." 24 | 25 | Installation 26 | ============ 27 | 28 | On Unix systems, type 29 | 30 | > make 31 | 32 | The interface needs only LIBLINEAR shared library, which is generated by 33 | the above command. We assume that the shared library is on the LIBLINEAR 34 | main directory or in the system path. 35 | 36 | Quick Start 37 | =========== 38 | 39 | There are two levels of usage. The high-level one uses utility functions 40 | in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface. 41 | 42 | >>> from liblinearutil import * 43 | # Read data in LIBSVM format 44 | >>> y, x = svm_read_problem('../heart_scale') 45 | >>> m = train(y[:200], x[:200], '-c 4') 46 | >>> p_label, p_acc, p_val = predict(y[200:], x[200:], m) 47 | 48 | # Construct problem in python format 49 | # Dense data 50 | >>> y, x = [1,-1], [[1,0,1], [-1,0,-1]] 51 | # Sparse data 52 | >>> y, x = [1,-1], [{1:1, 3:1}, {1:-1,3:-1}] 53 | >>> prob = problem(y, x) 54 | >>> param = parameter('-c 4 -B 1') 55 | >>> m = train(prob, param) 56 | 57 | # Other utility functions 58 | >>> save_model('heart_scale.model', m) 59 | >>> m = load_model('heart_scale.model') 60 | >>> p_label, p_acc, p_val = predict(y, x, m, '-b 1') 61 | >>> ACC, MSE, SCC = evaluations(y, p_label) 62 | 63 | # Getting online help 64 | >>> help(train) 65 | 66 | The low-level use directly calls C interfaces imported by liblinear.py. Note that 67 | all arguments and return values are in ctypes format. You need to handle them 68 | carefully. 69 | 70 | >>> from liblinear import * 71 | >>> prob = problem([1,-1], [{1:1, 3:1}, {1:-1,3:-1}]) 72 | >>> param = parameter('-c 4') 73 | >>> m = liblinear.train(prob, param) # m is a ctype pointer to a model 74 | # Convert a Python-format instance to feature_nodearray, a ctypes structure 75 | >>> x0, max_idx = gen_feature_nodearray({1:1, 3:1}) 76 | >>> label = liblinear.predict(m, x0) 77 | 78 | Design Description 79 | ================== 80 | 81 | There are two files liblinear.py and liblinearutil.py, which respectively correspond to 82 | low-level and high-level use of the interface. 83 | 84 | In liblinear.py, we adopt the Python built-in library "ctypes," so that 85 | Python can directly access C structures and interface functions defined 86 | in linear.h. 87 | 88 | While advanced users can use structures/functions in liblinear.py, to 89 | avoid handling ctypes structures, in liblinearutil.py we provide some easy-to-use 90 | functions. The usage is similar to LIBLINEAR MATLAB interface. 91 | 92 | Data Structures 93 | =============== 94 | 95 | Three data structures derived from linear.h are node, problem, and 96 | parameter. They all contain fields with the same names in 97 | linear.h. Access these fields carefully because you directly use a C structure 98 | instead of a Python object. The following description introduces additional 99 | fields and methods. 100 | 101 | Before using the data structures, execute the following command to load the 102 | LIBLINEAR shared library: 103 | 104 | >>> from liblinear import * 105 | 106 | - class feature_node: 107 | 108 | Construct a feature_node. 109 | 110 | >>> node = feature_node(idx, val) 111 | 112 | idx: an integer indicates the feature index. 113 | 114 | val: a float indicates the feature value. 115 | 116 | Show the index and the value of a node. 117 | 118 | >>> print(node) 119 | 120 | - Function: gen_feature_nodearray(xi [,feature_max=None [,issparse=True]]) 121 | 122 | Generate a feature vector from a Python list/tuple or a dictionary: 123 | 124 | >>> xi, max_idx = gen_feature_nodearray({1:1, 3:1, 5:-2}) 125 | 126 | xi: the returned feature_nodearray (a ctypes structure) 127 | 128 | max_idx: the maximal feature index of xi 129 | 130 | issparse: if issparse == True, zero feature values are removed. The default 131 | value is True for the sparsity. 132 | 133 | feature_max: if feature_max is assigned, features with indices larger than 134 | feature_max are removed. 135 | 136 | - class problem: 137 | 138 | Construct a problem instance 139 | 140 | >>> prob = problem(y, x [,bias=-1]) 141 | 142 | y: a Python list/tuple of l labels (type must be int/double). 143 | 144 | x: a Python list/tuple of l data instances. Each element of x must be 145 | an instance of list/tuple/dictionary type. 146 | 147 | bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term 148 | added (default -1) 149 | 150 | You can also modify the bias value by 151 | 152 | >>> prob.set_bias(1) 153 | 154 | Note that if your x contains sparse data (i.e., dictionary), the internal 155 | ctypes data format is still sparse. 156 | 157 | - class parameter: 158 | 159 | Construct a parameter instance 160 | 161 | >>> param = parameter('training_options') 162 | 163 | If 'training_options' is empty, LIBLINEAR default values are applied. 164 | 165 | Set param to LIBLINEAR default values. 166 | 167 | >>> param.set_to_default_values() 168 | 169 | Parse a string of options. 170 | 171 | >>> param.parse_options('training_options') 172 | 173 | Show values of parameters. 174 | 175 | >>> print(param) 176 | 177 | - class model: 178 | 179 | There are two ways to obtain an instance of model: 180 | 181 | >>> model_ = train(y, x) 182 | >>> model_ = load_model('model_file_name') 183 | 184 | Note that the returned structure of interface functions 185 | liblinear.train and liblinear.load_model is a ctypes pointer of 186 | model, which is different from the model object returned 187 | by train and load_model in liblinearutil.py. We provide a 188 | function toPyModel for the conversion: 189 | 190 | >>> model_ptr = liblinear.train(prob, param) 191 | >>> model_ = toPyModel(model_ptr) 192 | 193 | If you obtain a model in a way other than the above approaches, 194 | handle it carefully to avoid memory leak or segmentation fault. 195 | 196 | Some interface functions to access LIBLINEAR models are wrapped as 197 | members of the class model: 198 | 199 | >>> nr_feature = model_.get_nr_feature() 200 | >>> nr_class = model_.get_nr_class() 201 | >>> class_labels = model_.get_labels() 202 | >>> is_prob_model = model_.is_probability_model() 203 | 204 | Utility Functions 205 | ================= 206 | 207 | To use utility functions, type 208 | 209 | >>> from liblinearutil import * 210 | 211 | The above command loads 212 | train() : train a linear model 213 | predict() : predict testing data 214 | svm_read_problem() : read the data from a LIBSVM-format file. 215 | load_model() : load a LIBLINEAR model. 216 | save_model() : save model to a file. 217 | evaluations() : evaluate prediction results. 218 | 219 | - Function: train 220 | 221 | There are three ways to call train() 222 | 223 | >>> model = train(y, x [, 'training_options']) 224 | >>> model = train(prob [, 'training_options']) 225 | >>> model = train(prob, param) 226 | 227 | y: a list/tuple of l training labels (type must be int/double). 228 | 229 | x: a list/tuple of l training instances. The feature vector of 230 | each training instance is an instance of list/tuple or dictionary. 231 | 232 | training_options: a string in the same form as that for LIBLINEAR command 233 | mode. 234 | 235 | prob: a problem instance generated by calling 236 | problem(y, x). 237 | 238 | param: a parameter instance generated by calling 239 | parameter('training_options') 240 | 241 | model: the returned model instance. See linear.h for details of this 242 | structure. If '-v' is specified, cross validation is 243 | conducted and the returned model is just a scalar: cross-validation 244 | accuracy for classification and mean-squared error for regression. 245 | 246 | To train the same data many times with different 247 | parameters, the second and the third ways should be faster.. 248 | 249 | Examples: 250 | 251 | >>> y, x = svm_read_problem('../heart_scale') 252 | >>> prob = problem(y, x) 253 | >>> param = parameter('-s 3 -c 5 -q') 254 | >>> m = train(y, x, '-c 5') 255 | >>> m = train(prob, '-w1 5 -c 5') 256 | >>> m = train(prob, param) 257 | >>> CV_ACC = train(y, x, '-v 3') 258 | 259 | - Function: predict 260 | 261 | To predict testing data with a model, use 262 | 263 | >>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options']) 264 | 265 | y: a list/tuple of l true labels (type must be int/double). It is used 266 | for calculating the accuracy. Use [] if true labels are 267 | unavailable. 268 | 269 | x: a list/tuple of l predicting instances. The feature vector of 270 | each predicting instance is an instance of list/tuple or dictionary. 271 | 272 | predicting_options: a string of predicting options in the same format as 273 | that of LIBLINEAR. 274 | 275 | model: a model instance. 276 | 277 | p_labels: a list of predicted labels 278 | 279 | p_acc: a tuple including accuracy (for classification), mean 280 | squared error, and squared correlation coefficient (for 281 | regression). 282 | 283 | p_vals: a list of decision values or probability estimates (if '-b 1' 284 | is specified). If k is the number of classes, for decision values, 285 | each element includes results of predicting k binary-class 286 | SVMs. If k = 2 and solver is not MCSVM_CS, only one decision value 287 | is returned. For probabilities, each element contains k values 288 | indicating the probability that the testing instance is in each class. 289 | Note that the order of classes here is the same as 'model.label' 290 | field in the model structure. 291 | 292 | Example: 293 | 294 | >>> m = train(y, x, '-c 5') 295 | >>> p_labels, p_acc, p_vals = predict(y, x, m) 296 | 297 | - Functions: svm_read_problem/load_model/save_model 298 | 299 | See the usage by examples: 300 | 301 | >>> y, x = svm_read_problem('data.txt') 302 | >>> m = load_model('model_file') 303 | >>> save_model('model_file', m) 304 | 305 | - Function: evaluations 306 | 307 | Calculate some evaluations using the true values (ty) and predicted 308 | values (pv): 309 | 310 | >>> (ACC, MSE, SCC) = evaluations(ty, pv) 311 | 312 | ty: a list of true values. 313 | 314 | pv: a list of predict values. 315 | 316 | ACC: accuracy. 317 | 318 | MSE: mean squared error. 319 | 320 | SCC: squared correlation coefficient. 321 | 322 | 323 | Additional Information 324 | ====================== 325 | 326 | This interface was written by Hsiang-Fu Yu from Department of Computer 327 | Science, National Taiwan University. If you find this tool useful, please 328 | cite LIBLINEAR as follows 329 | 330 | R.-E. Fan, K.-W. Chang, C.-J. Hsieh, X.-R. Wang, and C.-J. Lin. 331 | LIBLINEAR: A Library for Large Linear Classification, Journal of 332 | Machine Learning Research 9(2008), 1871-1874. Software available at 333 | http://www.csie.ntu.edu.tw/~cjlin/liblinear 334 | 335 | For any question, please contact Chih-Jen Lin , 336 | or check the FAQ page: 337 | 338 | http://www.csie.ntu.edu.tw/~cjlin/liblinear/faq.html 339 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/python/liblinear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from ctypes import * 4 | from ctypes.util import find_library 5 | from os import path 6 | import sys 7 | 8 | # For unix the prefix 'lib' is not considered. 9 | liblinear = CDLL(path.join(path.dirname(path.abspath(__file__)), '../liblinear.so.1')) 10 | 11 | # Construct constants 12 | SOLVER_TYPE = ['L2R_LR', 'L2R_L2LOSS_SVC_DUAL', 'L2R_L2LOSS_SVC', 'L2R_L1LOSS_SVC_DUAL',\ 13 | 'MCSVM_CS', 'L1R_L2LOSS_SVC', 'L1R_LR', 'L2R_LR_DUAL', \ 14 | None, None, None, \ 15 | 'L2R_L2LOSS_SVR', 'L2R_L2LOSS_SVR_DUAL', 'L2R_L1LOSS_SVR_DUAL'] 16 | for i, s in enumerate(SOLVER_TYPE): 17 | if s is not None: exec("%s = %d" % (s , i)) 18 | 19 | PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p) 20 | def print_null(s): 21 | return 22 | 23 | def genFields(names, types): 24 | return list(zip(names, types)) 25 | 26 | def fillprototype(f, restype, argtypes): 27 | f.restype = restype 28 | f.argtypes = argtypes 29 | 30 | class feature_node(Structure): 31 | _names = ["index", "value"] 32 | _types = [c_int64, c_double] 33 | _fields_ = genFields(_names, _types) 34 | 35 | def __str__(self): 36 | return '%d:%g' % (self.index, self.value) 37 | 38 | def gen_feature_nodearray(xi, feature_max=None, issparse=True): 39 | if isinstance(xi, dict): 40 | index_range = xi.keys() 41 | elif isinstance(xi, (list, tuple)): 42 | xi = [0] + xi # idx should start from 1 43 | index_range = range(1, len(xi)) 44 | else: 45 | raise TypeError('xi should be a dictionary, list or tuple') 46 | 47 | if feature_max: 48 | assert(isinstance(feature_max, int)) 49 | index_range = filter(lambda j: j <= feature_max, index_range) 50 | if issparse: 51 | index_range = filter(lambda j:xi[j] != 0, index_range) 52 | 53 | index_range = sorted(index_range) 54 | ret = (feature_node * (len(index_range)+2))() 55 | ret[-1].index = -1 # for bias term 56 | ret[-2].index = -1 57 | for idx, j in enumerate(index_range): 58 | ret[idx].index = j 59 | ret[idx].value = xi[j] 60 | max_idx = 0 61 | if index_range : 62 | max_idx = index_range[-1] 63 | return ret, max_idx 64 | 65 | class problem(Structure): 66 | _names = ["l", "n", "y", "x", "bias"] 67 | _types = [c_int64, c_int64, POINTER(c_double), POINTER(POINTER(feature_node)), c_double] 68 | _fields_ = genFields(_names, _types) 69 | 70 | def __init__(self, y, x, bias = -1): 71 | if len(y) != len(x) : 72 | raise ValueError("len(y) != len(x)") 73 | self.l = l = len(y) 74 | self.bias = -1 75 | 76 | max_idx = 0 77 | x_space = self.x_space = [] 78 | for i, xi in enumerate(x): 79 | tmp_xi, tmp_idx = gen_feature_nodearray(xi) 80 | x_space += [tmp_xi] 81 | max_idx = max(max_idx, tmp_idx) 82 | self.n = max_idx 83 | 84 | self.y = (c_double * l)() 85 | for i, yi in enumerate(y): self.y[i] = y[i] 86 | 87 | self.x = (POINTER(feature_node) * l)() 88 | for i, xi in enumerate(self.x_space): self.x[i] = xi 89 | 90 | self.set_bias(bias) 91 | 92 | def set_bias(self, bias): 93 | if self.bias == bias: 94 | return 95 | if bias >= 0 and self.bias < 0: 96 | self.n += 1 97 | node = feature_node(self.n, bias) 98 | if bias < 0 and self.bias >= 0: 99 | self.n -= 1 100 | node = feature_node(-1, bias) 101 | 102 | for xi in self.x_space: 103 | xi[-2] = node 104 | self.bias = bias 105 | 106 | 107 | class parameter(Structure): 108 | _names = ["solver_type", "eps", "C", "nr_weight", "weight_label", "weight", "p"] 109 | _types = [c_int64, c_double, c_double, c_int64, POINTER(c_int64), POINTER(c_double), c_double] 110 | _fields_ = genFields(_names, _types) 111 | 112 | def __init__(self, options = None): 113 | if options == None: 114 | options = '' 115 | self.parse_options(options) 116 | 117 | def __str__(self): 118 | s = '' 119 | attrs = parameter._names + list(self.__dict__.keys()) 120 | values = map(lambda attr: getattr(self, attr), attrs) 121 | for attr, val in zip(attrs, values): 122 | s += (' %s: %s\n' % (attr, val)) 123 | s = s.strip() 124 | 125 | return s 126 | 127 | def set_to_default_values(self): 128 | self.solver_type = L2R_L2LOSS_SVC_DUAL 129 | self.eps = float('inf') 130 | self.C = 1 131 | self.p = 0.1 132 | self.nr_weight = 0 133 | self.weight_label = (c_int64 * 0)() 134 | self.weight = (c_double * 0)() 135 | self.bias = -1 136 | self.cross_validation = False 137 | self.nr_fold = 0 138 | self.print_func = None 139 | 140 | def parse_options(self, options): 141 | if isinstance(options, list): 142 | argv = options 143 | elif isinstance(options, str): 144 | argv = options.split() 145 | else: 146 | raise TypeError("arg 1 should be a list or a str.") 147 | self.set_to_default_values() 148 | self.print_func = cast(None, PRINT_STRING_FUN) 149 | weight_label = [] 150 | weight = [] 151 | 152 | i = 0 153 | while i < len(argv) : 154 | if argv[i] == "-s": 155 | i = i + 1 156 | self.solver_type = int(argv[i]) 157 | elif argv[i] == "-c": 158 | i = i + 1 159 | self.C = float(argv[i]) 160 | elif argv[i] == "-p": 161 | i = i + 1 162 | self.p = float(argv[i]) 163 | elif argv[i] == "-e": 164 | i = i + 1 165 | self.eps = float(argv[i]) 166 | elif argv[i] == "-B": 167 | i = i + 1 168 | self.bias = float(argv[i]) 169 | elif argv[i] == "-v": 170 | i = i + 1 171 | self.cross_validation = 1 172 | self.nr_fold = int(argv[i]) 173 | if self.nr_fold < 2 : 174 | raise ValueError("n-fold cross validation: n must >= 2") 175 | elif argv[i].startswith("-w"): 176 | i = i + 1 177 | self.nr_weight += 1 178 | nr_weight = self.nr_weight 179 | weight_label += [int(argv[i-1][2:])] 180 | weight += [float(argv[i])] 181 | elif argv[i] == "-q": 182 | self.print_func = PRINT_STRING_FUN(print_null) 183 | else : 184 | raise ValueError("Wrong options") 185 | i += 1 186 | 187 | liblinear.set_print_string_function(self.print_func) 188 | self.weight_label = (c_int64*self.nr_weight)() 189 | self.weight = (c_double*self.nr_weight)() 190 | for i in range(self.nr_weight): 191 | self.weight[i] = weight[i] 192 | self.weight_label[i] = weight_label[i] 193 | 194 | if self.eps == float('inf'): 195 | if self.solver_type in [L2R_LR, L2R_L2LOSS_SVC]: 196 | self.eps = 0.01 197 | elif self.solver_type in [L2R_L2LOSS_SVR]: 198 | self.eps = 0.001 199 | elif self.solver_type in [L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L2R_LR_DUAL]: 200 | self.eps = 0.1 201 | elif self.solver_type in [L1R_L2LOSS_SVC, L1R_LR]: 202 | self.eps = 0.01 203 | elif self.solver_type in [L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: 204 | self.eps = 0.1 205 | 206 | class model(Structure): 207 | _names = ["param", "nr_class", "nr_feature", "w", "label", "bias"] 208 | _types = [parameter, c_int64, c_int64, POINTER(c_double), POINTER(c_int64), c_double] 209 | _fields_ = genFields(_names, _types) 210 | 211 | def __init__(self): 212 | self.__createfrom__ = 'python' 213 | 214 | def __del__(self): 215 | # free memory created by C to avoid memory leak 216 | if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C': 217 | liblinear.free_and_destroy_model(pointer(self)) 218 | 219 | def get_nr_feature(self): 220 | return liblinear.get_nr_feature(self) 221 | 222 | def get_nr_class(self): 223 | return liblinear.get_nr_class(self) 224 | 225 | def get_labels(self): 226 | nr_class = self.get_nr_class() 227 | labels = (c_int64 * nr_class)() 228 | liblinear.get_labels(self, labels) 229 | return labels[:nr_class] 230 | 231 | def is_probability_model(self): 232 | return (liblinear.check_probability_model(self) == 1) 233 | 234 | def toPyModel(model_ptr): 235 | """ 236 | toPyModel(model_ptr) -> model 237 | 238 | Convert a ctypes POINTER(model) to a Python model 239 | """ 240 | if bool(model_ptr) == False: 241 | raise ValueError("Null pointer") 242 | m = model_ptr.contents 243 | m.__createfrom__ = 'C' 244 | return m 245 | 246 | fillprototype(liblinear.train, POINTER(model), [POINTER(problem), POINTER(parameter)]) 247 | fillprototype(liblinear.cross_validation, None, [POINTER(problem), POINTER(parameter), c_int64, POINTER(c_double)]) 248 | 249 | fillprototype(liblinear.predict_values, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)]) 250 | fillprototype(liblinear.predict, c_double, [POINTER(model), POINTER(feature_node)]) 251 | fillprototype(liblinear.predict_probability, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)]) 252 | 253 | fillprototype(liblinear.save_model, c_int64, [c_char_p, POINTER(model)]) 254 | fillprototype(liblinear.load_model, POINTER(model), [c_char_p]) 255 | 256 | fillprototype(liblinear.get_nr_feature, c_int64, [POINTER(model)]) 257 | fillprototype(liblinear.get_nr_class, c_int64, [POINTER(model)]) 258 | fillprototype(liblinear.get_labels, None, [POINTER(model), POINTER(c_int64)]) 259 | 260 | fillprototype(liblinear.free_model_content, None, [POINTER(model)]) 261 | fillprototype(liblinear.free_and_destroy_model, None, [POINTER(POINTER(model))]) 262 | fillprototype(liblinear.destroy_param, None, [POINTER(parameter)]) 263 | fillprototype(liblinear.check_parameter, c_char_p, [POINTER(problem), POINTER(parameter)]) 264 | fillprototype(liblinear.check_probability_model, c_int64, [POINTER(model)]) 265 | fillprototype(liblinear.set_print_string_function, None, [CFUNCTYPE(None, c_char_p)]) 266 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/python/liblinearutil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys 4 | sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path 5 | from liblinear import * 6 | 7 | def svm_read_problem(data_file_name): 8 | """ 9 | svm_read_problem(data_file_name) -> [y, x] 10 | 11 | Read LIBSVM-format data from data_file_name and return labels y 12 | and data instances x. 13 | """ 14 | prob_y = [] 15 | prob_x = [] 16 | for line in open(data_file_name): 17 | line = line.split(None, 1) 18 | # In case an instance with all zero features 19 | if len(line) == 1: line += [''] 20 | label, features = line 21 | xi = {} 22 | for e in features.split(): 23 | ind, val = e.split(":") 24 | xi[int(ind)] = float(val) 25 | prob_y += [float(label)] 26 | prob_x += [xi] 27 | return (prob_y, prob_x) 28 | 29 | def load_model(model_file_name): 30 | """ 31 | load_model(model_file_name) -> model 32 | 33 | Load a LIBLINEAR model from model_file_name and return. 34 | """ 35 | model = liblinear.load_model(model_file_name.encode()) 36 | if not model: 37 | print("can't open model file %s" % model_file_name) 38 | return None 39 | model = toPyModel(model) 40 | return model 41 | 42 | def save_model(model_file_name, model): 43 | """ 44 | save_model(model_file_name, model) -> None 45 | 46 | Save a LIBLINEAR model to the file model_file_name. 47 | """ 48 | liblinear.save_model(model_file_name.encode(), model) 49 | 50 | def evaluations(ty, pv): 51 | """ 52 | evaluations(ty, pv) -> (ACC, MSE, SCC) 53 | 54 | Calculate accuracy, mean squared error and squared correlation coefficient 55 | using the true values (ty) and predicted values (pv). 56 | """ 57 | if len(ty) != len(pv): 58 | raise ValueError("len(ty) must equal to len(pv)") 59 | total_correct = total_error = 0 60 | sumv = sumy = sumvv = sumyy = sumvy = 0 61 | for v, y in zip(pv, ty): 62 | if y == v: 63 | total_correct += 1 64 | total_error += (v-y)*(v-y) 65 | sumv += v 66 | sumy += y 67 | sumvv += v*v 68 | sumyy += y*y 69 | sumvy += v*y 70 | l = len(ty) 71 | ACC = 100.0*total_correct/l 72 | MSE = total_error/l 73 | try: 74 | SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) 75 | except: 76 | SCC = float('nan') 77 | return (ACC, MSE, SCC) 78 | 79 | def train(arg1, arg2=None, arg3=None): 80 | """ 81 | train(y, x [, options]) -> model | ACC 82 | train(prob [, options]) -> model | ACC 83 | train(prob, param) -> model | ACC 84 | 85 | Train a model from data (y, x) or a problem prob using 86 | 'options' or a parameter param. 87 | If '-v' is specified in 'options' (i.e., cross validation) 88 | either accuracy (ACC) or mean-squared error (MSE) is returned. 89 | 90 | options: 91 | -s type : set type of solver (default 1) 92 | for multi-class classification 93 | 0 -- L2-regularized logistic regression (primal) 94 | 1 -- L2-regularized L2-loss support vector classification (dual) 95 | 2 -- L2-regularized L2-loss support vector classification (primal) 96 | 3 -- L2-regularized L1-loss support vector classification (dual) 97 | 4 -- support vector classification by Crammer and Singer 98 | 5 -- L1-regularized L2-loss support vector classification 99 | 6 -- L1-regularized logistic regression 100 | 7 -- L2-regularized logistic regression (dual) 101 | for regression 102 | 11 -- L2-regularized L2-loss support vector regression (primal) 103 | 12 -- L2-regularized L2-loss support vector regression (dual) 104 | 13 -- L2-regularized L1-loss support vector regression (dual) 105 | -c cost : set the parameter C (default 1) 106 | -p epsilon : set the epsilon in loss function of SVR (default 0.1) 107 | -e epsilon : set tolerance of termination criterion 108 | -s 0 and 2 109 | |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, 110 | where f is the primal function, (default 0.01) 111 | -s 11 112 | |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001) 113 | -s 1, 3, 4, and 7 114 | Dual maximal violation <= eps; similar to liblinear (default 0.) 115 | -s 5 and 6 116 | |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, 117 | where f is the primal function (default 0.01) 118 | -s 12 and 13 119 | |f'(alpha)|_1 <= eps |f'(alpha0)|, 120 | where f is the dual function (default 0.1) 121 | -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) 122 | -wi weight: weights adjust the parameter C of different classes (see README for details) 123 | -v n: n-fold cross validation mode 124 | -q : quiet mode (no outputs) 125 | """ 126 | prob, param = None, None 127 | if isinstance(arg1, (list, tuple)): 128 | assert isinstance(arg2, (list, tuple)) 129 | y, x, options = arg1, arg2, arg3 130 | prob = problem(y, x) 131 | param = parameter(options) 132 | elif isinstance(arg1, problem): 133 | prob = arg1 134 | if isinstance(arg2, parameter): 135 | param = arg2 136 | else : 137 | param = parameter(arg2) 138 | if prob == None or param == None : 139 | raise TypeError("Wrong types for the arguments") 140 | 141 | prob.set_bias(param.bias) 142 | liblinear.set_print_string_function(param.print_func) 143 | err_msg = liblinear.check_parameter(prob, param) 144 | if err_msg : 145 | raise ValueError('Error: %s' % err_msg) 146 | 147 | if param.cross_validation: 148 | l, nr_fold = prob.l, param.nr_fold 149 | target = (c_double * l)() 150 | liblinear.cross_validation(prob, param, nr_fold, target) 151 | ACC, MSE, SCC = evaluations(prob.y[:l], target[:l]) 152 | if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: 153 | print("Cross Validation Mean squared error = %g" % MSE) 154 | print("Cross Validation Squared correlation coefficient = %g" % SCC) 155 | return MSE 156 | else: 157 | print("Cross Validation Accuracy = %g%%" % ACC) 158 | return ACC 159 | else : 160 | m = liblinear.train(prob, param) 161 | m = toPyModel(m) 162 | 163 | return m 164 | 165 | def predict(y, x, m, options=""): 166 | """ 167 | predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals) 168 | 169 | Predict data (y, x) with the SVM model m. 170 | options: 171 | -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only 172 | -q quiet mode (no outputs) 173 | 174 | The return tuple contains 175 | p_labels: a list of predicted labels 176 | p_acc: a tuple including accuracy (for classification), mean-squared 177 | error, and squared correlation coefficient (for regression). 178 | p_vals: a list of decision values or probability estimates (if '-b 1' 179 | is specified). If k is the number of classes, for decision values, 180 | each element includes results of predicting k binary-class 181 | SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value 182 | is returned. For probabilities, each element contains k values 183 | indicating the probability that the testing instance is in each class. 184 | Note that the order of classes here is the same as 'model.label' 185 | field in the model structure. 186 | """ 187 | 188 | def info(s): 189 | print(s) 190 | 191 | predict_probability = 0 192 | argv = options.split() 193 | i = 0 194 | while i < len(argv): 195 | if argv[i] == '-b': 196 | i += 1 197 | predict_probability = int(argv[i]) 198 | elif argv[i] == '-q': 199 | info = print_null 200 | else: 201 | raise ValueError("Wrong options") 202 | i+=1 203 | 204 | solver_type = m.param.solver_type 205 | nr_class = m.get_nr_class() 206 | nr_feature = m.get_nr_feature() 207 | is_prob_model = m.is_probability_model() 208 | bias = m.bias 209 | if bias >= 0: 210 | biasterm = feature_node(nr_feature+1, bias) 211 | else: 212 | biasterm = feature_node(-1, bias) 213 | pred_labels = [] 214 | pred_values = [] 215 | 216 | if predict_probability: 217 | if not is_prob_model: 218 | raise TypeError('probability output is only supported for logistic regression') 219 | prob_estimates = (c_double * nr_class)() 220 | for xi in x: 221 | xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature) 222 | xi[-2] = biasterm 223 | label = liblinear.predict_probability(m, xi, prob_estimates) 224 | values = prob_estimates[:nr_class] 225 | pred_labels += [label] 226 | pred_values += [values] 227 | else: 228 | if nr_class <= 2: 229 | nr_classifier = 1 230 | else: 231 | nr_classifier = nr_class 232 | dec_values = (c_double * nr_classifier)() 233 | for xi in x: 234 | xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature) 235 | xi[-2] = biasterm 236 | label = liblinear.predict_values(m, xi, dec_values) 237 | values = dec_values[:nr_classifier] 238 | pred_labels += [label] 239 | pred_values += [values] 240 | if len(y) == 0: 241 | y = [0] * len(x) 242 | ACC, MSE, SCC = evaluations(y, pred_labels) 243 | l = len(y) 244 | if solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: 245 | info("Mean squared error = %g (regression)" % MSE) 246 | info("Squared correlation coefficient = %g (regression)" % SCC) 247 | else: 248 | info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l)) 249 | 250 | return pred_labels, (ACC, MSE, SCC), pred_values 251 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/train: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/classifier/learner/liblinear/train -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/train.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "linear.h" 8 | #define Malloc(type,n) (type *)malloc((n)*sizeof(type)) 9 | #define INF HUGE_VAL 10 | 11 | void print_null(const char *s) {} 12 | 13 | void exit_with_help() 14 | { 15 | printf( 16 | "Usage: train [options] training_set_file [model_file]\n" 17 | "options:\n" 18 | "-s type : set type of solver (default 1)\n" 19 | " for multi-class classification\n" 20 | " 0 -- L2-regularized logistic regression (primal)\n" 21 | " 1 -- L2-regularized L2-loss support vector classification (dual)\n" 22 | " 2 -- L2-regularized L2-loss support vector classification (primal)\n" 23 | " 3 -- L2-regularized L1-loss support vector classification (dual)\n" 24 | " 4 -- support vector classification by Crammer and Singer\n" 25 | " 5 -- L1-regularized L2-loss support vector classification\n" 26 | " 6 -- L1-regularized logistic regression\n" 27 | " 7 -- L2-regularized logistic regression (dual)\n" 28 | " for regression\n" 29 | " 11 -- L2-regularized L2-loss support vector regression (primal)\n" 30 | " 12 -- L2-regularized L2-loss support vector regression (dual)\n" 31 | " 13 -- L2-regularized L1-loss support vector regression (dual)\n" 32 | "-c cost : set the parameter C (default 1)\n" 33 | "-p epsilon : set the epsilon in loss function of SVR (default 0.1)\n" 34 | "-e epsilon : set tolerance of termination criterion\n" 35 | " -s 0 and 2\n" 36 | " |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" 37 | " where f is the primal function and pos/neg are # of\n" 38 | " positive/negative data (default 0.01)\n" 39 | " -s 11\n" 40 | " |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001)\n" 41 | " -s 1, 3, 4, and 7\n" 42 | " Dual maximal violation <= eps; similar to libsvm (default 0.1)\n" 43 | " -s 5 and 6\n" 44 | " |f'(w)|_1 <= eps*min(pos,neg)/l*|f'(w0)|_1,\n" 45 | " where f is the primal function (default 0.01)\n" 46 | " -s 12 and 13\n" 47 | " |f'(alpha)|_1 <= eps |f'(alpha0)|,\n" 48 | " where f is the dual function (default 0.1)\n" 49 | "-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)\n" 50 | "-wi weight: weights adjust the parameter C of different classes (see README for details)\n" 51 | "-v n: n-fold cross validation mode\n" 52 | "-q : quiet mode (no outputs)\n" 53 | ); 54 | exit(1); 55 | } 56 | 57 | void exit_input_error(INT64 line_num) 58 | { 59 | fprintf(stderr,"Wrong input format at line %lld\n", (long long int)line_num); 60 | exit(1); 61 | } 62 | 63 | static char *line = NULL; 64 | static INT64 max_line_len; 65 | 66 | static char* readline(FILE *input) 67 | { 68 | INT64 len; 69 | 70 | if(fgets(line,max_line_len,input) == NULL) 71 | return NULL; 72 | 73 | while(strrchr(line,'\n') == NULL) 74 | { 75 | max_line_len *= 2; 76 | line = (char *) realloc(line,max_line_len); 77 | len = (INT64) strlen(line); 78 | if(fgets(line+len,max_line_len-len,input) == NULL) 79 | break; 80 | } 81 | return line; 82 | } 83 | 84 | void parse_command_line(INT64 argc, char **argv, char *input_file_name, char *model_file_name); 85 | void read_problem(const char *filename); 86 | void do_cross_validation(); 87 | 88 | struct feature_node *x_space; 89 | struct parameter param; 90 | struct problem prob; 91 | struct model* model_; 92 | INT64 flag_cross_validation; 93 | INT64 nr_fold; 94 | double bias; 95 | 96 | int main(int argc, char **argv) 97 | { 98 | char input_file_name[1024]; 99 | char model_file_name[1024]; 100 | const char *error_msg; 101 | 102 | parse_command_line((INT64)argc, argv, input_file_name, model_file_name); 103 | read_problem(input_file_name); 104 | error_msg = check_parameter(&prob,¶m); 105 | 106 | if(error_msg) 107 | { 108 | fprintf(stderr,"ERROR: %s\n",error_msg); 109 | exit(1); 110 | } 111 | 112 | if(flag_cross_validation) 113 | { 114 | do_cross_validation(); 115 | } 116 | else 117 | { 118 | model_=train(&prob, ¶m); 119 | if(save_model(model_file_name, model_)) 120 | { 121 | fprintf(stderr,"can't save model to file %s\n",model_file_name); 122 | exit(1); 123 | } 124 | free_and_destroy_model(&model_); 125 | } 126 | destroy_param(¶m); 127 | free(prob.y); 128 | free(prob.x); 129 | free(x_space); 130 | free(line); 131 | 132 | return 0; 133 | } 134 | 135 | void do_cross_validation() 136 | { 137 | INT64 i; 138 | INT64 total_correct = 0; 139 | double total_error = 0; 140 | double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0; 141 | double *target = Malloc(double, prob.l); 142 | 143 | cross_validation(&prob,¶m,nr_fold,target); 144 | if(param.solver_type == L2R_L2LOSS_SVR || 145 | param.solver_type == L2R_L1LOSS_SVR_DUAL || 146 | param.solver_type == L2R_L2LOSS_SVR_DUAL) 147 | { 148 | for(i=0;i=argc) 197 | exit_with_help(); 198 | switch(argv[i-1][1]) 199 | { 200 | case 's': 201 | param.solver_type = atoi(argv[i]); 202 | break; 203 | 204 | case 'c': 205 | param.C = atof(argv[i]); 206 | break; 207 | 208 | case 'p': 209 | param.p = atof(argv[i]); 210 | break; 211 | 212 | case 'e': 213 | param.eps = atof(argv[i]); 214 | break; 215 | 216 | case 'B': 217 | bias = atof(argv[i]); 218 | break; 219 | 220 | case 'w': 221 | ++param.nr_weight; 222 | param.weight_label = (INT64 *) realloc(param.weight_label,sizeof(INT64)*param.nr_weight); 223 | param.weight = (double *) realloc(param.weight,sizeof(double)*param.nr_weight); 224 | param.weight_label[param.nr_weight-1] = atoi(&argv[i-1][2]); 225 | param.weight[param.nr_weight-1] = atof(argv[i]); 226 | break; 227 | 228 | case 'v': 229 | flag_cross_validation = 1; 230 | nr_fold = atoi(argv[i]); 231 | if(nr_fold < 2) 232 | { 233 | fprintf(stderr,"n-fold cross validation: n must >= 2\n"); 234 | exit_with_help(); 235 | } 236 | break; 237 | 238 | case 'q': 239 | print_func = &print_null; 240 | i--; 241 | break; 242 | 243 | default: 244 | fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]); 245 | exit_with_help(); 246 | break; 247 | } 248 | } 249 | 250 | set_print_string_function(print_func); 251 | 252 | // determine filenames 253 | if(i>=argc) 254 | exit_with_help(); 255 | 256 | strcpy(input_file_name, argv[i]); 257 | 258 | if(i= 0) elements += prob.l; 339 | 340 | prob.y = Malloc(double,prob.l); 341 | prob.x = Malloc(struct feature_node *,prob.l); 342 | x_space = Malloc(struct feature_node,elements+prob.l); 343 | 344 | max_index = 0; 345 | j=0; 346 | for(i=0;i max_index) 383 | max_index = inst_max_index; 384 | 385 | if(prob.bias >= 0) 386 | x_space[j++].value = prob.bias; 387 | 388 | x_space[j++].index = -1; 389 | } 390 | 391 | if(prob.bias >= 0) 392 | { 393 | prob.n=max_index+1; 394 | for(i=1;iindex = prob.n; 396 | x_space[j-2].index = prob.n; 397 | } 398 | else 399 | prob.n=max_index; 400 | 401 | fclose(fp); 402 | } 403 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/tron.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "tron.h" 6 | #ifndef INT64_DEFINED 7 | typedef int64_t INT64; 8 | #define INT64_DEFINED 9 | #endif 10 | 11 | #ifndef min 12 | template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } 17 | #endif 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | extern double dnrm2_(INT64 *, double *, INT64 *); 24 | extern double ddot_(INT64 *, double *, INT64 *, double *, INT64 *); 25 | extern INT64 daxpy_(INT64 *, double *, double *, INT64 *, double *, INT64 *); 26 | extern INT64 dscal_(INT64 *, double *, double *, INT64 *); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | static void default_print(const char *buf) 33 | { 34 | fputs(buf,stdout); 35 | fflush(stdout); 36 | } 37 | 38 | void TRON::info(const char *fmt,...) 39 | { 40 | char buf[BUFSIZ]; 41 | va_list ap; 42 | va_start(ap,fmt); 43 | vsprintf(buf,fmt,ap); 44 | va_end(ap); 45 | (*tron_print_string)(buf); 46 | } 47 | 48 | TRON::TRON(const function *fun_obj, double eps, INT64 max_iter) 49 | { 50 | this->fun_obj=const_cast(fun_obj); 51 | this->eps=eps; 52 | this->max_iter=max_iter; 53 | tron_print_string = default_print; 54 | } 55 | 56 | TRON::~TRON() 57 | { 58 | } 59 | 60 | void TRON::tron(double *w) 61 | { 62 | // Parameters for updating the iterates. 63 | double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75; 64 | 65 | // Parameters for updating the trust region size delta. 66 | double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4; 67 | 68 | INT64 n = fun_obj->get_nr_variable(); 69 | INT64 i, cg_iter; 70 | double delta, snorm, one=1.0; 71 | double alpha, f, fnew, prered, actred, gs; 72 | INT64 search = 1, iter = 1, inc = 1; 73 | double *s = new double[n]; 74 | double *r = new double[n]; 75 | double *w_new = new double[n]; 76 | double *g = new double[n]; 77 | 78 | for (i=0; ifun(w); 82 | fun_obj->grad(w, g); 83 | delta = dnrm2_(&n, g, &inc); 84 | double gnorm1 = delta; 85 | double gnorm = gnorm1; 86 | 87 | if (gnorm <= eps*gnorm1) 88 | search = 0; 89 | 90 | iter = 1; 91 | 92 | while (iter <= max_iter && search) 93 | { 94 | cg_iter = trcg(delta, g, s, r); 95 | 96 | memcpy(w_new, w, sizeof(double)*n); 97 | daxpy_(&n, &one, s, &inc, w_new, &inc); 98 | 99 | gs = ddot_(&n, g, &inc, s, &inc); 100 | prered = -0.5*(gs-ddot_(&n, s, &inc, r, &inc)); 101 | fnew = fun_obj->fun(w_new); 102 | 103 | // Compute the actual reduction. 104 | actred = f - fnew; 105 | 106 | // On the first iteration, adjust the initial step bound. 107 | snorm = dnrm2_(&n, s, &inc); 108 | if (iter == 1) 109 | delta = min(delta, snorm); 110 | 111 | // Compute prediction alpha*snorm of the step. 112 | if (fnew - f - gs <= 0) 113 | alpha = sigma3; 114 | else 115 | alpha = max(sigma1, -0.5*(gs/(fnew - f - gs))); 116 | 117 | // Update the trust region bound according to the ratio of actual to predicted reduction. 118 | if (actred < eta0*prered) 119 | delta = min(max(alpha, sigma1)*snorm, sigma2*delta); 120 | else if (actred < eta1*prered) 121 | delta = max(sigma1*delta, min(alpha*snorm, sigma2*delta)); 122 | else if (actred < eta2*prered) 123 | delta = max(sigma1*delta, min(alpha*snorm, sigma3*delta)); 124 | else 125 | delta = max(delta, min(alpha*snorm, sigma3*delta)); 126 | 127 | info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter); 128 | 129 | if (actred > eta0*prered) 130 | { 131 | iter++; 132 | memcpy(w, w_new, sizeof(double)*n); 133 | f = fnew; 134 | fun_obj->grad(w, g); 135 | 136 | gnorm = dnrm2_(&n, g, &inc); 137 | if (gnorm <= eps*gnorm1) 138 | break; 139 | } 140 | if (f < -1.0e+32) 141 | { 142 | info("WARNING: f < -1.0e+32\n"); 143 | break; 144 | } 145 | if (fabs(actred) <= 0 && prered <= 0) 146 | { 147 | info("WARNING: actred and prered <= 0\n"); 148 | break; 149 | } 150 | if (fabs(actred) <= 1.0e-12*fabs(f) && 151 | fabs(prered) <= 1.0e-12*fabs(f)) 152 | { 153 | info("WARNING: actred and prered too small\n"); 154 | break; 155 | } 156 | } 157 | 158 | delete[] g; 159 | delete[] r; 160 | delete[] w_new; 161 | delete[] s; 162 | } 163 | 164 | INT64 TRON::trcg(double delta, double *g, double *s, double *r) 165 | { 166 | INT64 i, inc = 1; 167 | INT64 n = fun_obj->get_nr_variable(); 168 | double one = 1; 169 | double *d = new double[n]; 170 | double *Hd = new double[n]; 171 | double rTr, rnewTrnew, alpha, beta, cgtol; 172 | 173 | for (i=0; iHv(d, Hd); 189 | 190 | alpha = rTr/ddot_(&n, d, &inc, Hd, &inc); 191 | daxpy_(&n, &alpha, d, &inc, s, &inc); 192 | if (dnrm2_(&n, s, &inc) > delta) 193 | { 194 | info("cg reaches trust region boundary\n"); 195 | alpha = -alpha; 196 | daxpy_(&n, &alpha, d, &inc, s, &inc); 197 | 198 | double std = ddot_(&n, s, &inc, d, &inc); 199 | double sts = ddot_(&n, s, &inc, s, &inc); 200 | double dtd = ddot_(&n, d, &inc, d, &inc); 201 | double dsq = delta*delta; 202 | double rad = sqrt(std*std + dtd*(dsq-sts)); 203 | if (std >= 0) 204 | alpha = (dsq - sts)/(std + rad); 205 | else 206 | alpha = (rad - std)/dtd; 207 | daxpy_(&n, &alpha, d, &inc, s, &inc); 208 | alpha = -alpha; 209 | daxpy_(&n, &alpha, Hd, &inc, r, &inc); 210 | break; 211 | } 212 | alpha = -alpha; 213 | daxpy_(&n, &alpha, Hd, &inc, r, &inc); 214 | rnewTrnew = ddot_(&n, r, &inc, r, &inc); 215 | beta = rnewTrnew/rTr; 216 | dscal_(&n, &beta, d, &inc); 217 | daxpy_(&n, &one, r, &inc, d, &inc); 218 | rTr = rnewTrnew; 219 | } 220 | 221 | delete[] d; 222 | delete[] Hd; 223 | 224 | return(cg_iter); 225 | } 226 | 227 | double TRON::norm_inf(INT64 n, double *x) 228 | { 229 | double dmax = fabs(x[0]); 230 | for (INT64 i=1; i= dmax) 232 | dmax = fabs(x[i]); 233 | return(dmax); 234 | } 235 | 236 | void TRON::set_print_string(void (*print_string) (const char *buf)) 237 | { 238 | tron_print_string = print_string; 239 | } 240 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/liblinear/tron.h: -------------------------------------------------------------------------------- 1 | #include 2 | #ifndef INT64_DEFINED 3 | typedef int64_t INT64; 4 | #define INT64_DEFINED 5 | #endif 6 | 7 | #ifndef _TRON_H 8 | #define _TRON_H 9 | 10 | class function 11 | { 12 | public: 13 | virtual double fun(double *w) = 0 ; 14 | virtual void grad(double *w, double *g) = 0 ; 15 | virtual void Hv(double *s, double *Hs) = 0 ; 16 | 17 | virtual INT64 get_nr_variable(void) = 0 ; 18 | virtual ~function(void){} 19 | }; 20 | 21 | class TRON 22 | { 23 | public: 24 | TRON(const function *fun_obj, double eps = 0.1, INT64 max_iter = 1000); 25 | ~TRON(); 26 | 27 | void tron(double *w); 28 | void set_print_string(void (*i_print) (const char *buf)); 29 | 30 | private: 31 | INT64 trcg(double delta, double *g, double *s, double *r); 32 | double norm_inf(INT64 n, double *x); 33 | 34 | double eps; 35 | INT64 max_iter; 36 | function *fun_obj; 37 | void info(const char *fmt,...); 38 | void (*tron_print_string)(const char *buf); 39 | }; 40 | #endif 41 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/classifier/learner/test -------------------------------------------------------------------------------- /libshorttext/classifier/learner/test.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "util.c" 3 | 4 | int main(int argc, const char* argv[]){ 5 | INT64 offsets[1000]; 6 | INT64 error_code = 0; 7 | merge_problems(&argv[1], argc-2, &offsets[0], argv[argc-1], 1, &error_code); 8 | 9 | for(int i = 0; i < argc-1; i++) 10 | printf("%ld ", offsets[i]); 11 | puts(""); 12 | return 0; 13 | } 14 | 15 | -------------------------------------------------------------------------------- /libshorttext/classifier/learner/util.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "linear.h" 8 | #define Malloc(type,n) (type *)malloc((n)*sizeof(type)) 9 | 10 | 11 | static char *line = NULL; 12 | static INT64 max_line_len; 13 | 14 | static char* readline(FILE *input) 15 | { 16 | INT64 len; 17 | 18 | if(fgets(line,max_line_len,input) == NULL) 19 | return NULL; 20 | 21 | while(strrchr(line,'\n') == NULL) 22 | { 23 | max_line_len *= 2; 24 | line = (char *) realloc(line,max_line_len); 25 | len = (INT64) strlen(line); 26 | if(fgets(line+len,max_line_len-len,input) == NULL) 27 | break; 28 | } 29 | return line; 30 | } 31 | 32 | typedef struct { 33 | struct problem prob; 34 | struct feature_node* x_space; 35 | INT64 len_x_space; 36 | } SVMProblem; 37 | 38 | void freeSVMProblem(SVMProblem svmprob) { 39 | struct problem *prob = &(svmprob.prob); 40 | if (prob->x!=NULL) free(prob->x); 41 | if (prob->y!=NULL) free(prob->y); 42 | if (svmprob.x_space!=NULL) free(svmprob.x_space); 43 | } 44 | 45 | 46 | // read in a problem (in libsvm format) 47 | SVMProblem read_problem(const char *filename, double bias, INT64 *error_code) 48 | { 49 | INT64 max_index, inst_max_index, i; 50 | INT64 elements, j; 51 | FILE *fp = fopen(filename,"r"); 52 | char *endptr; 53 | char *idx, *val, *label; 54 | struct problem prob; 55 | SVMProblem svmprob; 56 | 57 | /** 58 | * error_code: 59 | * 0 no error 60 | * > 0 input format error. The error_code value 61 | * indicates the line number. 62 | * -1 can not open file 63 | * -2 memory exhausted 64 | */ 65 | *error_code = 0; 66 | 67 | if(fp == NULL) 68 | { 69 | *error_code = -1; 70 | return svmprob; 71 | } 72 | 73 | prob.l = 0; 74 | elements = 0; 75 | max_line_len = 1024; 76 | line = Malloc(char,max_line_len); 77 | while(readline(fp)!=NULL) 78 | { 79 | char *p = strtok(line," \t"); // label 80 | 81 | // features 82 | while(1) 83 | { 84 | p = strtok(NULL," \t"); 85 | if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature 86 | break; 87 | elements++; 88 | } 89 | prob.l++; 90 | } 91 | rewind(fp); 92 | 93 | prob.bias=bias; 94 | if(prob.bias >= 0) elements += prob.l; 95 | 96 | errno = 0; 97 | prob.y = Malloc(double,prob.l); 98 | prob.x = Malloc(struct feature_node *,prob.l); 99 | struct feature_node* x_space = Malloc(struct feature_node,elements+prob.l); 100 | 101 | if(errno == ENOMEM) 102 | { 103 | free(line); 104 | fclose(fp); 105 | *error_code = -2; 106 | return svmprob; 107 | } 108 | 109 | max_index = 0; 110 | j=0; 111 | for(i=0;i max_index) 169 | max_index = inst_max_index; 170 | 171 | if(prob.bias >= 0) 172 | x_space[j++].value = prob.bias; 173 | 174 | x_space[j++].index = -1; 175 | } 176 | 177 | if(prob.bias >= 0) 178 | { 179 | prob.n=max_index+1; 180 | for(i=1;iindex = prob.n; 182 | x_space[j-2].index = prob.n; 183 | } 184 | else 185 | prob.n=max_index; 186 | 187 | fclose(fp); 188 | free(line); 189 | 190 | svmprob.prob = prob; 191 | svmprob.x_space = x_space; 192 | svmprob.len_x_space = j; 193 | 194 | return svmprob; 195 | } 196 | 197 | 198 | double* compute_idf(const struct problem *prob, double *idf_val) 199 | { 200 | INT64 i, j; 201 | //double* idf_val = Malloc(double, prob.n); 202 | memset(idf_val, 0, sizeof(double) * prob->n); 203 | 204 | for(i = 0; i < prob->l; ++i) 205 | { 206 | struct feature_node* xi = prob->x[i]; 207 | while(xi->index != -1) 208 | { 209 | ++idf_val[xi->index-1]; 210 | ++xi; 211 | } 212 | } 213 | 214 | for(j = 0; j < prob->n; ++j) 215 | { 216 | if(idf_val[j] > 0) 217 | idf_val[j] = log(prob->l / idf_val[j]); 218 | else 219 | idf_val[j] = 0; 220 | } 221 | 222 | return idf_val; 223 | } 224 | 225 | void normalize(struct problem *prob, int binary, int norm, int tf, int idf, double* idf_val) 226 | { 227 | INT64 i; 228 | 229 | for(i = 0; i < prob->l; ++i) 230 | { 231 | struct feature_node* xi; 232 | 233 | if(binary) 234 | { 235 | xi = prob->x[i]; 236 | while(xi->index != -1) 237 | { 238 | xi->value = xi->value != 0; 239 | ++xi; 240 | } 241 | } 242 | 243 | if(tf) 244 | { 245 | double norm = 0; 246 | xi = prob->x[i]; 247 | while(xi->index != -1) 248 | { 249 | norm += xi->value; 250 | ++xi; 251 | } 252 | 253 | xi = prob->x[i]; 254 | if(norm != 0) 255 | while(xi->index != -1) 256 | { 257 | xi->value /= norm; 258 | ++xi; 259 | } 260 | } 261 | 262 | if(idf) 263 | { 264 | xi = prob->x[i]; 265 | while(xi->index != -1) 266 | { 267 | xi->value *= idf_val[xi->index-1]; 268 | ++xi; 269 | } 270 | } 271 | 272 | if(norm) 273 | { 274 | double norm = 0; 275 | xi = prob->x[i]; 276 | while(xi->index != -1) 277 | { 278 | norm += xi->value * xi->value; 279 | ++xi; 280 | } 281 | 282 | norm = sqrt(norm); 283 | 284 | xi = prob->x[i]; 285 | if(norm != 0) 286 | while(xi->index != -1) 287 | { 288 | xi->value /= norm; 289 | ++xi; 290 | } 291 | } 292 | } 293 | } 294 | 295 | 296 | void merge_problems(const char *srcs[], const int num_srcs, INT64* offsets, const char *output_filename, char training, INT64 *error_code) { 297 | int i, j; 298 | const double bias = -1; 299 | SVMProblem *svmproblems = Malloc(SVMProblem, num_srcs); 300 | FILE *fp = NULL; 301 | 302 | /** 303 | * error_code: 304 | * 0 no error 305 | * > 0 input format error. The error_code value 306 | * indicates the line number. 307 | * -1 can not open file 308 | * -2 memory exhausted 309 | * -3 input files contain different numbsers of instances 310 | * -4 no file given 311 | */ 312 | 313 | if(num_srcs <= 0) { 314 | *error_code = -4; 315 | return; 316 | } 317 | 318 | for(i=0; i < num_srcs; i++) 319 | { 320 | svmproblems[i] = read_problem(srcs[i], bias, error_code); 321 | if(*error_code != 0) { 322 | switch (*error_code) { 323 | case -1: 324 | fprintf(stderr,"ERROR: Cannot open input file: %s\n", srcs[i]); 325 | break; 326 | case -2: 327 | fprintf(stderr,"ERROR: Memory exhausted when reading %s\n", srcs[i]); 328 | break; 329 | default: /* error_code > 0 input format error*/ 330 | fprintf(stderr,"ERROR: input format error at line %ld in %s\n", (long)*error_code, srcs[i]); 331 | break; 332 | } 333 | return; 334 | } 335 | } 336 | 337 | 338 | // Overwrite offsets 339 | if(training) { 340 | offsets[0] = svmproblems[0].prob.n; 341 | for(i = 1; i < num_srcs; i++) 342 | offsets[i] = offsets[i-1] + svmproblems[i].prob.n; 343 | } 344 | 345 | // Make sure # of instances are all equal. 346 | for(i = 1; i < num_srcs; i++) 347 | { 348 | if(svmproblems[i].prob.l != svmproblems[i-1].prob.l) 349 | { 350 | *error_code = -3; 351 | fprintf(stderr,"ERROR: #insts in %s = %ld, but #insts in %s = %ld\n", 352 | srcs[i], (long)svmproblems[i].prob.l, srcs[i-1], (long)svmproblems[i-1].prob.l); 353 | return; 354 | } 355 | } 356 | 357 | fp = fopen(output_filename, "w"); 358 | if(fp == NULL) 359 | { 360 | *error_code = -1; 361 | fprintf(stderr,"ERROR: Cannot open output file: %s \n", srcs[i]); 362 | return; 363 | } 364 | 365 | for(j = 0; j < svmproblems[0].prob.l; j++) 366 | { 367 | INT64 base = 0; 368 | 369 | fprintf(fp, "%g", svmproblems[0].prob.y[j]); 370 | for(i = 0; i < num_srcs; i++) 371 | { 372 | struct feature_node* node; 373 | 374 | for(node = svmproblems[i].prob.x[j]; node->index != -1; node++) 375 | { 376 | INT64 index = base+node->index; 377 | if(index <= offsets[i]) 378 | fprintf(fp, " %ld:%.17g", (long)index, node->value); 379 | else 380 | break; 381 | } 382 | base = offsets[i]; 383 | } 384 | fprintf(fp,"\n"); 385 | } 386 | fclose(fp); 387 | 388 | for(i = 0; i < num_srcs; i++) 389 | freeSVMProblem(svmproblems[i]); 390 | } 391 | 392 | 393 | -------------------------------------------------------------------------------- /libshorttext/converter/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | :mod:`converter` module is used convert a text data set to a numerical data set. 3 | More specifically, it converts a text file to a LIBSVM-format data. Refer to 4 | :ref:`dataset` for the format of texts. 5 | 6 | The utilities of :mod:`converter` is wrapped in :class:`Text2svmConverter`. 7 | :class:`Text2svmConverter` consists of three components: 8 | :class:`TextPreprocessor`, :class:`FeatureGenerator`, and :class:`ClassMapping`. 9 | For users who only need the most basic usage, they can use the utility function 10 | :func:`convert_text` without understanding :mod:`converter`. 11 | 12 | """ 13 | 14 | 15 | from .converter_impl import * 16 | del converter_impl 17 | -------------------------------------------------------------------------------- /libshorttext/converter/stemmer/Makefile: -------------------------------------------------------------------------------- 1 | all = lib 2 | OS = $(shell uname) 3 | 4 | lib: porter.o 5 | if [ "$(OS)" = "Darwin" ]; then \ 6 | SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,porter.so.$(SHVER)"; \ 7 | else \ 8 | SHARED_LIB_FLAG="-shared -Wl,-soname,porter.so.$(SHVER)"; \ 9 | fi; \ 10 | gcc $${SHARED_LIB_FLAG} porter.o -o porter.so.1 11 | 12 | porter.o: porter.c 13 | gcc -fPIC -O3 -c -o porter.o porter.c 14 | 15 | clean: 16 | rm -rf porter.o porter.so.1 *pyc 17 | -------------------------------------------------------------------------------- /libshorttext/converter/stemmer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/converter/stemmer/__init__.py -------------------------------------------------------------------------------- /libshorttext/converter/stemmer/porter.c: -------------------------------------------------------------------------------- 1 | /* This is the Porter stemming algorithm, coded up as thread-safe ANSI C 2 | by the author. 3 | 4 | It may be be regarded as cononical, in that it follows the algorithm 5 | presented in 6 | 7 | Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 8 | no. 3, pp 130-137, 9 | 10 | only differing from it at the points maked --DEPARTURE-- below. 11 | 12 | See also http://www.tartarus.org/~martin/PorterStemmer 13 | 14 | The algorithm as described in the paper could be exactly replicated 15 | by adjusting the points of DEPARTURE, but this is barely necessary, 16 | because (a) the points of DEPARTURE are definitely improvements, and 17 | (b) no encoding of the Porter stemmer I have seen is anything like 18 | as exact as this version, even with the points of DEPARTURE! 19 | 20 | You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which 21 | 'stem' takes a list of inputs and sends the stemmed equivalent to 22 | stdout. 23 | 24 | The algorithm as encoded here is particularly fast. 25 | 26 | Release 2 (the more old-fashioned, non-thread-safe version may be 27 | regarded as release 1.) 28 | */ 29 | 30 | #include /* for malloc, free */ 31 | #include /* for memcmp, memmove */ 32 | 33 | /* You will probably want to move the following declarations to a central 34 | header file. 35 | */ 36 | 37 | struct stemmer; 38 | 39 | extern struct stemmer * create_stemmer(void); 40 | extern void free_stemmer(struct stemmer * z); 41 | 42 | extern int stem(struct stemmer * z, char * b, int k); 43 | 44 | 45 | 46 | /* The main part of the stemming algorithm starts here. 47 | */ 48 | 49 | #define TRUE 1 50 | #define FALSE 0 51 | 52 | /* stemmer is a structure for a few local bits of data, 53 | */ 54 | 55 | struct stemmer { 56 | char * b; /* buffer for word to be stemmed */ 57 | int k; /* offset to the end of the string */ 58 | int j; /* a general offset into the string */ 59 | }; 60 | 61 | 62 | /* Member b is a buffer holding a word to be stemmed. The letters are in 63 | b[0], b[1] ... ending at b[z->k]. Member k is readjusted downwards as 64 | the stemming progresses. Zero termination is not in fact used in the 65 | algorithm. 66 | 67 | Note that only lower case sequences are stemmed. Forcing to lower case 68 | should be done before stem(...) is called. 69 | 70 | 71 | Typical usage is: 72 | 73 | struct stemmer * z = create_stemmer(); 74 | char b[] = "pencils"; 75 | int res = stem(z, b, 6); 76 | /- stem the 7 characters of b[0] to b[6]. The result, res, 77 | will be 5 (the 's' is removed). -/ 78 | free_stemmer(z); 79 | */ 80 | 81 | 82 | extern struct stemmer * create_stemmer(void) 83 | { 84 | return (struct stemmer *) malloc(sizeof(struct stemmer)); 85 | /* assume malloc succeeds */ 86 | } 87 | 88 | extern void free_stemmer(struct stemmer * z) 89 | { 90 | free(z); 91 | } 92 | 93 | 94 | /* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here 95 | and below we drop 'z->' in comments. 96 | */ 97 | 98 | static int cons(struct stemmer * z, int i) 99 | { switch (z->b[i]) 100 | { case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE; 101 | case 'y': return (i == 0) ? TRUE : !cons(z, i - 1); 102 | default: return TRUE; 103 | } 104 | } 105 | 106 | /* m(z) measures the number of consonant sequences between 0 and j. if c is 107 | a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 108 | presence, 109 | 110 | gives 0 111 | vc gives 1 112 | vcvc gives 2 113 | vcvcvc gives 3 114 | .... 115 | */ 116 | 117 | static int m(struct stemmer * z) 118 | { int n = 0; 119 | int i = 0; 120 | int j = z->j; 121 | while(TRUE) 122 | { if (i > j) return n; 123 | if (! cons(z, i)) break; i++; 124 | } 125 | i++; 126 | while(TRUE) 127 | { while(TRUE) 128 | { if (i > j) return n; 129 | if (cons(z, i)) break; 130 | i++; 131 | } 132 | i++; 133 | n++; 134 | while(TRUE) 135 | { if (i > j) return n; 136 | if (! cons(z, i)) break; 137 | i++; 138 | } 139 | i++; 140 | } 141 | } 142 | 143 | /* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */ 144 | 145 | static int vowelinstem(struct stemmer * z) 146 | { 147 | int j = z->j; 148 | int i; for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE; 149 | return FALSE; 150 | } 151 | 152 | /* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */ 153 | 154 | static int doublec(struct stemmer * z, int j) 155 | { 156 | char * b = z->b; 157 | if (j < 1) return FALSE; 158 | if (b[j] != b[j - 1]) return FALSE; 159 | return cons(z, j); 160 | } 161 | 162 | /* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant 163 | and also if the second c is not w,x or y. this is used when trying to 164 | restore an e at the end of a short word. e.g. 165 | 166 | cav(e), lov(e), hop(e), crim(e), but 167 | snow, box, tray. 168 | 169 | */ 170 | 171 | static int cvc(struct stemmer * z, int i) 172 | { if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE; 173 | { int ch = z->b[i]; 174 | if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE; 175 | } 176 | return TRUE; 177 | } 178 | 179 | /* ends(z, s) is TRUE <=> 0,...k ends with the string s. */ 180 | 181 | static int ends(struct stemmer * z, char * s) 182 | { int length = s[0]; 183 | char * b = z->b; 184 | int k = z->k; 185 | if (s[length] != b[k]) return FALSE; /* tiny speed-up */ 186 | if (length > k + 1) return FALSE; 187 | if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE; 188 | z->j = k-length; 189 | return TRUE; 190 | } 191 | 192 | /* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting 193 | k. */ 194 | 195 | static void setto(struct stemmer * z, char * s) 196 | { int length = s[0]; 197 | int j = z->j; 198 | memmove(z->b + j + 1, s + 1, length); 199 | z->k = j+length; 200 | } 201 | 202 | /* r(z, s) is used further down. */ 203 | 204 | static void r(struct stemmer * z, char * s) { if (m(z) > 0) setto(z, s); } 205 | 206 | /* step1ab(z) gets rid of plurals and -ed or -ing. e.g. 207 | 208 | caresses -> caress 209 | ponies -> poni 210 | ties -> ti 211 | caress -> caress 212 | cats -> cat 213 | 214 | feed -> feed 215 | agreed -> agree 216 | disabled -> disable 217 | 218 | matting -> mat 219 | mating -> mate 220 | meeting -> meet 221 | milling -> mill 222 | messing -> mess 223 | 224 | meetings -> meet 225 | 226 | */ 227 | 228 | static void step1ab(struct stemmer * z) 229 | { 230 | char * b = z->b; 231 | if (b[z->k] == 's') 232 | { if (ends(z, "\04" "sses")) z->k -= 2; else 233 | if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else 234 | if (b[z->k - 1] != 's') z->k--; 235 | } 236 | if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else 237 | if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z)) 238 | { z->k = z->j; 239 | if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else 240 | if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else 241 | if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else 242 | if (doublec(z, z->k)) 243 | { z->k--; 244 | { int ch = b[z->k]; 245 | if (ch == 'l' || ch == 's' || ch == 'z') z->k++; 246 | } 247 | } 248 | else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e"); 249 | } 250 | } 251 | 252 | /* step1c(z) turns terminal y to i when there is another vowel in the stem. */ 253 | 254 | static void step1c(struct stemmer * z) 255 | { 256 | if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i'; 257 | } 258 | 259 | 260 | /* step2(z) maps double suffices to single ones. so -ization ( = -ize plus 261 | -ation) maps to -ize etc. note that the string before the suffix must give 262 | m(z) > 0. */ 263 | 264 | static void step2(struct stemmer * z) { switch (z->b[z->k-1]) 265 | { 266 | case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; } 267 | if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; } 268 | break; 269 | case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; } 270 | if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; } 271 | break; 272 | case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; } 273 | break; 274 | case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/ 275 | 276 | /* To match the published algorithm, replace this line with 277 | case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */ 278 | 279 | if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; } 280 | if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; } 281 | if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; } 282 | if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; } 283 | break; 284 | case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; } 285 | if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; } 286 | if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; } 287 | break; 288 | case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; } 289 | if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; } 290 | if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; } 291 | if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; } 292 | break; 293 | case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; } 294 | if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; } 295 | if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; } 296 | break; 297 | case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/ 298 | 299 | /* To match the published algorithm, delete this line */ 300 | 301 | } } 302 | 303 | /* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */ 304 | 305 | static void step3(struct stemmer * z) { switch (z->b[z->k]) 306 | { 307 | case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; } 308 | if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; } 309 | if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; } 310 | break; 311 | case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; } 312 | break; 313 | case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; } 314 | if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; } 315 | break; 316 | case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; } 317 | break; 318 | } } 319 | 320 | /* step4(z) takes off -ant, -ence etc., in context vcvc. */ 321 | 322 | static void step4(struct stemmer * z) 323 | { switch (z->b[z->k-1]) 324 | { case 'a': if (ends(z, "\02" "al")) break; return; 325 | case 'c': if (ends(z, "\04" "ance")) break; 326 | if (ends(z, "\04" "ence")) break; return; 327 | case 'e': if (ends(z, "\02" "er")) break; return; 328 | case 'i': if (ends(z, "\02" "ic")) break; return; 329 | case 'l': if (ends(z, "\04" "able")) break; 330 | if (ends(z, "\04" "ible")) break; return; 331 | case 'n': if (ends(z, "\03" "ant")) break; 332 | if (ends(z, "\05" "ement")) break; 333 | if (ends(z, "\04" "ment")) break; 334 | if (ends(z, "\03" "ent")) break; return; 335 | case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break; 336 | if (ends(z, "\02" "ou")) break; return; 337 | /* takes care of -ous */ 338 | case 's': if (ends(z, "\03" "ism")) break; return; 339 | case 't': if (ends(z, "\03" "ate")) break; 340 | if (ends(z, "\03" "iti")) break; return; 341 | case 'u': if (ends(z, "\03" "ous")) break; return; 342 | case 'v': if (ends(z, "\03" "ive")) break; return; 343 | case 'z': if (ends(z, "\03" "ize")) break; return; 344 | default: return; 345 | } 346 | if (m(z) > 1) z->k = z->j; 347 | } 348 | 349 | /* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if 350 | m(z) > 1. */ 351 | 352 | static void step5(struct stemmer * z) 353 | { 354 | char * b = z->b; 355 | z->j = z->k; 356 | if (b[z->k] == 'e') 357 | { int a = m(z); 358 | if (a > 1 || a == 1 && !cvc(z, z->k - 1)) z->k--; 359 | } 360 | if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--; 361 | } 362 | 363 | /* In stem(z, b, k), b is a char pointer, and the string to be stemmed is 364 | from b[0] to b[k] inclusive. Possibly b[k+1] == '\0', but it is not 365 | important. The stemmer adjusts the characters b[0] ... b[k] and returns 366 | the new end-point of the string, k'. Stemming never increases word 367 | length, so 0 <= k' <= k. 368 | */ 369 | 370 | extern int stem(struct stemmer * z, char * b, int k) 371 | { 372 | if (k <= 1) return k; /*-DEPARTURE-*/ 373 | z->b = b; z->k = k; /* copy the parameters into z */ 374 | 375 | /* With this line, strings of length 1 or 2 don't go through the 376 | stemming process, although no mention is made of this in the 377 | published algorithm. Remove the line to match the published 378 | algorithm. */ 379 | 380 | step1ab(z); step1c(z); step2(z); step3(z); step4(z); step5(z); 381 | return z->k; 382 | } 383 | 384 | /*--------------------stemmer definition ends here------------------------*/ 385 | 386 | #include 387 | #include /* for malloc, free */ 388 | #include /* for isupper, islower, tolower */ 389 | 390 | static char * s; /* buffer for words tobe stemmed */ 391 | 392 | #define INC 50 /* size units in which s is increased */ 393 | static int i_max = INC; /* maximum offset in s */ 394 | 395 | #define LETTER(ch) (isupper(ch) || islower(ch)) 396 | 397 | void stemfile(struct stemmer * z, FILE * f) 398 | { while(TRUE) 399 | { int ch = getc(f); 400 | if (ch == EOF) return; 401 | if (LETTER(ch)) 402 | { int i = 0; 403 | while(TRUE) 404 | { if (i == i_max) 405 | { i_max += INC; 406 | s = realloc(s, i_max + 1); 407 | } 408 | ch = tolower(ch); /* forces lower case */ 409 | 410 | s[i] = ch; i++; 411 | ch = getc(f); 412 | if (!LETTER(ch)) { ungetc(ch,f); break; } 413 | } 414 | s[stem(z, s, i - 1) + 1] = 0; 415 | /* the previous line calls the stemmer and uses its result to 416 | zero-terminate the string in s */ 417 | printf("%s",s); 418 | } 419 | else putchar(ch); 420 | } 421 | } 422 | 423 | 424 | int trim(char* src){ 425 | 426 | struct stemmer z; 427 | int len = strlen(src); 428 | return stem(&z,src,len-1) + 1; 429 | } 430 | /* 431 | int main(int argc, char * argv[]) 432 | { int i; 433 | 434 | struct stemmer * z = create_stemmer(); 435 | 436 | s = (char *) malloc(i_max + 1); 437 | for (i = 1; i < argc; i++) 438 | { FILE * f = fopen(argv[i],"r"); 439 | if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); } 440 | stemfile(z, f); 441 | } 442 | free(s); 443 | 444 | free_stemmer(z); 445 | 446 | return 0; 447 | } 448 | */ 449 | 450 | 451 | -------------------------------------------------------------------------------- /libshorttext/converter/stemmer/porter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from ctypes import * 4 | from ctypes.util import find_library 5 | import sys 6 | import os 7 | 8 | stemmer = CDLL(os.path.join(os.path.abspath(os.path.dirname(__file__)), './porter.so.1')) 9 | 10 | def fillprototype(f, restype, argtypes): 11 | f.restype = restype 12 | f.argtypes = argtypes 13 | 14 | fillprototype(stemmer.trim, c_int, [c_char_p]) 15 | 16 | def stem(word): 17 | return word[:stemmer.trim(word.encode('utf-8'))] 18 | -------------------------------------------------------------------------------- /libshorttext/converter/stop-words/stoplist-nsp.regex: -------------------------------------------------------------------------------- 1 | @stop.mode=OR 2 | /\b[a-zA-Z]\b/ 3 | /\b[aA]board\b/ 4 | /\b[aA]bout\b/ 5 | /\b[aA]bove\b/ 6 | /\b[aA]cross\b/ 7 | /\b[aA]fter\b/ 8 | /\b[aA]gain\b/ 9 | /\b[aA]gainst\b/ 10 | /\b[aA]ll\b/ 11 | /\b[aA]long\b/ 12 | /\b[aA]longside\b/ 13 | /\b[aA]lready\b/ 14 | /\b[aA]lso\b/ 15 | /\b[aA]lthough\b/ 16 | /\b[aA]lways\b/ 17 | /\b[aA]m\b/ 18 | /\b[aA]mid\b/ 19 | /\b[aA]midst\b/ 20 | /\b[aA]mong\b/ 21 | /\b[aA]mongst\b/ 22 | /\b[aA]n\b/ 23 | /\b[aA]nd\b/ 24 | /\b[aA]nother\b/ 25 | /\b[aA]nti\b/ 26 | /\b[aA]ny\b/ 27 | /\b[aA]nybody\b/ 28 | /\b[aA]nyone\b/ 29 | /\b[aA]nything\b/ 30 | /\b[aA]re\b/ 31 | /\b[aA]round\b/ 32 | /\b[aA]s\b/ 33 | /\b[aA]stride\b/ 34 | /\b[aA]t\b/ 35 | /\b[aA]ught\b/ 36 | /\b[aA]way\b/ 37 | /\b[bB]ack\b/ 38 | /\b[bB]ar\b/ 39 | /\b[bB]arring\b/ 40 | /\b[bB]e\b/ 41 | /\b[bB]ecause\b/ 42 | /\b[bB]ecome\b/ 43 | /\b[bB]ecomes\b/ 44 | /\b[bB]ecoming\b/ 45 | /\b[bB]een\b/ 46 | /\b[bB]efore\b/ 47 | /\b[bB]ehind\b/ 48 | /\b[bB]eing\b/ 49 | /\b[bB]elow\b/ 50 | /\b[bB]eneath\b/ 51 | /\b[bB]eside\b/ 52 | /\b[bB]esides\b/ 53 | /\b[bB]etween\b/ 54 | /\b[bB]eyond\b/ 55 | /\b[bB]oth\b/ 56 | /\b[bB]ut\b/ 57 | /\b[bB]y\b/ 58 | /\b[cC]alled\b/ 59 | /\b[cC]an\b/ 60 | /\b[cC]annot\b/ 61 | /\b[cC]ant\b/ 62 | /\b[cC]ertain\b/ 63 | /\b[cC]irca\b/ 64 | /\b[cC]m\b/ 65 | /\b[cC]oncerning\b/ 66 | /\b[cC]onsidering\b/ 67 | /\b[cC]ontain\b/ 68 | /\b[cC]ould\b/ 69 | /\b[dD]e\b/ 70 | /\b[dD]espite\b/ 71 | /\b[dD]id\b/ 72 | /\b[dD]o\b/ 73 | /\b[dD]oe\b/ 74 | /\b[dD]oes\b/ 75 | /\b[dD]oing\b/ 76 | /\b[dD]one\b/ 77 | /\b[dD]ont\b/ 78 | /\b[dD]own\b/ 79 | /\b[dD]r\b/ 80 | /\b[dD]uring\b/ 81 | /\b[eE]ach\b/ 82 | /\b[eE]c\b/ 83 | /\b[eE]e\b/ 84 | /\b[eE]ighteen\b/ 85 | /\b[eE]ighth\b/ 86 | /\b[eE]ither\b/ 87 | /\b[eE]leven\b/ 88 | /\b[eE]lse\b/ 89 | /\b[eE]nd\b/ 90 | /\b[eE]nough\b/ 91 | /\b[eE]specially\b/ 92 | /\b[eE]tc\b/ 93 | /\b[eE]ven\b/ 94 | /\b[eE]ver\b/ 95 | /\b[eE]very\b/ 96 | /\b[eE]verybody\b/ 97 | /\b[eE]veryone\b/ 98 | /\b[eE]xcept\b/ 99 | /\b[eE]xcepting\b/ 100 | /\b[eE]xcluding\b/ 101 | /\b[fF]ew\b/ 102 | /\b[fF]ewer\b/ 103 | /\b[fF]ifteen\b/ 104 | /\b[fF]ifth\b/ 105 | /\b[fF]irst\b/ 106 | /\b[fF]ollowing\b/ 107 | /\b[fF]or\b/ 108 | /\b[fF]ourteen\b/ 109 | /\b[fF]ourth\b/ 110 | /\b[fF]rom\b/ 111 | /\b[gG]et\b/ 112 | /\b[gG]ive\b/ 113 | /\b[gG]o\b/ 114 | /\b[gG]oing\b/ 115 | /\b[gG]ood\b/ 116 | /\b[gG]ot\b/ 117 | /\b[hH]a\b/ 118 | /\b[hH]ad\b/ 119 | /\b[hH]ardly\b/ 120 | /\b[hH]as\b/ 121 | /\b[hH]ave\b/ 122 | /\b[hH]e\b/ 123 | /\b[hH]eld\b/ 124 | /\b[hH]er\b/ 125 | /\b[hH]ere\b/ 126 | /\b[hH]ers\b/ 127 | /\b[hH]erself\b/ 128 | /\b[hH]es\b/ 129 | /\b[hH]im\b/ 130 | /\b[hH]imself\b/ 131 | /\b[hH]is\b/ 132 | /\b[hH]isself\b/ 133 | /\b[hH]m\b/ 134 | /\b[hH]ow\b/ 135 | /\b[iI]dem\b/ 136 | /\b[iI]f\b/ 137 | /\b[iI]i\b/ 138 | /\b[iI]ii\b/ 139 | /\b[iI]lk\b/ 140 | /\b[iI]n\b/ 141 | /\b[iI]nclude\b/ 142 | /\b[iI]ncluded\b/ 143 | /\b[iI]ncluding\b/ 144 | /\b[iI]ndeed\b/ 145 | /\b[iI]nside\b/ 146 | /\b[iI]nstead\b/ 147 | /\b[iI]nto\b/ 148 | /\b[iI]s\b/ 149 | /\b[iI]t\b/ 150 | /\b[iI]ts\b/ 151 | /\b[iI]tself\b/ 152 | /\b[iI]v\b/ 153 | /\b[jJ]r\b/ 154 | /\b[jJ]ust\b/ 155 | /\b[kK]ept\b/ 156 | /\b[kK]now\b/ 157 | /\b[lL]ast\b/ 158 | /\b[lL]ate\b/ 159 | /\b[lL]ater\b/ 160 | /\b[lL]ess\b/ 161 | /\b[lL]et\b/ 162 | /\b[lL]ike\b/ 163 | /\b[lL]ittle\b/ 164 | /\b[mM]ade\b/ 165 | /\b[mM]ake\b/ 166 | /\b[mM]aking\b/ 167 | /\b[mM]any\b/ 168 | /\b[mM]ay\b/ 169 | /\b[mM]e\b/ 170 | /\b[mM]ight\b/ 171 | /\b[mM]ine\b/ 172 | /\b[mM]inus\b/ 173 | /\b[mM]m\b/ 174 | /\b[mM]ore\b/ 175 | /\b[mM]ost\b/ 176 | /\b[mM]ostly\b/ 177 | /\b[mM]r\b/ 178 | /\b[mM]rs\b/ 179 | /\b[mM]uch\b/ 180 | /\b[mM]ust\b/ 181 | /\b[mM]y\b/ 182 | /\b[mM]yself\b/ 183 | /\b[nN]aught\b/ 184 | /\b[nN]ear\b/ 185 | /\b[nN]eeded\b/ 186 | /\b[nN]eeds\b/ 187 | /\b[nN]either\b/ 188 | /\b[nN]ever\b/ 189 | /\b[nN]ew\b/ 190 | /\b[nN]ext\b/ 191 | /\b[nN]hs\b/ 192 | /\b[nN]ine\b/ 193 | /\b[nN]ineteen\b/ 194 | /\b[nN]inth\b/ 195 | /\b[nN]o\b/ 196 | /\b[nN]obody\b/ 197 | /\b[nN]on\b/ 198 | /\b[nN]one\b/ 199 | /\b[nN]or\b/ 200 | /\b[nN]ot\b/ 201 | /\b[nN]othing\b/ 202 | /\b[nN]otwithstanding\b/ 203 | /\b[nN]ow\b/ 204 | /\b[nN]s\b/ 205 | /\b[nN]t\b/ 206 | /\b[oO]f\b/ 207 | /\b[oO]ff\b/ 208 | /\b[oO]ften\b/ 209 | /\b[oO]n\b/ 210 | /\b[oO]nce\b/ 211 | /\b[oO]ne\b/ 212 | /\b[oO]neself\b/ 213 | /\b[oO]nly\b/ 214 | /\b[oO]nto\b/ 215 | /\b[oO]pposite\b/ 216 | /\b[oO]r\b/ 217 | /\b[oO]ther\b/ 218 | /\b[oO]thers\b/ 219 | /\b[oO]therwise\b/ 220 | /\b[oO]ught\b/ 221 | /\b[oO]ur\b/ 222 | /\b[oO]urself\b/ 223 | /\b[oO]urselves\b/ 224 | /\b[oO]ut\b/ 225 | /\b[oO]utside\b/ 226 | /\b[oO]ver\b/ 227 | /\b[oO]wn\b/ 228 | /\b[pP]art\b/ 229 | /\b[pP]articular\b/ 230 | /\b[pP]ast\b/ 231 | /\b[pP]e\b/ 232 | /\b[pP]ending\b/ 233 | /\b[pP]er\b/ 234 | /\b[pP]erhaps\b/ 235 | /\b[pP]lenty\b/ 236 | /\b[pP]lus\b/ 237 | /\b[pP]robably\b/ 238 | /\b[pP]uts\b/ 239 | /\b[qQ]uite\b/ 240 | /\b[rR]ather\b/ 241 | /\b[rR]eally\b/ 242 | /\b[rR]egarding\b/ 243 | /\b[rR]elate\b/ 244 | /\b[rR]ound\b/ 245 | /\b[sS]aid\b/ 246 | /\b[sS]ave\b/ 247 | /\b[sS]aw\b/ 248 | /\b[sS]ay\b/ 249 | /\b[sS]ays\b/ 250 | /\b[sS]econd\b/ 251 | /\b[sS]ee\b/ 252 | /\b[sS]eem\b/ 253 | /\b[sS]eems\b/ 254 | /\b[sS]een\b/ 255 | /\b[sS]elf\b/ 256 | /\b[sS]eventeen\b/ 257 | /\b[sS]eventh\b/ 258 | /\b[sS]everal\b/ 259 | /\b[sS]hall\b/ 260 | /\b[sS]he\b/ 261 | /\b[sS]hort\b/ 262 | /\b[sS]hould\b/ 263 | /\b[sS]ince\b/ 264 | /\b[sS]ix\b/ 265 | /\b[sS]ixteen\b/ 266 | /\b[sS]ixth\b/ 267 | /\b[sS]o\b/ 268 | /\b[sS]ome\b/ 269 | /\b[sS]omebody\b/ 270 | /\b[sS]omeone\b/ 271 | /\b[sS]omething\b/ 272 | /\b[sS]ometimes\b/ 273 | /\b[sS]omewhat\b/ 274 | /\b[sS]oon\b/ 275 | /\b[sS]ooner\b/ 276 | /\b[sS]r\b/ 277 | /\b[sS]uch\b/ 278 | /\b[sS]uchlike\b/ 279 | /\b[sS]uddenly\b/ 280 | /\b[sS]undry\b/ 281 | /\b[tT]ake\b/ 282 | /\b[tT]en\b/ 283 | /\b[tT]enth\b/ 284 | /\b[tT]han\b/ 285 | /\b[tT]hat\b/ 286 | /\b[tT]he\b/ 287 | /\b[tT]hee\b/ 288 | /\b[tT]heir\b/ 289 | /\b[tT]heirs\b/ 290 | /\b[tT]hem\b/ 291 | /\b[tT]hemselves\b/ 292 | /\b[tT]hen\b/ 293 | /\b[tT]hen\b/ 294 | /\b[tT]here\b/ 295 | /\b[tT]hey\b/ 296 | /\b[tT]hine\b/ 297 | /\b[tT]hird\b/ 298 | /\b[tT]hirteen\b/ 299 | /\b[tT]his\b/ 300 | /\b[tT]hose\b/ 301 | /\b[tT]hou\b/ 302 | /\b[tT]hough\b/ 303 | /\b[tT]hree\b/ 304 | /\b[tT]hrice\b/ 305 | /\b[tT]hrough\b/ 306 | /\b[tT]hroughout\b/ 307 | /\b[tT]hus\b/ 308 | /\b[tT]hyself\b/ 309 | /\b[tT]ill\b/ 310 | /\b[tT]o\b/ 311 | /\b[tT]oo\b/ 312 | /\b[tT]otally\b/ 313 | /\b[tT]other\b/ 314 | /\b[tT]oward\b/ 315 | /\b[tT]owards\b/ 316 | /\b[tT]wain\b/ 317 | /\b[tT]welve\b/ 318 | /\b[tT]wenty\b/ 319 | /\b[tT]wice\b/ 320 | /\b[tT]wo\b/ 321 | /\b[uU]nder\b/ 322 | /\b[uU]nderneath\b/ 323 | /\b[uU]nless\b/ 324 | /\b[uU]nlike\b/ 325 | /\b[uU]ntil\b/ 326 | /\b[uU]p\b/ 327 | /\b[uU]pon\b/ 328 | /\b[uU]pper\b/ 329 | /\b[uU]s\b/ 330 | /\b[uU]se\b/ 331 | /\b[uU]sed\b/ 332 | /\b[uU]sually\b/ 333 | /\b[uU]x\b/ 334 | /\b[vV]arious\b/ 335 | /\b[vV]ersus\b/ 336 | /\b[vV]ery\b/ 337 | /\b[vV]i\b/ 338 | /\b[vV]ia\b/ 339 | /\b[vV]ii\b/ 340 | /\b[vV]iii\b/ 341 | /\b[vV]iiii\b/ 342 | /\b[vV]is-a-vis\b/ 343 | /\b[wW]a\b/ 344 | /\b[wW]ant\b/ 345 | /\b[wW]anted\b/ 346 | /\b[wW]ants\b/ 347 | /\b[wW]as\b/ 348 | /\b[wW]e\b/ 349 | /\b[wW]ell\b/ 350 | /\b[wW]ent\b/ 351 | /\b[wW]ere\b/ 352 | /\b[wW]hat\b/ 353 | /\b[wW]hatall\b/ 354 | /\b[wW]hatever\b/ 355 | /\b[wW]hatsoever\b/ 356 | /\b[wW]hen\b/ 357 | /\b[wW]here\b/ 358 | /\b[wW]hereas\b/ 359 | /\b[wW]hereby\b/ 360 | /\b[wW]herewith\b/ 361 | /\b[wW]herewithal\b/ 362 | /\b[wW]hich\b/ 363 | /\b[wW]hichever\b/ 364 | /\b[wW]hichsoever\b/ 365 | /\b[wW]hile\b/ 366 | /\b[wW]ho\b/ 367 | /\b[wW]hoever\b/ 368 | /\b[wW]hole\b/ 369 | /\b[wW]hom\b/ 370 | /\b[wW]homever\b/ 371 | /\b[wW]homso\b/ 372 | /\b[wW]homsoever\b/ 373 | /\b[wW]hose\b/ 374 | /\b[wW]hosoever\b/ 375 | /\b[wW]ill\b/ 376 | /\b[wW]ith\b/ 377 | /\b[wW]ithin\b/ 378 | /\b[wW]ithout\b/ 379 | /\b[wW]ont\b/ 380 | /\b[wW]orth\b/ 381 | /\b[wW]ould\b/ 382 | /\b[yY]e\b/ 383 | /\b[yY]ear\b/ 384 | /\b[yY]ears\b/ 385 | /\b[yY]es\b/ 386 | /\b[yY]et\b/ 387 | /\b[yY]on\b/ 388 | /\b[yY]onder\b/ 389 | /\b[yY]ou\b/ 390 | /\b[yY]ou-all\b/ 391 | /\b[yY]our\b/ 392 | /\b[yY]ours\b/ 393 | /\b[yY]ourself\b/ 394 | /\b[yY]ourselves\b/ 395 | -------------------------------------------------------------------------------- /libshorttext/converter/stop-words/stoplist-nsp.regex.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/converter/stop-words/stoplist-nsp.regex.pickle -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.command.install import install as DistutilsInstall 3 | from distutils.command.clean import clean as Clean 4 | import shutil 5 | import os 6 | from os.path import join 7 | 8 | 9 | class MakeCommand(DistutilsInstall): 10 | def run(self): 11 | os.system('make') 12 | common_dir = 'libshorttext/converter/stemmer' 13 | target_dir = '%s/%s' % (self.build_lib, common_dir) 14 | self.mkpath(target_dir) 15 | os.system('cp %s/porter.so.1 %s' % (common_dir, target_dir)) 16 | common_dir = 'libshorttext/classifier/learner' 17 | target_dir = '%s/%s' % (self.build_lib, common_dir) 18 | self.mkpath(target_dir) 19 | os.system('cp %s/util.so.1 %s' % (common_dir, target_dir)) 20 | common_dir = 'libshorttext/classifier/learner/liblinear' 21 | target_dir = '%s/%s' % (self.build_lib, common_dir) 22 | self.mkpath(target_dir) 23 | os.system('cp %s/liblinear.so.1 %s' % (common_dir, target_dir)) 24 | DistutilsInstall.run(self) 25 | 26 | 27 | class CleanCommand(Clean): 28 | description = "Remove build artifacts from the source tree" 29 | 30 | def run(self): 31 | Clean.run(self) 32 | if os.path.exists('build'): 33 | shutil.rmtree('build') 34 | for dirpath, dirnames, filenames in os.walk('libshorttext'): 35 | for filename in filenames: 36 | if (filename.endswith('.o') or filename.endswith('.a') or filename.endswith( 37 | '.so.1') or filename.endswith( 38 | '.pyd') or filename.endswith( 39 | '.dll') or filename.endswith('.pyc')): 40 | os.unlink(os.path.join(dirpath, filename)) 41 | for dirname in dirnames: 42 | if dirname == '__pycache__': 43 | shutil.rmtree(os.path.join(dirpath, dirname)) 44 | 45 | 46 | setup( 47 | name='libshorttext', 48 | version='1.1', 49 | packages=['', 'libshorttext', 'libshorttext.analyzer', 'libshorttext.converter', 'libshorttext.converter.stemmer', 50 | 'libshorttext.classifier', 'libshorttext.classifier.learner', 'libshorttext.classifier.learner.liblinear', 51 | 'libshorttext.classifier.learner.liblinear.python'], 52 | package_data={'libshorttext': [join('converter', 'stop-words', '*')]}, 53 | url='', 54 | license='', 55 | author='', 56 | author_email='', 57 | description='', 58 | cmdclass={ 59 | 'install': MakeCommand, 60 | 'clean': CleanCommand, 61 | }, 62 | ) 63 | --------------------------------------------------------------------------------