├── .gitignore
├── .travis.yml
├── Makefile
├── README.md
├── libshorttext
    ├── __init__.py
    ├── analyzer
    │   ├── __init__.py
    │   ├── analyzer_impl.py
    │   └── selector.py
    ├── classifier
    │   ├── __init__.py
    │   ├── classifier_impl.py
    │   ├── grid.py
    │   └── learner
    │   │   ├── Makefile
    │   │   ├── __init__.py
    │   │   ├── learner_impl.py
    │   │   ├── liblinear
    │   │       ├── COPYRIGHT
    │   │       ├── Makefile
    │   │       ├── README
    │   │       ├── blas
    │   │       │   ├── Makefile
    │   │       │   ├── blas.h
    │   │       │   ├── blasp.h
    │   │       │   ├── daxpy.c
    │   │       │   ├── ddot.c
    │   │       │   ├── dnrm2.c
    │   │       │   └── dscal.c
    │   │       ├── heart_scale
    │   │       ├── linear.cpp
    │   │       ├── linear.def
    │   │       ├── linear.h
    │   │       ├── predict
    │   │       ├── predict.c
    │   │       ├── python
    │   │       │   ├── Makefile
    │   │       │   ├── README
    │   │       │   ├── liblinear.py
    │   │       │   └── liblinearutil.py
    │   │       ├── train
    │   │       ├── train.c
    │   │       ├── tron.cpp
    │   │       └── tron.h
    │   │   ├── test
    │   │   ├── test.cpp
    │   │   └── util.c
    └── converter
    │   ├── __init__.py
    │   ├── converter_impl.py
    │   ├── stemmer
    │       ├── Makefile
    │       ├── __init__.py
    │       ├── porter.c
    │       └── porter.py
    │   └── stop-words
    │       ├── stoplist-nsp.regex
    │       └── stoplist-nsp.regex.pickle
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.o
 3 | *.so
 4 | *.pyd
 5 | *~
 6 | .#*
 7 | *.lprof
 8 | *.swp
 9 | *.swo
10 | .DS_Store
11 | build
12 | .idea


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |     - "2.6"
 5 |     - "2.7"
 6 | 
 7 | branches:
 8 |   only:
 9 |     - master
10 | 
11 | before_script:
12 |     - python setup.py install
13 | 
14 | script:
15 |     - python -c "from libshorttext.analyzer import *; from libshorttext.classifier import *; from libshorttext.converter import *"


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all: code 
 3 | 
 4 | code: stemmer learner
 5 | 
 6 | stemmer:
 7 | 	make -C libshorttext/converter/stemmer
 8 | 
 9 | learner:
10 | 	make -C libshorttext/classifier/learner
11 | 
12 | clean:
13 | 
14 | cleanclean:
15 | 	rm -rf *.svm *.converter *.model *.config *.out *.pyc
16 | 	make -C doc clean
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | python-libshorttext
 2 | ===================
 3 | 
 4 | [![Build Status](https://travis-ci.org/2shou/python-libshorttext.svg?branch=master)](https://travis-ci.org/2shou/python-libshorttext)
 5 | 
 6 | An easy-install script for LibShortText
 7 | 
 8 | I recommend [TextGrocery](https://github.com/2shou/TextGrocery) for beginners, which provides more elegant api for LibShortText.
 9 | 
10 | [LibShortText](http://www.csie.ntu.edu.tw/~cjlin/libshorttext/) is a high-performance classifier for short-text such as titles, questions, sentences, and short messages.
11 | 
12 | This script provides a easy way to install LibShortText.
13 | 
14 | Notice
15 | ------
16 | It only works on Unix-based System like Linux or Mac OS, while the Python version must be 2.6 or newer.
17 | 
18 | Install
19 | -------
20 | 
21 |     $ python setup install
22 | 


--------------------------------------------------------------------------------
/libshorttext/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | LibShort is a package for short text classification. It supports training, test,
3 | and analysis tools.
4 | """
5 | 


--------------------------------------------------------------------------------
/libshorttext/analyzer/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :mod:`analyzer` is used for micro (for a single text instance) or macro (e.g., 
 3 | accuracy) analysis. Users can use :class:`InstanceSet` to specify the scope
 4 | to analyze by :class:`Analyzer`.
 5 | 
 6 | ::
 7 | 	
 8 | 	>>> from libshorttext.analyzer import *
 9 | 	>>> 
10 | 	>>> # load instances from an analyzable predict result file
11 | 	>>> insts = InstanceSet('prediction_result_path')
12 | 	>>> # find instances labels whose true and predicted labels are as specified
13 | 	>>> insts = insts.select(with_labels(['Books', 'Music', 'Art']))
14 | 	>>> 
15 | 	>>> # create an analyzer
16 | 	>>> analyzer = Analyzer('model_path')
17 | 	>>> analyzer.gen_confusion_table(insts)
18 | 	         Books  Music  Art
19 | 	Books      169      1    0
20 | 	Music        2    214    0
21 | 	Art          6      0  162
22 | 
23 | To use the analysis tools, an analyzable result and a model are required. Refer to
24 | :class:`libshorttext.classifier.PredictionResult` and 
25 | :class:`libshorttext.classifier.TextModel`.
26 | 
27 | """
28 | 
29 | from .analyzer_impl import *
30 | del analyzer_impl
31 | 
32 | from .selector import *
33 | del selector
34 | 


--------------------------------------------------------------------------------
/libshorttext/analyzer/analyzer_impl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys, os
  3 | from collections import defaultdict 
  4 | from ..classifier import *
  5 | __all__ = ['TextInstance', 'InstanceSet', 'Analyzer']
  6 | 
  7 | if sys.version_info[0] >= 3:
  8 | 	xrange = range
  9 | 	izip = zip
 10 | else:
 11 | 	from itertools import izip
 12 | 
 13 | class TextInstance:
 14 | 	'''
 15 | 	:class:`TextInstance` represents a text instance. It includes the index, 
 16 | 	the true label, the predicted label, the text, and the decision values 
 17 | 	of the text instance. Normally you do not directly create an instance. 
 18 | 	Instead, it is usually manipulated by :class:`InstanceSet`. For 
 19 | 	more information, please see the usage in :class:`InstanceSet`.
 20 | 	'''
 21 | 
 22 | 	def __init__(self, idx, true_y = '', predicted_y = '', text = '', extra_svm_feats = [], decvals = None):
 23 | 		self.idx = idx #: Instance index in the text source.
 24 | 		
 25 | 		#: The true label (if provided in the text source in the prediction phase).
 26 | 		self.true_y = true_y 
 27 | 		
 28 | 		#: The predicted label.
 29 | 		self.predicted_y = predicted_y
 30 | 
 31 | 		#: The original text. The value is an empty :class:`str` 
 32 | 		#: (``''``) at the beginning. The value is filled after
 33 | 		#: :func:`PredInst.load_text` is called.
 34 | 		self.text = text
 35 | 		#: The extra svm features. The value is an empty :class:`str`
 36 | 		#: at the beginning. The value is filled after 
 37 | 		#: :func:`PredInst.load_text` is called.
 38 | 		self.extra_svm_feats = extra_svm_feats
 39 | 
 40 | 		#: A :class:`list` of decision values. The length should be the
 41 | 		#: number of classes.
 42 | 		self.decvals = decvals
 43 | 
 44 | 	def __str__(self):
 45 | 		string = '''text = {text}
 46 | true label = {true_y}
 47 | predicted label = {predicted_y}
 48 | '''.format(text = self.text, true_y = self.true_y, predicted_y = self.predicted_y)
 49 | 		if self.extra_svm_feats:
 50 | 			string += 'extra svm features = {extra}\n'.format(extra = self.extra_svm_feats)
 51 | 		return string
 52 | 	
 53 | class InstanceSet:
 54 | 	'''
 55 | 	:class:`InstanceSet` is a group of :class:`TextInstance` instances. It is used to
 56 | 	get a subset of interested data. It should be initialized with a prediction
 57 | 	result file (and a testing data). By default, the path to the testing data
 58 | 	is stored in the prediction result file so you can only give the path to
 59 | 	prediction result file.
 60 | 
 61 | 		>>> from libshorttext.analyzer import *
 62 | 		>>> insts = InstanceSet('prediction_result_path')
 63 | 
 64 | 	If you have moved testing data, then you must re-assign the path to testing
 65 | 	data.
 66 | 
 67 | 		>>> from libshorttext.analyzer import *
 68 | 		>>> insts = InstanceSet('prediction_result_path', 'testing_data_path')
 69 | 	'''
 70 | 
 71 | 	def __init__(self, rst_src = None, text_src = None):
 72 | 		self.insts = None
 73 | 		self.correct = None
 74 | 		self.filepath = None
 75 | 		self.extra_svm_files = []
 76 | 		self.true_labels = None
 77 | 		self.predict_labels = None
 78 | 		self.quantity = None
 79 | 		self.selectors = []
 80 | 		if rst_src is not None:
 81 | 			self._load(rst_src, text_src)
 82 | 
 83 | 	def __iter__(self):
 84 | 		return iter(self.insts)
 85 | 
 86 | 	def __getitem__(self, idx):
 87 | 		return self.insts[idx]
 88 | 	
 89 | 	def select(self, *sel_funcs):
 90 | 		'''
 91 | 		This function helps users find interested data. The arguments 
 92 | 		are `selector functions`, where both the argument and returned
 93 | 		values are lists. There are several build-in selector functions.
 94 | 		Refer to :ref:`selectorfunctions`.
 95 | 		
 96 | 		>>> from libshorttext.analyzer import *
 97 | 		>>> insts = InstanceSet('prediction_result_path')
 98 | 		>>> insts1 = insts.select(wrong, with_labels(['Books', 'Music'])) 
 99 | 		'''
100 | 		### How to link to the section??
101 | 		insts = self.insts
102 | 		selectors = self.selectors[:]
103 | 		for sel_func in sel_funcs:
104 | 			insts = sel_func(insts)
105 | 			selectors.append(sel_func._libshorttext_msg or '')
106 | 		#if not insts:
107 | 		#	raise Exception("No instance selected.")
108 | 		sel_insts = InstanceSet()
109 | 		sel_insts.filepath = self.filepath
110 | 		sel_insts.extra_svm_files = self.extra_svm_files
111 | 		sel_insts.selectors = selectors
112 | 		sel_insts.insts = insts
113 | 		return sel_insts
114 | 
115 | 	def load_text(self):
116 | 		'''
117 | 		The text of instances are not stored in the prediction result file,
118 | 		so you need to call this function to load texts from testing data.
119 | 
120 | 		>>> from libshorttext.analyzer import *
121 | 		>>> insts = InstanceSet('prediction_result_path')
122 | 		>>> insts.load_text()
123 | 
124 | 		This method also load the extra svm features if extra svm files
125 | 		are used when training.
126 | 		'''
127 | 		EMPTY_MESSAGE = '**None**'
128 | 		sorted_insts = sorted(self.insts, key = lambda inst: inst.idx)
129 | 		i = 0
130 | 		for idx, lines in enumerate(izip(*([open(self.filepath, 'r')] + [open(f, 'r') for f in self.extra_svm_files]))):
131 | 			line = lines[0]
132 | 			extra_svm_feats = lines[1:]
133 | 			nr_extra_svm_feats = len(extra_svm_feats)
134 | 			if idx > sorted_insts[-1].idx:
135 | 				break
136 | 			if idx == sorted_insts[i].idx:
137 | 				try:
138 | 					sorted_insts[i].text = line.split('\t',1)[1].strip()
139 | 				except:
140 | 					sorted_insts[i].text = EMPTY_MESSAGE
141 | 
142 | 				sorted_insts[i].extra_svm_feats = [None] * nr_extra_svm_feats
143 | 				for j, extra_svm_feat in enumerate(extra_svm_feats):
144 | 					try:
145 | 						sorted_insts[i].extra_svm_feats[j] = dict(map(lambda t: (int(t[0]), float(t[1])), [feat.split(':') for feat in extra_svm_feat.split(None, 1)[1].split()]))
146 | 					except:
147 | 						sorted_insts[i].extra_svm_feats[j] = EMPTY_MESSAGE
148 | 				i += 1
149 | 			
150 | 	def _load(self, src, text_src):
151 | 		if isinstance(src, PredictionResult):
152 | 			pass
153 | 		elif isinstance(src, str):
154 | 			result = PredictionResult()
155 | 			result.load(src)
156 | 		else:
157 | 			raise Exception('"result" should be PredictionResult or string.')
158 | 	
159 | 		if not result.analyzable():
160 | 			raise ValueError('The given result is not analyzable.')
161 | 	
162 | 		# +++ Need to move to another place.			   
163 | 		#if self.model._hashcode != result.model_id:
164 | 		#	sys.stderr.write('Warning: model ID is different from that in the predicted result. Do you use a different model to analyze?\n')
165 | 	
166 | 		if text_src is None:
167 | 			self.filepath = result.text_src
168 | 		else:
169 | 			self.filepath = text_src
170 | 		self.extra_svm_files = result.extra_svm_files
171 | 		predicted_y = result.predicted_y
172 | 		self.acc = result.get_accuracy()
173 | 		decvals = result.decvals
174 | 		true_y = result.true_y
175 | 				   
176 | 		self.insts, self.true_labels, self.predict_labels = [], set(), set()
177 | 		for idx in range(len(true_y)):
178 | 			self.insts += [TextInstance(idx, true_y = true_y[idx], predicted_y = predicted_y[idx], decvals = list(decvals[idx]))]
179 | 			self.true_labels.add(true_y[idx])
180 | 			self.predict_labels.add(predicted_y[idx])
181 | 	
182 | class Analyzer:
183 | 	'''
184 | 	:class:`Analyzer` is a tool for analyzing a group of instances, which is
185 | 	controlled by :class:`InstanceSet`. Typically :class:`Analyzer` is initialized
186 | 	with a path to a model.
187 | 
188 | 		>>> from libshorttext.analyzer import *
189 | 		>>> analyzer = Analyzer('model_path')
190 | 
191 | 	It can also be initialized with a :class:`libshorttext.classifier.TextModel`
192 | 	instance.
193 | 	
194 | 		>>> from libshorttext.analyzer import *
195 | 		>>> from libshorttext.classifier import *
196 | 		>>> text_model = TextModel('model_path')
197 | 		>>> analyzer = Analyzer(text_model)
198 | 		
199 | 	You can also construct an analyzer without a model. However,
200 | 	model-dependent functions cannot be used.
201 | 
202 | 		>>> from libshorttext.analyzer import *
203 | 		>>> analyzer = Analyzer()
204 | 	'''
205 | 	
206 | 	def __init__(self, model = None):
207 | 		self.labels = None
208 | 		self.model = None
209 | 		if model is not None:
210 | 			self.load_model(model)
211 | 		
212 | 	def load_model(self, model):
213 | 		'''
214 | 		:func:`load_model` is used to load a model into
215 | 		:class:`Analyzer`. If you did not load a model in the constructor or if you
216 | 		would like to use another model, you can use this function.
217 | 
218 | 		There are two ways to load a model: from an instance of
219 | 		:class:`libshorttext.classifier.TextModel` or a path to a model.
220 | 
221 | 		>>> from libshorttext.analyzer import *
222 | 		>>> analyzer = Analyzer('original_model_path')
223 | 		>>> analyzer.load_model('new_model_path')
224 | 		'''
225 | 		
226 | 		if isinstance(model, TextModel):
227 | 			self.model = model
228 | 		elif isinstance(model, str):
229 | 			self.model = TextModel()
230 | 			self.model.load(model)
231 | 		else:
232 | 			raise Exception('"model" should be TextModel or string.')
233 | 		self.labels = self.model.get_labels()
234 | 		
235 | 	def analyze_single(self, target, amount = 5, output = None, extra_svm_feats = []):
236 | 		'''
237 | 		:func:`analyze_single` is used to analyze a single instance. It prints
238 | 		weights of all features in some classes (default 5). The output is
239 | 		sorted according to decision values in descending order. *target* can be an
240 | 		instance or a string that you want to analyze. *amount* is how many instances
241 | 		you want to print. If *output* is specified by a path to a file, the
242 | 		result will be outputted to the file instead of on the screen.
243 | 
244 | 			>>> from libshorttext.analyzer import *
245 | 			>>> analyzer = Analyzer('model_path')
246 | 			>>> insts = InstanceSet('prediction_result_path')
247 | 			>>> insts.load_text()
248 | 			>>> analyzer.analyze_single(insts[61], 3)
249 | 			                    Jewelry & Watches  Cameras & Photo  Coins & Paper Money
250 | 			pb                          7.589e-19        2.041e-01            0.000e+00
251 | 			green                      -8.897e-02        1.227e-02           -1.507e-01
252 | 			mm                          5.922e-01        6.731e-01            1.256e-03
253 | 			onyx silver                 1.382e-01       -6.198e-02           -4.743e-19
254 | 			48                         -1.792e-02        2.188e-02           -1.346e-04
255 | 			pendant                     1.107e+00       -1.039e-01           -1.409e-01
256 | 			silver pendant              2.455e-01       -7.826e-02           -8.379e-02
257 | 			silver                      8.533e-01       -2.205e-02            8.076e-01
258 | 			onyx                        1.520e-01       -6.198e-02           -4.743e-19
259 | 			**decval**                  9.937e-01        1.944e-01            1.444e-01
260 | 			>>> analyzer.analyze_single('MICKEY MOUSE POT STAKE', 3)
261 | 			                Home & Garden  Video Games & Consoles  Computers/Tablets & Networking
262 | 			mickey              9.477e-02              -3.168e-02                       6.722e-02
263 | 			mouse               2.119e-01               2.039e-01                      -2.212e-02
264 | 			pot                 8.897e-01              -5.167e-02                      -2.466e-02
265 | 			stake               4.057e-01              -2.147e-02                      -3.699e-02
266 | 			mickey mouse        1.146e-01              -3.168e-02                       6.784e-02
267 | 			mouse pot           4.041e-01              -2.147e-02                      -1.588e-02
268 | 			pot stake           5.363e-01              -2.147e-02                      -1.588e-02
269 | 			**decval**          1.004e+00               9.255e-03                       7.385e-03
270 | 
271 | 
272 | 		If *target* is a :class:`str` and extra svm files are used in 
273 | 		training, the same number of extra svm features can be 
274 | 		specified in *extra_svm_feats*. Extra svm features should be 
275 | 		a list of dictionaries. If *target* is a :class:`TextInstance`,
276 | 		the extra features in the :class:`TextInstance` will be used.
277 | 		'''
278 | 		if self.model is None:
279 | 			raise Exception('Model not loaded.')
280 | 		if isinstance(target,str):
281 | 			text = target
282 | 			true_y = None
283 | 			result = predict_single_text(text, self.model, extra_svm_feats = extra_svm_feats)
284 | 			decvals = result.decvals
285 | 		elif isinstance(target,TextInstance):
286 | 			if target.text is None:
287 | 				raise Exception('Please load texts first.')
288 | 			text, extra_svm_feats, true_y = target.text, target.extra_svm_feats, target.true_y
289 | 			decvals = target.decvals
290 | 		if isinstance(output, str):
291 | 			output = open(output, 'w')
292 | 
293 | 		features, weights, labels = self.model.get_weight(text, extra_svm_feats = extra_svm_feats)
294 | 		nr_labels = len(labels)
295 | 		nr_feats = len(features)
296 | 		if not features or not weights:
297 | 			raise Exception('Invalid instance.')
298 | 		features = [' '.join(feature) for feature in features]
299 | 		features += ['**decval**']
300 | 		weights_table = [[0]*nr_labels]*(nr_feats+1)
301 | 		sorted_idx = sorted(xrange(nr_labels), key=lambda i:decvals[i], reverse=True)
302 | 		labels = [labels[idx] for idx in sorted_idx]
303 | 
304 | 		for feat in xrange(nr_feats):
305 | 			formatter = lambda idx: '{0:.3e}'.format(weights[feat][idx])
306 | 			weights_table[feat] = [formatter(idx) for idx in sorted_idx]
307 | 		weights_table[-1] = ['{0:.3e}'.format(decvals[idx]) for idx in sorted_idx]
308 | 
309 | 		if amount != 0:
310 | 			labels = labels[:amount]
311 | 		draw_table(features, labels, weights_table, output)
312 | 		if true_y is not None:
313 | 			print('True label: {0}'.format(true_y))
314 | 
315 | 	def _calculate_info(self, pred_insts):
316 | 		pred_insts.quantity = len(pred_insts.insts)
317 | 		pred_insts.true_labels, pred_insts.predict_labels, pred_insts.correct = \
318 | 			set(), set(), 0
319 | 		for inst in pred_insts.insts:
320 | 			pred_insts.true_labels.add(inst.true_y)
321 | 			pred_insts.predict_labels.add(inst.predicted_y)
322 | 			if inst.true_y == inst.predicted_y:
323 | 				pred_insts.correct += 1
324 | 		
325 | 	def info(self, pred_insts, output = None):
326 | 		'''
327 | 		:func:`info` gets information about a group of instances (an object
328 | 		of :class:`InstanceSet`). *pred_insts* is the target instances. If *output*
329 | 		is specified by a path to a file, the result will be outputted to the file
330 | 		instead of on the screen.
331 | 
332 | 			>>> from libshorttext.analyzer import *
333 | 			>>> analyzer = Analyzer('model_path')
334 | 			>>> insts = InstanceSet('prediction_result_path')
335 | 			>>> insts = insts.select(with_labels(['Books', 'Music', 'Art']))
336 | 			>>> analyzer.info(insts)
337 | 			Number of instances: 554
338 | 			Accuracy: 0.983754512635 (545/554)
339 | 			True labels: "Art"  "Books"  "Music"
340 | 			Predict labels: "Art"  "Books"  "Music"
341 | 			Text source:
342 | 			/home/guestwalk/working/short_text/svn/software-dev/test_file
343 | 			Selectors:
344 | 			-> labels: "Books", "Music", "Art"
345 | 		'''
346 | 		if isinstance(output, str):
347 | 			output = open(output, 'w')
348 | 		if pred_insts.quantity is None:
349 | 			self._calculate_info(pred_insts)
350 | 		acc = float(pred_insts.correct)/pred_insts.quantity
351 | 
352 | 		string = '''Number of instances: {quantity}
353 | Accuracy: {acc} ({correct}/{quantity}) 
354 | True labels: {true_y}
355 | Predicted labels: {predicted_y}
356 | Text source: {text_src}
357 | Selectors: \n-> {selectors}'''\
358 | 			  .format(quantity = pred_insts.quantity, correct = pred_insts.correct,\
359 | 					  acc = acc, true_y = '"'+'"  "'.join(pred_insts.true_labels)+'"',\
360 | 					  predicted_y = '"'+'"  "'.join(pred_insts.predict_labels)+'"',\
361 | 					  text_src = os.path.abspath(pred_insts.filepath),\
362 | 					  selectors = '\n-> '.join(pred_insts.selectors))
363 | 
364 | 		write(string, output)
365 | 
366 | 	def gen_confusion_table(self, pred_insts, output = None):
367 | 		'''
368 | 		:func:`gen_confusion_table` generates a confusion table of a group of
369 | 		predicted instances *pred_insts*. If *output* is specified by a path 
370 | 		to a file, the result will be outputted to the file instead of  
371 | 		on the screen.
372 | 
373 | 			>>> from libshorttext.analyzer import *
374 | 			>>> analyzer = Analyzer('model_path')
375 | 			>>> insts = InstanceSet('prediction_result_path')
376 | 			>>> insts = insts.select(with_labels(['Books', 'Music', 'Art']))
377 | 			>>> analyzer.gen_confusion_table(insts)
378 | 			         Books  Music  Art
379 | 			Books      169      1    0
380 | 			Music        2    214    0
381 | 			Art          6      0  162
382 | 		'''
383 | 		if isinstance(output, str):
384 | 			output = open(output, 'w')
385 | 		if pred_insts.quantity is None:
386 | 			self._calculate_info(pred_insts)
387 | 		labels = pred_insts.true_labels.union(pred_insts.predict_labels)
388 | 		#columns = rows
389 | 			
390 | 		invalid_labels = []
391 | 		for label in labels:
392 | 			if label not in pred_insts.true_labels and label not in pred_insts.predict_labels:
393 | 				invalid_labels.append(label)
394 | 		if invalid_labels:
395 | 			invalid_labels = ' '.join(invalid_labels)
396 | 			raise Exception('Labels {0} are invalid.'.format(invalid_labels))
397 | 
398 | 		labels_dic = dict(zip(labels, xrange(len(labels))))
399 | 		confusion_table = [[0 for i in range(len(labels_dic))] for j in range(len(labels_dic))]
400 | 		for inst in pred_insts.insts:
401 | 			if inst.true_y in labels_dic and inst.predicted_y in labels_dic:
402 | 				confusion_table[labels_dic[inst.true_y]][labels_dic[inst.predicted_y]] += 1
403 | 		for idx_row, row in enumerate(confusion_table):
404 | 			for idx_col, col in enumerate(row):
405 | 				confusion_table[idx_row][idx_col] = str(confusion_table[idx_row][idx_col])
406 | 
407 | 		draw_table(labels, labels, confusion_table, output)
408 | 		
409 | 		if output:
410 | 			output.close()
411 | 	
412 | def write(string, output = None):
413 | 	if output is None:
414 | 		print(string)
415 | 	else:
416 | 		output.write(string + '\n')
417 | 
418 | 		
419 | def draw_table(rows, columns, table, output = None):
420 | 	offset = 2
421 | 	column_widths = []
422 | 	title_width = max([len(row) for row in rows]) + offset
423 | 		
424 | 	for col_idx, column in enumerate(columns):
425 | 		column_widths.append(max([len(table[row_idx][col_idx]) \
426 | 				for row_idx, row in enumerate(rows)] + [len(column)]) + offset)
427 | 		
428 | 	string = ''.ljust(title_width)
429 | 	for idx, column in enumerate(columns):
430 | 		string += column.rjust(column_widths[idx])
431 | 	write(string, output)
432 | 
433 | 	for row_idx, row in enumerate(rows):
434 | 		string = row.ljust(title_width)
435 | 		for col_idx, column in enumerate(columns):
436 | 			string += table[row_idx][col_idx].rjust(column_widths[col_idx])
437 | 		write(string, output)
438 | 


--------------------------------------------------------------------------------
/libshorttext/analyzer/selector.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from random import sample
  4 | 
  5 | __all__ = ['wrong', 'with_labels', 'sort_by_dec', 'subset', 'selectorize', 'reverse']
  6 | 
  7 | def selectorize(option = 'general', comment = None):
  8 | 	"""
  9 | 	A function decorator which returns a function wrapper to generate a 
 10 | 	selector function.
 11 | 	
 12 | 	*option* can be ``'select'``, ``'sort'``, or ``'general'``. See the 
 13 | 	following table.
 14 | 	
 15 | 	+---------------+-----------------------------------------------------+
 16 | 	|   *option*    |      What should the defined function do?           |
 17 | 	+===============+=====================================================+
 18 | 	| ``'select'``  | The defined function should decide whether an       |
 19 | 	|               | instance should be selected or not. Therefore, the  |
 20 | 	|               | input is a :class:`TextInstance`, and the output    | 
 21 | 	|               | should be ``True`` or ``False``. ``True`` means that|
 22 | 	|               | this instance should be selected.                   |
 23 | 	+---------------+-----------------------------------------------------+
 24 | 	| ``'sort'``    | The defined function should return the key of an    |
 25 | 	|               | :class:`TextInstance` for sorting. The input is a   |
 26 | 	|               | :class:`TextInstance`, and the output should be a   |
 27 | 	|               | value or an object that is comparable.              |
 28 | 	+---------------+-----------------------------------------------------+
 29 | 	| ``'general'`` | Equivalent to the original function without applying| 
 30 | 	|               | the function wrapper. Therefore, the defined        |
 31 | 	|               | function's input and output are a list of           |
 32 | 	|               | :class:`TextInstance`.                              |
 33 | 	+---------------+-----------------------------------------------------+
 34 | 
 35 | 	For example, :func:`wrong` is equivalent to the following function::
 36 | 
 37 | 		@selectorize('select', 'Select wrongly predicted instances')
 38 | 		def wrong(inst):
 39 | 			return inst.true_y !=  inst.predicted_y
 40 | 	
 41 | 	And, :func:`sort_by_dec` is equivalent to the following function::
 42 | 		
 43 | 		@selectorize('sort', 'Sort by maximum decision values.')
 44 | 		def sort_by_dec(inst):
 45 | 			return max(inst.decvals)
 46 | 	
 47 | 	*comment* is the argument of the comment on the function, which will
 48 | 	be shown by the :meth:`libshorttext.analyzer.Analyzer.info`. See the
 49 | 	following example.
 50 | 
 51 | 	::
 52 | 
 53 | 		>>> from libshorttext.analyzer import *
 54 | 		>>> 
 55 | 		>>> @selectorize(comment = 'foo function')
 56 | 		>>> def foo(x):
 57 | 		>>> 	return x
 58 | 		>>> 
 59 | 		>>> insts = InstanceSet('predict_result_path').select(foo)
 60 | 		>>> Analyzer('model_path').info(insts)
 61 | 		[output skipped]
 62 | 		Selectors :
 63 | 		-> foo function
 64 | 	"""
 65 | 
 66 | 	def inner_func(input_func):
 67 | 		if option == "select":
 68 | 			def inner_func2(insts):
 69 | 				return list(filter(input_func, insts))
 70 | 		elif option == "sort":
 71 | 			def inner_func2(insts):
 72 | 				return sorted(insts, key = input_func)
 73 | 		elif option == "general":
 74 | 			inner_func2 = input_func
 75 | 		else:
 76 | 			raise Exception("No such setting.")
 77 | 		
 78 | 		if input_func is None or comment is None:
 79 | 			inner_func2._libshorttext_msg = "user-defined selector function"
 80 | 		else:
 81 | 			inner_func2._libshorttext_msg = comment
 82 | 		
 83 | 		inner_func2.__doc__ = input_func.__doc__
 84 | 		
 85 | 		return inner_func2
 86 | 	return inner_func
 87 | 
 88 | @selectorize('select', 'Select wrongly predicted instances')
 89 | def wrong(inst):
 90 | 	'''
 91 | 	Select wrongly predicted instances. It assumes that the labels in the
 92 | 	test data are true labels. 
 93 | 	
 94 | 	This function should be passed to :meth:`InstanceSet.select` without any 
 95 | 	argument.
 96 | 
 97 | 	>>> insts = InstanceSet('prediction_result_path').select(wrong)
 98 | 	'''
 99 | 	return inst.true_y !=  inst.predicted_y
100 | 
101 | def with_labels(labels, target = 'both'):
102 | 	'''
103 | 	Select instances with specified labels. *labels* is an iterable object
104 | 	of :class:`str` instances, which represent the label names. 
105 | 	
106 | 	*target* can be ``'true'``, ``'predict'``, ``'both'``, ``'or'``. If 
107 | 	*target* is ``'true'``, then this function finds instances based on the 
108 | 	true label specified in the test data. If *target* is 
109 | 	``'predict'``, it finds instances based on the predicted labels. 
110 | 	``'both'`` and ``'or'`` find the intersection and the union of 
111 | 	``'true'`` and ``'predict'``, respectively. The default value of 
112 | 	``'target'`` is ``'both'``.
113 | 
114 | 	The following example selects instances where the true labels are
115 | 	``'Music'`` or ``'Books'``.
116 | 
117 | 	>>> insts = InstanceSet('prediction_result_path').select(with_labels(['Books', 'Music']))
118 | 	'''
119 | 	@selectorize('select', 'labels: "{0}"'.format('", "'.join(labels)))
120 | 	def inner_func(inst):
121 | 		if target == 'both':
122 | 			return inst.true_y in labels and inst.predicted_y in labels
123 | 		elif target == 'or':
124 | 			return inst.true_y in labels or inst.predicted_y in labels
125 | 		elif target == 'true':
126 | 			return inst.true_y in labels
127 | 		elif target == 'predict':
128 | 			return inst.predicted_y in labels
129 | 		else:
130 | 			raise Exception("No such setting.")
131 | 	return inner_func
132 | 
133 | @selectorize('sort', 'Sort by maximum decision values.')
134 | def sort_by_dec(inst):
135 | 	'''
136 | 	Sort instances by the decision values of the predicted labels in ascending
137 | 	order. You can combine this function with :func:`reverse` to sort decision 
138 | 	values from large to small.
139 | 	
140 | 	>>> insts = InstanceSet('prediction_result_path').select(sort_by_dec, reverse)
141 | 	
142 | 	This function should be passed to :meth:`InstanceSet.select` without any argument. 
143 | 	'''
144 | 	return max(inst.decvals)
145 | 
146 | def subset(amount, method = 'top'):
147 | 	'''
148 | 	Find a subset of the :class:`InstanceSet`. *amount* is the number of 
149 | 	selected instances. *method* can be ``'top'`` or ``'random'``. If 
150 | 	*method* is ``'top'``, the first *amount* instances are selected.
151 | 	Otherwise, :meth:`InstanceSet` selects instances randomly. If *amount* is 
152 | 	larger than the number of instances, :meth:`InstanceSet` will return all
153 | 	instances.
154 | 
155 | 	The ``'top'`` method is useful when used after :func:`sort_by_dec`. The
156 | 	following example selects ten instances with the smallest decision values of
157 | 	the predicted label.
158 | 	
159 | 	>>> insts = InstanceSet('prediction_result_path').select(sort_by_dec, subset(10))
160 | 	'''
161 | 	@selectorize(comment = 'Select {0} instances in {1}.'.format(amount, method))
162 | 	def inner_func(insts):
163 | 		if amount > len(insts):
164 | 			return insts
165 | 		elif method == 'random':
166 | 			return sample(insts, amount)
167 | 		elif method == 'top':
168 | 			return insts[0:amount]
169 | 		else:
170 | 			raise Exception("No such setting.")
171 | 	return inner_func
172 | 
173 | 
174 | @selectorize(comment = 'Reverse the order of instances')
175 | def reverse(insts):
176 | 	"""
177 | 	Reverse the order of instances.
178 | 	
179 | 	This function should be passed to :meth:`InstanceSet.select` without any 
180 | 	argument.
181 | 
182 | 	>>> insts = InstanceSet('prediction_result_path').select(reverse)
183 | 	"""
184 | 	return list(reversed(insts))
185 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The :mod:`classifier` is a high-level interface to train a short-text data. 
 3 | Members of :mod:`classifier` include :class:`TextModel` and its utility 
 4 | functions. :class:`TextModel` is obtained in training and then used in prediction.
 5 | 
 6 | The standard method to get a :class:`TextModel` instance is via function 
 7 | :func:`train_text` or :func:`train_converted_text`, which trains
 8 | text data (refer to :ref:`dataset`) or LIBSVM-format data, respectively. 
 9 | 
10 | 	>>> from libshorttext.classifier import *
11 | 	>>> # train a model and save it to a file
12 | 	>>> m, svm_file = train_text('train_file')
13 | 	>>> # save the model to a file
14 | 	>>> m.save('model_path')
15 | 
16 | After obtaining a :class:`TextModel`, users can use :func:`predict_text` or 
17 | :func:`predict_single_text` to predict the label of a new short text.
18 | 	
19 | 	>>> from libshorttext.classifier import *
20 | 	>>> # load a model from a file
21 | 	>>> m = TextModel('model_path')
22 | 	>>> # predict a sentence
23 | 	>>> result = predict_single_text('This is a sentence.', m) 
24 | 
25 | Another class in module :mod:`classifier` is :class:`PredictionResult`, which is a
26 | wrapper of prediction results. Both :func:`predict_text` and 
27 | :func:`predict_single_text` return a :class:`PredictionResult` object.
28 | 
29 | :mod:`classifier` does not access the low-level LIBLINEAR's train and predict 
30 | utilities directly. All jobs are passed to a submodule called :mod:`learner`, 
31 | which is a middle-level classifier and communicates between :mod:`classifier`
32 | and LIBLINEAR. Users can also use the :mod:`learner` module directly without
33 | :mod:`classifier` to achieve more complicated usages.
34 | """
35 | 
36 | 
37 | from .classifier_impl import * 
38 | del classifier_impl
39 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/grid.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | __all__ = ['find_parameters']
  3 | 
  4 | import os, sys, traceback, getpass, time, re
  5 | from threading import Thread
  6 | from subprocess import *
  7 | 
  8 | if sys.version_info[0] < 3:
  9 | 	from Queue import Queue
 10 | else:
 11 | 	from queue import Queue
 12 | 
 13 | telnet_workers = []
 14 | ssh_workers = []
 15 | nr_local_worker = 1
 16 | 
 17 | class GridOption:
 18 | 	def __init__(self, dataset_pathname, options):
 19 | 		dirname = os.path.dirname(__file__)
 20 | 		if sys.platform != 'win32':
 21 | 			self.svmtrain_pathname = os.path.join(dirname, '../svm-train')
 22 | 			self.gnuplot_pathname = '/usr/bin/gnuplot'
 23 | 		else:
 24 | 			# example for windows
 25 | 			self.svmtrain_pathname = os.path.join(dirname, r'..\windows\svm-train.exe')
 26 | 			# svmtrain_pathname = r'c:\Program Files\libsvm\windows\svm-train.exe'
 27 | 			self.gnuplot_pathname = r'c:\tmp\gnuplot\binary\pgnuplot.exe'
 28 | 		self.fold = 5
 29 | 		self.c_begin, self.c_end, self.c_step = -5,  15,  2
 30 | 		self.g_begin, self.g_end, self.g_step =  3, -15, -2
 31 | 		self.grid_with_c, self.grid_with_g = True, True
 32 | 		self.dataset_pathname = dataset_pathname
 33 | 		self.dataset_title = os.path.split(dataset_pathname)[1]
 34 | 		self.out_pathname = '{0}.out'.format(self.dataset_title)
 35 | 		self.png_pathname = '{0}.png'.format(self.dataset_title)
 36 | 		self.pass_through_string = ' '
 37 | 		self.resume_pathname = None
 38 | 		self.parse_options(options)
 39 | 
 40 | 	def parse_options(self, options):
 41 | 		if type(options) == str:
 42 | 			options = options.split()
 43 | 		i = 0
 44 | 		pass_through_options = []
 45 | 		
 46 | 		while i < len(options):
 47 | 			if options[i] == '-log2c':
 48 | 				i = i + 1
 49 | 				if options[i] == 'null':
 50 | 					self.grid_with_c = False
 51 | 				else:
 52 | 					self.c_begin, self.c_end, self.c_step = map(float,options[i].split(','))
 53 | 			elif options[i] == '-log2g':
 54 | 				i = i + 1
 55 | 				if options[i] == 'null':
 56 | 					self.grid_with_g = False
 57 | 				else:
 58 | 					self.g_begin, self.g_end, self.g_step = map(float,options[i].split(','))
 59 | 			elif options[i] == '-v':
 60 | 				i = i + 1
 61 | 				self.fold = options[i]
 62 | 			elif options[i] in ('-c','-g'):
 63 | 				raise ValueError('Use -log2c and -log2g.')
 64 | 			elif options[i] == '-svmtrain':
 65 | 				i = i + 1
 66 | 				self.svmtrain_pathname = options[i]
 67 | 			elif options[i] == '-gnuplot':
 68 | 				i = i + 1
 69 | 				if options[i] == 'null':
 70 | 					self.gnuplot_pathname = None
 71 | 				else:	
 72 | 					self.gnuplot_pathname = options[i]
 73 | 			elif options[i] == '-out':
 74 | 				i = i + 1
 75 | 				if options[i] == 'null':
 76 | 					self.out_pathname = None
 77 | 				else:
 78 | 					self.out_pathname = options[i]
 79 | 			elif options[i] == '-png':
 80 | 				i = i + 1
 81 | 				self.png_pathname = options[i]
 82 | 			elif options[i] == '-resume':
 83 | 				if i == (len(options)-1) or options[i+1].startswith('-'):
 84 | 					self.resume_pathname = self.dataset_title + '.out'
 85 | 				else:
 86 | 					i = i + 1
 87 | 					self.resume_pathname = options[i]
 88 | 			else:
 89 | 				pass_through_options.append(options[i])
 90 | 			i = i + 1
 91 | 
 92 | 		self.pass_through_string = ' '.join(pass_through_options)
 93 | 		if not os.path.exists(self.svmtrain_pathname):
 94 | 			raise IOError('svm-train executable not found')
 95 | 		if not os.path.exists(self.dataset_pathname):
 96 | 			raise IOError('dataset not found')
 97 | 		if self.resume_pathname and not os.path.exists(self.resume_pathname):
 98 | 			raise IOError('file for resumption not found')
 99 | 		if not self.grid_with_c and not self.grid_with_g:
100 | 			raise ValueError('-log2c and -log2g should not be null simultaneously')
101 | 		if self.gnuplot_pathname and not os.path.exists(self.gnuplot_pathname):
102 | 			sys.stderr.write('gnuplot executable not found\n')
103 | 			self.gnuplot_pathname = None
104 | 
105 | def redraw(db,best_param,gnuplot,options,tofile=False):
106 | 	if len(db) == 0: return
107 | 	begin_level = round(max(x[2] for x in db)) - 3
108 | 	step_size = 0.5
109 | 
110 | 	best_log2c,best_log2g,best_rate = best_param
111 | 
112 | 	# if newly obtained c, g, or cv values are the same,
113 | 	# then stop redrawing the contour.
114 | 	if all(x[0] == db[0][0]  for x in db): return
115 | 	if all(x[1] == db[0][1]  for x in db): return
116 | 	if all(x[2] == db[0][2]  for x in db): return
117 | 
118 | 	if tofile:
119 | 		gnuplot.write(b"set term png transparent small linewidth 2 medium enhanced\n")
120 | 		gnuplot.write("set output \"{0}\"\n".format(options.png_pathname.replace('\\','\\\\')).encode())
121 | 		#gnuplot.write(b"set term postscript color solid\n")
122 | 		#gnuplot.write("set output \"{0}.ps\"\n".format(options.dataset_title).encode().encode())
123 | 	elif sys.platform == 'win32':
124 | 		gnuplot.write(b"set term windows\n")
125 | 	else:
126 | 		gnuplot.write( b"set term x11\n")
127 | 	gnuplot.write(b"set xlabel \"log2(C)\"\n")
128 | 	gnuplot.write(b"set ylabel \"log2(gamma)\"\n")
129 | 	gnuplot.write("set xrange [{0}:{1}]\n".format(options.c_begin,options.c_end).encode())
130 | 	gnuplot.write("set yrange [{0}:{1}]\n".format(options.g_begin,options.g_end).encode())
131 | 	gnuplot.write(b"set contour\n")
132 | 	gnuplot.write("set cntrparam levels incremental {0},{1},100\n".format(begin_level,step_size).encode())
133 | 	gnuplot.write(b"unset surface\n")
134 | 	gnuplot.write(b"unset ztics\n")
135 | 	gnuplot.write(b"set view 0,0\n")
136 | 	gnuplot.write("set title \"{0}\"\n".format(options.dataset_title).encode())
137 | 	gnuplot.write(b"unset label\n")
138 | 	gnuplot.write("set label \"Best log2(C) = {0}  log2(gamma) = {1}  accuracy = {2}%\" \
139 | 				  at screen 0.5,0.85 center\n". \
140 | 				  format(best_log2c, best_log2g, best_rate).encode())
141 | 	gnuplot.write("set label \"C = {0}  gamma = {1}\""
142 | 				  " at screen 0.5,0.8 center\n".format(2**best_log2c, 2**best_log2g).encode())
143 | 	gnuplot.write(b"set key at screen 0.9,0.9\n")
144 | 	gnuplot.write(b"splot \"-\" with lines\n")
145 | 	
146 | 	db.sort(key = lambda x:(x[0], -x[1]))
147 | 
148 | 	prevc = db[0][0]
149 | 	for line in db:
150 | 		if prevc != line[0]:
151 | 			gnuplot.write(b"\n")
152 | 			prevc = line[0]
153 | 		gnuplot.write("{0[0]} {0[1]} {0[2]}\n".format(line).encode())
154 | 	gnuplot.write(b"e\n")
155 | 	gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure
156 | 	gnuplot.flush()
157 | 
158 | 
159 | def calculate_jobs(options):
160 | 	
161 | 	def range_f(begin,end,step):
162 | 		# like range, but works on non-integer too
163 | 		seq = []
164 | 		while True:
165 | 			if step > 0 and begin > end: break
166 | 			if step < 0 and begin < end: break
167 | 			seq.append(begin)
168 | 			begin = begin + step
169 | 		return seq
170 | 	
171 | 	def permute_sequence(seq):
172 | 		n = len(seq)
173 | 		if n <= 1: return seq
174 | 	
175 | 		mid = int(n/2)
176 | 		left = permute_sequence(seq[:mid])
177 | 		right = permute_sequence(seq[mid+1:])
178 | 	
179 | 		ret = [seq[mid]]
180 | 		while left or right:
181 | 			if left: ret.append(left.pop(0))
182 | 			if right: ret.append(right.pop(0))
183 | 			
184 | 		return ret	
185 | 
186 | 	
187 | 	c_seq = permute_sequence(range_f(options.c_begin,options.c_end,options.c_step))
188 | 	g_seq = permute_sequence(range_f(options.g_begin,options.g_end,options.g_step))
189 | 
190 | 	if not options.grid_with_c:
191 | 		c_seq = [None]
192 | 	if not options.grid_with_g:
193 | 		g_seq = [None] 
194 | 	
195 | 	nr_c = float(len(c_seq))
196 | 	nr_g = float(len(g_seq))
197 | 	i, j = 0, 0
198 | 	jobs = []
199 | 
200 | 	while i < nr_c or j < nr_g:
201 | 		if i/nr_c < j/nr_g:
202 | 			# increase C resolution
203 | 			line = []
204 | 			for k in range(0,j):
205 | 				line.append((c_seq[i],g_seq[k]))
206 | 			i = i + 1
207 | 			jobs.append(line)
208 | 		else:
209 | 			# increase g resolution
210 | 			line = []
211 | 			for k in range(0,i):
212 | 				line.append((c_seq[k],g_seq[j]))
213 | 			j = j + 1
214 | 			jobs.append(line)
215 | 
216 | 	resumed_jobs = {}
217 | 	
218 | 	if options.resume_pathname is None:
219 | 		return jobs, resumed_jobs
220 | 
221 | 	for line in open(options.resume_pathname, 'r'):
222 | 		line = line.strip()
223 | 		rst = re.findall(r'rate=([0-9.]+)',line)
224 | 		if not rst: 
225 | 			continue
226 | 		rate = float(rst[0])
227 | 
228 | 		c, g = None, None 
229 | 		rst = re.findall(r'log2c=([0-9.-]+)',line)
230 | 		if rst: 
231 | 			c = float(rst[0])
232 | 		rst = re.findall(r'log2g=([0-9.-]+)',line)
233 | 		if rst: 
234 | 			g = float(rst[0])
235 | 
236 | 		resumed_jobs[(c,g)] = rate
237 | 
238 | 	return jobs, resumed_jobs
239 | 
240 | 	
241 | class WorkerStopToken:  # used to notify the worker to stop or if a worker is dead
242 | 	pass
243 | 
244 | class Worker(Thread):
245 | 	def __init__(self,name,job_queue,result_queue,options):
246 | 		Thread.__init__(self)
247 | 		self.name = name
248 | 		self.job_queue = job_queue
249 | 		self.result_queue = result_queue
250 | 		self.options = options
251 | 		
252 | 	def run(self):
253 | 		while True:
254 | 			(cexp,gexp) = self.job_queue.get()
255 | 			if cexp is WorkerStopToken:
256 | 				self.job_queue.put((cexp,gexp))
257 | 				# print('worker {0} stop.'.format(self.name))
258 | 				break
259 | 			try:
260 | 				c, g = None, None
261 | 				if cexp != None:
262 | 					c = 2.0**cexp
263 | 				if gexp != None:
264 | 					g = 2.0**gexp
265 | 				rate = self.run_one(c,g)
266 | 				if rate is None: raise RuntimeError('get no rate')
267 | 			except:
268 | 				# we failed, let others do that and we just quit
269 | 			
270 | 				traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
271 | 				
272 | 				self.job_queue.put((cexp,gexp))
273 | 				sys.stderr.write('worker {0} quit.\n'.format(self.name))
274 | 				break
275 | 			else:
276 | 				self.result_queue.put((self.name,cexp,gexp,rate))
277 | 
278 | 	def get_cmd(self,c,g):
279 | 		options=self.options
280 | 		cmdline = options.svmtrain_pathname
281 | 		if options.grid_with_c: 
282 | 			cmdline += ' -c {0} '.format(c)
283 | 		if options.grid_with_g: 
284 | 			cmdline += ' -g {0} '.format(g)
285 | 		cmdline += ' -v {0} {1} {2} '.format\
286 | 			(options.fold,options.pass_through_string,options.dataset_pathname)
287 | 		return cmdline
288 | 		
289 | class LocalWorker(Worker):
290 | 	def run_one(self,c,g):
291 | 		cmdline = self.get_cmd(c,g)
292 | 		result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
293 | 		for line in result.readlines():
294 | 			if str(line).find('Cross') != -1:
295 | 				return float(line.split()[-1][0:-1])
296 | 
297 | class SSHWorker(Worker):
298 | 	def __init__(self,name,job_queue,result_queue,host,options):
299 | 		Worker.__init__(self,name,job_queue,result_queue,options)
300 | 		self.host = host
301 | 		self.cwd = os.getcwd()
302 | 	def run_one(self,c,g):
303 | 		cmdline = 'ssh -x -t -t {0} "cd {1}; {2}"'.format\
304 | 			(self.host,self.cwd,self.get_cmd(c,g))
305 | 		result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
306 | 		for line in result.readlines():
307 | 			if str(line).find('Cross') != -1:
308 | 				return float(line.split()[-1][0:-1])
309 | 
310 | class TelnetWorker(Worker):
311 | 	def __init__(self,name,job_queue,result_queue,host,username,password,options):
312 | 		Worker.__init__(self,name,job_queue,result_queue,options)
313 | 		self.host = host
314 | 		self.username = username
315 | 		self.password = password		
316 | 	def run(self):
317 | 		import telnetlib
318 | 		self.tn = tn = telnetlib.Telnet(self.host)
319 | 		tn.read_until('login: ')
320 | 		tn.write(self.username + '\n')
321 | 		tn.read_until('Password: ')
322 | 		tn.write(self.password + '\n')
323 | 
324 | 		# XXX: how to know whether login is successful?
325 | 		tn.read_until(self.username)
326 | 		# 
327 | 		print('login ok', self.host)
328 | 		tn.write('cd '+os.getcwd()+'\n')
329 | 		Worker.run(self)
330 | 		tn.write('exit\n')			   
331 | 	def run_one(self,c,g):
332 | 		cmdline = self.get_cmd(c,g)
333 | 		result = self.tn.write(cmdline+'\n')
334 | 		(idx,matchm,output) = self.tn.expect(['Cross.*\n'])
335 | 		for line in output.split('\n'):
336 | 			if str(line).find('Cross') != -1:
337 | 				return float(line.split()[-1][0:-1])
338 | 			
339 | def find_parameters(dataset_pathname, options=''):
340 | 	
341 | 	def update_param(c,g,rate,best_c,best_g,best_rate,worker,resumed):
342 | 		if (rate > best_rate) or (rate==best_rate and g==best_g and c<best_c):
343 | 			best_rate,best_c,best_g = rate,c,g
344 | 		stdout_str = '[{0}] {1} {2} (best '.format(worker,' '.join(map(str,[c,g])),rate)
345 | 		output_str = ''
346 | 		if c != None:
347 | 			stdout_str += 'c={0}, '.format(2.0**best_c)
348 | 			output_str += 'log2c={0} '.format(c)
349 | 		if g != None:
350 | 			stdout_str += 'g={0}, '.format(2.0**best_g)
351 | 			output_str += 'log2g={0} '.format(g)
352 | 		stdout_str += 'rate={0})'.format(best_rate)
353 | 		print(stdout_str)
354 | 		if options.out_pathname and not resumed:
355 | 			output_str += 'rate={0}\n'.format(rate)
356 | 			result_file.write(output_str)
357 | 			result_file.flush()
358 | 		
359 | 		return best_c,best_g,best_rate
360 | 		
361 | 	options = GridOption(dataset_pathname, options);
362 | 
363 | 	if options.gnuplot_pathname:
364 | 		gnuplot = Popen(options.gnuplot_pathname,stdin = PIPE,stdout=PIPE,stderr=PIPE).stdin
365 | 	else:
366 | 		gnuplot = None
367 | 		
368 | 	# put jobs in queue
369 | 
370 | 	jobs,resumed_jobs = calculate_jobs(options)
371 | 	job_queue = Queue(0)
372 | 	result_queue = Queue(0)
373 | 
374 | 	for (c,g) in resumed_jobs:
375 | 		result_queue.put(('resumed',c,g,resumed_jobs[(c,g)]))
376 | 
377 | 	for line in jobs:
378 | 		for (c,g) in line:
379 | 			if (c,g) not in resumed_jobs:
380 | 				job_queue.put((c,g))
381 | 
382 | 	# hack the queue to become a stack --
383 | 	# this is important when some thread
384 | 	# failed and re-put a job. It we still
385 | 	# use FIFO, the job will be put
386 | 	# into the end of the queue, and the graph
387 | 	# will only be updated in the end
388 |  
389 | 	job_queue._put = job_queue.queue.appendleft
390 | 
391 | 	# fire telnet workers
392 | 
393 | 	if telnet_workers:
394 | 		nr_telnet_worker = len(telnet_workers)
395 | 		username = getpass.getuser()
396 | 		password = getpass.getpass()
397 | 		for host in telnet_workers:
398 | 			worker = TelnetWorker(host,job_queue,result_queue,
399 | 					 host,username,password,options)
400 | 			worker.start()
401 | 
402 | 	# fire ssh workers
403 | 
404 | 	if ssh_workers:
405 | 		for host in ssh_workers:
406 | 			worker = SSHWorker(host,job_queue,result_queue,host,options)
407 | 			worker.start()
408 | 
409 | 	# fire local workers
410 | 
411 | 	for i in range(nr_local_worker):
412 | 		worker = LocalWorker('local',job_queue,result_queue,options)
413 | 		worker.start()
414 | 
415 | 	# gather results
416 | 
417 | 	done_jobs = {}
418 | 
419 | 	if options.out_pathname:
420 | 		if options.resume_pathname:
421 | 			result_file = open(options.out_pathname, 'a')
422 | 		else:
423 | 			result_file = open(options.out_pathname, 'w')
424 | 
425 | 
426 | 	db = []
427 | 	best_rate = -1
428 | 	best_c,best_g = None,None  
429 | 
430 | 	for (c,g) in resumed_jobs:
431 | 		rate = resumed_jobs[(c,g)]
432 | 		best_c,best_g,best_rate = update_param(c,g,rate,best_c,best_g,best_rate,'resumed',True)
433 | 
434 | 	for line in jobs:
435 | 		for (c,g) in line:
436 | 			while (c,g) not in done_jobs:
437 | 				(worker,c1,g1,rate1) = result_queue.get()
438 | 				done_jobs[(c1,g1)] = rate1
439 | 				if (c1,g1) not in resumed_jobs:
440 | 					best_c,best_g,best_rate = update_param(c1,g1,rate1,best_c,best_g,best_rate,worker,False)
441 | 			db.append((c,g,done_jobs[(c,g)]))
442 | 		if gnuplot and options.grid_with_c and options.grid_with_g:
443 | 			redraw(db,[best_c, best_g, best_rate],gnuplot,options)
444 | 			redraw(db,[best_c, best_g, best_rate],gnuplot,options,True)
445 | 
446 | 
447 | 	if options.out_pathname:
448 | 		result_file.close()
449 | 	job_queue.put((WorkerStopToken,None))
450 | 	best_param, best_cg  = {}, []
451 | 	if best_c != None:
452 | 		best_param['c'] = 2.0**best_c
453 | 		best_cg += [2.0**best_c]
454 | 	if best_g != None:
455 | 		best_param['g'] = 2.0**best_g
456 | 		best_cg += [2.0**best_g]
457 | 	print('{0} {1}'.format(' '.join(map(str,best_cg)), best_rate))
458 | 
459 | 	return best_rate, best_param
460 | 
461 | 
462 | if __name__ == '__main__':
463 | 
464 | 	def exit_with_help():
465 | 		print("""\
466 | Usage: grid.py [grid_options] [svm_options] dataset
467 | 
468 | grid_options :
469 | -log2c {begin,end,step | "null"} : set the range of c (default -5,15,2)
470 |     begin,end,step -- c_range = 2^{begin,...,begin+k*step,...,end}
471 |     "null"         -- do not grid with c
472 | -log2g {begin,end,step | "null"} : set the range of g (default 3,-15,-2)
473 |     begin,end,step -- g_range = 2^{begin,...,begin+k*step,...,end}
474 |     "null"         -- do not grid with g
475 | -v n : n-fold cross validation (default 5)
476 | -svmtrain pathname : set svm executable path and name
477 | -gnuplot {pathname | "null"} :
478 |     pathname -- set gnuplot executable path and name
479 |     "null"   -- do not plot 
480 | -out {pathname | "null"} : (default dataset.out)
481 |     pathname -- set output file path and name
482 |     "null"   -- do not output file
483 | -png pathname : set graphic output file path and name (default dataset.png)
484 | -resume [pathname] : resume the grid task using an existing output file (default pathname is dataset.out)
485 |     This is experimental. Try this option only if some parameters have been checked for the SAME data.
486 | 
487 | svm_options : additional options for svm-train""")
488 | 		sys.exit(1)
489 | 	
490 | 	if len(sys.argv) < 2:
491 | 		exit_with_help()
492 | 	dataset_pathname = sys.argv[-1]
493 | 	options = sys.argv[1:-1]
494 | 	try:
495 | 		find_parameters(dataset_pathname, options)
496 | 	except (IOError,ValueError) as e:
497 | 		sys.stderr.write(str(e) + '\n')
498 | 		sys.stderr.write('Try "grid.py" for more information.\n')
499 | 		sys.exit(1)
500 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/Makefile:
--------------------------------------------------------------------------------
 1 | all = lib
 2 | TARGET = util
 3 | SHVER = 1
 4 | OS = $(shell uname)
 5 | 
 6 | all: lib  test
 7 | 	make -C liblinear
 8 | 	make -C liblinear/python
 9 | 
10 | test: util.c
11 | 	g++ -fPIC -Iliblinear test.cpp -o test
12 | lib: ${TARGET}.o
13 | 	if [ "$(OS)" = "Darwin" ]; then \
14 | 		SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,${TARGET}.so.$(SHVER)"; \
15 | 	else \
16 | 		SHARED_LIB_FLAG="-shared -Wl,-soname,${TARGET}.so.$(SHVER)"; \
17 | 	fi; \
18 | 	gcc $${SHARED_LIB_FLAG} ${TARGET}.o -o ${TARGET}.so.1
19 | 
20 | ${TARGET}.o: ${TARGET}.c
21 | 	gcc -fPIC -Iliblinear -O3 -c -o ${TARGET}.o ${TARGET}.c
22 | 
23 | clean:
24 | 	rm -rf ${TARGET}.o ${TARGET}.so.1 *pyc test
25 | 	make -C liblinear clean
26 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The middle-level classifier :mod:`learner` is used to train or predict a 
 3 | LIBSVM-format data. This module extends LIBLINEAR python interface to provide
 4 | more utilities such as various feature representations and instance-wise normalization. 
 5 | The only difference between this module and standard LIBLINEAR python interface is
 6 | that :mod:`learner` provides more utilities, e.g., instance normalization, 
 7 | tf-idf, and binary feature representation. We call it as a middle-level classifier 
 8 | because it provides an interface between :mod:`libshorttext.classifier` and LIBLINEAR.
 9 | Note that some of the utilities of :mod:`learner` is implemented in C language
10 | for efficiency.
11 | 
12 | .. note::
13 | 
14 | 	If the data set is in text format, use :mod:`libshorttext.classfier`
15 | 	rather than :mod:`learner`.
16 | 
17 | :mod:`learner` has three utility functions and one model class. If users 
18 | want to replace :mod:`learner` module by their own implementation, they need to
19 | implement the three utility functions and :class:`LearnerModel`, which will be
20 | used by :mod:`libshorttext.classifier` and :mod:`libshorttext.analyzer`.
21 | """
22 | 
23 | 
24 | from .learner_impl import *
25 | del learner_impl
26 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/learner_impl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from ctypes import *
  4 | from ctypes.util import find_library
  5 | import sys
  6 | import os
  7 | from os import path
  8 | import shutil
  9 | 
 10 | if sys.version_info[0] >= 3:
 11 | 	xrange = range
 12 | 	import pickle as cPickle
 13 | 	izip = zip
 14 | 	def unicode(string, setting):
 15 | 		return string
 16 | else :
 17 | 	import cPickle
 18 | 	from itertools import izip
 19 | 
 20 | util = CDLL(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'util.so.1'))
 21 | 
 22 | LIBLINEAR_HOME = os.environ.get('LIBLINEAR_HOME') or os.path.dirname(os.path.abspath(__file__)) + '/liblinear'
 23 | sys.path = [LIBLINEAR_HOME, LIBLINEAR_HOME + '/python'] + sys.path
 24 | 
 25 | import liblinear
 26 | from liblinearutil import train as liblinear_train, predict as liblinear_predict, save_model as liblinear_save_model, load_model as liblinear_load_model
 27 | 
 28 | __all__ = ['LearnerParameter', 'LearnerModel', 
 29 | 		'train', 'predict_one', 'predict', 'LIBLINEAR_HOME']
 30 | 
 31 | def print_debug(src):
 32 | 	if os.environ.get('SHORTTEXTDEBUG'):
 33 | 		print('[DEBUG]: ' + src)
 34 | 
 35 | def fillprototype(f, restype, argtypes): 
 36 | 	f.restype = restype
 37 | 	f.argtypes = argtypes
 38 | 
 39 | def genFields(names, types): 
 40 | 	return list(zip(names, types))
 41 | 
 42 | #--------------Interface to util---------------
 43 | class SVMProblem(Structure):
 44 | 	_names = ["prob", "x_space", "n_x_space"]
 45 | 	_types = [liblinear.problem, POINTER(liblinear.feature_node), c_int64]
 46 | 	_fields_ = genFields(_names, _types)
 47 | 
 48 | 	def __del__ (self):
 49 | 		print_debug('SVMProblem delete:%s'% id(self))
 50 | 		util.freeSVMProblem(self)
 51 | 
 52 | def read_SVMProblem(src):
 53 | 	status = c_int64()
 54 | 	svmprob = util.read_problem(src.encode(), 0, pointer(status))  # bias = 0 is required
 55 | 
 56 | 	status = status.value
 57 | 
 58 | 	if status == 0:	
 59 | 		print_debug('SVMProblem construct:%s'% id(svmprob))
 60 | 		return svmprob
 61 | 
 62 | 	if status == -1:
 63 | 		raise IOError("Can not open file " + src + ".")
 64 | 
 65 | 	if status == -2:
 66 | 		raise MemoryError("Memory Exhausted. Try to restart python.")
 67 | 
 68 | 	raise ValueError("Wrong file format in line " + str(status) + ".")
 69 | 
 70 | 
 71 | fillprototype(util.read_problem, SVMProblem, [c_char_p, c_double, POINTER(c_int64)])
 72 | fillprototype(util.freeSVMProblem, None, [SVMProblem])
 73 | fillprototype(util.compute_idf, c_double, [POINTER(liblinear.problem), POINTER(c_double)])
 74 | fillprototype(util.normalize, None, [POINTER(liblinear.problem), c_int, c_int, c_int, c_int, POINTER(c_double)])
 75 | 
 76 | class LearnerProblem(liblinear.problem):
 77 | 	def __init__(self, src):
 78 | 		#svmprob = util.read_problem(src.encode(), 0)  # bias = 0 is required
 79 | 		svmprob = read_SVMProblem(src)  # bias = 0 is required
 80 | 		self.x = svmprob.prob.x
 81 | 		self.y = svmprob.prob.y
 82 | 		self.l = svmprob.prob.l
 83 | 		self.n = svmprob.prob.n
 84 | 		self.bias = svmprob.prob.bias
 85 | 		self.x_space = svmprob.x_space
 86 | 		self.n_x_space = svmprob.n_x_space
 87 | 		print_debug('LearnerProblem construct:%s'% id(svmprob))
 88 | 
 89 | 	def set_bias(self, bias):
 90 | 		if self.bias == bias:
 91 | 			return
 92 | 		node = liblinear.feature_node(self.n, bias) 
 93 | 		if bias >= 0 and self.bias < 0: 
 94 | 			self.n += 1
 95 | 			node = liblinear.feature_node(self.n, bias)
 96 | 		if bias < 0 and self.bias >= 0: 
 97 | 			self.n -= 1
 98 | 			node = liblinear.feature_node(-1, bias)
 99 | 
100 | 		for i in range(1,self.l):
101 | 			self.x[i][-2] = node
102 | 		self.x_space[self.n_x_space-2] = node
103 | 		self.bias = bias
104 | 
105 | 	def normalize(self, learner_param, idf):
106 | 		print_debug ("normal parameters: bin_feat {0}, inst_norm {1}, tf {2}, idf {3}\n".format(learner_param.binary_feature,
107 | 			learner_param.inst_normalization,
108 | 			learner_param.term_frequency,
109 | 			learner_param.inverse_document_frequency,
110 | 		))
111 | 		util.normalize(pointer(self),
112 | 			learner_param.binary_feature,
113 | 			learner_param.inst_normalization,
114 | 			learner_param.term_frequency,
115 | 			learner_param.inverse_document_frequency,
116 | 			idf)
117 | 
118 | 	@staticmethod
119 | 	def normalize_one(xi, learner_param, idf):
120 | 		"""
121 | 		The maximum index of xi should be less
122 | 		or equal to the weight vector size.
123 | 		"""
124 | 		norm = 0
125 | 		word_count = 0
126 | 		i = 0
127 | 		while xi[i].index != -1:
128 | 			idx = xi[i].index-1
129 | 			if learner_param.binary_feature:
130 | 				xi[i].value = xi[i].value != 0
131 | 
132 | 			word_count += abs(xi[i].value)
133 | 
134 | 			if learner_param.inverse_document_frequency and idx < len(idf):
135 | 				xi[i].value *= idf[idx]
136 | 
137 | 			norm += xi[i].value * xi[i].value
138 | 			i += 1
139 | 
140 | 		norm **= .5
141 | 
142 | 
143 | 		if learner_param.term_frequency:
144 | 			i = 0
145 | 			while xi[i].index != -1:
146 | 				xi[i].value /= word_count
147 | 				i += 1
148 | 
149 | 		if learner_param.inst_normalization:
150 | 			i = 0
151 | 			while xi[i].index != -1:
152 | 				xi[i].value /= norm 
153 | 				i += 1
154 | 
155 | 	def compute_idf(self):
156 | 		idf = (c_double * self.n)()
157 | 		util.compute_idf(self, idf)
158 | 		return idf
159 | 
160 | class LearnerParameter(liblinear.parameter):
161 | 	"""
162 | 	:class:`LearnerParameter` is the parameter structure used by 
163 | 	:class:`LearnerModel`. It consists of normalization parameters and 
164 | 	LIBLINEAR parameters.
165 | 
166 | 	Both *liblinear_opts* and *learner_opts* are :class:`str` or a 
167 | 	:class:`list` of :class:`str`. For example, you can write either
168 | 	
169 | 	>>> param = LearnerParameter('-N 1 -T 1', '-c 2 -e 1e-2')
170 | 
171 | 	or
172 | 
173 | 	>>> param = LearnerParameter(['-N', '1', '-T', '1'], ['-c', '2', '-e', '1e-2'])
174 | 
175 | 	*liblinear_opts* is LIBLINEAR's parameters. Refer to LIBLINEAR's 
176 | 	document for more details. *learner_opts* includes options for feature
177 | 	representation and instance-wise normalization. The preprocessor of
178 | 	LibShortText converts text files to LIBSVM-format data, where the 
179 | 	features are word counts. All *value* in the options should be either 
180 | 	``1`` or ``0``, where ``1`` enables the option.
181 | 
182 | 		========== ====================================================
183 | 		options    explanation when *value* is ``1``
184 | 		========== ====================================================
185 | 		-D *value* Binary representation. All non-zero values are 
186 | 		           treated as 1. Default is enabled.
187 | 		-T *value* Term frequency. The data are divided by the feature
188 | 		           sum. That is, 
189 | 		           :math:`x_i \leftarrow (x_i)/\sum_j |x_j|`,
190 | 		           where :math:`x` is the training instance and 
191 | 		           :math:`x_i` is the :math:`i`-th feature of :math:`x`.
192 | 		           Default is disabled.
193 | 		-I *value* Inverse document frequency (idf). Default is 
194 | 		           disabled.
195 | 		-N *value* Instance normalization. The training instances are 
196 | 		           normalized to unit vectors before training. Default
197 | 		           is enabled.
198 | 		========== ====================================================
199 | 			   
200 | 	Note that if more than one option is enabled, then they are done in the
201 | 	order: binary representation, term frequency, IDF, and instance 
202 | 	normalization. The following example is tf-idf representation without
203 | 	instance normalization.
204 | 
205 | 	>>> param = LearnerParameter('-D 0 -T 1 -I 1 -N 0', liblinear_opts)
206 | 
207 | 	"""
208 | 	def __init__(self, learner_opts = '', liblinear_opts = ''):
209 | 		self.parse_options(learner_opts, liblinear_opts)
210 | 		
211 | 	def set_to_default_values(self):
212 | 		"""
213 | 		Set the options to some values 
214 | 		(``'-D 1 -T 0 -I 0 -N 1'``).
215 | 		"""
216 | 		liblinear.parameter.set_to_default_values(self) 
217 | 		self.binary_feature = 1
218 | 		self.inst_normalization = 1
219 | 		self.term_frequency = 0
220 | 		self.inverse_document_frequency = 0
221 | 	
222 | 	def parse_options(self, learner_opts, liblinear_opts):
223 | 		"""
224 | 		Set the options to the specific values.
225 | 		"""
226 | 		
227 | 		self.raw_options = (learner_opts, liblinear_opts)
228 | 		if isinstance(learner_opts, list):
229 | 			argv = learner_opts
230 | 		elif isinstance(learner_opts, str):
231 | 			argv = learner_opts.split()
232 | 		else:
233 | 			raise TypeError("Wrong types")
234 | 		self.set_to_default_values()
235 | 		liblinear.parameter.parse_options(self, liblinear_opts)
236 | 		
237 | 		i = 0
238 | 		while i < len(argv):
239 | 			if argv[i] == "-D":
240 | 				i = i + 1
241 | 				self.binary_feature = int(argv[i])
242 | 			elif argv[i] == "-N":
243 | 				i = i + 1
244 | 				self.inst_normalization = int(argv[i])
245 | 			elif argv[i] == "-I":
246 | 				i = i + 1
247 | 				self.inverse_document_frequency = int(argv[i])
248 | 			elif argv[i] == "-T":
249 | 				i = i + 1
250 | 				self.term_frequency = int(argv[i])
251 | 			else :
252 | 				raise ValueError('No option ' + argv[i]) 
253 | 			i = i + 1
254 | 
255 | 
256 | class LearnerModel(liblinear.model):
257 | 	"""
258 | 	:class:`LearnerModel` is a middle-level classification model. It 
259 | 	inherits from :class:`liblinear.model` by having two more members:
260 | 	a :class:`LearnerParameter` instance and an inverse document frequency list.
261 | 
262 | 	We do not recommend users to create a :class:`LearnerModel` by themselves. 
263 | 	Instead, users should create and manipulate a :class:`LearnerModel`
264 | 	via :func:`train`, :func:`predict`, and :func:`predict_one`.
265 | 	
266 | 	If users want to redefine :class:`LearnerModel`, they must 
267 | 	implement the following four methods used by 
268 | 	:mod:`libshorttext.classifier` and :mod:`libshorttext.analyzer`.
269 | 	"""
270 | 
271 | 	def _reconstruct_label_idx(self):
272 | 		def _get_label_idx(nr_class, labels):
273 | 			return dict(zip(labels[:nr_class], range(nr_class)))
274 | 		
275 | 		if self.c_model is not None:
276 | 			self.labelidx = _get_label_idx(self.c_model.nr_class, self.c_model.label)
277 | 
278 | 
279 | 	def __init__(self, c_model, param = None, idf = None):
280 | 		"""
281 | 		constructor of :class:`LearnerModel`.
282 | 		"""
283 | 		
284 | 		print_debug('c_model(%s), self(%s)' % (id(c_model), id(self)))
285 | 
286 | 		if isinstance(c_model, str):
287 | 			self.load(c_model)			
288 | 			return
289 | 		elif isinstance(c_model, liblinear.model):
290 | 			if param is None:
291 | 				raise ValueError("param can not be None if model is given.")
292 | 		else:
293 | 			raise TypeError("c_model should be model file name or a model.")
294 | 
295 | 		self.c_model = c_model # prevent GC
296 | 		
297 | 		if isinstance(param, LearnerParameter):
298 | 			self.param_options = param.raw_options
299 | 		elif isinstance(param, tuple):
300 | 			self.param_options = param
301 | 		else:
302 | 			raise TypeError("param should be a LearnerParameter or a tuple.")
303 | 		
304 | 		if idf is not None:
305 | 			self.idf = idf[:self.c_model.nr_feature + (self.c_model.bias >= 0)] 
306 | 		else:
307 | 			self.idf = None
308 | 
309 | 		for attr in c_model._names:
310 | 			setattr(self, attr, getattr(c_model, attr))
311 | 
312 | 		self._reconstruct_label_idx()
313 | 
314 | 	def get_weight(self, j, k):
315 | 		"""
316 | 		Return the weight of feature *j* and label *k*.
317 | 		"""
318 | 		return self.c_model.w[(j-1)*self.c_model.nr_class + self.labelidx[k]]
319 | 
320 | 	def get_labels(self):
321 | 		"""
322 | 		Return the labels of this model.
323 | 		"""
324 | 		return self.label[:self.nr_class]
325 | 
326 | 	def load(self, model_dir):
327 | 		"""
328 | 		Load the contents from a :class:`TextModel` directory. 
329 | 		"""
330 | 		
331 | 		self.c_model = liblinear_load_model(path.join(model_dir,'liblinear_model'))
332 | 		
333 | 		options_file = path.join(model_dir,'options.pickle')
334 | 		self.param_options = cPickle.load(open(options_file,'rb'))
335 | 		
336 | 		idf_file = path.join(model_dir,'idf.pickle')
337 | 		self.idf = cPickle.load(open(idf_file,'rb'))
338 | 		
339 | 		self.__init__(self.c_model, LearnerParameter(self.param_options[0], self.param_options[1]), self.idf)
340 | 
341 | 	def save(self, model_dir, force=False):
342 | 		"""
343 | 		Save the model to a directory. If *force* is set to ``True``, 
344 | 		the existing directory will be overwritten; otherwise, 
345 | 		:class:`IOError` will be raised.
346 | 		"""
347 | 
348 | 		if path.exists(model_dir): 
349 | 			if force: 
350 | 				shutil.rmtree(model_dir)
351 | 			else : 
352 | 				raise OSError('Please use force option to overwrite the existing files.')
353 | 		os.mkdir(model_dir)
354 | 
355 | 		liblinear_save_model(path.join(model_dir,'liblinear_model'), self.c_model) 
356 | 		options_file = path.join(model_dir,'options.pickle')
357 | 		cPickle.dump(self.param_options, open(options_file,'wb'),-1) 
358 | 		
359 | 		idf_file = path.join(model_dir,'idf.pickle')
360 | 		cPickle.dump(self.idf, open(idf_file,'wb'),-1) 
361 | 
362 | 	def __str__(self):
363 | 		if type(self.param_options) is tuple and len(self.param_options) > 0:
364 | 			return 'LearnerModel: ' + (self.param_options[0] or 'default')
365 | 		else:
366 | 			return 'empty LearnerModel'
367 | 
368 | def train(data_file_name, learner_opts="", liblinear_opts=""):
369 | 	"""
370 | 	Return a :class:`LearnerModel`.
371 | 
372 | 	*data_file_name* is the file path of the LIBSVM-format data. *learner_opts* is a 
373 | 	:class:`str`. Refer to :ref:`learner_param`. *liblinear_opts* is a :class:`str` of 
374 | 	LIBLINEAR's parameters. Refer to LIBLINEAR's document.
375 | 	"""
376 | 	
377 | 	learner_prob = LearnerProblem(data_file_name)
378 | 	learner_param = LearnerParameter(learner_opts, liblinear_opts)
379 | 	
380 | 	idf = None
381 | 	if learner_param.inverse_document_frequency:
382 | 		idf = learner_prob.compute_idf()
383 | 	
384 | 	learner_prob.normalize(learner_param, idf)
385 | 
386 | 	m = liblinear_train(learner_prob, learner_param)
387 | 	if not learner_param.cross_validation:
388 | 		m.x_space = None  # This is required to reduce the memory usage...
389 | 		m = LearnerModel(m, learner_param, idf)
390 | 	return m
391 | 
392 | def predict_one(xi, m):
393 | 	"""
394 | 	Return the label and a :class:`c_double` array of decision values of
395 | 	the test instance *xi* using :class:`LearnerModel` *m*.
396 | 
397 | 	*xi* can be a :class:`list` or a :class:`dict` as in LIBLINEAR python 
398 | 	interface. It can also be a LIBLINEAR feature_node array.
399 | 
400 | 	.. note::
401 | 
402 | 		This function is designed to analyze the result of one instance.
403 | 		It has a severe efficiency issue and should be used only by
404 | 		:func:`libshorttext.classifier.predict_single_text`. If many 
405 | 		instances need to be predicted, they should be stored in a file
406 | 		and predicted by :func:`predict`.
407 | 
408 | 	.. warning::
409 | 
410 | 		The content of *xi* may be **changed** after the function call.
411 | 	"""
412 | 	
413 | 	if isinstance(xi, (list, dict)):
414 | 		xi = liblinear.gen_feature_nodearray(xi)[0]
415 | 	elif not isinstance(xi, POINTER(liblinear.feature_node)):
416 | 		raise TypeError("xi should be a test instance")
417 | 	
418 | 	learner_param = LearnerParameter(m.param_options[0], m.param_options[1])
419 | 
420 | 	if m.bias >= 0:
421 | 		i = 0
422 | 		while xi[i].index != -1: i += 1
423 | 
424 | 		# Already has bias, or bias reserved.
425 | 		# Actually this statement should be true if
426 | 		# the data is read by read_SVMProblem.
427 | 		if i > 0 and xi[i-1].index == m.nr_feature + 1:
428 | 			i -= 1 
429 | 		
430 | 		xi[i] = liblinear.feature_node(m.nr_feature + 1, m.bias)
431 | 		xi[i+1] = liblinear.feature_node(-1, 0)
432 | 	
433 | 	LearnerProblem.normalize_one(xi, learner_param, m.idf)
434 | 
435 | 	dec_values = (c_double * m.nr_class)()
436 | 	label = liblinear.liblinear.predict_values(m, xi, dec_values)
437 | 
438 | 	return label, dec_values
439 | 
440 | def predict(data_file_name, m, liblinear_opts=""):
441 | 	"""
442 | 	Return a quadruple: the predicted labels, the accuracy, the decision values, and the
443 | 	true labels in the test data file (obtained through the :class:`LearnerModel` *m*).
444 | 
445 | 	The predicted labels and true labels in the file are :class:`list`. The accuracy is 
446 | 	evaluated by assuming that the labels in the file are the true label.
447 | 
448 | 	The decision values are in a :class:`list`, where the length is the same as the number
449 | 	of test instances. Each element in the list is a :class:`c_double` array, and the 
450 | 	values in the array are an instance's decision values in different classes.
451 | 	For example, the decision value of instance i and class k can be obtained by
452 | 
453 | 	>>> predicted_label, accuracy, all_dec_values, label = predict('svm_file', model)
454 | 	>>> print all_dec_values[i][k]
455 | 	"""
456 | 	
457 | 	learner_prob = LearnerProblem(data_file_name)
458 | 	learner_param = LearnerParameter(m.param_options[0], m.param_options[1])
459 | 
460 | 	idf = None
461 | 	if m.idf:
462 | 		idf = (c_double * len(m.idf))()
463 | 		for i in range(len(m.idf)): idf[i] = m.idf[i]
464 | 	learner_prob.normalize(learner_param, idf)
465 | 
466 | 	all_dec_values = []
467 | 	acc = 0
468 | 	py = []  # predicted y
469 | 	ty = []  # true y
470 | 	
471 | 	dec_values = (c_double * m.nr_class)()
472 | 	
473 | 	for i in range(learner_prob.l):
474 | 		label = liblinear.liblinear.predict_values(m, learner_prob.x[i], dec_values)
475 | 		all_dec_values += [dec_values[:m.nr_class]]
476 | 		py += [label]
477 | 		ty += [learner_prob.y[i]]
478 | 
479 | 		if label == learner_prob.y[i]:
480 | 			acc += 1
481 | 
482 | 	acc /= float(learner_prob.l)
483 | 
484 | 
485 | 	return py, acc, all_dec_values, ty
486 | 
487 | 
488 | 
489 | if __name__ == '__main__':
490 | 	argv = sys.argv
491 | 	if len(argv) < 2: #4 or '-v' not in argv:
492 | 		print("{0} -v fold [other liblinear_options] [learner_opts] training-data".format(argv[0]))
493 | 		sys.exit(-1)
494 | 	data_file_name = argv[-1]
495 | 	learner_opts, liblinear_opts = [], []
496 | 	i = 1 
497 | 	while i < len(argv)-1:
498 | 		if argv[i] in ["-D", "-N", "-I", "-T"]:
499 | 			learner_opts += [argv[i], argv[i+1]]
500 | 			i += 2
501 | 		else :
502 | 			liblinear_opts += [argv[i]]
503 | 			i += 1
504 | 	m = train(data_file_name, learner_opts, liblinear_opts)
505 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | 
 2 | opyright (c) 2007-2012 The LIBLINEAR Project.
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions
 7 | are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright
10 | notice, this list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright
13 | notice, this list of conditions and the following disclaimer in the
14 | documentation and/or other materials provided with the distribution.
15 | 
16 | 3. Neither name of copyright holders nor the names of its contributors
17 | may be used to endorse or promote products derived from this software
18 | without specific prior written permission.
19 | 
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
25 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/Makefile:
--------------------------------------------------------------------------------
 1 | CXX ?= g++
 2 | CC ?= gcc
 3 | CFLAGS = -Wall -Wconversion -O3 -fPIC
 4 | LIBS = blas/blas.a
 5 | SHVER = 1
 6 | OS = $(shell uname)
 7 | #LIBS = -lblas
 8 | 
 9 | all: train predict
10 | 
11 | lib: linear.o tron.o blas/blas.a
12 | 	if [ "$(OS)" = "Darwin" ]; then \
13 | 		SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,liblinear.so.$(SHVER)"; \
14 | 	else \
15 | 		SHARED_LIB_FLAG="-shared -Wl,-soname,liblinear.so.$(SHVER)"; \
16 | 	fi; \
17 | 	$(CXX) $${SHARED_LIB_FLAG} linear.o tron.o blas/blas.a -o liblinear.so.$(SHVER)
18 | 
19 | train: tron.o linear.o train.c blas/blas.a
20 | 	$(CXX) $(CFLAGS) -o train train.c tron.o linear.o $(LIBS)
21 | 
22 | predict: tron.o linear.o predict.c blas/blas.a
23 | 	$(CXX) $(CFLAGS) -o predict predict.c tron.o linear.o $(LIBS)
24 | 
25 | tron.o: tron.cpp tron.h
26 | 	$(CXX) $(CFLAGS) -c -o tron.o tron.cpp
27 | 
28 | linear.o: linear.cpp linear.h
29 | 	$(CXX) $(CFLAGS) -c -o linear.o linear.cpp
30 | 
31 | blas/blas.a: blas/*.c blas/*.h
32 | 	make -C blas OPTFLAGS='$(CFLAGS)' CC='$(CC)';
33 | 
34 | clean:
35 | 	make -C blas clean
36 | 	make -C matlab clean
37 | 	rm -f *~ tron.o linear.o train predict liblinear.so.$(SHVER)
38 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/README:
--------------------------------------------------------------------------------
  1 | LIBLINEAR is a simple package for solving large-scale regularized linear 
  2 | classification and regression. It currently supports 
  3 | - L2-regularized logistic regression/L2-loss support vector classification/L1-loss support vector classification
  4 | - L1-regularized L2-loss support vector classification/L1-regularized logistic regression
  5 | - L2-regularized L2-loss support vector regression/L1-loss support vector regression. 
  6 | This document explains the usage of LIBLINEAR.
  7 | 
  8 | To get started, please read the ``Quick Start'' section first.
  9 | For developers, please check the ``Library Usage'' section to learn
 10 | how to integrate LIBLINEAR in your software.
 11 | 
 12 | Table of Contents
 13 | =================
 14 | 
 15 | - When to use LIBLINEAR but not LIBSVM
 16 | - Quick Start
 17 | - Installation
 18 | - `train' Usage
 19 | - `predict' Usage
 20 | - Examples
 21 | - Library Usage
 22 | - Additional Information
 23 | - MATLAB/OCTAVE interface
 24 | - PYTHON interface
 25 | 
 26 | When to use LIBLINEAR but not LIBSVM
 27 | ====================================
 28 | 
 29 | There are some large data for which with/without nonlinear mappings
 30 | gives similar performances.  Without using kernels, one can
 31 | efficiently train a much larger set via linear classification/regression.  
 32 | These data usually have a large number of features. Document classification
 33 | is an example.
 34 | 
 35 | Warning: While generally liblinear is very fast, its default solver
 36 | may be slow under certain situations (e.g., data not scaled or C is
 37 | large). See Appendix B of our SVM guide about how to handle such
 38 | cases.
 39 | http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
 40 | 
 41 | Warning: If you are a beginner and your data sets are not large, you
 42 | should consider LIBSVM first.
 43 | 
 44 | LIBSVM page:
 45 | http://www.csie.ntu.edu.tw/~cjlin/libsvm
 46 | 
 47 | 
 48 | Quick Start
 49 | ===========
 50 | 
 51 | See the section ``Installation'' for installing LIBLINEAR.
 52 | 
 53 | After installation, there are programs `train' and `predict' for
 54 | training and testing, respectively.
 55 | 
 56 | About the data format, please check the README file of LIBSVM. Note
 57 | that feature index must start from 1 (but not 0).
 58 | 
 59 | A sample classification data included in this package is `heart_scale'.
 60 | 
 61 | Type `train heart_scale', and the program will read the training
 62 | data and output the model file `heart_scale.model'. If you have a test
 63 | set called heart_scale.t, then type `predict heart_scale.t
 64 | heart_scale.model output' to see the prediction accuracy. The `output'
 65 | file contains the predicted class labels.
 66 | 
 67 | For more information about `train' and `predict', see the sections
 68 | `train' Usage and `predict' Usage.
 69 | 
 70 | To obtain good performances, sometimes one needs to scale the
 71 | data. Please check the program `svm-scale' of LIBSVM. For large and
 72 | sparse data, use `-l 0' to keep the sparsity.
 73 | 
 74 | Installation
 75 | ============
 76 | 
 77 | On Unix systems, type `make' to build the `train' and `predict'
 78 | programs. Run them without arguments to show the usages.
 79 | 
 80 | This software uses some level-1 BLAS subroutines. The needed functions are
 81 | included in this package.  If a BLAS library is available on your
 82 | machine, you may use it by modifying the Makefile: Unmark the following line
 83 | 
 84 |         #LIBS ?= -lblas
 85 | 
 86 | and mark
 87 | 
 88 |         LIBS ?= blas/blas.a
 89 | 
 90 | `train' Usage
 91 | =============
 92 | 
 93 | Usage: train [options] training_set_file [model_file]
 94 | options:
 95 | -s type : set type of solver (default 1)
 96 |   for multi-class classification
 97 | 	 0 -- L2-regularized logistic regression (primal)
 98 | 	 1 -- L2-regularized L2-loss support vector classification (dual)
 99 | 	 2 -- L2-regularized L2-loss support vector classification (primal)
100 | 	 3 -- L2-regularized L1-loss support vector classification (dual)
101 | 	 4 -- support vector classification by Crammer and Singer
102 | 	 5 -- L1-regularized L2-loss support vector classification
103 | 	 6 -- L1-regularized logistic regression
104 | 	 7 -- L2-regularized logistic regression (dual)
105 |   for regression
106 | 	11 -- L2-regularized L2-loss support vector regression (primal)
107 | 	12 -- L2-regularized L2-loss support vector regression (dual)
108 | 	13 -- L2-regularized L1-loss support vector regression (dual)
109 | -c cost : set the parameter C (default 1)
110 | -p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1)
111 | -e epsilon : set tolerance of termination criterion
112 | 	-s 0 and 2
113 | 		|f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
114 | 		where f is the primal function and pos/neg are # of
115 | 		positive/negative data (default 0.01)
116 | 	-s 11
117 | 		|f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001) 
118 | 	-s 1, 3, 4 and 7
119 | 		Dual maximal violation <= eps; similar to libsvm (default 0.1)
120 | 	-s 5 and 6
121 | 		|f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
122 | 		where f is the primal function (default 0.01)
123 | 	-s 12 and 13\n"
124 | 		|f'(alpha)|_1 <= eps |f'(alpha0)|,
125 | 		where f is the dual function (default 0.1)
126 | -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
127 | -wi weight: weights adjust the parameter C of different classes (see README for details)
128 | -v n: n-fold cross validation mode
129 | -q : quiet mode (no outputs)
130 | 
131 | Option -v randomly splits the data into n parts and calculates cross
132 | validation accuracy on them.
133 | 
134 | Formulations:
135 | 
136 | For L2-regularized logistic regression (-s 0), we solve
137 | 
138 | min_w w^Tw/2 + C \sum log(1 + exp(-y_i w^Tx_i))
139 | 
140 | For L2-regularized L2-loss SVC dual (-s 1), we solve
141 | 
142 | min_alpha  0.5(alpha^T (Q + I/2/C) alpha) - e^T alpha
143 |     s.t.   0 <= alpha_i,
144 | 
145 | For L2-regularized L2-loss SVC (-s 2), we solve
146 | 
147 | min_w w^Tw/2 + C \sum max(0, 1- y_i w^Tx_i)^2
148 | 
149 | For L2-regularized L1-loss SVC dual (-s 3), we solve
150 | 
151 | min_alpha  0.5(alpha^T Q alpha) - e^T alpha
152 |     s.t.   0 <= alpha_i <= C,
153 | 
154 | For L1-regularized L2-loss SVC (-s 5), we solve
155 | 
156 | min_w \sum |w_j| + C \sum max(0, 1- y_i w^Tx_i)^2
157 | 
158 | For L1-regularized logistic regression (-s 6), we solve
159 | 
160 | min_w \sum |w_j| + C \sum log(1 + exp(-y_i w^Tx_i))
161 | 
162 | For L2-regularized logistic regression (-s 7), we solve
163 | 
164 | min_alpha  0.5(alpha^T Q alpha) + \sum alpha_i*log(alpha_i) + \sum (C-alpha_i)*log(C-alpha_i) - a constant
165 |     s.t.   0 <= alpha_i <= C,
166 | 
167 | where
168 | 
169 | Q is a matrix with Q_ij = y_i y_j x_i^T x_j.
170 | 
171 | For L2-regularized L2-loss SVR (-s 11), we solve
172 | 
173 | min_w w^Tw/2 + C \sum max(0, |y_i-w^Tx_i|-epsilon)^2
174 | 
175 | For L2-regularized L2-loss SVR dual (-s 12), we solve
176 | 
177 | min_beta  0.5(beta^T (Q + lambda I/2/C) beta) - y^T beta + \sum |beta_i|
178 | 
179 | For L2-regularized L1-loss SVR dual (-s 13), we solve
180 | 
181 | min_beta  0.5(beta^T Q beta) - y^T beta + \sum |beta_i|
182 |     s.t.   -C <= beta_i <= C,
183 | 
184 | where
185 | 
186 | Q is a matrix with Q_ij = x_i^T x_j.
187 | 
188 | If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias].
189 | 
190 | The primal-dual relationship implies that -s 1 and -s 2 give the same
191 | model, -s 0 and -s 7 give the same, and -s 11 and -s 12 give the same.
192 | 
193 | We implement 1-vs-the rest multi-class strategy for classification. 
194 | In training i vs. non_i, their C parameters are (weight from -wi)*C 
195 | and C, respectively. If there are only two classes, we train only one
196 | model. Thus weight1*C vs. weight2*C is used. See examples below.
197 | 
198 | We also implement multi-class SVM by Crammer and Singer (-s 4):
199 | 
200 | min_{w_m, \xi_i}  0.5 \sum_m ||w_m||^2 + C \sum_i \xi_i
201 |     s.t.  w^T_{y_i} x_i - w^T_m x_i >= \e^m_i - \xi_i \forall m,i
202 | 
203 | where e^m_i = 0 if y_i  = m,
204 |       e^m_i = 1 if y_i != m,
205 | 
206 | Here we solve the dual problem:
207 | 
208 | min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
209 |     s.t.  \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
210 | 
211 | where w_m(\alpha) = \sum_i \alpha^m_i x_i,
212 | and C^m_i = C if m  = y_i,
213 |     C^m_i = 0 if m != y_i.
214 | 
215 | `predict' Usage
216 | ===============
217 | 
218 | Usage: predict [options] test_file model_file output_file
219 | options:
220 | -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only
221 | -q : quiet mode (no outputs)
222 | 
223 | Note that -b is only needed in the prediction phase. This is different
224 | from the setting of LIBSVM.
225 | 
226 | Examples
227 | ========
228 | 
229 | > train data_file
230 | 
231 | Train linear SVM with L2-loss function.
232 | 
233 | > train -s 0 data_file
234 | 
235 | Train a logistic regression model.
236 | 
237 | > train -v 5 -e 0.001 data_file
238 | 
239 | Do five-fold cross-validation using L2-loss svm.
240 | Use a smaller stopping tolerance 0.001 than the default
241 | 0.1 if you want more accurate solutions.
242 | 
243 | > train -c 10 -w1 2 -w2 5 -w3 2 four_class_data_file
244 | 
245 | Train four classifiers:
246 | positive        negative        Cp      Cn
247 | class 1         class 2,3,4.    20      10
248 | class 2         class 1,3,4.    50      10
249 | class 3         class 1,2,4.    20      10
250 | class 4         class 1,2,3.    10      10
251 | 
252 | > train -c 10 -w3 1 -w2 5 two_class_data_file
253 | 
254 | If there are only two classes, we train ONE model.
255 | The C values for the two classes are 10 and 50.
256 | 
257 | > predict -b 1 test_file data_file.model output_file
258 | 
259 | Output probability estimates (for logistic regression only).
260 | 
261 | Library Usage
262 | =============
263 | 
264 | - Function: model* train(const struct problem *prob,
265 |                 const struct parameter *param);
266 | 
267 |     This function constructs and returns a linear classification 
268 |     or regression model according to the given training data and 
269 |     parameters.
270 | 
271 |     struct problem describes the problem:
272 | 
273 |         struct problem
274 |         {
275 |             INT64 l, n;
276 |             INT64 *y;
277 |             struct feature_node **x;
278 |             double bias;
279 |         };
280 | 
281 |     where `l' is the number of training data. If bias >= 0, we assume
282 |     that one additional feature is added to the end of each data
283 |     instance. `n' is the number of feature (including the bias feature
284 |     if bias >= 0). `y' is an array containing the target values. (integers 
285 |     in classification, real numbers in regression) And `x' is an array 
286 |     of pointers, each of which points to a sparse representation (array 
287 |     of feature_node) of one training vector.
288 | 
289 |     For example, if we have the following training data:
290 | 
291 |     LABEL       ATTR1   ATTR2   ATTR3   ATTR4   ATTR5
292 |     -----       -----   -----   -----   -----   -----
293 |     1           0       0.1     0.2     0       0
294 |     2           0       0.1     0.3    -1.2     0
295 |     1           0.4     0       0       0       0
296 |     2           0       0.1     0       1.4     0.5
297 |     3          -0.1    -0.2     0.1     1.1     0.1
298 | 
299 |     and bias = 1, then the components of problem are:
300 | 
301 |     l = 5
302 |     n = 6
303 | 
304 |     y -> 1 2 1 2 3
305 | 
306 |     x -> [ ] -> (2,0.1) (3,0.2) (6,1) (-1,?)
307 |          [ ] -> (2,0.1) (3,0.3) (4,-1.2) (6,1) (-1,?)
308 |          [ ] -> (1,0.4) (6,1) (-1,?)
309 |          [ ] -> (2,0.1) (4,1.4) (5,0.5) (6,1) (-1,?)
310 |          [ ] -> (1,-0.1) (2,-0.2) (3,0.1) (4,1.1) (5,0.1) (6,1) (-1,?)
311 | 
312 |     struct parameter describes the parameters of a linear classification 
313 |     or regression model:
314 | 
315 |         struct parameter
316 |         {
317 |                 INT64 solver_type;
318 | 
319 |                 /* these are for training only */
320 |                 double eps;             /* stopping criteria */
321 |                 double C;
322 |                 INT64 nr_weight;
323 |                 INT64 *weight_label;
324 |                 double* weight;
325 |                 double p;
326 |         };
327 | 
328 |     solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL.
329 |   for classification
330 |     L2R_LR                L2-regularized logistic regression (primal)
331 |     L2R_L2LOSS_SVC_DUAL   L2-regularized L2-loss support vector classification (dual)
332 |     L2R_L2LOSS_SVC        L2-regularized L2-loss support vector classification (primal)
333 |     L2R_L1LOSS_SVC_DUAL   L2-regularized L1-loss support vector classification (dual)
334 |     MCSVM_CS              support vector classification by Crammer and Singer
335 |     L1R_L2LOSS_SVC        L1-regularized L2-loss support vector classification
336 |     L1R_LR                L1-regularized logistic regression
337 |     L2R_LR_DUAL           L2-regularized logistic regression (dual)
338 |   for regression
339 |     L2R_L2LOSS_SVR        L2-regularized L2-loss support vector regression (primal)
340 |     L2R_L2LOSS_SVR_DUAL   L2-regularized L2-loss support vector regression (dual)
341 |     L2R_L1LOSS_SVR_DUAL   L2-regularized L1-loss support vector regression (dual)
342 | 
343 |     C is the cost of constraints violation.
344 |     p is the sensitiveness of loss of support vector regression. 
345 |     eps is the stopping criterion.
346 | 
347 |     nr_weight, weight_label, and weight are used to change the penalty
348 |     for some classes (If the weight for a class is not changed, it is
349 |     set to 1). This is useful for training classifier using unbalanced
350 |     input data or with asymmetric misclassification cost.
351 | 
352 |     nr_weight is the number of elements in the array weight_label and
353 |     weight. Each weight[i] corresponds to weight_label[i], meaning that
354 |     the penalty of class weight_label[i] is scaled by a factor of weight[i].
355 | 
356 |     If you do not want to change penalty for any of the classes,
357 |     just set nr_weight to 0.
358 | 
359 |     *NOTE* To avoid wrong parameters, check_parameter() should be
360 |     called before train().
361 | 
362 |     struct model stores the model obtained from the training procedure:
363 | 
364 |         struct model
365 |         {
366 |                 struct parameter param;
367 |                 INT64 nr_class;           /* number of classes */
368 |                 INT64 nr_feature;
369 |                 double *w;
370 |                 INT64 *label;             /* label of each class */
371 |                 double bias;
372 |         };
373 | 
374 |      param describes the parameters used to obtain the model.
375 | 
376 |      nr_class and nr_feature are the number of classes and features, 
377 |      respectively. nr_class = 2 for regression. 
378 | 
379 |      The nr_feature*nr_class array w gives feature weights. We use one
380 |      against the rest for multi-class classification, so each feature
381 |      index corresponds to nr_class weight values. Weights are
382 |      organized in the following way
383 | 
384 |      +------------------+------------------+------------+
385 |      | nr_class weights | nr_class weights |  ...
386 |      | for 1st feature  | for 2nd feature  |
387 |      +------------------+------------------+------------+
388 | 
389 |      If bias >= 0, x becomes [x; bias]. The number of features is
390 |      increased by one, so w is a (nr_feature+1)*nr_class array. The
391 |      value of bias is stored in the variable bias.
392 | 
393 |      The array label stores class labels.
394 | 
395 | - Function: void cross_validation(const problem *prob, const parameter *param, INT64 nr_fold, double *target);
396 | 
397 |     This function conducts cross validation. Data are separated to
398 |     nr_fold folds. Under given parameters, sequentially each fold is
399 |     validated using the model from training the remaining. Predicted
400 |     labels in the validation process are stored in the array called
401 |     target.
402 | 
403 |     The format of prob is same as that for train().
404 | 
405 | - Function: double predict(const model *model_, const feature_node *x);
406 | 
407 |     For a classification model, the predicted class for x is returned.
408 |     For a regression model, the function value of x calculated using
409 |     the model is returned. 
410 | 
411 | - Function: double predict_values(const struct model *model_,
412 |             const struct feature_node *x, double* dec_values);
413 | 
414 |     This function gives nr_w decision values in the array dec_values. 
415 |     nr_w=1 if regression is applied or the number of classes is two. An exception is
416 |     multi-class svm by Crammer and Singer (-s 4), where nr_w = 2 if there are two classes. For all other situations, nr_w is the 
417 |     number of classes.
418 | 
419 |     We implement one-vs-the rest multi-class strategy (-s 0,1,2,3,5,6,7) 
420 |     and multi-class svm by Crammer and Singer (-s 4) for multi-class SVM.
421 |     The class with the highest decision value is returned.
422 | 
423 | - Function: double predict_probability(const struct model *model_,
424 |             const struct feature_node *x, double* prob_estimates);
425 | 
426 |     This function gives nr_class probability estimates in the array
427 |     prob_estimates. nr_class can be obtained from the function
428 |     get_nr_class. The class with the highest probability is
429 |     returned. Currently, we support only the probability outputs of
430 |     logistic regression.
431 | 
432 | - Function: INT64 get_nr_feature(const model *model_);
433 | 
434 |     The function gives the number of attributes of the model.
435 | 
436 | - Function: INT64 get_nr_class(const model *model_);
437 | 
438 |     The function gives the number of classes of the model.
439 |     For a regression model, 2 is returned.
440 | 
441 | - Function: void get_labels(const model *model_, INT64* label);
442 | 
443 |     This function outputs the name of labels into an array called label.
444 |     For a regression model, label is unchanged.
445 | 
446 | - Function: const char *check_parameter(const struct problem *prob,
447 |             const struct parameter *param);
448 | 
449 |     This function checks whether the parameters are within the feasible
450 |     range of the problem. This function should be called before calling
451 |     train() and cross_validation(). It returns NULL if the
452 |     parameters are feasible, otherwise an error message is returned.
453 | 
454 | - Function: INT64 save_model(const char *model_file_name,
455 |             const struct model *model_);
456 | 
457 |     This function saves a model to a file; returns 0 on success, or -1
458 |     if an error occurs.
459 | 
460 | - Function: struct model *load_model(const char *model_file_name);
461 | 
462 |     This function returns a pointer to the model read from the file,
463 |     or a null pointer if the model could not be loaded.
464 | 
465 | - Function: void free_model_content(struct model *model_ptr);
466 | 
467 |     This function frees the memory used by the entries in a model structure.
468 | 
469 | - Function: void free_and_destroy_model(struct model **model_ptr_ptr);
470 | 
471 |     This function frees the memory used by a model and destroys the model
472 |     structure.
473 | 
474 | - Function: void destroy_param(struct parameter *param);
475 | 
476 |     This function frees the memory used by a parameter set.
477 | 
478 | - Function: void set_print_string_function(void (*print_func)(const char *));
479 | 
480 |     Users can specify their output format by a function. Use
481 |         set_print_string_function(NULL); 
482 |     for default printing to stdout.
483 | 
484 | 	
485 | MATLAB/OCTAVE Interface
486 | =======================
487 | 
488 | Please check the file README in the directory `matlab'.
489 | 
490 | PYTHON Interface
491 | ================
492 | 
493 | Please check the file README in the directory `python'.
494 | 
495 | Additional Information
496 | ======================
497 | 
498 | If you find LIBLINEAR helpful, please cite it as
499 | 
500 | R.-E. Fan, K.-W. Chang, C.-J. Hsieh, X.-R. Wang, and C.-J. Lin.
501 | LIBLINEAR: A Library for Large Linear Classification, Journal of
502 | Machine Learning Research 9(2008), 1871-1874. Software available at
503 | http://www.csie.ntu.edu.tw/~cjlin/liblinear
504 | 
505 | For any questions and comments, please send your email to
506 | cjlin@csie.ntu.edu.tw
507 | 
508 | 
509 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/blas/Makefile:
--------------------------------------------------------------------------------
 1 | AR     = ar rcv
 2 | RANLIB = ranlib 
 3 | 
 4 | HEADERS = blas.h blasp.h
 5 | FILES = dnrm2.o daxpy.o ddot.o dscal.o 
 6 | 
 7 | CFLAGS = $(OPTFLAGS) 
 8 | FFLAGS = $(OPTFLAGS)
 9 | 
10 | blas: $(FILES) $(HEADERS)
11 | 	$(AR) blas.a $(FILES)  
12 | 	$(RANLIB) blas.a
13 | 
14 | clean:
15 | 	- rm -f *.o
16 | 	- rm -f *.a
17 | 	- rm -f *~
18 | 
19 | .c.o:
20 | 	$(CC) $(CFLAGS) -c $*.c
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/blas/blas.h:
--------------------------------------------------------------------------------
 1 | /* blas.h  --  C header file for BLAS                         Ver 1.0 */
 2 | /* Jesse Bennett                                       March 23, 2000 */
 3 | 
 4 | /**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
 5 | 
 6 | 	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
 7 | 
 8 | #ifndef BLAS_INCLUDE
 9 | #define BLAS_INCLUDE
10 | 
11 | #include "stdint.h"
12 | #ifndef INT64_DEFINED
13 | typedef int64_t INT64;
14 | #define INT64_DEFINED
15 | #endif
16 | /* Data types specific to BLAS implementation */
17 | typedef struct { float r, i; } fcomplex;
18 | typedef struct { double r, i; } dcomplex;
19 | typedef INT64 blasbool;
20 | 
21 | #include "blasp.h"    /* Prototypes for all BLAS functions */
22 | 
23 | #define FALSE 0
24 | #define TRUE  1
25 | 
26 | /* Macro functions */
27 | #define MIN(a,b) ((a) <= (b) ? (a) : (b))
28 | #define MAX(a,b) ((a) >= (b) ? (a) : (b))
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/blas/blasp.h:
--------------------------------------------------------------------------------
  1 | /* blasp.h  --  C prototypes for BLAS                         Ver 1.0 */
  2 | /* Jesse Bennett                                       March 23, 2000 */
  3 | 
  4 | /* Functions  listed in alphabetical order */
  5 | #include <stdint.h>
  6 | #ifndef INT64_DEFINED
  7 | typedef int64_t INT64;
  8 | #define INT64_DEFINED
  9 | #endif
 10 | 
 11 | #ifdef F2C_COMPAT
 12 | 
 13 | void cdotc_(fcomplex *dotval, INT64 *n, fcomplex *cx, INT64 *incx,
 14 |             fcomplex *cy, INT64 *incy);
 15 | 
 16 | void cdotu_(fcomplex *dotval, INT64 *n, fcomplex *cx, INT64 *incx,
 17 |             fcomplex *cy, INT64 *incy);
 18 | 
 19 | double sasum_(INT64 *n, float *sx, INT64 *incx);
 20 | 
 21 | double scasum_(INT64 *n, fcomplex *cx, INT64 *incx);
 22 | 
 23 | double scnrm2_(INT64 *n, fcomplex *x, INT64 *incx);
 24 | 
 25 | double sdot_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy);
 26 | 
 27 | double snrm2_(INT64 *n, float *x, INT64 *incx);
 28 | 
 29 | void zdotc_(dcomplex *dotval, INT64 *n, dcomplex *cx, INT64 *incx,
 30 |             dcomplex *cy, INT64 *incy);
 31 | 
 32 | void zdotu_(dcomplex *dotval, INT64 *n, dcomplex *cx, INT64 *incx,
 33 |             dcomplex *cy, INT64 *incy);
 34 | 
 35 | #else
 36 | 
 37 | fcomplex cdotc_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy);
 38 | 
 39 | fcomplex cdotu_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy);
 40 | 
 41 | float sasum_(INT64 *n, float *sx, INT64 *incx);
 42 | 
 43 | float scasum_(INT64 *n, fcomplex *cx, INT64 *incx);
 44 | 
 45 | float scnrm2_(INT64 *n, fcomplex *x, INT64 *incx);
 46 | 
 47 | float sdot_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy);
 48 | 
 49 | float snrm2_(INT64 *n, float *x, INT64 *incx);
 50 | 
 51 | dcomplex zdotc_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy);
 52 | 
 53 | dcomplex zdotu_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy);
 54 | 
 55 | #endif
 56 | 
 57 | /* Remaining functions listed in alphabetical order */
 58 | 
 59 | INT64 caxpy_(INT64 *n, fcomplex *ca, fcomplex *cx, INT64 *incx, fcomplex *cy,
 60 |            INT64 *incy);
 61 | 
 62 | INT64 ccopy_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy);
 63 | 
 64 | INT64 cgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku,
 65 |            fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *x, INT64 *incx,
 66 |            fcomplex *beta, fcomplex *y, INT64 *incy);
 67 | 
 68 | INT64 cgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k,
 69 |            fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb,
 70 |            fcomplex *beta, fcomplex *c, INT64 *ldc);
 71 | 
 72 | INT64 cgemv_(char *trans, INT64 *m, INT64 *n, fcomplex *alpha, fcomplex *a,
 73 |            INT64 *lda, fcomplex *x, INT64 *incx, fcomplex *beta, fcomplex *y,
 74 |            INT64 *incy);
 75 | 
 76 | INT64 cgerc_(INT64 *m, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx,
 77 |            fcomplex *y, INT64 *incy, fcomplex *a, INT64 *lda);
 78 | 
 79 | INT64 cgeru_(INT64 *m, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx,
 80 |            fcomplex *y, INT64 *incy, fcomplex *a, INT64 *lda);
 81 | 
 82 | INT64 chbmv_(char *uplo, INT64 *n, INT64 *k, fcomplex *alpha, fcomplex *a,
 83 |            INT64 *lda, fcomplex *x, INT64 *incx, fcomplex *beta, fcomplex *y,
 84 |            INT64 *incy);
 85 | 
 86 | INT64 chemm_(char *side, char *uplo, INT64 *m, INT64 *n, fcomplex *alpha,
 87 |            fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, fcomplex *beta,
 88 |            fcomplex *c, INT64 *ldc);
 89 | 
 90 | INT64 chemv_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *a, INT64 *lda,
 91 |            fcomplex *x, INT64 *incx, fcomplex *beta, fcomplex *y, INT64 *incy);
 92 | 
 93 | INT64 cher_(char *uplo, INT64 *n, float *alpha, fcomplex *x, INT64 *incx,
 94 |           fcomplex *a, INT64 *lda);
 95 | 
 96 | INT64 cher2_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx,
 97 |            fcomplex *y, INT64 *incy, fcomplex *a, INT64 *lda);
 98 | 
 99 | INT64 cher2k_(char *uplo, char *trans, INT64 *n, INT64 *k, fcomplex *alpha,
100 |             fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, float *beta,
101 |             fcomplex *c, INT64 *ldc);
102 | 
103 | INT64 cherk_(char *uplo, char *trans, INT64 *n, INT64 *k, float *alpha,
104 |            fcomplex *a, INT64 *lda, float *beta, fcomplex *c, INT64 *ldc);
105 | 
106 | INT64 chpmv_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *ap, fcomplex *x,
107 |            INT64 *incx, fcomplex *beta, fcomplex *y, INT64 *incy);
108 | 
109 | INT64 chpr_(char *uplo, INT64 *n, float *alpha, fcomplex *x, INT64 *incx,
110 |           fcomplex *ap);
111 | 
112 | INT64 chpr2_(char *uplo, INT64 *n, fcomplex *alpha, fcomplex *x, INT64 *incx,
113 |            fcomplex *y, INT64 *incy, fcomplex *ap);
114 | 
115 | INT64 crotg_(fcomplex *ca, fcomplex *cb, float *c, fcomplex *s);
116 | 
117 | INT64 cscal_(INT64 *n, fcomplex *ca, fcomplex *cx, INT64 *incx);
118 | 
119 | INT64 csscal_(INT64 *n, float *sa, fcomplex *cx, INT64 *incx);
120 | 
121 | INT64 cswap_(INT64 *n, fcomplex *cx, INT64 *incx, fcomplex *cy, INT64 *incy);
122 | 
123 | INT64 csymm_(char *side, char *uplo, INT64 *m, INT64 *n, fcomplex *alpha,
124 |            fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, fcomplex *beta,
125 |            fcomplex *c, INT64 *ldc);
126 | 
127 | INT64 csyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, fcomplex *alpha,
128 |             fcomplex *a, INT64 *lda, fcomplex *b, INT64 *ldb, fcomplex *beta,
129 |             fcomplex *c, INT64 *ldc);
130 | 
131 | INT64 csyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, fcomplex *alpha,
132 |            fcomplex *a, INT64 *lda, fcomplex *beta, fcomplex *c, INT64 *ldc);
133 | 
134 | INT64 ctbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
135 |            fcomplex *a, INT64 *lda, fcomplex *x, INT64 *incx);
136 | 
137 | INT64 ctbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
138 |            fcomplex *a, INT64 *lda, fcomplex *x, INT64 *incx);
139 | 
140 | INT64 ctpmv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *ap,
141 |            fcomplex *x, INT64 *incx);
142 | 
143 | INT64 ctpsv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *ap,
144 |            fcomplex *x, INT64 *incx);
145 | 
146 | INT64 ctrmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
147 |            INT64 *n, fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *b,
148 |            INT64 *ldb);
149 | 
150 | INT64 ctrmv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *a,
151 |            INT64 *lda, fcomplex *x, INT64 *incx);
152 | 
153 | INT64 ctrsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
154 |            INT64 *n, fcomplex *alpha, fcomplex *a, INT64 *lda, fcomplex *b,
155 |            INT64 *ldb);
156 | 
157 | INT64 ctrsv_(char *uplo, char *trans, char *diag, INT64 *n, fcomplex *a,
158 |            INT64 *lda, fcomplex *x, INT64 *incx);
159 | 
160 | INT64 daxpy_(INT64 *n, double *sa, double *sx, INT64 *incx, double *sy,
161 |            INT64 *incy);
162 | 
163 | INT64 dcopy_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy);
164 | 
165 | INT64 dgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku,
166 |            double *alpha, double *a, INT64 *lda, double *x, INT64 *incx,
167 |            double *beta, double *y, INT64 *incy);
168 | 
169 | INT64 dgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k,
170 |            double *alpha, double *a, INT64 *lda, double *b, INT64 *ldb,
171 |            double *beta, double *c, INT64 *ldc);
172 | 
173 | INT64 dgemv_(char *trans, INT64 *m, INT64 *n, double *alpha, double *a,
174 |            INT64 *lda, double *x, INT64 *incx, double *beta, double *y, 
175 |            INT64 *incy);
176 | 
177 | INT64 dger_(INT64 *m, INT64 *n, double *alpha, double *x, INT64 *incx,
178 |           double *y, INT64 *incy, double *a, INT64 *lda);
179 | 
180 | INT64 drot_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy,
181 |           double *c, double *s);
182 | 
183 | INT64 drotg_(double *sa, double *sb, double *c, double *s);
184 | 
185 | INT64 dsbmv_(char *uplo, INT64 *n, INT64 *k, double *alpha, double *a,
186 |            INT64 *lda, double *x, INT64 *incx, double *beta, double *y, 
187 |            INT64 *incy);
188 | 
189 | INT64 dscal_(INT64 *n, double *sa, double *sx, INT64 *incx);
190 | 
191 | INT64 dspmv_(char *uplo, INT64 *n, double *alpha, double *ap, double *x,
192 |            INT64 *incx, double *beta, double *y, INT64 *incy);
193 | 
194 | INT64 dspr_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx,
195 |           double *ap);
196 | 
197 | INT64 dspr2_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx,
198 |            double *y, INT64 *incy, double *ap);
199 | 
200 | INT64 dswap_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy);
201 | 
202 | INT64 dsymm_(char *side, char *uplo, INT64 *m, INT64 *n, double *alpha,
203 |            double *a, INT64 *lda, double *b, INT64 *ldb, double *beta,
204 |            double *c, INT64 *ldc);
205 | 
206 | INT64 dsymv_(char *uplo, INT64 *n, double *alpha, double *a, INT64 *lda,
207 |            double *x, INT64 *incx, double *beta, double *y, INT64 *incy);
208 | 
209 | INT64 dsyr_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx,
210 |           double *a, INT64 *lda);
211 | 
212 | INT64 dsyr2_(char *uplo, INT64 *n, double *alpha, double *x, INT64 *incx,
213 |            double *y, INT64 *incy, double *a, INT64 *lda);
214 | 
215 | INT64 dsyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, double *alpha,
216 |             double *a, INT64 *lda, double *b, INT64 *ldb, double *beta,
217 |             double *c, INT64 *ldc);
218 | 
219 | INT64 dsyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, double *alpha,
220 |            double *a, INT64 *lda, double *beta, double *c, INT64 *ldc);
221 | 
222 | INT64 dtbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
223 |            double *a, INT64 *lda, double *x, INT64 *incx);
224 | 
225 | INT64 dtbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
226 |            double *a, INT64 *lda, double *x, INT64 *incx);
227 | 
228 | INT64 dtpmv_(char *uplo, char *trans, char *diag, INT64 *n, double *ap,
229 |            double *x, INT64 *incx);
230 | 
231 | INT64 dtpsv_(char *uplo, char *trans, char *diag, INT64 *n, double *ap,
232 |            double *x, INT64 *incx);
233 | 
234 | INT64 dtrmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
235 |            INT64 *n, double *alpha, double *a, INT64 *lda, double *b, 
236 |            INT64 *ldb);
237 | 
238 | INT64 dtrmv_(char *uplo, char *trans, char *diag, INT64 *n, double *a,
239 |            INT64 *lda, double *x, INT64 *incx);
240 | 
241 | INT64 dtrsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
242 |            INT64 *n, double *alpha, double *a, INT64 *lda, double *b, 
243 |            INT64 *ldb);
244 | 
245 | INT64 dtrsv_(char *uplo, char *trans, char *diag, INT64 *n, double *a,
246 |            INT64 *lda, double *x, INT64 *incx);
247 | 
248 | 
249 | INT64 saxpy_(INT64 *n, float *sa, float *sx, INT64 *incx, float *sy, INT64 *incy);
250 | 
251 | INT64 scopy_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy);
252 | 
253 | INT64 sgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku,
254 |            float *alpha, float *a, INT64 *lda, float *x, INT64 *incx,
255 |            float *beta, float *y, INT64 *incy);
256 | 
257 | INT64 sgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k,
258 |            float *alpha, float *a, INT64 *lda, float *b, INT64 *ldb,
259 |            float *beta, float *c, INT64 *ldc);
260 | 
261 | INT64 sgemv_(char *trans, INT64 *m, INT64 *n, float *alpha, float *a,
262 |            INT64 *lda, float *x, INT64 *incx, float *beta, float *y, 
263 |            INT64 *incy);
264 | 
265 | INT64 sger_(INT64 *m, INT64 *n, float *alpha, float *x, INT64 *incx,
266 |           float *y, INT64 *incy, float *a, INT64 *lda);
267 | 
268 | INT64 srot_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy,
269 |           float *c, float *s);
270 | 
271 | INT64 srotg_(float *sa, float *sb, float *c, float *s);
272 | 
273 | INT64 ssbmv_(char *uplo, INT64 *n, INT64 *k, float *alpha, float *a,
274 |            INT64 *lda, float *x, INT64 *incx, float *beta, float *y, 
275 |            INT64 *incy);
276 | 
277 | INT64 sscal_(INT64 *n, float *sa, float *sx, INT64 *incx);
278 | 
279 | INT64 sspmv_(char *uplo, INT64 *n, float *alpha, float *ap, float *x,
280 |            INT64 *incx, float *beta, float *y, INT64 *incy);
281 | 
282 | INT64 sspr_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx,
283 |           float *ap);
284 | 
285 | INT64 sspr2_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx,
286 |            float *y, INT64 *incy, float *ap);
287 | 
288 | INT64 sswap_(INT64 *n, float *sx, INT64 *incx, float *sy, INT64 *incy);
289 | 
290 | INT64 ssymm_(char *side, char *uplo, INT64 *m, INT64 *n, float *alpha,
291 |            float *a, INT64 *lda, float *b, INT64 *ldb, float *beta,
292 |            float *c, INT64 *ldc);
293 | 
294 | INT64 ssymv_(char *uplo, INT64 *n, float *alpha, float *a, INT64 *lda,
295 |            float *x, INT64 *incx, float *beta, float *y, INT64 *incy);
296 | 
297 | INT64 ssyr_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx,
298 |           float *a, INT64 *lda);
299 | 
300 | INT64 ssyr2_(char *uplo, INT64 *n, float *alpha, float *x, INT64 *incx,
301 |            float *y, INT64 *incy, float *a, INT64 *lda);
302 | 
303 | INT64 ssyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, float *alpha,
304 |             float *a, INT64 *lda, float *b, INT64 *ldb, float *beta,
305 |             float *c, INT64 *ldc);
306 | 
307 | INT64 ssyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, float *alpha,
308 |            float *a, INT64 *lda, float *beta, float *c, INT64 *ldc);
309 | 
310 | INT64 stbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
311 |            float *a, INT64 *lda, float *x, INT64 *incx);
312 | 
313 | INT64 stbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
314 |            float *a, INT64 *lda, float *x, INT64 *incx);
315 | 
316 | INT64 stpmv_(char *uplo, char *trans, char *diag, INT64 *n, float *ap,
317 |            float *x, INT64 *incx);
318 | 
319 | INT64 stpsv_(char *uplo, char *trans, char *diag, INT64 *n, float *ap,
320 |            float *x, INT64 *incx);
321 | 
322 | INT64 strmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
323 |            INT64 *n, float *alpha, float *a, INT64 *lda, float *b, 
324 |            INT64 *ldb);
325 | 
326 | INT64 strmv_(char *uplo, char *trans, char *diag, INT64 *n, float *a,
327 |            INT64 *lda, float *x, INT64 *incx);
328 | 
329 | INT64 strsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
330 |            INT64 *n, float *alpha, float *a, INT64 *lda, float *b, 
331 |            INT64 *ldb);
332 | 
333 | INT64 strsv_(char *uplo, char *trans, char *diag, INT64 *n, float *a,
334 |            INT64 *lda, float *x, INT64 *incx);
335 | 
336 | INT64 zaxpy_(INT64 *n, dcomplex *ca, dcomplex *cx, INT64 *incx, dcomplex *cy,
337 |            INT64 *incy);
338 | 
339 | INT64 zcopy_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy);
340 | 
341 | INT64 zdscal_(INT64 *n, double *sa, dcomplex *cx, INT64 *incx);
342 | 
343 | INT64 zgbmv_(char *trans, INT64 *m, INT64 *n, INT64 *kl, INT64 *ku,
344 |            dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *x, INT64 *incx,
345 |            dcomplex *beta, dcomplex *y, INT64 *incy);
346 | 
347 | INT64 zgemm_(char *transa, char *transb, INT64 *m, INT64 *n, INT64 *k,
348 |            dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb,
349 |            dcomplex *beta, dcomplex *c, INT64 *ldc);
350 | 
351 | INT64 zgemv_(char *trans, INT64 *m, INT64 *n, dcomplex *alpha, dcomplex *a,
352 |            INT64 *lda, dcomplex *x, INT64 *incx, dcomplex *beta, dcomplex *y,
353 |            INT64 *incy);
354 | 
355 | INT64 zgerc_(INT64 *m, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx,
356 |            dcomplex *y, INT64 *incy, dcomplex *a, INT64 *lda);
357 | 
358 | INT64 zgeru_(INT64 *m, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx,
359 |            dcomplex *y, INT64 *incy, dcomplex *a, INT64 *lda);
360 | 
361 | INT64 zhbmv_(char *uplo, INT64 *n, INT64 *k, dcomplex *alpha, dcomplex *a,
362 |            INT64 *lda, dcomplex *x, INT64 *incx, dcomplex *beta, dcomplex *y,
363 |            INT64 *incy);
364 | 
365 | INT64 zhemm_(char *side, char *uplo, INT64 *m, INT64 *n, dcomplex *alpha,
366 |            dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, dcomplex *beta,
367 |            dcomplex *c, INT64 *ldc);
368 | 
369 | INT64 zhemv_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *a, INT64 *lda,
370 |            dcomplex *x, INT64 *incx, dcomplex *beta, dcomplex *y, INT64 *incy);
371 | 
372 | INT64 zher_(char *uplo, INT64 *n, double *alpha, dcomplex *x, INT64 *incx,
373 |           dcomplex *a, INT64 *lda);
374 | 
375 | INT64 zher2_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx,
376 |            dcomplex *y, INT64 *incy, dcomplex *a, INT64 *lda);
377 | 
378 | INT64 zher2k_(char *uplo, char *trans, INT64 *n, INT64 *k, dcomplex *alpha,
379 |             dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, double *beta,
380 |             dcomplex *c, INT64 *ldc);
381 | 
382 | INT64 zherk_(char *uplo, char *trans, INT64 *n, INT64 *k, double *alpha,
383 |            dcomplex *a, INT64 *lda, double *beta, dcomplex *c, INT64 *ldc);
384 | 
385 | INT64 zhpmv_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *ap, dcomplex *x,
386 |            INT64 *incx, dcomplex *beta, dcomplex *y, INT64 *incy);
387 | 
388 | INT64 zhpr_(char *uplo, INT64 *n, double *alpha, dcomplex *x, INT64 *incx,
389 |           dcomplex *ap);
390 | 
391 | INT64 zhpr2_(char *uplo, INT64 *n, dcomplex *alpha, dcomplex *x, INT64 *incx,
392 |            dcomplex *y, INT64 *incy, dcomplex *ap);
393 | 
394 | INT64 zrotg_(dcomplex *ca, dcomplex *cb, double *c, dcomplex *s);
395 | 
396 | INT64 zscal_(INT64 *n, dcomplex *ca, dcomplex *cx, INT64 *incx);
397 | 
398 | INT64 zswap_(INT64 *n, dcomplex *cx, INT64 *incx, dcomplex *cy, INT64 *incy);
399 | 
400 | INT64 zsymm_(char *side, char *uplo, INT64 *m, INT64 *n, dcomplex *alpha,
401 |            dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, dcomplex *beta,
402 |            dcomplex *c, INT64 *ldc);
403 | 
404 | INT64 zsyr2k_(char *uplo, char *trans, INT64 *n, INT64 *k, dcomplex *alpha,
405 |             dcomplex *a, INT64 *lda, dcomplex *b, INT64 *ldb, dcomplex *beta,
406 |             dcomplex *c, INT64 *ldc);
407 | 
408 | INT64 zsyrk_(char *uplo, char *trans, INT64 *n, INT64 *k, dcomplex *alpha,
409 |            dcomplex *a, INT64 *lda, dcomplex *beta, dcomplex *c, INT64 *ldc);
410 | 
411 | INT64 ztbmv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
412 |            dcomplex *a, INT64 *lda, dcomplex *x, INT64 *incx);
413 | 
414 | INT64 ztbsv_(char *uplo, char *trans, char *diag, INT64 *n, INT64 *k,
415 |            dcomplex *a, INT64 *lda, dcomplex *x, INT64 *incx);
416 | 
417 | INT64 ztpmv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *ap,
418 |            dcomplex *x, INT64 *incx);
419 | 
420 | INT64 ztpsv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *ap,
421 |            dcomplex *x, INT64 *incx);
422 | 
423 | INT64 ztrmm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
424 |            INT64 *n, dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *b,
425 |            INT64 *ldb);
426 | 
427 | INT64 ztrmv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *a,
428 |            INT64 *lda, dcomplex *x, INT64 *incx);
429 | 
430 | INT64 ztrsm_(char *side, char *uplo, char *transa, char *diag, INT64 *m,
431 |            INT64 *n, dcomplex *alpha, dcomplex *a, INT64 *lda, dcomplex *b,
432 |            INT64 *ldb);
433 | 
434 | INT64 ztrsv_(char *uplo, char *trans, char *diag, INT64 *n, dcomplex *a,
435 |            INT64 *lda, dcomplex *x, INT64 *incx);
436 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/blas/daxpy.c:
--------------------------------------------------------------------------------
 1 | #include "blas.h"
 2 | 
 3 | INT64 daxpy_(INT64 *n, double *sa, double *sx, INT64 *incx, double *sy,
 4 |            INT64 *incy)
 5 | {
 6 |   INT64 i, m, ix, iy, nn, iincx, iincy;
 7 |   register double ssa;
 8 | 
 9 |   /* constant times a vector plus a vector.   
10 |      uses unrolled loop for increments equal to one.   
11 |      jack dongarra, linpack, 3/11/78.   
12 |      modified 12/3/93, array(1) declarations changed to array(*) */
13 | 
14 |   /* Dereference inputs */
15 |   nn = *n;
16 |   ssa = *sa;
17 |   iincx = *incx;
18 |   iincy = *incy;
19 | 
20 |   if( nn > 0 && ssa != 0.0 )
21 |   {
22 |     if (iincx == 1 && iincy == 1) /* code for both increments equal to 1 */
23 |     {
24 |       m = nn-3;
25 |       for (i = 0; i < m; i += 4)
26 |       {
27 |         sy[i] += ssa * sx[i];
28 |         sy[i+1] += ssa * sx[i+1];
29 |         sy[i+2] += ssa * sx[i+2];
30 |         sy[i+3] += ssa * sx[i+3];
31 |       }
32 |       for ( ; i < nn; ++i) /* clean-up loop */
33 |         sy[i] += ssa * sx[i];
34 |     }
35 |     else /* code for unequal increments or equal increments not equal to 1 */
36 |     {
37 |       ix = iincx >= 0 ? 0 : (1 - nn) * iincx;
38 |       iy = iincy >= 0 ? 0 : (1 - nn) * iincy;
39 |       for (i = 0; i < nn; i++)
40 |       {
41 |         sy[iy] += ssa * sx[ix];
42 |         ix += iincx;
43 |         iy += iincy;
44 |       }
45 |     }
46 |   }
47 | 
48 |   return 0;
49 | } /* daxpy_ */
50 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/blas/ddot.c:
--------------------------------------------------------------------------------
 1 | #include "blas.h"
 2 | 
 3 | double ddot_(INT64 *n, double *sx, INT64 *incx, double *sy, INT64 *incy)
 4 | {
 5 |   INT64 i, m, nn, iincx, iincy;
 6 |   double stemp;
 7 |   INT64 ix, iy;
 8 | 
 9 |   /* forms the dot product of two vectors.   
10 |      uses unrolled loops for increments equal to one.   
11 |      jack dongarra, linpack, 3/11/78.   
12 |      modified 12/3/93, array(1) declarations changed to array(*) */
13 | 
14 |   /* Dereference inputs */
15 |   nn = *n;
16 |   iincx = *incx;
17 |   iincy = *incy;
18 | 
19 |   stemp = 0.0;
20 |   if (nn > 0)
21 |   {
22 |     if (iincx == 1 && iincy == 1) /* code for both increments equal to 1 */
23 |     {
24 |       m = nn-4;
25 |       for (i = 0; i < m; i += 5)
26 |         stemp += sx[i] * sy[i] + sx[i+1] * sy[i+1] + sx[i+2] * sy[i+2] +
27 |                  sx[i+3] * sy[i+3] + sx[i+4] * sy[i+4];
28 | 
29 |       for ( ; i < nn; i++)        /* clean-up loop */
30 |         stemp += sx[i] * sy[i];
31 |     }
32 |     else /* code for unequal increments or equal increments not equal to 1 */
33 |     {
34 |       ix = 0;
35 |       iy = 0;
36 |       if (iincx < 0)
37 |         ix = (1 - nn) * iincx;
38 |       if (iincy < 0)
39 |         iy = (1 - nn) * iincy;
40 |       for (i = 0; i < nn; i++)
41 |       {
42 |         stemp += sx[ix] * sy[iy];
43 |         ix += iincx;
44 |         iy += iincy;
45 |       }
46 |     }
47 |   }
48 | 
49 |   return stemp;
50 | } /* ddot_ */
51 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/blas/dnrm2.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>  /* Needed for fabs() and sqrt() */
 2 | #include "blas.h"
 3 | 
 4 | double dnrm2_(INT64 *n, double *x, INT64 *incx)
 5 | {
 6 |   INT64 ix, nn, iincx;
 7 |   double norm, scale, absxi, ssq, temp;
 8 | 
 9 | /*  DNRM2 returns the euclidean norm of a vector via the function   
10 |     name, so that   
11 | 
12 |        DNRM2 := sqrt( x'*x )   
13 | 
14 |     -- This version written on 25-October-1982.   
15 |        Modified on 14-October-1993 to inline the call to SLASSQ.   
16 |        Sven Hammarling, Nag Ltd.   */
17 | 
18 |   /* Dereference inputs */
19 |   nn = *n;
20 |   iincx = *incx;
21 | 
22 |   if( nn > 0 && iincx > 0 )
23 |   {
24 |     if (nn == 1)
25 |     {
26 |       norm = fabs(x[0]);
27 |     }  
28 |     else
29 |     {
30 |       scale = 0.0;
31 |       ssq = 1.0;
32 | 
33 |       /* The following loop is equivalent to this call to the LAPACK 
34 |          auxiliary routine:   CALL SLASSQ( N, X, INCX, SCALE, SSQ ) */
35 | 
36 |       for (ix=(nn-1)*iincx; ix>=0; ix-=iincx)
37 |       {
38 |         if (x[ix] != 0.0)
39 |         {
40 |           absxi = fabs(x[ix]);
41 |           if (scale < absxi)
42 |           {
43 |             temp = scale / absxi;
44 |             ssq = ssq * (temp * temp) + 1.0;
45 |             scale = absxi;
46 |           }
47 |           else
48 |           {
49 |             temp = absxi / scale;
50 |             ssq += temp * temp;
51 |           }
52 |         }
53 |       }
54 |       norm = scale * sqrt(ssq);
55 |     }
56 |   }
57 |   else
58 |     norm = 0.0;
59 | 
60 |   return norm;
61 | 
62 | } /* dnrm2_ */
63 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/blas/dscal.c:
--------------------------------------------------------------------------------
 1 | #include "blas.h"
 2 | 
 3 | INT64 dscal_(INT64 *n, double *sa, double *sx, INT64 *incx)
 4 | {
 5 |   INT64 i, m, nincx, nn, iincx;
 6 |   double ssa;
 7 | 
 8 |   /* scales a vector by a constant.   
 9 |      uses unrolled loops for increment equal to 1.   
10 |      jack dongarra, linpack, 3/11/78.   
11 |      modified 3/93 to return if incx .le. 0.   
12 |      modified 12/3/93, array(1) declarations changed to array(*) */
13 | 
14 |   /* Dereference inputs */
15 |   nn = *n;
16 |   iincx = *incx;
17 |   ssa = *sa;
18 | 
19 |   if (nn > 0 && iincx > 0)
20 |   {
21 |     if (iincx == 1) /* code for increment equal to 1 */
22 |     {
23 |       m = nn-4;
24 |       for (i = 0; i < m; i += 5)
25 |       {
26 |         sx[i] = ssa * sx[i];
27 |         sx[i+1] = ssa * sx[i+1];
28 |         sx[i+2] = ssa * sx[i+2];
29 |         sx[i+3] = ssa * sx[i+3];
30 |         sx[i+4] = ssa * sx[i+4];
31 |       }
32 |       for ( ; i < nn; ++i) /* clean-up loop */
33 |         sx[i] = ssa * sx[i];
34 |     }
35 |     else /* code for increment not equal to 1 */
36 |     {
37 |       nincx = nn * iincx;
38 |       for (i = 0; i < nincx; i += iincx)
39 |         sx[i] = ssa * sx[i];
40 |     }
41 |   }
42 | 
43 |   return 0;
44 | } /* dscal_ */
45 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/linear.def:
--------------------------------------------------------------------------------
 1 | LIBRARY liblinear
 2 | EXPORTS
 3 | 	train	@1
 4 | 	cross_validation	@2
 5 | 	save_model	@3
 6 | 	load_model	@4
 7 | 	get_nr_feature	@5
 8 | 	get_nr_class	@6
 9 | 	get_labels	@7
10 | 	predict_values	@8
11 | 	predict	@9
12 | 	predict_probability	@10
13 | 	free_and_destroy_model	@11
14 | 	free_model_content	@12
15 | 	destroy_param	@13
16 | 	check_parameter	@14
17 | 	check_probability_model	@15
18 | 	set_print_string_function	@16
19 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/linear.h:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | 
 3 | #ifndef _LIBLINEAR_H
 4 | #define _LIBLINEAR_H
 5 | #ifndef INT64_DEFINED
 6 | typedef int64_t INT64;
 7 | #define INT64_DEFINED
 8 | #endif
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | struct feature_node
15 | {
16 | 	INT64 index;
17 | 	double value;
18 | };
19 | 
20 | struct problem
21 | {
22 | 	INT64 l, n;
23 | 	double *y;
24 | 	struct feature_node **x;
25 | 	double bias;            /* < 0 if no bias term */  
26 | };
27 | 
28 | enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR = 11, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL }; /* solver_type */
29 | 
30 | struct parameter
31 | {
32 | 	INT64 solver_type;
33 | 
34 | 	/* these are for training only */
35 | 	double eps;	        /* stopping criteria */
36 | 	double C;
37 | 	INT64 nr_weight;
38 | 	INT64 *weight_label;
39 | 	double* weight;
40 | 	double p;
41 | };
42 | 
43 | struct model
44 | {
45 | 	struct parameter param;
46 | 	INT64 nr_class;		/* number of classes */
47 | 	INT64 nr_feature;
48 | 	double *w;
49 | 	INT64 *label;		/* label of each class */
50 | 	double bias;
51 | };
52 | 
53 | struct model* train(const struct problem *prob, const struct parameter *param);
54 | void cross_validation(const struct problem *prob, const struct parameter *param, INT64 nr_fold, double *target);
55 | 
56 | double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values);
57 | double predict(const struct model *model_, const struct feature_node *x);
58 | double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates);
59 | 
60 | INT64 save_model(const char *model_file_name, const struct model *model_);
61 | struct model *load_model(const char *model_file_name);
62 | 
63 | INT64 get_nr_feature(const struct model *model_);
64 | INT64 get_nr_class(const struct model *model_);
65 | void get_labels(const struct model *model_, INT64* label);
66 | 
67 | void free_model_content(struct model *model_ptr);
68 | void free_and_destroy_model(struct model **model_ptr_ptr);
69 | void destroy_param(struct parameter *param);
70 | 
71 | const char *check_parameter(const struct problem *prob, const struct parameter *param);
72 | INT64 check_probability_model(const struct model *model);
73 | void set_print_string_function(void (*print_func) (const char*));
74 | 
75 | #ifdef __cplusplus
76 | }
77 | #endif
78 | 
79 | #endif /* _LIBLINEAR_H */
80 | 
81 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/predict:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/classifier/learner/liblinear/predict


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/predict.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <ctype.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <errno.h>
  6 | #include "linear.h"
  7 | 
  8 | int print_null(const char *s,...) {return 0;}
  9 | 
 10 | static int (*info)(const char *fmt,...) = &printf;
 11 | 
 12 | struct feature_node *x;
 13 | INT64 max_nr_attr = 64;
 14 | 
 15 | struct model* model_;
 16 | INT64 flag_predict_probability=0;
 17 | 
 18 | void exit_input_error(INT64 line_num)
 19 | {
 20 | 	fprintf(stderr,"Wrong input format at line %lld\n", (long long int)line_num);
 21 | 	exit(1);
 22 | }
 23 | 
 24 | static char *line = NULL;
 25 | static INT64 max_line_len;
 26 | 
 27 | static char* readline(FILE *input)
 28 | {
 29 | 	INT64 len;
 30 | 
 31 | 	if(fgets(line,max_line_len,input) == NULL)
 32 | 		return NULL;
 33 | 
 34 | 	while(strrchr(line,'\n') == NULL)
 35 | 	{
 36 | 		max_line_len *= 2;
 37 | 		line = (char *) realloc(line,max_line_len);
 38 | 		len = (INT64) strlen(line);
 39 | 		if(fgets(line+len,max_line_len-len,input) == NULL)
 40 | 			break;
 41 | 	}
 42 | 	return line;
 43 | }
 44 | 
 45 | void do_predict(FILE *input, FILE *output)
 46 | {
 47 | 	INT64 correct = 0;
 48 | 	INT64 total = 0;
 49 | 	double error = 0;
 50 | 	double sump = 0, sumt = 0, sumpp = 0, sumtt = 0, sumpt = 0;
 51 | 
 52 | 	INT64 nr_class=get_nr_class(model_);
 53 | 	double *prob_estimates=NULL;
 54 | 	INT64 j, n;
 55 | 	INT64 nr_feature=get_nr_feature(model_);
 56 | 	if(model_->bias>=0)
 57 | 		n=nr_feature+1;
 58 | 	else
 59 | 		n=nr_feature;
 60 | 
 61 | 	if(flag_predict_probability)
 62 | 	{
 63 | 		INT64 *labels;
 64 | 
 65 | 		if(!check_probability_model(model_))
 66 | 		{
 67 | 			fprintf(stderr, "probability output is only supported for logistic regression\n");
 68 | 			exit(1);
 69 | 		}
 70 | 
 71 | 		labels=(INT64 *) malloc(nr_class*sizeof(INT64));
 72 | 		get_labels(model_,labels);
 73 | 		prob_estimates = (double *) malloc(nr_class*sizeof(double));
 74 | 		fprintf(output,"labels");
 75 | 		for(j=0;j<nr_class;j++)
 76 | 			fprintf(output," %lld",(long long int)labels[j]);
 77 | 		fprintf(output,"\n");
 78 | 		free(labels);
 79 | 	}
 80 | 
 81 | 	max_line_len = 1024;
 82 | 	line = (char *)malloc(max_line_len*sizeof(char));
 83 | 	while(readline(input) != NULL)
 84 | 	{
 85 | 		INT64 i = 0;
 86 | 		double target_label, predict_label;
 87 | 		char *idx, *val, *label, *endptr;
 88 | 		INT64 inst_max_index = 0; // strtol gives 0 if wrong format
 89 | 
 90 | 		label = strtok(line," \t\n");
 91 | 		if(label == NULL) // empty line
 92 | 			exit_input_error(total+1);
 93 | 
 94 | 		target_label = strtod(label,&endptr);
 95 | 		if(endptr == label || *endptr != '\0')
 96 | 			exit_input_error(total+1);
 97 | 
 98 | 		while(1)
 99 | 		{
100 | 			if(i>=max_nr_attr-2)	// need one more for index = -1
101 | 			{
102 | 				max_nr_attr *= 2;
103 | 				x = (struct feature_node *) realloc(x,max_nr_attr*sizeof(struct feature_node));
104 | 			}
105 | 
106 | 			idx = strtok(NULL,":");
107 | 			val = strtok(NULL," \t");
108 | 
109 | 			if(val == NULL)
110 | 				break;
111 | 			errno = 0;
112 | 			x[i].index = (INT64) strtoll(idx,&endptr,10);
113 | 			if(endptr == idx || errno != 0 || *endptr != '\0' || x[i].index <= inst_max_index)
114 | 				exit_input_error(total+1);
115 | 			else
116 | 				inst_max_index = x[i].index;
117 | 
118 | 			errno = 0;
119 | 			x[i].value = strtod(val,&endptr);
120 | 			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
121 | 				exit_input_error(total+1);
122 | 
123 | 			// feature indices larger than those in training are not used
124 | 			if(x[i].index <= nr_feature)
125 | 				++i;
126 | 		}
127 | 
128 | 		if(model_->bias>=0)
129 | 		{
130 | 			x[i].index = n;
131 | 			x[i].value = model_->bias;
132 | 			i++;
133 | 		}
134 | 		x[i].index = -1;
135 | 
136 | 		if(flag_predict_probability)
137 | 		{
138 | 			INT64 j;
139 | 			predict_label = predict_probability(model_,x,prob_estimates);
140 | 			fprintf(output,"%g",predict_label);
141 | 			for(j=0;j<model_->nr_class;j++)
142 | 				fprintf(output," %g",prob_estimates[j]);
143 | 			fprintf(output,"\n");
144 | 		}
145 | 		else
146 | 		{
147 | 			predict_label = predict(model_,x);
148 | 			fprintf(output,"%g\n",predict_label);
149 | 		}
150 | 
151 | 		if(predict_label == target_label)
152 | 			++correct;
153 | 		error += (predict_label-target_label)*(predict_label-target_label);
154 | 		sump += predict_label;
155 | 		sumt += target_label;
156 | 		sumpp += predict_label*predict_label;
157 | 		sumtt += target_label*target_label;
158 | 		sumpt += predict_label*target_label;
159 | 		++total;
160 | 	}
161 | 	if(model_->param.solver_type==L2R_L2LOSS_SVR ||
162 | 	   model_->param.solver_type==L2R_L1LOSS_SVR_DUAL ||
163 | 	   model_->param.solver_type==L2R_L2LOSS_SVR_DUAL)
164 | 	{
165 | 		info("Mean squared error = %g (regression)\n",error/total);
166 | 		info("Squared correlation coefficient = %g (regression)\n",
167 | 			((total*sumpt-sump*sumt)*(total*sumpt-sump*sumt))/
168 | 			((total*sumpp-sump*sump)*(total*sumtt-sumt*sumt))
169 | 			);
170 | 	}
171 | 	else
172 | 		info("Accuracy = %g%% (%lld/%lld)\n",(double) correct/total*100,(long long int)correct,(long long int)total);
173 | 	if(flag_predict_probability)
174 | 		free(prob_estimates);
175 | }
176 | 
177 | void exit_with_help()
178 | {
179 | 	printf(
180 | 	"Usage: predict [options] test_file model_file output_file\n"
181 | 	"options:\n"
182 | 	"-b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only\n"
183 | 	"-q : quiet mode (no outputs)\n"
184 | 	);
185 | 	exit(1);
186 | }
187 | 
188 | int main(int argc, char **argv)
189 | {
190 | 	FILE *input, *output;
191 | 	INT64 i;
192 | 
193 | 	// parse options
194 | 	for(i=1;i<argc;i++)
195 | 	{
196 | 		if(argv[i][0] != '-') break;
197 | 		++i;
198 | 		switch(argv[i-1][1])
199 | 		{
200 | 			case 'b':
201 | 				flag_predict_probability = atoi(argv[i]);
202 | 				break;
203 | 			case 'q':
204 | 				info = &print_null;
205 | 				i--;
206 | 				break;
207 | 			default:
208 | 				fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]);
209 | 				exit_with_help();
210 | 				break;
211 | 		}
212 | 	}
213 | 	if(i>=argc)
214 | 		exit_with_help();
215 | 	
216 | 	if(i!=argc-3 || argv[i+1][0] == '-' || argv[i+2][0] == '-')
217 | 		exit_with_help();
218 | 
219 | 	input = fopen(argv[i],"r");
220 | 	if(input == NULL)
221 | 	{
222 | 		fprintf(stderr,"can't open input file %s\n",argv[i]);
223 | 		exit(1);
224 | 	}
225 | 
226 | 	output = fopen(argv[i+2],"w");
227 | 	if(output == NULL)
228 | 	{
229 | 		fprintf(stderr,"can't open output file %s\n",argv[i+2]);
230 | 		exit(1);
231 | 	}
232 | 
233 | 	if((model_=load_model(argv[i+1]))==0)
234 | 	{
235 | 		fprintf(stderr,"can't open model file %s\n",argv[i+1]);
236 | 		exit(1);
237 | 	}
238 | 
239 | 	x = (struct feature_node *) malloc(max_nr_attr*sizeof(struct feature_node));
240 | 	do_predict(input, output);
241 | 	free_and_destroy_model(&model_);
242 | 	free(line);
243 | 	free(x);
244 | 	fclose(input);
245 | 	fclose(output);
246 | 	return 0;
247 | }
248 | 
249 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/python/Makefile:
--------------------------------------------------------------------------------
1 | all = lib
2 | 
3 | lib:
4 | 	make -C .. lib
5 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/python/README:
--------------------------------------------------------------------------------
  1 | -------------------------------------
  2 | --- Python interface of LIBLINEAR ---
  3 | -------------------------------------
  4 | 
  5 | Table of Contents
  6 | =================
  7 | 
  8 | - Introduction
  9 | - Installation
 10 | - Quick Start
 11 | - Design Description
 12 | - Data Structures
 13 | - Utility Functions
 14 | - Additional Information
 15 | 
 16 | Introduction
 17 | ============
 18 | 
 19 | Python (http://www.python.org/) is a programming language suitable for rapid
 20 | development. This tool provides a simple Python interface to LIBLINEAR, a library
 21 | for support vector machines (http://www.csie.ntu.edu.tw/~cjlin/liblinear). The
 22 | interface is very easy to use as the usage is the same as that of LIBLINEAR. The
 23 | interface is developed with the built-in Python library "ctypes."
 24 | 
 25 | Installation
 26 | ============
 27 | 
 28 | On Unix systems, type
 29 | 
 30 | > make
 31 | 
 32 | The interface needs only LIBLINEAR shared library, which is generated by
 33 | the above command. We assume that the shared library is on the LIBLINEAR
 34 | main directory or in the system path.
 35 | 
 36 | Quick Start
 37 | ===========
 38 | 
 39 | There are two levels of usage. The high-level one uses utility functions
 40 | in liblinearutil.py and the usage is the same as the LIBLINEAR MATLAB interface.
 41 | 
 42 | >>> from liblinearutil import *
 43 | # Read data in LIBSVM format
 44 | >>> y, x = svm_read_problem('../heart_scale')
 45 | >>> m = train(y[:200], x[:200], '-c 4')
 46 | >>> p_label, p_acc, p_val = predict(y[200:], x[200:], m)
 47 | 
 48 | # Construct problem in python format
 49 | # Dense data
 50 | >>> y, x = [1,-1], [[1,0,1], [-1,0,-1]]
 51 | # Sparse data
 52 | >>> y, x = [1,-1], [{1:1, 3:1}, {1:-1,3:-1}]
 53 | >>> prob  = problem(y, x)
 54 | >>> param = parameter('-c 4 -B 1')
 55 | >>> m = train(prob, param)
 56 | 
 57 | # Other utility functions
 58 | >>> save_model('heart_scale.model', m)
 59 | >>> m = load_model('heart_scale.model')
 60 | >>> p_label, p_acc, p_val = predict(y, x, m, '-b 1')
 61 | >>> ACC, MSE, SCC = evaluations(y, p_label)
 62 | 
 63 | # Getting online help
 64 | >>> help(train)
 65 | 
 66 | The low-level use directly calls C interfaces imported by liblinear.py. Note that
 67 | all arguments and return values are in ctypes format. You need to handle them
 68 | carefully.
 69 | 
 70 | >>> from liblinear import *
 71 | >>> prob = problem([1,-1], [{1:1, 3:1}, {1:-1,3:-1}])
 72 | >>> param = parameter('-c 4')
 73 | >>> m = liblinear.train(prob, param) # m is a ctype pointer to a model
 74 | # Convert a Python-format instance to feature_nodearray, a ctypes structure
 75 | >>> x0, max_idx = gen_feature_nodearray({1:1, 3:1})
 76 | >>> label = liblinear.predict(m, x0)
 77 | 
 78 | Design Description
 79 | ==================
 80 | 
 81 | There are two files liblinear.py and liblinearutil.py, which respectively correspond to
 82 | low-level and high-level use of the interface.
 83 | 
 84 | In liblinear.py, we adopt the Python built-in library "ctypes," so that
 85 | Python can directly access C structures and interface functions defined
 86 | in linear.h.
 87 | 
 88 | While advanced users can use structures/functions in liblinear.py, to
 89 | avoid handling ctypes structures, in liblinearutil.py we provide some easy-to-use
 90 | functions. The usage is similar to LIBLINEAR MATLAB interface.
 91 | 
 92 | Data Structures
 93 | ===============
 94 | 
 95 | Three data structures derived from linear.h are node, problem, and
 96 | parameter. They all contain fields with the same names in
 97 | linear.h. Access these fields carefully because you directly use a C structure
 98 | instead of a Python object. The following description introduces additional
 99 | fields and methods.
100 | 
101 | Before using the data structures, execute the following command to load the
102 | LIBLINEAR shared library:
103 | 
104 |     >>> from liblinear import *
105 | 
106 | - class feature_node:
107 | 
108 |     Construct a feature_node.
109 | 
110 |     >>> node = feature_node(idx, val)
111 | 
112 |     idx: an integer indicates the feature index.
113 | 
114 |     val: a float indicates the feature value.
115 | 
116 |     Show the index and the value of a node.
117 | 
118 |     >>> print(node) 
119 | 
120 | - Function: gen_feature_nodearray(xi [,feature_max=None [,issparse=True]])
121 | 
122 |     Generate a feature vector from a Python list/tuple or a dictionary:
123 | 
124 |     >>> xi, max_idx = gen_feature_nodearray({1:1, 3:1, 5:-2})
125 | 
126 |     xi: the returned feature_nodearray (a ctypes structure)
127 | 
128 |     max_idx: the maximal feature index of xi
129 | 
130 |     issparse: if issparse == True, zero feature values are removed. The default
131 |               value is True for the sparsity.
132 | 
133 |     feature_max: if feature_max is assigned, features with indices larger than
134 |                  feature_max are removed.
135 | 
136 | - class problem:
137 | 
138 |     Construct a problem instance
139 | 
140 |     >>> prob = problem(y, x [,bias=-1])
141 | 
142 |     y: a Python list/tuple of l labels (type must be int/double).
143 | 
144 |     x: a Python list/tuple of l data instances. Each element of x must be
145 |        an instance of list/tuple/dictionary type.
146 | 
147 |     bias: if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term 
148 |           added (default -1)
149 | 
150 |     You can also modify the bias value by
151 | 
152 |     >>> prob.set_bias(1)
153 | 
154 |     Note that if your x contains sparse data (i.e., dictionary), the internal
155 |     ctypes data format is still sparse.
156 | 
157 | - class parameter:
158 | 
159 |     Construct a parameter instance
160 | 
161 |     >>> param = parameter('training_options')
162 | 
163 |     If 'training_options' is empty, LIBLINEAR default values are applied.
164 | 
165 |     Set param to LIBLINEAR default values.
166 | 
167 |     >>> param.set_to_default_values()
168 | 
169 |     Parse a string of options.
170 | 
171 |     >>> param.parse_options('training_options')
172 | 
173 |     Show values of parameters.
174 | 
175 |     >>> print(param)
176 | 
177 | - class model:
178 | 
179 |     There are two ways to obtain an instance of model:
180 | 
181 |     >>> model_ = train(y, x)
182 |     >>> model_ = load_model('model_file_name')
183 | 
184 |     Note that the returned structure of interface functions
185 |     liblinear.train and liblinear.load_model is a ctypes pointer of
186 |     model, which is different from the model object returned
187 |     by train and load_model in liblinearutil.py. We provide a
188 |     function toPyModel for the conversion:
189 | 
190 |     >>> model_ptr = liblinear.train(prob, param)
191 |     >>> model_ = toPyModel(model_ptr)
192 | 
193 |     If you obtain a model in a way other than the above approaches,
194 |     handle it carefully to avoid memory leak or segmentation fault.
195 | 
196 |     Some interface functions to access LIBLINEAR models are wrapped as
197 |     members of the class model:
198 | 
199 |     >>> nr_feature =  model_.get_nr_feature()
200 |     >>> nr_class = model_.get_nr_class()
201 |     >>> class_labels = model_.get_labels()
202 |     >>> is_prob_model = model_.is_probability_model()
203 | 
204 | Utility Functions
205 | =================
206 | 
207 | To use utility functions, type
208 | 
209 |     >>> from liblinearutil import *
210 | 
211 | The above command loads
212 |     train()            : train a linear model
213 |     predict()          : predict testing data
214 |     svm_read_problem() : read the data from a LIBSVM-format file.
215 |     load_model()       : load a LIBLINEAR model.
216 |     save_model()       : save model to a file.
217 |     evaluations()      : evaluate prediction results.
218 | 
219 | - Function: train
220 | 
221 |     There are three ways to call train()
222 | 
223 |     >>> model = train(y, x [, 'training_options'])
224 |     >>> model = train(prob [, 'training_options'])
225 |     >>> model = train(prob, param)
226 | 
227 |     y: a list/tuple of l training labels (type must be int/double).
228 | 
229 |     x: a list/tuple of l training instances. The feature vector of
230 |        each training instance is an instance of list/tuple or dictionary.
231 | 
232 |     training_options: a string in the same form as that for LIBLINEAR command
233 |                       mode.
234 | 
235 |     prob: a problem instance generated by calling
236 |           problem(y, x).
237 | 
238 |     param: a parameter instance generated by calling
239 |            parameter('training_options')
240 | 
241 |     model: the returned model instance. See linear.h for details of this
242 |            structure. If '-v' is specified, cross validation is
243 |            conducted and the returned model is just a scalar: cross-validation
244 |            accuracy for classification and mean-squared error for regression.
245 | 
246 |     To train the same data many times with different
247 |     parameters, the second and the third ways should be faster..
248 | 
249 |     Examples:
250 | 
251 |     >>> y, x = svm_read_problem('../heart_scale')
252 |     >>> prob = problem(y, x)
253 |     >>> param = parameter('-s 3 -c 5 -q')
254 |     >>> m = train(y, x, '-c 5')
255 |     >>> m = train(prob, '-w1 5 -c 5')
256 |     >>> m = train(prob, param)
257 |     >>> CV_ACC = train(y, x, '-v 3')
258 | 
259 | - Function: predict
260 | 
261 |     To predict testing data with a model, use
262 | 
263 |     >>> p_labs, p_acc, p_vals = predict(y, x, model [,'predicting_options'])
264 | 
265 |     y: a list/tuple of l true labels (type must be int/double). It is used
266 |        for calculating the accuracy. Use [] if true labels are
267 |        unavailable.
268 | 
269 |     x: a list/tuple of l predicting instances. The feature vector of
270 |        each predicting instance is an instance of list/tuple or dictionary.
271 | 
272 |     predicting_options: a string of predicting options in the same format as
273 |                         that of LIBLINEAR.
274 | 
275 |     model: a model instance.
276 | 
277 |     p_labels: a list of predicted labels
278 | 
279 |     p_acc: a tuple including accuracy (for classification), mean
280 |            squared error, and squared correlation coefficient (for
281 |            regression).
282 | 
283 |     p_vals: a list of decision values or probability estimates (if '-b 1' 
284 |             is specified). If k is the number of classes, for decision values,
285 |             each element includes results of predicting k binary-class
286 |             SVMs. If k = 2 and solver is not MCSVM_CS, only one decision value 
287 |             is returned. For probabilities, each element contains k values 
288 |             indicating the probability that the testing instance is in each class.
289 |             Note that the order of classes here is the same as 'model.label'
290 |             field in the model structure.
291 | 
292 |     Example:
293 | 
294 |     >>> m = train(y, x, '-c 5')
295 |     >>> p_labels, p_acc, p_vals = predict(y, x, m)
296 | 
297 | - Functions: svm_read_problem/load_model/save_model
298 | 
299 |     See the usage by examples:
300 | 
301 |     >>> y, x = svm_read_problem('data.txt')
302 |     >>> m = load_model('model_file')
303 |     >>> save_model('model_file', m)
304 | 
305 | - Function: evaluations
306 | 
307 |     Calculate some evaluations using the true values (ty) and predicted
308 |     values (pv):
309 | 
310 |     >>> (ACC, MSE, SCC) = evaluations(ty, pv)
311 | 
312 |     ty: a list of true values.
313 | 
314 |     pv: a list of predict values.
315 | 
316 |     ACC: accuracy.
317 | 
318 |     MSE: mean squared error.
319 | 
320 |     SCC: squared correlation coefficient.
321 | 
322 | 
323 | Additional Information
324 | ======================
325 | 
326 | This interface was written by Hsiang-Fu Yu from Department of Computer
327 | Science, National Taiwan University. If you find this tool useful, please
328 | cite LIBLINEAR as follows
329 | 
330 | R.-E. Fan, K.-W. Chang, C.-J. Hsieh, X.-R. Wang, and C.-J. Lin.
331 | LIBLINEAR: A Library for Large Linear Classification, Journal of
332 | Machine Learning Research 9(2008), 1871-1874. Software available at
333 | http://www.csie.ntu.edu.tw/~cjlin/liblinear
334 | 
335 | For any question, please contact Chih-Jen Lin <cjlin@csie.ntu.edu.tw>,
336 | or check the FAQ page:
337 | 
338 | http://www.csie.ntu.edu.tw/~cjlin/liblinear/faq.html
339 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/python/liblinear.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from ctypes import *
  4 | from ctypes.util import find_library
  5 | from os import path
  6 | import sys
  7 | 
  8 | # For unix the prefix 'lib' is not considered.
  9 | liblinear = CDLL(path.join(path.dirname(path.abspath(__file__)), '../liblinear.so.1'))
 10 | 
 11 | # Construct constants
 12 | SOLVER_TYPE = ['L2R_LR', 'L2R_L2LOSS_SVC_DUAL', 'L2R_L2LOSS_SVC', 'L2R_L1LOSS_SVC_DUAL',\
 13 | 		'MCSVM_CS', 'L1R_L2LOSS_SVC', 'L1R_LR', 'L2R_LR_DUAL', \
 14 | 		None, None, None, \
 15 | 		'L2R_L2LOSS_SVR', 'L2R_L2LOSS_SVR_DUAL', 'L2R_L1LOSS_SVR_DUAL']
 16 | for i, s in enumerate(SOLVER_TYPE): 
 17 | 	if s is not None: exec("%s = %d" % (s , i))
 18 | 
 19 | PRINT_STRING_FUN = CFUNCTYPE(None, c_char_p)
 20 | def print_null(s): 
 21 | 	return 
 22 | 
 23 | def genFields(names, types): 
 24 | 	return list(zip(names, types))
 25 | 
 26 | def fillprototype(f, restype, argtypes): 
 27 | 	f.restype = restype
 28 | 	f.argtypes = argtypes
 29 | 
 30 | class feature_node(Structure):
 31 | 	_names = ["index", "value"]
 32 | 	_types = [c_int64, c_double]
 33 | 	_fields_ = genFields(_names, _types)
 34 | 
 35 | 	def __str__(self):
 36 | 		return '%d:%g' % (self.index, self.value)
 37 | 
 38 | def gen_feature_nodearray(xi, feature_max=None, issparse=True):
 39 | 	if isinstance(xi, dict):
 40 | 		index_range = xi.keys()
 41 | 	elif isinstance(xi, (list, tuple)):
 42 | 		xi = [0] + xi  # idx should start from 1
 43 | 		index_range = range(1, len(xi))
 44 | 	else:
 45 | 		raise TypeError('xi should be a dictionary, list or tuple')
 46 | 
 47 | 	if feature_max:
 48 | 		assert(isinstance(feature_max, int))
 49 | 		index_range = filter(lambda j: j <= feature_max, index_range)
 50 | 	if issparse: 
 51 | 		index_range = filter(lambda j:xi[j] != 0, index_range)
 52 | 
 53 | 	index_range = sorted(index_range)
 54 | 	ret = (feature_node * (len(index_range)+2))()
 55 | 	ret[-1].index = -1 # for bias term
 56 | 	ret[-2].index = -1
 57 | 	for idx, j in enumerate(index_range):
 58 | 		ret[idx].index = j
 59 | 		ret[idx].value = xi[j]
 60 | 	max_idx = 0
 61 | 	if index_range : 
 62 | 		max_idx = index_range[-1]
 63 | 	return ret, max_idx
 64 | 
 65 | class problem(Structure):
 66 | 	_names = ["l", "n", "y", "x", "bias"]
 67 | 	_types = [c_int64, c_int64, POINTER(c_double), POINTER(POINTER(feature_node)), c_double]
 68 | 	_fields_ = genFields(_names, _types)
 69 | 
 70 | 	def __init__(self, y, x, bias = -1):
 71 | 		if len(y) != len(x) :
 72 | 			raise ValueError("len(y) != len(x)")
 73 | 		self.l = l = len(y)
 74 | 		self.bias = -1
 75 | 
 76 | 		max_idx = 0
 77 | 		x_space = self.x_space = []
 78 | 		for i, xi in enumerate(x):
 79 | 			tmp_xi, tmp_idx = gen_feature_nodearray(xi)
 80 | 			x_space += [tmp_xi]
 81 | 			max_idx = max(max_idx, tmp_idx)
 82 | 		self.n = max_idx
 83 | 
 84 | 		self.y = (c_double * l)()
 85 | 		for i, yi in enumerate(y): self.y[i] = y[i]
 86 | 
 87 | 		self.x = (POINTER(feature_node) * l)() 
 88 | 		for i, xi in enumerate(self.x_space): self.x[i] = xi
 89 | 
 90 | 		self.set_bias(bias)
 91 | 
 92 | 	def set_bias(self, bias):
 93 | 		if self.bias == bias:
 94 | 			return 
 95 | 		if bias >= 0 and self.bias < 0: 
 96 | 			self.n += 1
 97 | 			node = feature_node(self.n, bias)
 98 | 		if bias < 0 and self.bias >= 0: 
 99 | 			self.n -= 1
100 | 			node = feature_node(-1, bias)
101 | 
102 | 		for xi in self.x_space:
103 | 			xi[-2] = node
104 | 		self.bias = bias
105 | 
106 | 
107 | class parameter(Structure):
108 | 	_names = ["solver_type", "eps", "C", "nr_weight", "weight_label", "weight", "p"]
109 | 	_types = [c_int64, c_double, c_double, c_int64, POINTER(c_int64), POINTER(c_double), c_double]
110 | 	_fields_ = genFields(_names, _types)
111 | 
112 | 	def __init__(self, options = None):
113 | 		if options == None:
114 | 			options = ''
115 | 		self.parse_options(options)
116 | 
117 | 	def __str__(self):
118 | 		s = ''
119 | 		attrs = parameter._names + list(self.__dict__.keys())
120 | 		values = map(lambda attr: getattr(self, attr), attrs) 
121 | 		for attr, val in zip(attrs, values):
122 | 			s += (' %s: %s\n' % (attr, val))
123 | 		s = s.strip()
124 | 
125 | 		return s
126 | 
127 | 	def set_to_default_values(self):
128 | 		self.solver_type = L2R_L2LOSS_SVC_DUAL
129 | 		self.eps = float('inf')
130 | 		self.C = 1
131 | 		self.p = 0.1
132 | 		self.nr_weight = 0
133 | 		self.weight_label = (c_int64 * 0)()
134 | 		self.weight = (c_double * 0)()
135 | 		self.bias = -1
136 | 		self.cross_validation = False
137 | 		self.nr_fold = 0
138 | 		self.print_func = None
139 | 
140 | 	def parse_options(self, options):
141 | 		if isinstance(options, list):
142 | 			argv = options
143 | 		elif isinstance(options, str):
144 | 			argv = options.split()
145 | 		else:
146 | 			raise TypeError("arg 1 should be a list or a str.")
147 | 		self.set_to_default_values()
148 | 		self.print_func = cast(None, PRINT_STRING_FUN)
149 | 		weight_label = []
150 | 		weight = []
151 | 
152 | 		i = 0
153 | 		while i < len(argv) :
154 | 			if argv[i] == "-s":
155 | 				i = i + 1
156 | 				self.solver_type = int(argv[i])
157 | 			elif argv[i] == "-c":
158 | 				i = i + 1
159 | 				self.C = float(argv[i])
160 | 			elif argv[i] == "-p":
161 | 				i = i + 1
162 | 				self.p = float(argv[i])
163 | 			elif argv[i] == "-e":
164 | 				i = i + 1
165 | 				self.eps = float(argv[i])
166 | 			elif argv[i] == "-B":
167 | 				i = i + 1
168 | 				self.bias = float(argv[i])
169 | 			elif argv[i] == "-v":
170 | 				i = i + 1
171 | 				self.cross_validation = 1
172 | 				self.nr_fold = int(argv[i])
173 | 				if self.nr_fold < 2 :
174 | 					raise ValueError("n-fold cross validation: n must >= 2")
175 | 			elif argv[i].startswith("-w"):
176 | 				i = i + 1
177 | 				self.nr_weight += 1
178 | 				nr_weight = self.nr_weight
179 | 				weight_label += [int(argv[i-1][2:])]
180 | 				weight += [float(argv[i])]
181 | 			elif argv[i] == "-q":
182 | 				self.print_func = PRINT_STRING_FUN(print_null)
183 | 			else :
184 | 				raise ValueError("Wrong options")
185 | 			i += 1
186 | 
187 | 		liblinear.set_print_string_function(self.print_func)
188 | 		self.weight_label = (c_int64*self.nr_weight)()
189 | 		self.weight = (c_double*self.nr_weight)()
190 | 		for i in range(self.nr_weight): 
191 | 			self.weight[i] = weight[i]
192 | 			self.weight_label[i] = weight_label[i]
193 | 
194 | 		if self.eps == float('inf'):
195 | 			if self.solver_type in [L2R_LR, L2R_L2LOSS_SVC]:
196 | 				self.eps = 0.01
197 | 			elif self.solver_type in [L2R_L2LOSS_SVR]:
198 | 				self.eps = 0.001
199 | 			elif self.solver_type in [L2R_L2LOSS_SVC_DUAL, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L2R_LR_DUAL]:
200 | 				self.eps = 0.1
201 | 			elif self.solver_type in [L1R_L2LOSS_SVC, L1R_LR]:
202 | 				self.eps = 0.01
203 | 			elif self.solver_type in [L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
204 | 				self.eps = 0.1
205 | 
206 | class model(Structure):
207 | 	_names = ["param", "nr_class", "nr_feature", "w", "label", "bias"]
208 | 	_types = [parameter, c_int64, c_int64, POINTER(c_double), POINTER(c_int64), c_double]
209 | 	_fields_ = genFields(_names, _types)
210 | 
211 | 	def __init__(self):
212 | 		self.__createfrom__ = 'python'
213 | 
214 | 	def __del__(self):
215 | 		# free memory created by C to avoid memory leak
216 | 		if hasattr(self, '__createfrom__') and self.__createfrom__ == 'C':
217 | 			liblinear.free_and_destroy_model(pointer(self))
218 | 
219 | 	def get_nr_feature(self):
220 | 		return liblinear.get_nr_feature(self)
221 | 
222 | 	def get_nr_class(self):
223 | 		return liblinear.get_nr_class(self)
224 | 
225 | 	def get_labels(self):
226 | 		nr_class = self.get_nr_class()
227 | 		labels = (c_int64 * nr_class)()
228 | 		liblinear.get_labels(self, labels)
229 | 		return labels[:nr_class]
230 | 
231 | 	def is_probability_model(self):
232 | 		return (liblinear.check_probability_model(self) == 1)
233 | 
234 | def toPyModel(model_ptr):
235 | 	"""
236 | 	toPyModel(model_ptr) -> model
237 | 
238 | 	Convert a ctypes POINTER(model) to a Python model
239 | 	"""
240 | 	if bool(model_ptr) == False:
241 | 		raise ValueError("Null pointer")
242 | 	m = model_ptr.contents
243 | 	m.__createfrom__ = 'C'
244 | 	return m
245 | 
246 | fillprototype(liblinear.train, POINTER(model), [POINTER(problem), POINTER(parameter)])
247 | fillprototype(liblinear.cross_validation, None, [POINTER(problem), POINTER(parameter), c_int64, POINTER(c_double)])
248 | 
249 | fillprototype(liblinear.predict_values, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)])
250 | fillprototype(liblinear.predict, c_double, [POINTER(model), POINTER(feature_node)])
251 | fillprototype(liblinear.predict_probability, c_double, [POINTER(model), POINTER(feature_node), POINTER(c_double)])
252 | 
253 | fillprototype(liblinear.save_model, c_int64, [c_char_p, POINTER(model)])
254 | fillprototype(liblinear.load_model, POINTER(model), [c_char_p])
255 | 
256 | fillprototype(liblinear.get_nr_feature, c_int64, [POINTER(model)])
257 | fillprototype(liblinear.get_nr_class, c_int64, [POINTER(model)])
258 | fillprototype(liblinear.get_labels, None, [POINTER(model), POINTER(c_int64)])
259 | 
260 | fillprototype(liblinear.free_model_content, None, [POINTER(model)])
261 | fillprototype(liblinear.free_and_destroy_model, None, [POINTER(POINTER(model))])
262 | fillprototype(liblinear.destroy_param, None, [POINTER(parameter)])
263 | fillprototype(liblinear.check_parameter, c_char_p, [POINTER(problem), POINTER(parameter)])
264 | fillprototype(liblinear.check_probability_model, c_int64, [POINTER(model)])
265 | fillprototype(liblinear.set_print_string_function, None, [CFUNCTYPE(None, c_char_p)])
266 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/python/liblinearutil.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, sys
  4 | sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path 
  5 | from liblinear import *
  6 | 
  7 | def svm_read_problem(data_file_name):
  8 | 	"""
  9 | 	svm_read_problem(data_file_name) -> [y, x]
 10 | 
 11 | 	Read LIBSVM-format data from data_file_name and return labels y
 12 | 	and data instances x.
 13 | 	"""
 14 | 	prob_y = []
 15 | 	prob_x = []
 16 | 	for line in open(data_file_name):
 17 | 		line = line.split(None, 1)
 18 | 		# In case an instance with all zero features
 19 | 		if len(line) == 1: line += ['']
 20 | 		label, features = line
 21 | 		xi = {}
 22 | 		for e in features.split():
 23 | 			ind, val = e.split(":")
 24 | 			xi[int(ind)] = float(val)
 25 | 		prob_y += [float(label)]
 26 | 		prob_x += [xi]
 27 | 	return (prob_y, prob_x)
 28 | 
 29 | def load_model(model_file_name):
 30 | 	"""
 31 | 	load_model(model_file_name) -> model
 32 | 
 33 | 	Load a LIBLINEAR model from model_file_name and return.
 34 | 	"""
 35 | 	model = liblinear.load_model(model_file_name.encode())
 36 | 	if not model:
 37 | 		print("can't open model file %s" % model_file_name)
 38 | 		return None
 39 | 	model = toPyModel(model)
 40 | 	return model
 41 | 
 42 | def save_model(model_file_name, model):
 43 | 	"""
 44 | 	save_model(model_file_name, model) -> None
 45 | 
 46 | 	Save a LIBLINEAR model to the file model_file_name.
 47 | 	"""
 48 | 	liblinear.save_model(model_file_name.encode(), model)
 49 | 
 50 | def evaluations(ty, pv):
 51 | 	"""
 52 | 	evaluations(ty, pv) -> (ACC, MSE, SCC)
 53 | 
 54 | 	Calculate accuracy, mean squared error and squared correlation coefficient
 55 | 	using the true values (ty) and predicted values (pv).
 56 | 	"""
 57 | 	if len(ty) != len(pv):
 58 | 		raise ValueError("len(ty) must equal to len(pv)")
 59 | 	total_correct = total_error = 0
 60 | 	sumv = sumy = sumvv = sumyy = sumvy = 0
 61 | 	for v, y in zip(pv, ty):
 62 | 		if y == v:
 63 | 			total_correct += 1
 64 | 		total_error += (v-y)*(v-y)
 65 | 		sumv += v
 66 | 		sumy += y
 67 | 		sumvv += v*v
 68 | 		sumyy += y*y
 69 | 		sumvy += v*y
 70 | 	l = len(ty)
 71 | 	ACC = 100.0*total_correct/l
 72 | 	MSE = total_error/l
 73 | 	try:
 74 | 		SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
 75 | 	except:
 76 | 		SCC = float('nan')
 77 | 	return (ACC, MSE, SCC)
 78 | 
 79 | def train(arg1, arg2=None, arg3=None):
 80 | 	"""
 81 | 	train(y, x [, options]) -> model | ACC
 82 | 	train(prob [, options]) -> model | ACC
 83 | 	train(prob, param) -> model | ACC
 84 | 
 85 | 	Train a model from data (y, x) or a problem prob using
 86 | 	'options' or a parameter param.
 87 | 	If '-v' is specified in 'options' (i.e., cross validation)
 88 | 	either accuracy (ACC) or mean-squared error (MSE) is returned.
 89 | 
 90 | 	options:
 91 | 		-s type : set type of solver (default 1)
 92 | 		  for multi-class classification
 93 | 			 0 -- L2-regularized logistic regression (primal)
 94 | 			 1 -- L2-regularized L2-loss support vector classification (dual)
 95 | 			 2 -- L2-regularized L2-loss support vector classification (primal)
 96 | 			 3 -- L2-regularized L1-loss support vector classification (dual)
 97 | 			 4 -- support vector classification by Crammer and Singer
 98 | 			 5 -- L1-regularized L2-loss support vector classification
 99 | 			 6 -- L1-regularized logistic regression
100 | 			 7 -- L2-regularized logistic regression (dual)
101 | 		  for regression
102 | 			11 -- L2-regularized L2-loss support vector regression (primal)
103 | 			12 -- L2-regularized L2-loss support vector regression (dual)
104 | 			13 -- L2-regularized L1-loss support vector regression (dual)
105 | 		-c cost : set the parameter C (default 1)
106 | 		-p epsilon : set the epsilon in loss function of SVR (default 0.1)
107 | 		-e epsilon : set tolerance of termination criterion
108 | 			-s 0 and 2
109 | 				|f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
110 | 				where f is the primal function, (default 0.01)
111 | 			-s 11
112 | 				|f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001)
113 | 			-s 1, 3, 4, and 7
114 | 				Dual maximal violation <= eps; similar to liblinear (default 0.)
115 | 			-s 5 and 6
116 | 				|f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
117 | 				where f is the primal function (default 0.01)
118 | 			-s 12 and 13
119 | 				|f'(alpha)|_1 <= eps |f'(alpha0)|,
120 | 				where f is the dual function (default 0.1)
121 | 		-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
122 | 		-wi weight: weights adjust the parameter C of different classes (see README for details)
123 | 		-v n: n-fold cross validation mode
124 | 	    -q : quiet mode (no outputs)
125 | 	"""
126 | 	prob, param = None, None
127 | 	if isinstance(arg1, (list, tuple)):
128 | 		assert isinstance(arg2, (list, tuple))
129 | 		y, x, options = arg1, arg2, arg3
130 | 		prob = problem(y, x)
131 | 		param = parameter(options)
132 | 	elif isinstance(arg1, problem):
133 | 		prob = arg1
134 | 		if isinstance(arg2, parameter):
135 | 			param = arg2
136 | 		else :
137 | 			param = parameter(arg2)
138 | 	if prob == None or param == None :
139 | 		raise TypeError("Wrong types for the arguments")
140 | 
141 | 	prob.set_bias(param.bias)
142 | 	liblinear.set_print_string_function(param.print_func)
143 | 	err_msg = liblinear.check_parameter(prob, param)
144 | 	if err_msg :
145 | 		raise ValueError('Error: %s' % err_msg)
146 | 
147 | 	if param.cross_validation:
148 | 		l, nr_fold = prob.l, param.nr_fold
149 | 		target = (c_double * l)()
150 | 		liblinear.cross_validation(prob, param, nr_fold, target)
151 | 		ACC, MSE, SCC = evaluations(prob.y[:l], target[:l])
152 | 		if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
153 | 			print("Cross Validation Mean squared error = %g" % MSE)
154 | 			print("Cross Validation Squared correlation coefficient = %g" % SCC)
155 | 			return MSE
156 | 		else:
157 | 			print("Cross Validation Accuracy = %g%%" % ACC)
158 | 			return ACC
159 | 	else :
160 | 		m = liblinear.train(prob, param)
161 | 		m = toPyModel(m)
162 | 
163 | 		return m
164 | 
165 | def predict(y, x, m, options=""):
166 | 	"""
167 | 	predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals)
168 | 
169 | 	Predict data (y, x) with the SVM model m.
170 | 	options:
171 | 	    -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only
172 | 	    -q quiet mode (no outputs)
173 | 
174 | 	The return tuple contains
175 | 	p_labels: a list of predicted labels
176 | 	p_acc: a tuple including  accuracy (for classification), mean-squared
177 | 	       error, and squared correlation coefficient (for regression).
178 | 	p_vals: a list of decision values or probability estimates (if '-b 1'
179 | 	        is specified). If k is the number of classes, for decision values,
180 | 	        each element includes results of predicting k binary-class
181 | 	        SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value
182 | 	        is returned. For probabilities, each element contains k values
183 | 	        indicating the probability that the testing instance is in each class.
184 | 	        Note that the order of classes here is the same as 'model.label'
185 | 	        field in the model structure.
186 | 	"""
187 | 
188 | 	def info(s):
189 | 		print(s)
190 | 
191 | 	predict_probability = 0
192 | 	argv = options.split()
193 | 	i = 0
194 | 	while i < len(argv):
195 | 		if argv[i] == '-b':
196 | 			i += 1
197 | 			predict_probability = int(argv[i])
198 | 		elif argv[i] == '-q':
199 | 			info = print_null
200 | 		else:
201 | 			raise ValueError("Wrong options")
202 | 		i+=1
203 | 
204 | 	solver_type = m.param.solver_type
205 | 	nr_class = m.get_nr_class()
206 | 	nr_feature = m.get_nr_feature()
207 | 	is_prob_model = m.is_probability_model()
208 | 	bias = m.bias
209 | 	if bias >= 0:
210 | 		biasterm = feature_node(nr_feature+1, bias)
211 | 	else:
212 | 		biasterm = feature_node(-1, bias)
213 | 	pred_labels = []
214 | 	pred_values = []
215 | 
216 | 	if predict_probability:
217 | 		if not is_prob_model:
218 | 			raise TypeError('probability output is only supported for logistic regression')
219 | 		prob_estimates = (c_double * nr_class)()
220 | 		for xi in x:
221 | 			xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature)
222 | 			xi[-2] = biasterm
223 | 			label = liblinear.predict_probability(m, xi, prob_estimates)
224 | 			values = prob_estimates[:nr_class]
225 | 			pred_labels += [label]
226 | 			pred_values += [values]
227 | 	else:
228 | 		if nr_class <= 2:
229 | 			nr_classifier = 1
230 | 		else:
231 | 			nr_classifier = nr_class
232 | 		dec_values = (c_double * nr_classifier)()
233 | 		for xi in x:
234 | 			xi, idx = gen_feature_nodearray(xi, feature_max=nr_feature)
235 | 			xi[-2] = biasterm
236 | 			label = liblinear.predict_values(m, xi, dec_values)
237 | 			values = dec_values[:nr_classifier]
238 | 			pred_labels += [label]
239 | 			pred_values += [values]
240 | 	if len(y) == 0:
241 | 		y = [0] * len(x)
242 | 	ACC, MSE, SCC = evaluations(y, pred_labels)
243 | 	l = len(y)
244 | 	if solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]:
245 | 		info("Mean squared error = %g (regression)" % MSE)
246 | 		info("Squared correlation coefficient = %g (regression)" % SCC)
247 | 	else:
248 | 		info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(l*ACC/100), l))
249 | 
250 | 	return pred_labels, (ACC, MSE, SCC), pred_values
251 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/train:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/classifier/learner/liblinear/train


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/train.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <math.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <ctype.h>
  6 | #include <errno.h>
  7 | #include "linear.h"
  8 | #define Malloc(type,n) (type *)malloc((n)*sizeof(type))
  9 | #define INF HUGE_VAL
 10 | 
 11 | void print_null(const char *s) {}
 12 | 
 13 | void exit_with_help()
 14 | {
 15 | 	printf(
 16 | 	"Usage: train [options] training_set_file [model_file]\n"
 17 | 	"options:\n"
 18 | 	"-s type : set type of solver (default 1)\n"
 19 | 	"  for multi-class classification\n"
 20 | 	"	 0 -- L2-regularized logistic regression (primal)\n"
 21 | 	"	 1 -- L2-regularized L2-loss support vector classification (dual)\n"
 22 | 	"	 2 -- L2-regularized L2-loss support vector classification (primal)\n"
 23 | 	"	 3 -- L2-regularized L1-loss support vector classification (dual)\n"
 24 | 	"	 4 -- support vector classification by Crammer and Singer\n"
 25 | 	"	 5 -- L1-regularized L2-loss support vector classification\n"
 26 | 	"	 6 -- L1-regularized logistic regression\n"
 27 | 	"	 7 -- L2-regularized logistic regression (dual)\n"
 28 | 	"  for regression\n"
 29 | 	"	11 -- L2-regularized L2-loss support vector regression (primal)\n"
 30 | 	"	12 -- L2-regularized L2-loss support vector regression (dual)\n"
 31 | 	"	13 -- L2-regularized L1-loss support vector regression (dual)\n"
 32 | 	"-c cost : set the parameter C (default 1)\n"
 33 | 	"-p epsilon : set the epsilon in loss function of SVR (default 0.1)\n"
 34 | 	"-e epsilon : set tolerance of termination criterion\n"
 35 | 	"	-s 0 and 2\n"
 36 | 	"		|f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n"
 37 | 	"		where f is the primal function and pos/neg are # of\n"
 38 | 	"		positive/negative data (default 0.01)\n"
 39 | 	"	-s 11\n"
 40 | 	"		|f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.001)\n"
 41 | 	"	-s 1, 3, 4, and 7\n"
 42 | 	"		Dual maximal violation <= eps; similar to libsvm (default 0.1)\n"
 43 | 	"	-s 5 and 6\n"
 44 | 	"		|f'(w)|_1 <= eps*min(pos,neg)/l*|f'(w0)|_1,\n"
 45 | 	"		where f is the primal function (default 0.01)\n"
 46 | 	"	-s 12 and 13\n"
 47 | 	"		|f'(alpha)|_1 <= eps |f'(alpha0)|,\n"
 48 | 	"		where f is the dual function (default 0.1)\n"
 49 | 	"-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)\n"
 50 | 	"-wi weight: weights adjust the parameter C of different classes (see README for details)\n"
 51 | 	"-v n: n-fold cross validation mode\n"
 52 | 	"-q : quiet mode (no outputs)\n"
 53 | 	);
 54 | 	exit(1);
 55 | }
 56 | 
 57 | void exit_input_error(INT64 line_num)
 58 | {
 59 | 	fprintf(stderr,"Wrong input format at line %lld\n", (long long int)line_num);
 60 | 	exit(1);
 61 | }
 62 | 
 63 | static char *line = NULL;
 64 | static INT64 max_line_len;
 65 | 
 66 | static char* readline(FILE *input)
 67 | {
 68 | 	INT64 len;
 69 | 
 70 | 	if(fgets(line,max_line_len,input) == NULL)
 71 | 		return NULL;
 72 | 
 73 | 	while(strrchr(line,'\n') == NULL)
 74 | 	{
 75 | 		max_line_len *= 2;
 76 | 		line = (char *) realloc(line,max_line_len);
 77 | 		len = (INT64) strlen(line);
 78 | 		if(fgets(line+len,max_line_len-len,input) == NULL)
 79 | 			break;
 80 | 	}
 81 | 	return line;
 82 | }
 83 | 
 84 | void parse_command_line(INT64 argc, char **argv, char *input_file_name, char *model_file_name);
 85 | void read_problem(const char *filename);
 86 | void do_cross_validation();
 87 | 
 88 | struct feature_node *x_space;
 89 | struct parameter param;
 90 | struct problem prob;
 91 | struct model* model_;
 92 | INT64 flag_cross_validation;
 93 | INT64 nr_fold;
 94 | double bias;
 95 | 
 96 | int main(int argc, char **argv)
 97 | {
 98 | 	char input_file_name[1024];
 99 | 	char model_file_name[1024];
100 | 	const char *error_msg;
101 | 
102 | 	parse_command_line((INT64)argc, argv, input_file_name, model_file_name);
103 | 	read_problem(input_file_name);
104 | 	error_msg = check_parameter(&prob,&param);
105 | 
106 | 	if(error_msg)
107 | 	{
108 | 		fprintf(stderr,"ERROR: %s\n",error_msg);
109 | 		exit(1);
110 | 	}
111 | 
112 | 	if(flag_cross_validation)
113 | 	{
114 | 		do_cross_validation();
115 | 	}
116 | 	else
117 | 	{
118 | 		model_=train(&prob, &param);
119 | 		if(save_model(model_file_name, model_))
120 | 		{
121 | 			fprintf(stderr,"can't save model to file %s\n",model_file_name);
122 | 			exit(1);
123 | 		}
124 | 		free_and_destroy_model(&model_);
125 | 	}
126 | 	destroy_param(&param);
127 | 	free(prob.y);
128 | 	free(prob.x);
129 | 	free(x_space);
130 | 	free(line);
131 | 
132 | 	return 0;
133 | }
134 | 
135 | void do_cross_validation()
136 | {
137 | 	INT64 i;
138 | 	INT64 total_correct = 0;
139 | 	double total_error = 0;
140 | 	double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0;
141 | 	double *target = Malloc(double, prob.l);
142 | 
143 | 	cross_validation(&prob,&param,nr_fold,target);
144 | 	if(param.solver_type == L2R_L2LOSS_SVR ||
145 | 	   param.solver_type == L2R_L1LOSS_SVR_DUAL ||
146 | 	   param.solver_type == L2R_L2LOSS_SVR_DUAL)
147 | 	{
148 | 		for(i=0;i<prob.l;i++)
149 | 		{
150 | 			double y = prob.y[i];
151 | 			double v = target[i];
152 | 			total_error += (v-y)*(v-y);
153 | 			sumv += v;
154 | 			sumy += y;
155 | 			sumvv += v*v;
156 | 			sumyy += y*y;
157 | 			sumvy += v*y;
158 | 		}
159 | 		printf("Cross Validation Mean squared error = %g\n",total_error/prob.l);
160 | 		printf("Cross Validation Squared correlation coefficient = %g\n",
161 | 			   ((prob.l*sumvy-sumv*sumy)*(prob.l*sumvy-sumv*sumy))/
162 | 			   ((prob.l*sumvv-sumv*sumv)*(prob.l*sumyy-sumy*sumy))
163 | 			   );
164 | 	}
165 | 	else
166 | 	{
167 | 		for(i=0;i<prob.l;i++)
168 | 			if(target[i] == prob.y[i])
169 | 				++total_correct;
170 | 		printf("Cross Validation Accuracy = %g%%\n",100.0*total_correct/prob.l);
171 | 	}
172 | 
173 | 	free(target);
174 | }
175 | 
176 | void parse_command_line(INT64 argc, char **argv, char *input_file_name, char *model_file_name)
177 | {
178 | 	INT64 i;
179 | 	void (*print_func)(const char*) = NULL;	// default printing to stdout
180 | 
181 | 	// default values
182 | 	param.solver_type = L2R_L2LOSS_SVC_DUAL;
183 | 	param.C = 1;
184 | 	param.eps = INF; // see setting below
185 | 	param.p = 0.1;
186 | 	param.nr_weight = 0;
187 | 	param.weight_label = NULL;
188 | 	param.weight = NULL;
189 | 	flag_cross_validation = 0;
190 | 	bias = -1;
191 | 
192 | 	// parse options
193 | 	for(i=1;i<argc;i++)
194 | 	{
195 | 		if(argv[i][0] != '-') break;
196 | 		if(++i>=argc)
197 | 			exit_with_help();
198 | 		switch(argv[i-1][1])
199 | 		{
200 | 			case 's':
201 | 				param.solver_type = atoi(argv[i]);
202 | 				break;
203 | 
204 | 			case 'c':
205 | 				param.C = atof(argv[i]);
206 | 				break;
207 | 
208 | 			case 'p':
209 | 				param.p = atof(argv[i]);
210 | 				break;
211 | 
212 | 			case 'e':
213 | 				param.eps = atof(argv[i]);
214 | 				break;
215 | 
216 | 			case 'B':
217 | 				bias = atof(argv[i]);
218 | 				break;
219 | 
220 | 			case 'w':
221 | 				++param.nr_weight;
222 | 				param.weight_label = (INT64 *) realloc(param.weight_label,sizeof(INT64)*param.nr_weight);
223 | 				param.weight = (double *) realloc(param.weight,sizeof(double)*param.nr_weight);
224 | 				param.weight_label[param.nr_weight-1] = atoi(&argv[i-1][2]);
225 | 				param.weight[param.nr_weight-1] = atof(argv[i]);
226 | 				break;
227 | 
228 | 			case 'v':
229 | 				flag_cross_validation = 1;
230 | 				nr_fold = atoi(argv[i]);
231 | 				if(nr_fold < 2)
232 | 				{
233 | 					fprintf(stderr,"n-fold cross validation: n must >= 2\n");
234 | 					exit_with_help();
235 | 				}
236 | 				break;
237 | 
238 | 			case 'q':
239 | 				print_func = &print_null;
240 | 				i--;
241 | 				break;
242 | 
243 | 			default:
244 | 				fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]);
245 | 				exit_with_help();
246 | 				break;
247 | 		}
248 | 	}
249 | 
250 | 	set_print_string_function(print_func);
251 | 
252 | 	// determine filenames
253 | 	if(i>=argc)
254 | 		exit_with_help();
255 | 
256 | 	strcpy(input_file_name, argv[i]);
257 | 
258 | 	if(i<argc-1)
259 | 		strcpy(model_file_name,argv[++i]);
260 | 	else
261 | 	{
262 | 		char *p = strrchr(argv[i],'/');
263 | 		if(p==NULL)
264 | 			p = argv[i];
265 | 		else
266 | 			++p;
267 | 		sprintf(model_file_name,"%s.model",p);
268 | 	}
269 | 
270 | 	if(i<argc-1 || model_file_name[0] == '-')
271 | 		exit_with_help();
272 | 	
273 | 	if(param.eps == INF)
274 | 	{
275 | 		switch(param.solver_type)
276 | 		{
277 | 			case L2R_LR:
278 | 			case L2R_L2LOSS_SVC:
279 | 				param.eps = 0.01;
280 | 				break;
281 | 			case L2R_L2LOSS_SVR:
282 | 				param.eps = 0.001;
283 | 				break;
284 | 			case L2R_L2LOSS_SVC_DUAL:
285 | 			case L2R_L1LOSS_SVC_DUAL:
286 | 			case MCSVM_CS:
287 | 			case L2R_LR_DUAL:
288 | 				param.eps = 0.1;
289 | 				break;
290 | 			case L1R_L2LOSS_SVC:
291 | 			case L1R_LR:
292 | 				param.eps = 0.01;
293 | 				break;
294 | 			case L2R_L1LOSS_SVR_DUAL:
295 | 			case L2R_L2LOSS_SVR_DUAL:
296 | 				param.eps = 0.1;
297 | 				break;
298 | 		}
299 | 	}
300 | }
301 | 
302 | // read in a problem (in libsvm format)
303 | void read_problem(const char *filename)
304 | {
305 | 	INT64 max_index, inst_max_index, i;
306 | 	INT64 elements, j;
307 | 	FILE *fp = fopen(filename,"r");
308 | 	char *endptr;
309 | 	char *idx, *val, *label;
310 | 
311 | 	if(fp == NULL)
312 | 	{
313 | 		fprintf(stderr,"can't open input file %s\n",filename);
314 | 		exit(1);
315 | 	}
316 | 
317 | 	prob.l = 0;
318 | 	elements = 0;
319 | 	max_line_len = 1024;
320 | 	line = Malloc(char,max_line_len);
321 | 	while(readline(fp)!=NULL)
322 | 	{
323 | 		char *p = strtok(line," \t"); // label
324 | 
325 | 		// features
326 | 		while(1)
327 | 		{
328 | 			p = strtok(NULL," \t");
329 | 			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
330 | 				break;
331 | 			elements++;
332 | 		}
333 | 		prob.l++;
334 | 	}
335 | 	rewind(fp);
336 | 
337 | 	prob.bias=bias;
338 | 	if(prob.bias >= 0) elements += prob.l;
339 | 
340 | 	prob.y = Malloc(double,prob.l);
341 | 	prob.x = Malloc(struct feature_node *,prob.l);
342 | 	x_space = Malloc(struct feature_node,elements+prob.l);
343 | 
344 | 	max_index = 0;
345 | 	j=0;
346 | 	for(i=0;i<prob.l;i++)
347 | 	{
348 | 		inst_max_index = 0; // strtol gives 0 if wrong format
349 | 		readline(fp);
350 | 		prob.x[i] = &x_space[j];
351 | 		label = strtok(line," \t\n");
352 | 		if(label == NULL) // empty line
353 | 			exit_input_error(i+1);
354 | 
355 | 		prob.y[i] = strtod(label,&endptr);
356 | 		if(endptr == label || *endptr != '\0')
357 | 			exit_input_error(i+1);
358 | 
359 | 		while(1)
360 | 		{
361 | 			idx = strtok(NULL,":");
362 | 			val = strtok(NULL," \t");
363 | 
364 | 			if(val == NULL)
365 | 				break;
366 | 
367 | 			errno = 0;
368 | 			x_space[j].index = (INT64)strtoll(idx,&endptr,10);
369 | 			if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
370 | 				exit_input_error(i+1);
371 | 			else
372 | 				inst_max_index = x_space[j].index;
373 | 
374 | 			errno = 0;
375 | 			x_space[j].value = strtod(val,&endptr);
376 | 			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
377 | 				exit_input_error(i+1);
378 | 
379 | 			++j;
380 | 		}
381 | 
382 | 		if(inst_max_index > max_index)
383 | 			max_index = inst_max_index;
384 | 
385 | 		if(prob.bias >= 0)
386 | 			x_space[j++].value = prob.bias;
387 | 
388 | 		x_space[j++].index = -1;
389 | 	}
390 | 
391 | 	if(prob.bias >= 0)
392 | 	{
393 | 		prob.n=max_index+1;
394 | 		for(i=1;i<prob.l;i++)
395 | 			(prob.x[i]-2)->index = prob.n;
396 | 		x_space[j-2].index = prob.n;
397 | 	}
398 | 	else
399 | 		prob.n=max_index;
400 | 
401 | 	fclose(fp);
402 | }
403 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/tron.cpp:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <string.h>
  4 | #include <stdarg.h>
  5 | #include "tron.h"
  6 | #ifndef INT64_DEFINED
  7 | typedef int64_t INT64;
  8 | #define INT64_DEFINED
  9 | #endif
 10 | 
 11 | #ifndef min
 12 | template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
 13 | #endif
 14 | 
 15 | #ifndef max
 16 | template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
 17 | #endif
 18 | 
 19 | #ifdef __cplusplus
 20 | extern "C" {
 21 | #endif
 22 | 
 23 | extern double dnrm2_(INT64 *, double *, INT64 *);
 24 | extern double ddot_(INT64 *, double *, INT64 *, double *, INT64 *);
 25 | extern INT64 daxpy_(INT64 *, double *, double *, INT64 *, double *, INT64 *);
 26 | extern INT64 dscal_(INT64 *, double *, double *, INT64 *);
 27 | 
 28 | #ifdef __cplusplus
 29 | }
 30 | #endif
 31 | 
 32 | static void default_print(const char *buf)
 33 | {
 34 | 	fputs(buf,stdout);
 35 | 	fflush(stdout);
 36 | }
 37 | 
 38 | void TRON::info(const char *fmt,...)
 39 | {
 40 | 	char buf[BUFSIZ];
 41 | 	va_list ap;
 42 | 	va_start(ap,fmt);
 43 | 	vsprintf(buf,fmt,ap);
 44 | 	va_end(ap);
 45 | 	(*tron_print_string)(buf);
 46 | }
 47 | 
 48 | TRON::TRON(const function *fun_obj, double eps, INT64 max_iter)
 49 | {
 50 | 	this->fun_obj=const_cast<function *>(fun_obj);
 51 | 	this->eps=eps;
 52 | 	this->max_iter=max_iter;
 53 | 	tron_print_string = default_print;
 54 | }
 55 | 
 56 | TRON::~TRON()
 57 | {
 58 | }
 59 | 
 60 | void TRON::tron(double *w)
 61 | {
 62 | 	// Parameters for updating the iterates.
 63 | 	double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75;
 64 | 
 65 | 	// Parameters for updating the trust region size delta.
 66 | 	double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4;
 67 | 
 68 | 	INT64 n = fun_obj->get_nr_variable();
 69 | 	INT64 i, cg_iter;
 70 | 	double delta, snorm, one=1.0;
 71 | 	double alpha, f, fnew, prered, actred, gs;
 72 | 	INT64 search = 1, iter = 1, inc = 1;
 73 | 	double *s = new double[n];
 74 | 	double *r = new double[n];
 75 | 	double *w_new = new double[n];
 76 | 	double *g = new double[n];
 77 | 
 78 | 	for (i=0; i<n; i++)
 79 | 		w[i] = 0;
 80 | 
 81 |         f = fun_obj->fun(w);
 82 | 	fun_obj->grad(w, g);
 83 | 	delta = dnrm2_(&n, g, &inc);
 84 | 	double gnorm1 = delta;
 85 | 	double gnorm = gnorm1;
 86 | 
 87 | 	if (gnorm <= eps*gnorm1)
 88 | 		search = 0;
 89 | 
 90 | 	iter = 1;
 91 | 
 92 | 	while (iter <= max_iter && search)
 93 | 	{
 94 | 		cg_iter = trcg(delta, g, s, r);
 95 | 
 96 | 		memcpy(w_new, w, sizeof(double)*n);
 97 | 		daxpy_(&n, &one, s, &inc, w_new, &inc);
 98 | 
 99 | 		gs = ddot_(&n, g, &inc, s, &inc);
100 | 		prered = -0.5*(gs-ddot_(&n, s, &inc, r, &inc));
101 |                 fnew = fun_obj->fun(w_new);
102 | 
103 | 		// Compute the actual reduction.
104 | 	        actred = f - fnew;
105 | 
106 | 		// On the first iteration, adjust the initial step bound.
107 | 		snorm = dnrm2_(&n, s, &inc);
108 | 		if (iter == 1)
109 | 			delta = min(delta, snorm);
110 | 
111 | 		// Compute prediction alpha*snorm of the step.
112 | 		if (fnew - f - gs <= 0)
113 | 			alpha = sigma3;
114 | 		else
115 | 			alpha = max(sigma1, -0.5*(gs/(fnew - f - gs)));
116 | 
117 | 		// Update the trust region bound according to the ratio of actual to predicted reduction.
118 | 		if (actred < eta0*prered)
119 | 			delta = min(max(alpha, sigma1)*snorm, sigma2*delta);
120 | 		else if (actred < eta1*prered)
121 | 			delta = max(sigma1*delta, min(alpha*snorm, sigma2*delta));
122 | 		else if (actred < eta2*prered)
123 | 			delta = max(sigma1*delta, min(alpha*snorm, sigma3*delta));
124 | 		else
125 | 			delta = max(delta, min(alpha*snorm, sigma3*delta));
126 | 
127 | 		info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter);
128 | 
129 | 		if (actred > eta0*prered)
130 | 		{
131 | 			iter++;
132 | 			memcpy(w, w_new, sizeof(double)*n);
133 | 			f = fnew;
134 | 		        fun_obj->grad(w, g);
135 | 
136 | 			gnorm = dnrm2_(&n, g, &inc);
137 | 			if (gnorm <= eps*gnorm1)
138 | 				break;
139 | 		}
140 | 		if (f < -1.0e+32)
141 | 		{
142 | 			info("WARNING: f < -1.0e+32\n");
143 | 			break;
144 | 		}
145 | 		if (fabs(actred) <= 0 && prered <= 0)
146 | 		{
147 | 			info("WARNING: actred and prered <= 0\n");
148 | 			break;
149 | 		}
150 | 		if (fabs(actred) <= 1.0e-12*fabs(f) &&
151 | 		    fabs(prered) <= 1.0e-12*fabs(f))
152 | 		{
153 | 			info("WARNING: actred and prered too small\n");
154 | 			break;
155 | 		}
156 | 	}
157 | 
158 | 	delete[] g;
159 | 	delete[] r;
160 | 	delete[] w_new;
161 | 	delete[] s;
162 | }
163 | 
164 | INT64 TRON::trcg(double delta, double *g, double *s, double *r)
165 | {
166 | 	INT64 i, inc = 1;
167 | 	INT64 n = fun_obj->get_nr_variable();
168 | 	double one = 1;
169 | 	double *d = new double[n];
170 | 	double *Hd = new double[n];
171 | 	double rTr, rnewTrnew, alpha, beta, cgtol;
172 | 
173 | 	for (i=0; i<n; i++)
174 | 	{
175 | 		s[i] = 0;
176 | 		r[i] = -g[i];
177 | 		d[i] = r[i];
178 | 	}
179 | 	cgtol = 0.1*dnrm2_(&n, g, &inc);
180 | 
181 | 	INT64 cg_iter = 0;
182 | 	rTr = ddot_(&n, r, &inc, r, &inc);
183 | 	while (1)
184 | 	{
185 | 		if (dnrm2_(&n, r, &inc) <= cgtol)
186 | 			break;
187 | 		cg_iter++;
188 | 		fun_obj->Hv(d, Hd);
189 | 
190 | 		alpha = rTr/ddot_(&n, d, &inc, Hd, &inc);
191 | 		daxpy_(&n, &alpha, d, &inc, s, &inc);
192 | 		if (dnrm2_(&n, s, &inc) > delta)
193 | 		{
194 | 			info("cg reaches trust region boundary\n");
195 | 			alpha = -alpha;
196 | 			daxpy_(&n, &alpha, d, &inc, s, &inc);
197 | 
198 | 			double std = ddot_(&n, s, &inc, d, &inc);
199 | 			double sts = ddot_(&n, s, &inc, s, &inc);
200 | 			double dtd = ddot_(&n, d, &inc, d, &inc);
201 | 			double dsq = delta*delta;
202 | 			double rad = sqrt(std*std + dtd*(dsq-sts));
203 | 			if (std >= 0)
204 | 				alpha = (dsq - sts)/(std + rad);
205 | 			else
206 | 				alpha = (rad - std)/dtd;
207 | 			daxpy_(&n, &alpha, d, &inc, s, &inc);
208 | 			alpha = -alpha;
209 | 			daxpy_(&n, &alpha, Hd, &inc, r, &inc);
210 | 			break;
211 | 		}
212 | 		alpha = -alpha;
213 | 		daxpy_(&n, &alpha, Hd, &inc, r, &inc);
214 | 		rnewTrnew = ddot_(&n, r, &inc, r, &inc);
215 | 		beta = rnewTrnew/rTr;
216 | 		dscal_(&n, &beta, d, &inc);
217 | 		daxpy_(&n, &one, r, &inc, d, &inc);
218 | 		rTr = rnewTrnew;
219 | 	}
220 | 
221 | 	delete[] d;
222 | 	delete[] Hd;
223 | 
224 | 	return(cg_iter);
225 | }
226 | 
227 | double TRON::norm_inf(INT64 n, double *x)
228 | {
229 | 	double dmax = fabs(x[0]);
230 | 	for (INT64 i=1; i<n; i++)
231 | 		if (fabs(x[i]) >= dmax)
232 | 			dmax = fabs(x[i]);
233 | 	return(dmax);
234 | }
235 | 
236 | void TRON::set_print_string(void (*print_string) (const char *buf))
237 | {
238 | 	tron_print_string = print_string;
239 | }
240 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/liblinear/tron.h:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #ifndef INT64_DEFINED
 3 | typedef int64_t INT64;
 4 | #define INT64_DEFINED
 5 | #endif
 6 | 
 7 | #ifndef _TRON_H
 8 | #define _TRON_H
 9 | 
10 | class function
11 | {
12 | public:
13 | 	virtual double fun(double *w) = 0 ;
14 | 	virtual void grad(double *w, double *g) = 0 ;
15 | 	virtual void Hv(double *s, double *Hs) = 0 ;
16 | 
17 | 	virtual INT64 get_nr_variable(void) = 0 ;
18 | 	virtual ~function(void){}
19 | };
20 | 
21 | class TRON
22 | {
23 | public:
24 | 	TRON(const function *fun_obj, double eps = 0.1, INT64 max_iter = 1000);
25 | 	~TRON();
26 | 
27 | 	void tron(double *w);
28 | 	void set_print_string(void (*i_print) (const char *buf));
29 | 
30 | private:
31 | 	INT64 trcg(double delta, double *g, double *s, double *r);
32 | 	double norm_inf(INT64 n, double *x);
33 | 
34 | 	double eps;
35 | 	INT64 max_iter;
36 | 	function *fun_obj;
37 | 	void info(const char *fmt,...);
38 | 	void (*tron_print_string)(const char *buf);
39 | };
40 | #endif
41 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/classifier/learner/test


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/test.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "util.c"
 3 | 
 4 | int main(int argc, const char* argv[]){
 5 | 	INT64 offsets[1000];
 6 | 	INT64 error_code = 0;
 7 | 	merge_problems(&argv[1], argc-2, &offsets[0], argv[argc-1], 1, &error_code);
 8 | 
 9 | 	for(int i = 0; i < argc-1; i++) 
10 | 		printf("%ld ", offsets[i]);
11 | 	puts("");
12 | 	return 0;
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/libshorttext/classifier/learner/util.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <math.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <ctype.h>
  6 | #include <errno.h>
  7 | #include "linear.h"
  8 | #define Malloc(type,n) (type *)malloc((n)*sizeof(type))
  9 | 
 10 | 
 11 | static char *line = NULL;
 12 | static INT64 max_line_len;
 13 | 
 14 | static char* readline(FILE *input)
 15 | {
 16 | 	INT64 len;
 17 | 	
 18 | 	if(fgets(line,max_line_len,input) == NULL)
 19 | 		return NULL;
 20 | 
 21 | 	while(strrchr(line,'\n') == NULL)
 22 | 	{
 23 | 		max_line_len *= 2;
 24 | 		line = (char *) realloc(line,max_line_len);
 25 | 		len = (INT64) strlen(line);
 26 | 		if(fgets(line+len,max_line_len-len,input) == NULL)
 27 | 			break;
 28 | 	}
 29 | 	return line;
 30 | }
 31 | 
 32 | typedef struct {
 33 | 	struct problem prob;
 34 | 	struct feature_node* x_space;
 35 | 	INT64 len_x_space;
 36 | } SVMProblem;
 37 | 
 38 | void freeSVMProblem(SVMProblem svmprob) {
 39 | 	struct problem *prob = &(svmprob.prob);
 40 | 	if (prob->x!=NULL) free(prob->x);
 41 | 	if (prob->y!=NULL) free(prob->y);
 42 | 	if (svmprob.x_space!=NULL) free(svmprob.x_space);
 43 | }
 44 | 
 45 | 
 46 | // read in a problem (in libsvm format)
 47 | SVMProblem read_problem(const char *filename, double bias, INT64 *error_code)
 48 | {
 49 | 	INT64 max_index, inst_max_index, i;
 50 | 	INT64 elements, j;
 51 | 	FILE *fp = fopen(filename,"r");
 52 | 	char *endptr;
 53 | 	char *idx, *val, *label;
 54 | 	struct problem prob;
 55 | 	SVMProblem svmprob;
 56 | 
 57 | 	/**
 58 | 	 * error_code:
 59 | 	 * 0	no error
 60 | 	 * > 0	input format error. The error_code value
 61 | 	 * 	indicates the line number.
 62 | 	 * -1	can not open file
 63 | 	 * -2	memory exhausted
 64 | 	 */
 65 | 	*error_code = 0;
 66 | 
 67 | 	if(fp == NULL)
 68 | 	{
 69 | 		*error_code = -1;
 70 | 		return svmprob;
 71 | 	}
 72 | 
 73 | 	prob.l = 0;
 74 | 	elements = 0;
 75 | 	max_line_len = 1024;
 76 | 	line = Malloc(char,max_line_len);
 77 | 	while(readline(fp)!=NULL)
 78 | 	{
 79 | 		char *p = strtok(line," \t"); // label
 80 | 
 81 | 		// features
 82 | 		while(1)
 83 | 		{
 84 | 			p = strtok(NULL," \t");
 85 | 			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
 86 | 				break;
 87 | 			elements++;
 88 | 		}
 89 | 		prob.l++;
 90 | 	}
 91 | 	rewind(fp);
 92 | 
 93 | 	prob.bias=bias;
 94 | 	if(prob.bias >= 0) elements += prob.l;
 95 | 
 96 | 	errno = 0;
 97 | 	prob.y = Malloc(double,prob.l);
 98 | 	prob.x = Malloc(struct feature_node *,prob.l);
 99 | 	struct feature_node* x_space = Malloc(struct feature_node,elements+prob.l);
100 | 	
101 | 	if(errno == ENOMEM)
102 | 	{
103 | 		free(line);
104 | 		fclose(fp);
105 | 		*error_code = -2;
106 | 		return svmprob;
107 | 	}
108 | 
109 | 	max_index = 0;
110 | 	j=0;
111 | 	for(i=0;i<prob.l;i++)
112 | 	{
113 | 		inst_max_index = 0; // strtol gives 0 if wrong format
114 | 		readline(fp);
115 | 		prob.x[i] = &x_space[j];
116 | 		label = strtok(line," \t\n");
117 | 		if(label == NULL) // empty line
118 | 		{	
119 | 			free(line);
120 | 			fclose(fp);
121 | 			*error_code = i+1;
122 | 			return svmprob;
123 | 		}
124 | 
125 | 		prob.y[i] = strtod(label,&endptr);
126 | 		if(endptr == label || *endptr != '\0')
127 | 		{
128 | 			free(line);
129 | 			fclose(fp);
130 | 			*error_code = i+1;
131 | 			return svmprob;
132 | 		}
133 | 
134 | 		while(1)
135 | 		{
136 | 			idx = strtok(NULL,":");
137 | 			val = strtok(NULL," \t");
138 | 
139 | 			if(val == NULL)
140 | 				break;
141 | 
142 | 			errno = 0;
143 | 			x_space[j].index = (INT64)strtoll(idx,&endptr,10);
144 | 			if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
145 | 			{	
146 | 				free(line);
147 | 				fclose(fp);
148 | 				*error_code = i+1;
149 | 				return svmprob;
150 | 			}
151 | 			else
152 | 				inst_max_index = x_space[j].index;
153 | 
154 | 			errno = 0;
155 | 			x_space[j].value = strtod(val,&endptr);
156 | 			//if(binary) x_space[j].value = x_space[j].value != 0;
157 | 			if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
158 | 			{	
159 | 				free(line);
160 | 				fclose(fp);
161 | 				*error_code = i+1;
162 | 				return svmprob;
163 | 			}
164 | 
165 | 			++j;
166 | 		}
167 | 
168 | 		if(inst_max_index > max_index)
169 | 			max_index = inst_max_index;
170 | 
171 | 		if(prob.bias >= 0)
172 | 			x_space[j++].value = prob.bias;
173 | 
174 | 		x_space[j++].index = -1;
175 | 	}
176 | 
177 | 	if(prob.bias >= 0)
178 | 	{
179 | 		prob.n=max_index+1;
180 | 		for(i=1;i<prob.l;i++)
181 | 			(prob.x[i]-2)->index = prob.n; 
182 | 		x_space[j-2].index = prob.n;
183 | 	}
184 | 	else
185 | 		prob.n=max_index;
186 | 
187 | 	fclose(fp);
188 | 	free(line);
189 | 
190 | 	svmprob.prob = prob;
191 | 	svmprob.x_space = x_space;
192 | 	svmprob.len_x_space = j;
193 | 
194 | 	return svmprob;
195 | }
196 | 
197 | 
198 | double* compute_idf(const struct problem *prob, double *idf_val)
199 | {
200 | 	INT64 i, j;
201 | 	//double* idf_val = Malloc(double, prob.n);
202 | 	memset(idf_val, 0, sizeof(double) * prob->n);
203 | 
204 | 	for(i = 0; i < prob->l; ++i)
205 | 	{
206 | 		struct feature_node* xi = prob->x[i];
207 | 		while(xi->index != -1)
208 | 		{
209 | 			++idf_val[xi->index-1];
210 | 			++xi;
211 | 		}
212 | 	}
213 | 
214 | 	for(j = 0; j < prob->n; ++j)
215 | 	{
216 | 		if(idf_val[j] > 0)
217 | 			idf_val[j] = log(prob->l / idf_val[j]);
218 | 		else
219 | 			idf_val[j] = 0;
220 | 	}
221 | 
222 | 	return idf_val;
223 | }
224 | 
225 | void normalize(struct problem *prob, int binary, int norm, int tf, int idf, double* idf_val)
226 | {
227 | 	INT64 i;
228 | 
229 | 	for(i = 0; i < prob->l; ++i)
230 | 	{
231 | 		struct feature_node* xi;
232 | 
233 | 		if(binary)
234 | 		{
235 | 			xi = prob->x[i];
236 | 			while(xi->index != -1)
237 | 			{
238 | 				xi->value = xi->value != 0;
239 | 				++xi;
240 | 			}
241 | 		}
242 | 
243 | 		if(tf)
244 | 		{
245 | 			double norm = 0;
246 | 			xi = prob->x[i];
247 | 			while(xi->index != -1)
248 | 			{
249 | 				norm += xi->value;
250 | 				++xi;
251 | 			}
252 | 
253 | 			xi = prob->x[i];
254 | 			if(norm != 0)
255 | 				while(xi->index != -1)
256 | 				{
257 | 					xi->value /= norm;
258 | 					++xi;
259 | 				}
260 | 		}
261 | 
262 | 		if(idf)
263 | 		{
264 | 			xi = prob->x[i];
265 | 			while(xi->index != -1)
266 | 			{
267 | 				xi->value *= idf_val[xi->index-1];
268 | 				++xi;
269 | 			}
270 | 		}
271 | 
272 | 		if(norm)
273 | 		{
274 | 			double norm = 0;
275 | 			xi = prob->x[i];
276 | 			while(xi->index != -1)
277 | 			{
278 | 				norm += xi->value * xi->value;
279 | 				++xi;
280 | 			}
281 | 
282 | 			norm = sqrt(norm);
283 | 
284 | 			xi = prob->x[i];
285 | 			if(norm != 0)
286 | 				while(xi->index != -1)
287 | 				{
288 | 					xi->value /= norm;
289 | 					++xi;
290 | 				}
291 | 		}
292 | 	}
293 | }
294 | 
295 | 
296 | void merge_problems(const char *srcs[], const int num_srcs, INT64* offsets, const char *output_filename, char training, INT64 *error_code) { 
297 | 	int i, j;
298 | 	const double bias = -1;
299 | 	SVMProblem *svmproblems = Malloc(SVMProblem, num_srcs);
300 | 	FILE *fp = NULL;
301 | 
302 | 	/**
303 | 	 * error_code:
304 | 	 * 0	no error
305 | 	 * > 0	input format error. The error_code value
306 | 	 * 	indicates the line number.
307 | 	 * -1	can not open file
308 | 	 * -2	memory exhausted
309 | 	 * -3	input files contain different numbsers of instances
310 | 	 * -4   no file given
311 | 	 */
312 | 
313 | 	if(num_srcs <= 0) {
314 | 		*error_code = -4;
315 | 		return;
316 | 	}
317 | 
318 | 	for(i=0; i < num_srcs; i++) 
319 | 	{
320 | 		svmproblems[i] = read_problem(srcs[i], bias, error_code);
321 | 		if(*error_code != 0) {
322 | 			switch (*error_code) {
323 | 				case -1:
324 | 					fprintf(stderr,"ERROR: Cannot open input file: %s\n", srcs[i]);
325 | 					break;
326 | 				case -2:
327 | 					fprintf(stderr,"ERROR: Memory exhausted when reading %s\n", srcs[i]);
328 | 					break;
329 | 				default: /* error_code  > 0 input format error*/
330 | 					fprintf(stderr,"ERROR: input format error at line %ld in %s\n", (long)*error_code, srcs[i]);
331 | 					break;
332 | 			}
333 | 			return;
334 | 		}
335 | 	}
336 | 
337 | 
338 | 	// Overwrite offsets
339 | 	if(training) {
340 | 		offsets[0] = svmproblems[0].prob.n;
341 | 		for(i = 1; i < num_srcs; i++) 
342 | 			offsets[i] = offsets[i-1] + svmproblems[i].prob.n;
343 | 	}
344 | 	
345 | 	// Make sure # of instances are all equal.
346 | 	for(i = 1; i < num_srcs; i++) 
347 | 	{
348 | 		if(svmproblems[i].prob.l != svmproblems[i-1].prob.l) 
349 | 		{
350 | 			*error_code = -3;
351 | 			fprintf(stderr,"ERROR: #insts in %s = %ld, but #insts in %s = %ld\n",
352 | 					srcs[i], (long)svmproblems[i].prob.l, srcs[i-1], (long)svmproblems[i-1].prob.l);
353 | 			return;
354 | 		}
355 | 	}
356 | 
357 | 	fp = fopen(output_filename, "w");
358 | 	if(fp == NULL) 
359 | 	{
360 | 		*error_code = -1;
361 | 		fprintf(stderr,"ERROR: Cannot open output file: %s \n", srcs[i]);
362 | 		return;
363 | 	}
364 | 
365 | 	for(j = 0; j < svmproblems[0].prob.l; j++) 
366 | 	{
367 | 		INT64 base = 0;
368 | 		
369 | 		fprintf(fp, "%g", svmproblems[0].prob.y[j]);
370 | 		for(i = 0; i < num_srcs; i++)
371 | 		{
372 | 			struct feature_node* node;
373 | 
374 | 			for(node = svmproblems[i].prob.x[j]; node->index != -1; node++) 
375 | 			{
376 | 				INT64 index = base+node->index;
377 | 				if(index <= offsets[i])
378 |  					fprintf(fp, " %ld:%.17g", (long)index, node->value);
379 | 				else 
380 | 					break;
381 | 			}
382 | 			base = offsets[i];
383 | 		}
384 | 		fprintf(fp,"\n");
385 | 	}
386 | 	fclose(fp);
387 | 
388 | 	for(i = 0; i < num_srcs; i++)
389 | 		freeSVMProblem(svmproblems[i]);
390 | }
391 | 
392 | 
393 | 


--------------------------------------------------------------------------------
/libshorttext/converter/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :mod:`converter` module is used convert a text data set to a numerical data set.
 3 | More specifically, it converts a text file to a LIBSVM-format data. Refer to 
 4 | :ref:`dataset` for the format of texts.
 5 | 
 6 | The utilities of :mod:`converter` is wrapped in :class:`Text2svmConverter`.
 7 | :class:`Text2svmConverter` consists of three components: 
 8 | :class:`TextPreprocessor`, :class:`FeatureGenerator`, and :class:`ClassMapping`.
 9 | For users who only need the most basic usage, they can use the utility function
10 | :func:`convert_text` without understanding :mod:`converter`.
11 | 
12 | """
13 | 
14 | 
15 | from .converter_impl import *
16 | del converter_impl
17 | 


--------------------------------------------------------------------------------
/libshorttext/converter/stemmer/Makefile:
--------------------------------------------------------------------------------
 1 | all = lib
 2 | OS = $(shell uname)
 3 | 
 4 | lib: porter.o
 5 | 	if [ "$(OS)" = "Darwin" ]; then \
 6 | 		SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,porter.so.$(SHVER)"; \
 7 | 	else \
 8 | 		SHARED_LIB_FLAG="-shared -Wl,-soname,porter.so.$(SHVER)"; \
 9 | 	fi; \
10 | 	gcc $${SHARED_LIB_FLAG} porter.o -o porter.so.1
11 | 
12 | porter.o: porter.c
13 | 	gcc -fPIC -O3 -c -o porter.o porter.c
14 | 
15 | clean:
16 | 	rm -rf porter.o porter.so.1 *pyc
17 | 


--------------------------------------------------------------------------------
/libshorttext/converter/stemmer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/converter/stemmer/__init__.py


--------------------------------------------------------------------------------
/libshorttext/converter/stemmer/porter.c:
--------------------------------------------------------------------------------
  1 | /* This is the Porter stemming algorithm, coded up as thread-safe ANSI C
  2 |    by the author.
  3 | 
  4 |    It may be be regarded as cononical, in that it follows the algorithm
  5 |    presented in
  6 | 
  7 |    Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  8 |    no. 3, pp 130-137,
  9 | 
 10 |    only differing from it at the points maked --DEPARTURE-- below.
 11 | 
 12 |    See also http://www.tartarus.org/~martin/PorterStemmer
 13 | 
 14 |    The algorithm as described in the paper could be exactly replicated
 15 |    by adjusting the points of DEPARTURE, but this is barely necessary,
 16 |    because (a) the points of DEPARTURE are definitely improvements, and
 17 |    (b) no encoding of the Porter stemmer I have seen is anything like
 18 |    as exact as this version, even with the points of DEPARTURE!
 19 | 
 20 |    You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
 21 |    'stem' takes a list of inputs and sends the stemmed equivalent to
 22 |    stdout.
 23 | 
 24 |    The algorithm as encoded here is particularly fast.
 25 | 
 26 |    Release 2 (the more old-fashioned, non-thread-safe version may be
 27 |    regarded as release 1.)
 28 | */
 29 | 
 30 | #include <stdlib.h>  /* for malloc, free */
 31 | #include <string.h>  /* for memcmp, memmove */
 32 | 
 33 | /* You will probably want to move the following declarations to a central
 34 |    header file.
 35 | */
 36 | 
 37 | struct stemmer;
 38 | 
 39 | extern struct stemmer * create_stemmer(void);
 40 | extern void free_stemmer(struct stemmer * z);
 41 | 
 42 | extern int stem(struct stemmer * z, char * b, int k);
 43 | 
 44 | 
 45 | 
 46 | /* The main part of the stemming algorithm starts here.
 47 | */
 48 | 
 49 | #define TRUE 1
 50 | #define FALSE 0
 51 | 
 52 | /* stemmer is a structure for a few local bits of data,
 53 | */
 54 | 
 55 | struct stemmer {
 56 |    char * b;       /* buffer for word to be stemmed */
 57 |    int k;          /* offset to the end of the string */
 58 |    int j;          /* a general offset into the string */
 59 | };
 60 | 
 61 | 
 62 | /* Member b is a buffer holding a word to be stemmed. The letters are in
 63 |    b[0], b[1] ... ending at b[z->k]. Member k is readjusted downwards as
 64 |    the stemming progresses. Zero termination is not in fact used in the
 65 |    algorithm.
 66 | 
 67 |    Note that only lower case sequences are stemmed. Forcing to lower case
 68 |    should be done before stem(...) is called.
 69 | 
 70 | 
 71 |    Typical usage is:
 72 | 
 73 |        struct stemmer * z = create_stemmer();
 74 |        char b[] = "pencils";
 75 |        int res = stem(z, b, 6);
 76 |            /- stem the 7 characters of b[0] to b[6]. The result, res,
 77 |               will be 5 (the 's' is removed). -/
 78 |        free_stemmer(z);
 79 | */
 80 | 
 81 | 
 82 | extern struct stemmer * create_stemmer(void)
 83 | {
 84 |     return (struct stemmer *) malloc(sizeof(struct stemmer));
 85 |     /* assume malloc succeeds */
 86 | }
 87 | 
 88 | extern void free_stemmer(struct stemmer * z)
 89 | {
 90 |     free(z);
 91 | }
 92 | 
 93 | 
 94 | /* cons(z, i) is TRUE <=> b[i] is a consonant. ('b' means 'z->b', but here
 95 |    and below we drop 'z->' in comments.
 96 | */
 97 | 
 98 | static int cons(struct stemmer * z, int i)
 99 | {  switch (z->b[i])
100 |    {  case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
101 |       case 'y': return (i == 0) ? TRUE : !cons(z, i - 1);
102 |       default: return TRUE;
103 |    }
104 | }
105 | 
106 | /* m(z) measures the number of consonant sequences between 0 and j. if c is
107 |    a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
108 |    presence,
109 | 
110 |       <c><v>       gives 0
111 |       <c>vc<v>     gives 1
112 |       <c>vcvc<v>   gives 2
113 |       <c>vcvcvc<v> gives 3
114 |       ....
115 | */
116 | 
117 | static int m(struct stemmer * z)
118 | {  int n = 0;
119 |    int i = 0;
120 |    int j = z->j;
121 |    while(TRUE)
122 |    {  if (i > j) return n;
123 |       if (! cons(z, i)) break; i++;
124 |    }
125 |    i++;
126 |    while(TRUE)
127 |    {  while(TRUE)
128 |       {  if (i > j) return n;
129 |             if (cons(z, i)) break;
130 |             i++;
131 |       }
132 |       i++;
133 |       n++;
134 |       while(TRUE)
135 |       {  if (i > j) return n;
136 |          if (! cons(z, i)) break;
137 |          i++;
138 |       }
139 |       i++;
140 |    }
141 | }
142 | 
143 | /* vowelinstem(z) is TRUE <=> 0,...j contains a vowel */
144 | 
145 | static int vowelinstem(struct stemmer * z)
146 | {
147 |    int j = z->j;
148 |    int i; for (i = 0; i <= j; i++) if (! cons(z, i)) return TRUE;
149 |    return FALSE;
150 | }
151 | 
152 | /* doublec(z, j) is TRUE <=> j,(j-1) contain a double consonant. */
153 | 
154 | static int doublec(struct stemmer * z, int j)
155 | {
156 |    char * b = z->b;
157 |    if (j < 1) return FALSE;
158 |    if (b[j] != b[j - 1]) return FALSE;
159 |    return cons(z, j);
160 | }
161 | 
162 | /* cvc(z, i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
163 |    and also if the second c is not w,x or y. this is used when trying to
164 |    restore an e at the end of a short word. e.g.
165 | 
166 |       cav(e), lov(e), hop(e), crim(e), but
167 |       snow, box, tray.
168 | 
169 | */
170 | 
171 | static int cvc(struct stemmer * z, int i)
172 | {  if (i < 2 || !cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return FALSE;
173 |    {  int ch = z->b[i];
174 |       if (ch  == 'w' || ch == 'x' || ch == 'y') return FALSE;
175 |    }
176 |    return TRUE;
177 | }
178 | 
179 | /* ends(z, s) is TRUE <=> 0,...k ends with the string s. */
180 | 
181 | static int ends(struct stemmer * z, char * s)
182 | {  int length = s[0];
183 |    char * b = z->b;
184 |    int k = z->k;
185 |    if (s[length] != b[k]) return FALSE; /* tiny speed-up */
186 |    if (length > k + 1) return FALSE;
187 |    if (memcmp(b + k - length + 1, s + 1, length) != 0) return FALSE;
188 |    z->j = k-length;
189 |    return TRUE;
190 | }
191 | 
192 | /* setto(z, s) sets (j+1),...k to the characters in the string s, readjusting
193 |    k. */
194 | 
195 | static void setto(struct stemmer * z, char * s)
196 | {  int length = s[0];
197 |    int j = z->j;
198 |    memmove(z->b + j + 1, s + 1, length);
199 |    z->k = j+length;
200 | }
201 | 
202 | /* r(z, s) is used further down. */
203 | 
204 | static void r(struct stemmer * z, char * s) { if (m(z) > 0) setto(z, s); }
205 | 
206 | /* step1ab(z) gets rid of plurals and -ed or -ing. e.g.
207 | 
208 |        caresses  ->  caress
209 |        ponies    ->  poni
210 |        ties      ->  ti
211 |        caress    ->  caress
212 |        cats      ->  cat
213 | 
214 |        feed      ->  feed
215 |        agreed    ->  agree
216 |        disabled  ->  disable
217 | 
218 |        matting   ->  mat
219 |        mating    ->  mate
220 |        meeting   ->  meet
221 |        milling   ->  mill
222 |        messing   ->  mess
223 | 
224 |        meetings  ->  meet
225 | 
226 | */
227 | 
228 | static void step1ab(struct stemmer * z)
229 | {
230 |    char * b = z->b;
231 |    if (b[z->k] == 's')
232 |    {  if (ends(z, "\04" "sses")) z->k -= 2; else
233 |       if (ends(z, "\03" "ies")) setto(z, "\01" "i"); else
234 |       if (b[z->k - 1] != 's') z->k--;
235 |    }
236 |    if (ends(z, "\03" "eed")) { if (m(z) > 0) z->k--; } else
237 |    if ((ends(z, "\02" "ed") || ends(z, "\03" "ing")) && vowelinstem(z))
238 |    {  z->k = z->j;
239 |       if (ends(z, "\02" "at")) setto(z, "\03" "ate"); else
240 |       if (ends(z, "\02" "bl")) setto(z, "\03" "ble"); else
241 |       if (ends(z, "\02" "iz")) setto(z, "\03" "ize"); else
242 |       if (doublec(z, z->k))
243 |       {  z->k--;
244 |          {  int ch = b[z->k];
245 |             if (ch == 'l' || ch == 's' || ch == 'z') z->k++;
246 |          }
247 |       }
248 |       else if (m(z) == 1 && cvc(z, z->k)) setto(z, "\01" "e");
249 |    }
250 | }
251 | 
252 | /* step1c(z) turns terminal y to i when there is another vowel in the stem. */
253 | 
254 | static void step1c(struct stemmer * z)
255 | {
256 |    if (ends(z, "\01" "y") && vowelinstem(z)) z->b[z->k] = 'i';
257 | }
258 | 
259 | 
260 | /* step2(z) maps double suffices to single ones. so -ization ( = -ize plus
261 |    -ation) maps to -ize etc. note that the string before the suffix must give
262 |    m(z) > 0. */
263 | 
264 | static void step2(struct stemmer * z) { switch (z->b[z->k-1])
265 | {
266 |    case 'a': if (ends(z, "\07" "ational")) { r(z, "\03" "ate"); break; }
267 |              if (ends(z, "\06" "tional")) { r(z, "\04" "tion"); break; }
268 |              break;
269 |    case 'c': if (ends(z, "\04" "enci")) { r(z, "\04" "ence"); break; }
270 |              if (ends(z, "\04" "anci")) { r(z, "\04" "ance"); break; }
271 |              break;
272 |    case 'e': if (ends(z, "\04" "izer")) { r(z, "\03" "ize"); break; }
273 |              break;
274 |    case 'l': if (ends(z, "\03" "bli")) { r(z, "\03" "ble"); break; } /*-DEPARTURE-*/
275 | 
276 |  /* To match the published algorithm, replace this line with
277 |     case 'l': if (ends(z, "\04" "abli")) { r(z, "\04" "able"); break; } */
278 | 
279 |              if (ends(z, "\04" "alli")) { r(z, "\02" "al"); break; }
280 |              if (ends(z, "\05" "entli")) { r(z, "\03" "ent"); break; }
281 |              if (ends(z, "\03" "eli")) { r(z, "\01" "e"); break; }
282 |              if (ends(z, "\05" "ousli")) { r(z, "\03" "ous"); break; }
283 |              break;
284 |    case 'o': if (ends(z, "\07" "ization")) { r(z, "\03" "ize"); break; }
285 |              if (ends(z, "\05" "ation")) { r(z, "\03" "ate"); break; }
286 |              if (ends(z, "\04" "ator")) { r(z, "\03" "ate"); break; }
287 |              break;
288 |    case 's': if (ends(z, "\05" "alism")) { r(z, "\02" "al"); break; }
289 |              if (ends(z, "\07" "iveness")) { r(z, "\03" "ive"); break; }
290 |              if (ends(z, "\07" "fulness")) { r(z, "\03" "ful"); break; }
291 |              if (ends(z, "\07" "ousness")) { r(z, "\03" "ous"); break; }
292 |              break;
293 |    case 't': if (ends(z, "\05" "aliti")) { r(z, "\02" "al"); break; }
294 |              if (ends(z, "\05" "iviti")) { r(z, "\03" "ive"); break; }
295 |              if (ends(z, "\06" "biliti")) { r(z, "\03" "ble"); break; }
296 |              break;
297 |    case 'g': if (ends(z, "\04" "logi")) { r(z, "\03" "log"); break; } /*-DEPARTURE-*/
298 | 
299 |  /* To match the published algorithm, delete this line */
300 | 
301 | } }
302 | 
303 | /* step3(z) deals with -ic-, -full, -ness etc. similar strategy to step2. */
304 | 
305 | static void step3(struct stemmer * z) { switch (z->b[z->k])
306 | {
307 |    case 'e': if (ends(z, "\05" "icate")) { r(z, "\02" "ic"); break; }
308 |              if (ends(z, "\05" "ative")) { r(z, "\00" ""); break; }
309 |              if (ends(z, "\05" "alize")) { r(z, "\02" "al"); break; }
310 |              break;
311 |    case 'i': if (ends(z, "\05" "iciti")) { r(z, "\02" "ic"); break; }
312 |              break;
313 |    case 'l': if (ends(z, "\04" "ical")) { r(z, "\02" "ic"); break; }
314 |              if (ends(z, "\03" "ful")) { r(z, "\00" ""); break; }
315 |              break;
316 |    case 's': if (ends(z, "\04" "ness")) { r(z, "\00" ""); break; }
317 |              break;
318 | } }
319 | 
320 | /* step4(z) takes off -ant, -ence etc., in context <c>vcvc<v>. */
321 | 
322 | static void step4(struct stemmer * z)
323 | {  switch (z->b[z->k-1])
324 |    {  case 'a': if (ends(z, "\02" "al")) break; return;
325 |       case 'c': if (ends(z, "\04" "ance")) break;
326 |                 if (ends(z, "\04" "ence")) break; return;
327 |       case 'e': if (ends(z, "\02" "er")) break; return;
328 |       case 'i': if (ends(z, "\02" "ic")) break; return;
329 |       case 'l': if (ends(z, "\04" "able")) break;
330 |                 if (ends(z, "\04" "ible")) break; return;
331 |       case 'n': if (ends(z, "\03" "ant")) break;
332 |                 if (ends(z, "\05" "ement")) break;
333 |                 if (ends(z, "\04" "ment")) break;
334 |                 if (ends(z, "\03" "ent")) break; return;
335 |       case 'o': if (ends(z, "\03" "ion") && (z->b[z->j] == 's' || z->b[z->j] == 't')) break;
336 |                 if (ends(z, "\02" "ou")) break; return;
337 |                 /* takes care of -ous */
338 |       case 's': if (ends(z, "\03" "ism")) break; return;
339 |       case 't': if (ends(z, "\03" "ate")) break;
340 |                 if (ends(z, "\03" "iti")) break; return;
341 |       case 'u': if (ends(z, "\03" "ous")) break; return;
342 |       case 'v': if (ends(z, "\03" "ive")) break; return;
343 |       case 'z': if (ends(z, "\03" "ize")) break; return;
344 |       default: return;
345 |    }
346 |    if (m(z) > 1) z->k = z->j;
347 | }
348 | 
349 | /* step5(z) removes a final -e if m(z) > 1, and changes -ll to -l if
350 |    m(z) > 1. */
351 | 
352 | static void step5(struct stemmer * z)
353 | {
354 |    char * b = z->b;
355 |    z->j = z->k;
356 |    if (b[z->k] == 'e')
357 |    {  int a = m(z);
358 |       if (a > 1 || a == 1 && !cvc(z, z->k - 1)) z->k--;
359 |    }
360 |    if (b[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;
361 | }
362 | 
363 | /* In stem(z, b, k), b is a char pointer, and the string to be stemmed is
364 |    from b[0] to b[k] inclusive.  Possibly b[k+1] == '\0', but it is not
365 |    important. The stemmer adjusts the characters b[0] ... b[k] and returns
366 |    the new end-point of the string, k'. Stemming never increases word
367 |    length, so 0 <= k' <= k.
368 | */
369 | 
370 | extern int stem(struct stemmer * z, char * b, int k)
371 | {
372 |    if (k <= 1) return k; /*-DEPARTURE-*/
373 |    z->b = b; z->k = k; /* copy the parameters into z */
374 | 
375 |    /* With this line, strings of length 1 or 2 don't go through the
376 |       stemming process, although no mention is made of this in the
377 |       published algorithm. Remove the line to match the published
378 |       algorithm. */
379 | 
380 |    step1ab(z); step1c(z); step2(z); step3(z); step4(z); step5(z);
381 |    return z->k;
382 | }
383 | 
384 | /*--------------------stemmer definition ends here------------------------*/
385 | 
386 | #include <stdio.h>
387 | #include <stdlib.h>      /* for malloc, free */
388 | #include <ctype.h>       /* for isupper, islower, tolower */
389 | 
390 | static char * s;         /* buffer for words tobe stemmed */
391 | 
392 | #define INC 50           /* size units in which s is increased */
393 | static int i_max = INC;  /* maximum offset in s */
394 | 
395 | #define LETTER(ch) (isupper(ch) || islower(ch))
396 | 
397 | void stemfile(struct stemmer * z, FILE * f)
398 | {  while(TRUE)
399 |    {  int ch = getc(f);
400 |       if (ch == EOF) return;
401 |       if (LETTER(ch))
402 |       {  int i = 0;
403 |          while(TRUE)
404 |          {  if (i == i_max)
405 |             {  i_max += INC;
406 |                s = realloc(s, i_max + 1);
407 |             }
408 |             ch = tolower(ch); /* forces lower case */
409 | 
410 |             s[i] = ch; i++;
411 |             ch = getc(f);
412 |             if (!LETTER(ch)) { ungetc(ch,f); break; }
413 |          }
414 |          s[stem(z, s, i - 1) + 1] = 0;
415 |          /* the previous line calls the stemmer and uses its result to
416 |             zero-terminate the string in s */
417 |          printf("%s",s);
418 |       }
419 |       else putchar(ch);
420 |    }
421 | }
422 | 
423 | 
424 | int trim(char* src){
425 | 	
426 | 	struct stemmer z;
427 | 	int len = strlen(src);
428 | 	return stem(&z,src,len-1) + 1;
429 | }
430 | /*
431 | int main(int argc, char * argv[])
432 | {  int i;
433 | 
434 |    struct stemmer * z = create_stemmer();
435 | 
436 |    s = (char *) malloc(i_max + 1);
437 |    for (i = 1; i < argc; i++)
438 |    {  FILE * f = fopen(argv[i],"r");
439 |       if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
440 |       stemfile(z, f);
441 |    }
442 |    free(s);
443 | 
444 |    free_stemmer(z);
445 | 
446 |    return 0;
447 | }
448 | */
449 | 
450 | 
451 | 


--------------------------------------------------------------------------------
/libshorttext/converter/stemmer/porter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from ctypes import *
 4 | from ctypes.util import find_library
 5 | import sys
 6 | import os
 7 | 
 8 | stemmer = CDLL(os.path.join(os.path.abspath(os.path.dirname(__file__)), './porter.so.1'))
 9 | 
10 | def fillprototype(f, restype, argtypes): 
11 | 	f.restype = restype
12 | 	f.argtypes = argtypes
13 | 
14 | fillprototype(stemmer.trim, c_int, [c_char_p])
15 | 
16 | def stem(word):
17 | 	return word[:stemmer.trim(word.encode('utf-8'))]
18 | 


--------------------------------------------------------------------------------
/libshorttext/converter/stop-words/stoplist-nsp.regex:
--------------------------------------------------------------------------------
  1 | @stop.mode=OR
  2 | /\b[a-zA-Z]\b/
  3 | /\b[aA]board\b/
  4 | /\b[aA]bout\b/
  5 | /\b[aA]bove\b/
  6 | /\b[aA]cross\b/
  7 | /\b[aA]fter\b/
  8 | /\b[aA]gain\b/
  9 | /\b[aA]gainst\b/
 10 | /\b[aA]ll\b/
 11 | /\b[aA]long\b/
 12 | /\b[aA]longside\b/
 13 | /\b[aA]lready\b/
 14 | /\b[aA]lso\b/
 15 | /\b[aA]lthough\b/
 16 | /\b[aA]lways\b/
 17 | /\b[aA]m\b/
 18 | /\b[aA]mid\b/
 19 | /\b[aA]midst\b/
 20 | /\b[aA]mong\b/
 21 | /\b[aA]mongst\b/
 22 | /\b[aA]n\b/
 23 | /\b[aA]nd\b/
 24 | /\b[aA]nother\b/
 25 | /\b[aA]nti\b/
 26 | /\b[aA]ny\b/
 27 | /\b[aA]nybody\b/
 28 | /\b[aA]nyone\b/
 29 | /\b[aA]nything\b/
 30 | /\b[aA]re\b/
 31 | /\b[aA]round\b/
 32 | /\b[aA]s\b/
 33 | /\b[aA]stride\b/
 34 | /\b[aA]t\b/
 35 | /\b[aA]ught\b/
 36 | /\b[aA]way\b/
 37 | /\b[bB]ack\b/
 38 | /\b[bB]ar\b/
 39 | /\b[bB]arring\b/
 40 | /\b[bB]e\b/
 41 | /\b[bB]ecause\b/
 42 | /\b[bB]ecome\b/
 43 | /\b[bB]ecomes\b/
 44 | /\b[bB]ecoming\b/
 45 | /\b[bB]een\b/
 46 | /\b[bB]efore\b/
 47 | /\b[bB]ehind\b/
 48 | /\b[bB]eing\b/
 49 | /\b[bB]elow\b/
 50 | /\b[bB]eneath\b/
 51 | /\b[bB]eside\b/
 52 | /\b[bB]esides\b/
 53 | /\b[bB]etween\b/
 54 | /\b[bB]eyond\b/
 55 | /\b[bB]oth\b/
 56 | /\b[bB]ut\b/
 57 | /\b[bB]y\b/
 58 | /\b[cC]alled\b/
 59 | /\b[cC]an\b/
 60 | /\b[cC]annot\b/
 61 | /\b[cC]ant\b/
 62 | /\b[cC]ertain\b/
 63 | /\b[cC]irca\b/
 64 | /\b[cC]m\b/
 65 | /\b[cC]oncerning\b/
 66 | /\b[cC]onsidering\b/
 67 | /\b[cC]ontain\b/
 68 | /\b[cC]ould\b/
 69 | /\b[dD]e\b/
 70 | /\b[dD]espite\b/
 71 | /\b[dD]id\b/
 72 | /\b[dD]o\b/
 73 | /\b[dD]oe\b/
 74 | /\b[dD]oes\b/
 75 | /\b[dD]oing\b/
 76 | /\b[dD]one\b/
 77 | /\b[dD]ont\b/
 78 | /\b[dD]own\b/
 79 | /\b[dD]r\b/
 80 | /\b[dD]uring\b/
 81 | /\b[eE]ach\b/
 82 | /\b[eE]c\b/
 83 | /\b[eE]e\b/
 84 | /\b[eE]ighteen\b/
 85 | /\b[eE]ighth\b/
 86 | /\b[eE]ither\b/
 87 | /\b[eE]leven\b/
 88 | /\b[eE]lse\b/
 89 | /\b[eE]nd\b/
 90 | /\b[eE]nough\b/
 91 | /\b[eE]specially\b/
 92 | /\b[eE]tc\b/
 93 | /\b[eE]ven\b/
 94 | /\b[eE]ver\b/
 95 | /\b[eE]very\b/
 96 | /\b[eE]verybody\b/
 97 | /\b[eE]veryone\b/
 98 | /\b[eE]xcept\b/
 99 | /\b[eE]xcepting\b/
100 | /\b[eE]xcluding\b/
101 | /\b[fF]ew\b/
102 | /\b[fF]ewer\b/
103 | /\b[fF]ifteen\b/
104 | /\b[fF]ifth\b/
105 | /\b[fF]irst\b/
106 | /\b[fF]ollowing\b/
107 | /\b[fF]or\b/
108 | /\b[fF]ourteen\b/
109 | /\b[fF]ourth\b/
110 | /\b[fF]rom\b/
111 | /\b[gG]et\b/
112 | /\b[gG]ive\b/
113 | /\b[gG]o\b/
114 | /\b[gG]oing\b/
115 | /\b[gG]ood\b/
116 | /\b[gG]ot\b/
117 | /\b[hH]a\b/
118 | /\b[hH]ad\b/
119 | /\b[hH]ardly\b/
120 | /\b[hH]as\b/
121 | /\b[hH]ave\b/
122 | /\b[hH]e\b/
123 | /\b[hH]eld\b/
124 | /\b[hH]er\b/
125 | /\b[hH]ere\b/
126 | /\b[hH]ers\b/
127 | /\b[hH]erself\b/
128 | /\b[hH]es\b/
129 | /\b[hH]im\b/
130 | /\b[hH]imself\b/
131 | /\b[hH]is\b/
132 | /\b[hH]isself\b/
133 | /\b[hH]m\b/
134 | /\b[hH]ow\b/
135 | /\b[iI]dem\b/
136 | /\b[iI]f\b/
137 | /\b[iI]i\b/
138 | /\b[iI]ii\b/
139 | /\b[iI]lk\b/
140 | /\b[iI]n\b/
141 | /\b[iI]nclude\b/
142 | /\b[iI]ncluded\b/
143 | /\b[iI]ncluding\b/
144 | /\b[iI]ndeed\b/
145 | /\b[iI]nside\b/
146 | /\b[iI]nstead\b/
147 | /\b[iI]nto\b/
148 | /\b[iI]s\b/
149 | /\b[iI]t\b/
150 | /\b[iI]ts\b/
151 | /\b[iI]tself\b/
152 | /\b[iI]v\b/
153 | /\b[jJ]r\b/
154 | /\b[jJ]ust\b/
155 | /\b[kK]ept\b/
156 | /\b[kK]now\b/
157 | /\b[lL]ast\b/
158 | /\b[lL]ate\b/
159 | /\b[lL]ater\b/
160 | /\b[lL]ess\b/
161 | /\b[lL]et\b/
162 | /\b[lL]ike\b/
163 | /\b[lL]ittle\b/
164 | /\b[mM]ade\b/
165 | /\b[mM]ake\b/
166 | /\b[mM]aking\b/
167 | /\b[mM]any\b/
168 | /\b[mM]ay\b/
169 | /\b[mM]e\b/
170 | /\b[mM]ight\b/
171 | /\b[mM]ine\b/
172 | /\b[mM]inus\b/
173 | /\b[mM]m\b/
174 | /\b[mM]ore\b/
175 | /\b[mM]ost\b/
176 | /\b[mM]ostly\b/
177 | /\b[mM]r\b/
178 | /\b[mM]rs\b/
179 | /\b[mM]uch\b/
180 | /\b[mM]ust\b/
181 | /\b[mM]y\b/
182 | /\b[mM]yself\b/
183 | /\b[nN]aught\b/
184 | /\b[nN]ear\b/
185 | /\b[nN]eeded\b/
186 | /\b[nN]eeds\b/
187 | /\b[nN]either\b/
188 | /\b[nN]ever\b/
189 | /\b[nN]ew\b/
190 | /\b[nN]ext\b/
191 | /\b[nN]hs\b/
192 | /\b[nN]ine\b/
193 | /\b[nN]ineteen\b/
194 | /\b[nN]inth\b/
195 | /\b[nN]o\b/
196 | /\b[nN]obody\b/
197 | /\b[nN]on\b/
198 | /\b[nN]one\b/
199 | /\b[nN]or\b/
200 | /\b[nN]ot\b/
201 | /\b[nN]othing\b/
202 | /\b[nN]otwithstanding\b/
203 | /\b[nN]ow\b/
204 | /\b[nN]s\b/
205 | /\b[nN]t\b/
206 | /\b[oO]f\b/
207 | /\b[oO]ff\b/
208 | /\b[oO]ften\b/
209 | /\b[oO]n\b/
210 | /\b[oO]nce\b/
211 | /\b[oO]ne\b/
212 | /\b[oO]neself\b/
213 | /\b[oO]nly\b/
214 | /\b[oO]nto\b/
215 | /\b[oO]pposite\b/
216 | /\b[oO]r\b/
217 | /\b[oO]ther\b/
218 | /\b[oO]thers\b/
219 | /\b[oO]therwise\b/
220 | /\b[oO]ught\b/
221 | /\b[oO]ur\b/
222 | /\b[oO]urself\b/
223 | /\b[oO]urselves\b/
224 | /\b[oO]ut\b/
225 | /\b[oO]utside\b/
226 | /\b[oO]ver\b/
227 | /\b[oO]wn\b/
228 | /\b[pP]art\b/
229 | /\b[pP]articular\b/
230 | /\b[pP]ast\b/
231 | /\b[pP]e\b/
232 | /\b[pP]ending\b/
233 | /\b[pP]er\b/
234 | /\b[pP]erhaps\b/
235 | /\b[pP]lenty\b/
236 | /\b[pP]lus\b/
237 | /\b[pP]robably\b/
238 | /\b[pP]uts\b/
239 | /\b[qQ]uite\b/
240 | /\b[rR]ather\b/
241 | /\b[rR]eally\b/
242 | /\b[rR]egarding\b/
243 | /\b[rR]elate\b/
244 | /\b[rR]ound\b/
245 | /\b[sS]aid\b/
246 | /\b[sS]ave\b/
247 | /\b[sS]aw\b/
248 | /\b[sS]ay\b/
249 | /\b[sS]ays\b/
250 | /\b[sS]econd\b/
251 | /\b[sS]ee\b/
252 | /\b[sS]eem\b/
253 | /\b[sS]eems\b/
254 | /\b[sS]een\b/
255 | /\b[sS]elf\b/
256 | /\b[sS]eventeen\b/
257 | /\b[sS]eventh\b/
258 | /\b[sS]everal\b/
259 | /\b[sS]hall\b/
260 | /\b[sS]he\b/
261 | /\b[sS]hort\b/
262 | /\b[sS]hould\b/
263 | /\b[sS]ince\b/
264 | /\b[sS]ix\b/
265 | /\b[sS]ixteen\b/
266 | /\b[sS]ixth\b/
267 | /\b[sS]o\b/
268 | /\b[sS]ome\b/
269 | /\b[sS]omebody\b/
270 | /\b[sS]omeone\b/
271 | /\b[sS]omething\b/
272 | /\b[sS]ometimes\b/
273 | /\b[sS]omewhat\b/
274 | /\b[sS]oon\b/
275 | /\b[sS]ooner\b/
276 | /\b[sS]r\b/
277 | /\b[sS]uch\b/
278 | /\b[sS]uchlike\b/
279 | /\b[sS]uddenly\b/
280 | /\b[sS]undry\b/
281 | /\b[tT]ake\b/
282 | /\b[tT]en\b/
283 | /\b[tT]enth\b/
284 | /\b[tT]han\b/
285 | /\b[tT]hat\b/
286 | /\b[tT]he\b/
287 | /\b[tT]hee\b/
288 | /\b[tT]heir\b/
289 | /\b[tT]heirs\b/
290 | /\b[tT]hem\b/
291 | /\b[tT]hemselves\b/
292 | /\b[tT]hen\b/
293 | /\b[tT]hen\b/
294 | /\b[tT]here\b/
295 | /\b[tT]hey\b/
296 | /\b[tT]hine\b/
297 | /\b[tT]hird\b/
298 | /\b[tT]hirteen\b/
299 | /\b[tT]his\b/
300 | /\b[tT]hose\b/
301 | /\b[tT]hou\b/
302 | /\b[tT]hough\b/
303 | /\b[tT]hree\b/
304 | /\b[tT]hrice\b/
305 | /\b[tT]hrough\b/
306 | /\b[tT]hroughout\b/
307 | /\b[tT]hus\b/
308 | /\b[tT]hyself\b/
309 | /\b[tT]ill\b/
310 | /\b[tT]o\b/
311 | /\b[tT]oo\b/
312 | /\b[tT]otally\b/
313 | /\b[tT]other\b/
314 | /\b[tT]oward\b/
315 | /\b[tT]owards\b/
316 | /\b[tT]wain\b/
317 | /\b[tT]welve\b/
318 | /\b[tT]wenty\b/
319 | /\b[tT]wice\b/
320 | /\b[tT]wo\b/
321 | /\b[uU]nder\b/
322 | /\b[uU]nderneath\b/
323 | /\b[uU]nless\b/
324 | /\b[uU]nlike\b/
325 | /\b[uU]ntil\b/
326 | /\b[uU]p\b/
327 | /\b[uU]pon\b/
328 | /\b[uU]pper\b/
329 | /\b[uU]s\b/
330 | /\b[uU]se\b/
331 | /\b[uU]sed\b/
332 | /\b[uU]sually\b/
333 | /\b[uU]x\b/
334 | /\b[vV]arious\b/
335 | /\b[vV]ersus\b/
336 | /\b[vV]ery\b/
337 | /\b[vV]i\b/
338 | /\b[vV]ia\b/
339 | /\b[vV]ii\b/
340 | /\b[vV]iii\b/
341 | /\b[vV]iiii\b/
342 | /\b[vV]is-a-vis\b/
343 | /\b[wW]a\b/
344 | /\b[wW]ant\b/
345 | /\b[wW]anted\b/
346 | /\b[wW]ants\b/
347 | /\b[wW]as\b/
348 | /\b[wW]e\b/
349 | /\b[wW]ell\b/
350 | /\b[wW]ent\b/
351 | /\b[wW]ere\b/
352 | /\b[wW]hat\b/
353 | /\b[wW]hatall\b/
354 | /\b[wW]hatever\b/
355 | /\b[wW]hatsoever\b/
356 | /\b[wW]hen\b/
357 | /\b[wW]here\b/
358 | /\b[wW]hereas\b/
359 | /\b[wW]hereby\b/
360 | /\b[wW]herewith\b/
361 | /\b[wW]herewithal\b/
362 | /\b[wW]hich\b/
363 | /\b[wW]hichever\b/
364 | /\b[wW]hichsoever\b/
365 | /\b[wW]hile\b/
366 | /\b[wW]ho\b/
367 | /\b[wW]hoever\b/
368 | /\b[wW]hole\b/
369 | /\b[wW]hom\b/
370 | /\b[wW]homever\b/
371 | /\b[wW]homso\b/
372 | /\b[wW]homsoever\b/
373 | /\b[wW]hose\b/
374 | /\b[wW]hosoever\b/
375 | /\b[wW]ill\b/
376 | /\b[wW]ith\b/
377 | /\b[wW]ithin\b/
378 | /\b[wW]ithout\b/
379 | /\b[wW]ont\b/
380 | /\b[wW]orth\b/
381 | /\b[wW]ould\b/
382 | /\b[yY]e\b/
383 | /\b[yY]ear\b/
384 | /\b[yY]ears\b/
385 | /\b[yY]es\b/
386 | /\b[yY]et\b/
387 | /\b[yY]on\b/
388 | /\b[yY]onder\b/
389 | /\b[yY]ou\b/
390 | /\b[yY]ou-all\b/
391 | /\b[yY]our\b/
392 | /\b[yY]ours\b/
393 | /\b[yY]ourself\b/
394 | /\b[yY]ourselves\b/
395 | 


--------------------------------------------------------------------------------
/libshorttext/converter/stop-words/stoplist-nsp.regex.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2shou/python-libshorttext/460773dbbefe7a82a9b544ca419242b68a1a0533/libshorttext/converter/stop-words/stoplist-nsp.regex.pickle


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from distutils.command.install import install as DistutilsInstall
 3 | from distutils.command.clean import clean as Clean
 4 | import shutil
 5 | import os
 6 | from os.path import join
 7 | 
 8 | 
 9 | class MakeCommand(DistutilsInstall):
10 |     def run(self):
11 |         os.system('make')
12 |         common_dir = 'libshorttext/converter/stemmer'
13 |         target_dir = '%s/%s' % (self.build_lib, common_dir)
14 |         self.mkpath(target_dir)
15 |         os.system('cp %s/porter.so.1 %s' % (common_dir, target_dir))
16 |         common_dir = 'libshorttext/classifier/learner'
17 |         target_dir = '%s/%s' % (self.build_lib, common_dir)
18 |         self.mkpath(target_dir)
19 |         os.system('cp %s/util.so.1 %s' % (common_dir, target_dir))
20 |         common_dir = 'libshorttext/classifier/learner/liblinear'
21 |         target_dir = '%s/%s' % (self.build_lib, common_dir)
22 |         self.mkpath(target_dir)
23 |         os.system('cp %s/liblinear.so.1 %s' % (common_dir, target_dir))
24 |         DistutilsInstall.run(self)
25 | 
26 | 
27 | class CleanCommand(Clean):
28 |     description = "Remove build artifacts from the source tree"
29 | 
30 |     def run(self):
31 |         Clean.run(self)
32 |         if os.path.exists('build'):
33 |             shutil.rmtree('build')
34 |         for dirpath, dirnames, filenames in os.walk('libshorttext'):
35 |             for filename in filenames:
36 |                 if (filename.endswith('.o') or filename.endswith('.a') or filename.endswith(
37 |                         '.so.1') or filename.endswith(
38 |                         '.pyd') or filename.endswith(
39 |                         '.dll') or filename.endswith('.pyc')):
40 |                     os.unlink(os.path.join(dirpath, filename))
41 |             for dirname in dirnames:
42 |                 if dirname == '__pycache__':
43 |                     shutil.rmtree(os.path.join(dirpath, dirname))
44 | 
45 | 
46 | setup(
47 |     name='libshorttext',
48 |     version='1.1',
49 |     packages=['', 'libshorttext', 'libshorttext.analyzer', 'libshorttext.converter', 'libshorttext.converter.stemmer',
50 |               'libshorttext.classifier', 'libshorttext.classifier.learner', 'libshorttext.classifier.learner.liblinear',
51 |               'libshorttext.classifier.learner.liblinear.python'],
52 |     package_data={'libshorttext': [join('converter', 'stop-words', '*')]},
53 |     url='',
54 |     license='',
55 |     author='',
56 |     author_email='',
57 |     description='',
58 |     cmdclass={
59 |         'install': MakeCommand,
60 |         'clean': CleanCommand,
61 |     },
62 | )
63 | 


--------------------------------------------------------------------------------