├── .gitignore ├── LICENSE ├── README.md ├── bin └── fxml.py ├── fastxml ├── __init__.py ├── fastxml.py ├── inferencer.pyx ├── metrics.py ├── proc.py ├── splitter.pyx ├── trainer.py └── weights.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | venv/ 4 | *.egg-info/ 5 | *.pyc 6 | *.so 7 | *.c 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2016 Andrew Stanton 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | FastXML / PFastXML / PFastreXML - Fast and Accurate Tree Extreme Multi-label Classifier 2 | === 3 | 4 | This is a fast implementation of FastXML, PFastXML, and PFastreXML based on the following papers: 5 | 6 | - "FastXML: A Fast, Accurate and Stable Tree-classifier for eXtreme Multi-label Learning" [Paper](https://manikvarma.github.io/pubs/prabhu14.pdf) 7 | - "Extreme Multi-label Loss Functions for Recommendation, Tagging, Ranking & Other Missing Label Application" [Paper](https://manikvarma.github.io/pubs/jain16.pdf) 8 | - "DiSMEC - Distributed Sparse Machines for Extreme Multi-label Classification" [Paper](https://arxiv.org/abs/1609.02521) [Code](https://sites.google.com/site/rohitbabbar/code/dismec) 9 | 10 | DiSMEC makes it's appearance via an L2 penalty rather than an L1 which, when set with a high alpha and sparsity eps of 0.01-0.05, also can produce sparse linear classifiers. 11 | 12 | It's implemented in the quasi-familiar scikit-learn clf format. 13 | 14 | Release Notes 15 | === 16 | 2.0 17 | --- 18 | - Version 2.0 is _not_ backward compatible with 1.x 19 | - User model.save(path) to save models instead of cPickle 20 | - Rewrites data storage layer 21 | - Uses 50% the memory, loads 30% faster, and is 40% faster to inference 22 | 23 | Binary 24 | === 25 | 26 | This repo provides a simple script along with the library, fxml.py, which allows easy train / testing of simple datasets. 27 | 28 | It takes two formats: a simple JSON format and the standard extreme multi label dataset format. 29 | 30 | Standard Benchmark Datasets 31 | --- 32 | 33 | As an example, to train a standalone classifier against the Delicious-200K dataset: 34 | 35 | fxml.py delicious.model deliciousLarge_train.txt --standard-dataset --verbose train --iters 5 --trees 20 --label-weight propensity --alpha 1e-4 --leaf-classifiers --no-remap-labels 36 | 37 | To test: 38 | 39 | fxml.py delicious.model deliciousLarge_test.txt --standard-dataset inference 40 | 41 | JSON File 42 | --- 43 | 44 | As fxml.py is intended as an easy to understand example for setting up a FastXML classifier, the JSON format 45 | is very simple. It is newline delimited format. 46 | 47 | train.json: 48 | 49 | {"title": "red dresses", "tags": ["clothing", "women", "dresses"]} 50 | {"title": "yellow dresses for sweet 16", "tags": ["yellow", "summer dresses", "occasionwear"]} 51 | ... 52 | 53 | It can then be trained: 54 | 55 | fxml.py my_json.model train.json --verbose train --iters 5 --trees 20 --label-weight propensity --alpha 1e-4 --leaf-classifiers 56 | 57 | Not the omission of the flags "--standard-dataset" and "--no-remap-labels". Since the tags/classes provided are strings, fxml.py will remap them to an integer label space for training. During inference, it will map the label index back 58 | 59 | Simple Python Usage 60 | === 61 | 62 | from fastxml import Trainer, Inferencer 63 | 64 | X = [Sparse or numpy arrays] 65 | y = [[1, 3]] # Currently requires list[list[int]] 66 | 67 | trainer = Trainer(n_trees=32, n_jobs=-1) 68 | 69 | trainer.fit(X, y) 70 | 71 | trainer.save(path) 72 | 73 | clf = Inferencer(path) 74 | 75 | clf.predict(X) 76 | # or 77 | clf.predict(X, fmt='dict') 78 | 79 | ############# 80 | # PFastXML 81 | ############# 82 | 83 | from fastxml.weights import propensity 84 | 85 | weights = propensity(y) 86 | trainer.fit(X, y, weights) 87 | 88 | ############### 89 | # PFastreXML 90 | ############### 91 | trainer = Trainer(n_trees=32, n_jobs=-1, leaf_classifiers=True) 92 | trainer.fit(X, y, weights) 93 | 94 | TODO 95 | === 96 | 97 | 1. Run all the standard benchmark datasets against it. 98 | 99 | 2. Refactor. Most of the effort has been spent on speed and it needs to be cleaned up. 100 | -------------------------------------------------------------------------------- /bin/fxml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from __future__ import division 4 | from builtins import zip 5 | from builtins import input 6 | from builtins import map 7 | from builtins import next 8 | from builtins import str 9 | from builtins import range 10 | from past.utils import old_div 11 | from builtins import object 12 | import sys 13 | import json 14 | import pprint 15 | import os 16 | import threading 17 | from collections import defaultdict, Counter 18 | import multiprocessing 19 | from itertools import islice, chain, count 20 | 21 | import argparse 22 | 23 | import numpy as np 24 | from sklearn.feature_extraction import FeatureHasher 25 | import scipy.sparse as sp 26 | 27 | from fastxml import Inferencer, Trainer, metric_cluster 28 | from fastxml.weights import uniform, nnllog, propensity, logexp 29 | from fastxml.metrics import ndcg, precision, pSndcg 30 | 31 | def build_arg_parser(): 32 | parser = argparse.ArgumentParser(description='FastXML trainer and tester', 33 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 34 | ) 35 | parser.add_argument("model", 36 | help="Model to use for dataset file") 37 | 38 | parser.add_argument("input_file", 39 | help="Input file to use") 40 | 41 | group = parser.add_mutually_exclusive_group() 42 | group.add_argument("--standard-dataset", dest="standardDataset", action="store_true", 43 | help="Input is standard dataset sparse format") 44 | 45 | group.add_argument("--pre-gen", dest="preGen", type=int, 46 | help="Input is is pregenerated sparse format") 47 | 48 | parser.add_argument("--verbose", action="store_true", 49 | help="Verbose" 50 | ) 51 | 52 | subparsers = parser.add_subparsers(dest="command") 53 | 54 | trainer = subparsers.add_parser('train', help="Set up for trainer") 55 | build_train_parser(trainer) 56 | 57 | inference = subparsers.add_parser('inference', help="Runs a model against a dataset") 58 | build_repl_parser(inference) 59 | build_inference_parser(inference) 60 | 61 | cluster = subparsers.add_parser('cluster', help="Clusters labels into NDCG classes") 62 | build_cluster_parser(cluster) 63 | 64 | repl = subparsers.add_parser('repl', help="Interactive mode for a model") 65 | build_repl_parser(repl) 66 | 67 | return parser 68 | 69 | def build_cluster_parser(parser): 70 | parser.add_argument("--trees", dest="trees", type=int, default=1, 71 | help="Number of random trees to cluster on" 72 | ) 73 | parser.add_argument("--label-weight", dest="label_weight", 74 | choices=('uniform', 'nnllog', 'propensity', 'logexp'), default='propensity', 75 | help="Metric for computing label weighting." 76 | ) 77 | parser.add_argument("--max_leaf_size", dest="max_leaf_size", type=int, 78 | default=10, 79 | help="Maximumum number of examples allowed per leaf" 80 | ) 81 | parser.add_argument("--label-weight-hp", dest="label_weight_hp", 82 | metavar="P", nargs=2, type=float, default = (None, None), 83 | help="Hyper parameters for label weight tuning" 84 | ) 85 | 86 | def build_repl_parser(parser): 87 | parser.add_argument("--max-predict", dest="max_predict", type=int, 88 | default=10, 89 | help="Maximum number of classes to predict" 90 | ) 91 | parser.add_argument("--gamma", type=float, 92 | help="Overrides default gamma value for leaf classifiers" 93 | ) 94 | parser.add_argument("--blend_factor", type=float, 95 | help="Overrides default blend factor" 96 | ) 97 | parser.add_argument("--leaf-probs", dest="leafProbs", type=lambda x: x.lower() == "true", 98 | help="Overrides whether to show log vs P(Y|X)" 99 | ) 100 | 101 | def build_inference_parser(parser): 102 | parser.add_argument("--dict", dest="dict", action="store_true", 103 | help="Store predict as dict" 104 | ) 105 | parser.add_argument("--score", action="store_true", 106 | help="Scores results according to ndcg and precision" 107 | ) 108 | parser.add_argument("--score-only", dest="scoreOnly", action="store_true", 109 | help="Scores the dataset and returns the average NDCG scores" 110 | ) 111 | 112 | def build_train_parser(parser): 113 | parser.add_argument("--engine", dest="engine", default="auto", 114 | choices=('auto', 'sgd', 'liblinear'), 115 | help="Which engine to use." 116 | ) 117 | parser.add_argument("--auto-weight", dest="auto_weight", default=32, type=int, 118 | help="When engine is 'auto', number of classes * max_leaf_size remaining to revert to SGD" 119 | ) 120 | parser.add_argument("--no-remap-labels", dest="noRemap", action="store_true", 121 | help="Whether to remap labels to an internal format. Needed for string labels" 122 | ) 123 | parser.add_argument("--trees", dest="trees", type=int, 124 | default=50, 125 | help="Number of trees to use" 126 | ) 127 | parser.add_argument("--max_leaf_size", dest="max_leaf_size", type=int, 128 | default=10, 129 | help="Maximumum number of examples allowed per leaf" 130 | ) 131 | parser.add_argument("--max_labels_per_leaf", dest="max_labels_per_leaf", type=int, 132 | default=50, 133 | help="Maximum number of classes to retaion for probability distribution per leaf" 134 | ) 135 | parser.add_argument("--re_split", dest="re_split", type=int, 136 | default=1, 137 | help="After fitting a classifier, re-splits the data according to fitted "\ 138 | "classifier. If greater than 1, it will re-fit and re-train a classifier "\ 139 | "the data if after splitting, it all ends in a leaf. Will retry N times." 140 | ) 141 | parser.add_argument("--alpha", dest="alpha", type=float, 142 | default=1e-3, 143 | help="L1 coefficient. Too high and it won't learn a split, too low and "\ 144 | "it won't be sparse (larger file size, slower inference)." 145 | ) 146 | parser.add_argument("--C", dest="C", type=float, 147 | default=1, 148 | help="C value for when using auto, penalizing accuracy over fit" 149 | ) 150 | parser.add_argument("--iters", dest="iters", 151 | type=lambda x: int(x) if x != 'auto' else x, 152 | default=2, 153 | help="Number of iterations to run over the dataset when fitting classifier" 154 | ) 155 | parser.add_argument("--n_updates", dest="n_updates", 156 | type=int, 157 | default=100, 158 | help="If iters is 'auto', makes it use iters = n_update / N" 159 | ) 160 | parser.add_argument("--no_bias", dest="bias", action="store_false", 161 | help="Fits a bias for the classifier. Not needed if data has E[X] = 0" 162 | ) 163 | parser.add_argument("--subsample", dest="subsample", type=float, 164 | default=1.0, 165 | help="Subsample data per tree. if less than 1, interpretted as a "\ 166 | "percentage. If greater than one, taken as number of data " \ 167 | "points per tree." 168 | ) 169 | parser.add_argument("--loss", dest="loss", choices=('log', 'hinge'), 170 | default='log', 171 | help="Loss to minimize." 172 | ) 173 | parser.add_argument("--threads", dest="threads", type=int, 174 | default=multiprocessing.cpu_count(), 175 | help="Number of threads to use. Will use min(threads, trees)" 176 | ) 177 | parser.add_argument("--label-weight", dest="label_weight", 178 | choices=('uniform', 'nnllog', 'propensity', 'logexp'), default='propensity', 179 | help="Metric for computing label weighting." 180 | ) 181 | parser.add_argument("--label-weight-hp", dest="label_weight_hp", 182 | metavar="P", nargs=2, type=float, default = (None, None), 183 | help="Hyper parameters for label weight tuning" 184 | ) 185 | parser.add_argument("--optimization", dest="optimization", 186 | choices=('fastxml', 'dsimec'), default='fastxml', 187 | help="optimization strategy to use for linear classifier" 188 | ) 189 | parser.add_argument("--eps", dest="eps", type=float, 190 | help="Sparsity epsilon. Weights lower than eps will suppress to zero" 191 | ) 192 | parser.add_argument("--leaf-classifiers", dest="leaf_class", 193 | action="store_true", 194 | help="Whether to use and compute leaf classifiers" 195 | ) 196 | parser.add_argument("--gamma", type=int, default=30, 197 | help="Gamma coefficient for hyper-sphere weighting" 198 | ) 199 | parser.add_argument("--blend-factor", dest="blend_factor", 200 | type=float, default=0.5, 201 | help="blend * tree-probs + (1 - blend) * tail-classifiers" 202 | ) 203 | parser.add_argument("--min-label-count", dest="mlc", 204 | type=int, default=5, 205 | help="Filter out labels with count < min-label-count" 206 | ) 207 | parser.add_argument("--leaf-probs", dest="leafProbs", 208 | action="store_true", 209 | help="Computes probability: TP(X) * LP(X)" 210 | ) 211 | return parser 212 | 213 | def sliding(it, window): 214 | x = list(islice(it, window)) 215 | try: 216 | if len(x) == window: 217 | while True: 218 | yield x 219 | x2 = x[1:] 220 | x2.append(next(it)) 221 | x = x2 222 | 223 | except StopIteration: 224 | pass 225 | 226 | class Quantizer(object): 227 | def stream(self, fn): 228 | raise NotImplementedError() 229 | 230 | class JsonQuantizer(Quantizer): 231 | def __init__(self, verbose, min_label_count=1, inference=False): 232 | self.fh = FeatureHasher(dtype='float32') 233 | self.verbose = verbose 234 | self.inference = inference 235 | self.min_label_count = min_label_count 236 | 237 | def quantize(self, text): 238 | text = text.lower().replace(',', '') 239 | unigrams = text.split() 240 | bigrams = (' '.join(xs) for xs in sliding(iter(unigrams), 2)) 241 | trigrams = (' '.join(xs) for xs in sliding(iter(unigrams), 3)) 242 | 243 | d = {f: 1.0 for f in chain(unigrams, bigrams, trigrams)} 244 | return self.fh.transform([d]) 245 | 246 | def yieldJson(self, fname): 247 | with open(fname, 'rt') as f: 248 | for i, line in enumerate(f): 249 | if self.verbose and i % 10000 == 0: 250 | print("%s docs encoded" % i) 251 | 252 | yield json.loads(line) 253 | 254 | def count_labels(self, fname): 255 | c = Counter() 256 | for data in self.yieldJson(fname): 257 | c.update(data['tags']) 258 | 259 | return (lambda t: c[t] >= self.min_label_count) 260 | 261 | def stream(self, fname, no_features=False): 262 | if self.min_label_count > 1: 263 | f = self.count_labels(fname) 264 | else: 265 | f = lambda x: True 266 | 267 | for data in self.yieldJson(fname): 268 | y = [yi for yi in set(data.get('tags', [])) if f(yi)] 269 | if no_features: 270 | yield data, y 271 | else: 272 | X = self.quantize(data['title']) 273 | yield data, X, y 274 | 275 | class PregenQuantizer(JsonQuantizer): 276 | 277 | def __init__(self, verbose, min_label_count, dims, inference=False): 278 | super(PregenQuantizer, self).__init__(verbose, min_label_count, inference) 279 | self.dims = dims 280 | 281 | def quantize(self, text): 282 | data = [] 283 | row_ind = [] 284 | col_ind = [] 285 | for p in text.split(): 286 | rIndex, rValue = p.split(':') 287 | row_ind.append(0) 288 | col_ind.append(int(rIndex)) 289 | data.append(float(rValue)) 290 | 291 | return sp.csr_matrix((data, (row_ind, col_ind)), (1, self.dims)).astype('float32') 292 | 293 | class StandardDatasetQuantizer(Quantizer): 294 | 295 | def __init__(self, verbose): 296 | self.verbose = verbose 297 | 298 | def quantize(self, line, no_features): 299 | if " " not in line: 300 | classes, sparse = line.strip(), "" 301 | elif line.startswith(' '): 302 | classes, sparse = '', line.strip() 303 | else: 304 | classes, sparse = line.strip().split(None, 1) 305 | 306 | if classes: 307 | y = list(map(int, classes.split(','))) 308 | else: 309 | y = [] 310 | 311 | if no_features: 312 | return y 313 | 314 | c, d = [], [] 315 | for v in sparse.split(): 316 | loc, v = v.split(":") 317 | c.append(int(loc)) 318 | d.append(float(v)) 319 | 320 | return (c, d), y 321 | 322 | def stream(self, fn, no_features=False): 323 | with open(fn, 'rt') as f: 324 | n_samples, n_feats, n_classes = list(map(int, f.readline().split())) 325 | for i, line in enumerate(f): 326 | if i == 0: 327 | continue 328 | 329 | if self.verbose and i % 10000 == 0: 330 | print("%s docs encoded" % i) 331 | 332 | res = self.quantize(line, no_features) 333 | if no_features: 334 | yield {"labels": res}, res 335 | else: 336 | 337 | (c, d), y = res 338 | yield {"labels": y}, sp.csr_matrix((d, ([0] * len(d), c)), 339 | shape=(1, n_feats), dtype='float32'), y 340 | 341 | class Dataset(object): 342 | def __init__(self, dataset): 343 | self.dataset = dataset 344 | 345 | @property 346 | def model(self): 347 | return os.path.join(self.dataset, 'model') 348 | 349 | @property 350 | def classes(self): 351 | return os.path.join(self.dataset, 'counts') 352 | 353 | @property 354 | def weights(self): 355 | return os.path.join(self.dataset, 'weights') 356 | 357 | 358 | class ClusterDataset(object): 359 | def __init__(self, dataset): 360 | self.dataset = dataset 361 | 362 | def probs(self, i): 363 | return os.path.join(self.dataset, 'probs.%s' % i) 364 | 365 | @property 366 | def clusters(self): 367 | return os.path.join(self.dataset, 'cluster') 368 | 369 | 370 | def quantize(args, quantizer, classes): 371 | cnt = count() 372 | for _, X, ys in quantizer.stream(args.input_file): 373 | nys = [] 374 | for y in ys: 375 | if y not in classes: 376 | classes[y] = y if getattr(args, 'noRemap', False) else next(cnt) 377 | 378 | nys.append(classes[y]) 379 | 380 | yield X, nys 381 | 382 | def quantize_y(args, quantizer, classes): 383 | cnt = count() 384 | for _, ys in quantizer.stream(args.input_file, no_features=True): 385 | nys = [] 386 | for y in ys: 387 | if y not in classes: 388 | classes[y] = y if getattr(args, 'noRemap', False) else next(cnt) 389 | 390 | nys.append(classes[y]) 391 | 392 | yield nys 393 | 394 | def train(args, quantizer): 395 | cnt = count() 396 | classes, X_train, y_train = {}, [], [] 397 | for i, (X, y) in enumerate(quantize(args, quantizer, classes)): 398 | if y: 399 | X_train.append(X) 400 | y_train.append(y) 401 | 402 | elif args.verbose: 403 | print("Skipping example %s since it has no classes matching threshold" % i) 404 | 405 | # Save the mapping 406 | dataset = Dataset(args.model) 407 | if not os.path.isdir(args.model): 408 | os.makedirs(args.model) 409 | 410 | with open(dataset.classes, 'wt') as out: 411 | json.dump(list(classes.items()), out) 412 | 413 | weights = compute_weights(y_train, args.label_weight, args.label_weight_hp) 414 | with open(dataset.weights, 'wt') as out: 415 | for i, w in enumerate(weights): 416 | out.write("%s,%s\n" % (i, w)) 417 | 418 | # Train 419 | clf = Trainer( 420 | n_trees=args.trees, 421 | max_leaf_size=args.max_leaf_size, 422 | max_labels_per_leaf=args.max_labels_per_leaf, 423 | re_split=args.re_split, 424 | alpha=args.alpha, 425 | n_epochs=args.iters, 426 | n_updates=args.n_updates, 427 | bias=args.bias, 428 | subsample=args.subsample, 429 | loss=args.loss, 430 | leaf_classifiers=args.leaf_class, 431 | n_jobs=args.threads, 432 | optimization=args.optimization, 433 | eps=args.eps, 434 | C=args.C, 435 | engine=args.engine, 436 | auto_weight=args.auto_weight, 437 | verbose=args.verbose 438 | ) 439 | 440 | clf.fit(X_train, y_train, weights=weights) 441 | 442 | clf.save(dataset.model) 443 | 444 | sys.exit(0) 445 | 446 | def compute_weights(y_train, label_weight, hps): 447 | 448 | args = (y_train,) 449 | if hps[0] is not None: 450 | args += tuple(hps) 451 | 452 | if label_weight == 'nnllog': 453 | return nnllog(*args) 454 | elif label_weight == 'uniform': 455 | return uniform(y_train) 456 | elif label_weight == 'propensity': 457 | return propensity(*args) 458 | elif label_weight == 'logexp': 459 | return logexp(*args) 460 | else: 461 | raise NotImplementedError(label_weight) 462 | 463 | def print_metrics(ndcgs, precs, pndcgs, toStderr): 464 | fout = sys.stderr if toStderr else sys.stdout 465 | ndcgT = list(zip(*ndcgs)) 466 | precsT = list(zip(*precs)) 467 | pndcgT = list(zip(*pndcgs)) 468 | for i in range(3): 469 | print('P@{}: {}'.format(2 * i + 1, np.mean(precsT[i])), file=fout) 470 | 471 | for i in range(3): 472 | print('NDCG@{}: {}'.format(2 * i + 1, np.mean(ndcgT[i])), file=fout) 473 | 474 | for i in range(3): 475 | print('pNDCG@{}: {}'.format(2 * i + 1, np.mean(pndcgT[i])), file=fout) 476 | 477 | print(file=fout) 478 | 479 | def loadClasses(dataset): 480 | # Load reverse map 481 | with open(dataset.classes, 'rt') as f: 482 | data = json.load(f) 483 | return {v: k for k, v in data} 484 | 485 | def loadPropensities(dataset): 486 | props = [] 487 | with open(dataset.weights) as f: 488 | for line in f: 489 | props.append(old_div(1, float(line.strip().split(',')[1]))) 490 | 491 | return props 492 | 493 | def inference(args, quantizer): 494 | dataset = Dataset(args.model) 495 | 496 | clf = load_clf(dataset, args) 497 | 498 | classes = loadClasses(dataset) 499 | propensities = loadPropensities(dataset) 500 | 501 | ndcgs = [] 502 | precs = [] 503 | pndcgs = [] 504 | for data, X, y in quantizer.stream(args.input_file): 505 | y_hat = clf.predict(X, 'dict')[0] 506 | yi = islice(iter(y_hat.items()), args.max_predict) 507 | nvals = [[str(classes[k]), float(v)] for k, v in yi] 508 | data['predict'] = dict(nvals) if args.dict else nvals 509 | 510 | if args.score: 511 | ys = set(y) 512 | scores = [] 513 | props = [] 514 | for yii in y_hat.keys(): 515 | props.append(propensities[yii]) 516 | if classes[yii] in ys: 517 | ys.remove(classes[yii]) 518 | scores.append(1) 519 | else: 520 | scores.append(0) 521 | 522 | scores.extend([1] * len(ys)) 523 | 524 | ndcgs.append([ndcg(scores, i) for i in (1, 3, 5)]) 525 | pndcgs.append([pSndcg(scores, props, i) for i in (1, 3, 5)]) 526 | precs.append([precision(scores, i) for i in (1, 3, 5)]) 527 | data['ndcg'] = ndcgs[-1] 528 | data['precision'] = precs[-1] 529 | data['pSndcg'] = pndcgs[-1] 530 | 531 | if len(ndcgs) % 100 == 0: 532 | print("Seen:", len(ndcgs), file=sys.stderr) 533 | print_metrics(ndcgs, precs, pndcgs, not args.scoreOnly) 534 | 535 | if not args.scoreOnly: 536 | print(json.dumps(data)) 537 | 538 | if args.score: 539 | print_metrics(ndcgs, precs, pndcgs, not args.scoreOnly) 540 | 541 | def cluster(args, quantizer): 542 | 543 | cluster_dataset = ClusterDataset(args.model) 544 | if not os.path.exists(args.model): 545 | os.makedirs(args.model) 546 | 547 | # We only need classes for the clustering 548 | classes, y_train = {}, [] 549 | for y in quantize_y(args, quantizer, classes): 550 | y_train.append(y) 551 | 552 | classes = {v: k for k, v in classes.items()} 553 | 554 | weights = compute_weights(y_train, args.label_weight, args.label_weight_hp) 555 | trees = [] 556 | for i in range(args.trees): 557 | tree = metric_cluster(y_train, weights=weights, 558 | max_leaf_size=args.max_leaf_size, 559 | seed=2016 + i, verbose=args.verbose) 560 | 561 | d = tree.build_discrete() 562 | p = tree.build_probs(y_train) 563 | with open(cluster_dataset.probs(i), 'wt') as out: 564 | for i, pi in enumerate(p): 565 | x = {classes[l]: round(ps, 3) for l, ps in pi.items()} 566 | print(json.dumps(x), file=out) 567 | 568 | td = {idx: tn for tn, idxs in d for idx in idxs} 569 | trees.append(td) 570 | 571 | with open(cluster_dataset.clusters, 'wt') as out: 572 | for i in range(len(y_train)): 573 | cluster = [t[i] for t in trees] 574 | print(json.dumps(cluster), file=out) 575 | 576 | def load_clf(dataset, args): 577 | 578 | clf = Inferencer(dataset.model) 579 | 580 | if args.blend_factor is not None: 581 | clf.blend = args.blend_factor 582 | 583 | if args.gamma is not None: 584 | clf.gamma = args.gamma 585 | 586 | if args.leafProbs is not None: 587 | clf.leaf_probs = args.leafProbs 588 | 589 | return clf 590 | 591 | def repl(args, quantizer): 592 | dataset = Dataset(args.model) 593 | 594 | clf = load_clf(dataset, args) 595 | 596 | classes = loadClasses(dataset) 597 | 598 | try: 599 | while True: 600 | title = input("> ") 601 | X = quantizer.quantize(title) 602 | y_hat = clf.predict(X, 'dict')[0] 603 | yi = islice(iter(y_hat.items()), args.max_predict) 604 | nvals = [[str(classes[k]), v] for k, v in yi] 605 | pprint.pprint(nvals) 606 | 607 | except KeyboardInterrupt: 608 | pass 609 | 610 | if __name__ == '__main__': 611 | args = build_arg_parser().parse_args() 612 | # Quantize 613 | if args.standardDataset: 614 | quantizer = StandardDatasetQuantizer(args.verbose) 615 | elif args.preGen is not None: 616 | mlc = args.mlc if args.command == 'train' else 1 617 | quantizer = PregenQuantizer(args.verbose, mlc, args.preGen, args.command == 'inference') 618 | else: 619 | mlc = args.mlc if args.command == 'train' else 1 620 | quantizer = JsonQuantizer(args.verbose, mlc, args.command == 'inference') 621 | 622 | if args.command == 'train': 623 | train(args, quantizer) 624 | elif args.command == 'inference': 625 | inference(args, quantizer) 626 | elif args.command == 'repl': 627 | repl(args, quantizer) 628 | elif args.command == 'cluster': 629 | cluster(args, quantizer) 630 | -------------------------------------------------------------------------------- /fastxml/__init__.py: -------------------------------------------------------------------------------- 1 | from .fastxml import Inferencer 2 | from .trainer import Trainer, metric_cluster 3 | -------------------------------------------------------------------------------- /fastxml/fastxml.py: -------------------------------------------------------------------------------- 1 | from builtins import range 2 | from builtins import object 3 | import os 4 | import json 5 | from collections import OrderedDict 6 | 7 | import scipy.sparse as sp 8 | 9 | from .inferencer import IForest, LeafComputer, Blender, IForestBlender 10 | 11 | class Inferencer(object): 12 | """ 13 | Loads up a model for inferencing 14 | """ 15 | def __init__(self, dname, gamma=30, blend=0.8, leaf_probs=False): 16 | with open(os.path.join(dname, 'settings'), 'rt') as f: 17 | self.__dict__.update(json.load(f)) 18 | 19 | self.gamma = gamma 20 | self.blend = blend 21 | self.leaf_probs = leaf_probs 22 | 23 | forest = IForest(dname, self.n_trees, self.n_labels) 24 | if self.leaf_classifiers: 25 | lc = LeafComputer(dname) 26 | predictor = Blender(forest, lc) 27 | else: 28 | predictor = IForestBlender(forest) 29 | 30 | self.predictor = predictor 31 | 32 | def predict(self, X, fmt='sparse'): 33 | assert fmt in ('sparse', 'dict') 34 | s = [] 35 | num = X.shape[0] if isinstance(X, sp.csr_matrix) else len(X) 36 | for i in range(num): 37 | Xi = X[i] 38 | mean = self.predictor.predict(Xi.data, Xi.indices, 39 | self.blend, self.gamma, self.leaf_probs) 40 | 41 | if fmt == 'sparse': 42 | s.append(mean) 43 | 44 | else: 45 | od = OrderedDict() 46 | for idx in reversed(mean.data.argsort()): 47 | od[mean.indices[idx]] = mean.data[idx] 48 | 49 | s.append(od) 50 | 51 | if fmt == 'sparse': 52 | return sp.vstack(s) 53 | 54 | return s 55 | 56 | -------------------------------------------------------------------------------- /fastxml/inferencer.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False, wraparound=False 2 | 3 | import numpy as np 4 | import scipy.sparse as sp 5 | import struct 6 | 7 | cimport cython 8 | cimport numpy as np 9 | 10 | from libc.math cimport log, abs, exp, pow 11 | from cython.operator cimport dereference as deref, preincrement as inc 12 | from libcpp.algorithm cimport sort as stdsort 13 | from libcpp.unordered_map cimport unordered_map 14 | from libcpp cimport bool 15 | from libcpp.vector cimport vector 16 | from libcpp.pair cimport pair 17 | 18 | ctypedef pair[int,float] DP 19 | ctypedef vector[DP] SR 20 | ctypedef vector[SR] CSR 21 | ctypedef vector[vector[float]] DENSE 22 | 23 | cdef object sr_to_sparse(const SR& sr, const int size): 24 | cdef int count = sr.size() 25 | 26 | cdef np.ndarray[np.int32_t] ip = np.zeros(2, dtype=np.int32) 27 | cdef np.ndarray[np.int32_t] c = np.zeros(count, dtype=np.int32) 28 | cdef np.ndarray[np.float32_t] d = np.zeros(count, dtype=np.float32) 29 | 30 | cdef int [:] cv = c 31 | cdef float [:] dv = d 32 | cdef pair[int,float] p 33 | for i in range(count): 34 | p = sr[i] 35 | cv[i] = p.first 36 | dv[i] = p.second 37 | 38 | cv = ip 39 | cv[1] = count 40 | 41 | csr = sp.csr_matrix((1, size), dtype='float32') 42 | csr.indptr = ip 43 | csr.indices = c 44 | csr.data = d 45 | 46 | return csr 47 | 48 | cdef float dot(const SR& x, const SR& w, const float bias): 49 | cdef int xidx = 0, widx = 0, xi, wi 50 | cdef int x_s = x.size(), w_s = w.size() 51 | cdef float tally = 0.0 52 | 53 | while xidx < x_s and widx < w_s: 54 | xi = x[xidx].first 55 | wi = w[widx].first 56 | if xi < wi: 57 | xidx += 1 58 | elif xi > wi: 59 | widx += 1 60 | else: 61 | tally += x[xidx].second * w[widx].second 62 | xidx += 1 63 | widx += 1 64 | 65 | return tally + bias 66 | 67 | cdef SR convert_to_sr(const int [:] indices, const float [:] data, const int size): 68 | cdef SR sparse 69 | cdef pair[int,float] p 70 | cdef int i 71 | 72 | for i in range(size): 73 | p = pair[int,float]() 74 | p.first = indices[i] 75 | p.second = data[i] 76 | sparse.push_back(p) 77 | 78 | return sparse 79 | 80 | cdef SR sparse_sr_mean(const vector[SR*] probs, SR& averaged): 81 | cdef unordered_map[int,float] summer 82 | cdef SR* sr 83 | cdef int i,k 84 | 85 | # Copy srs into vector 86 | for i in range(probs.size()): 87 | sr = probs[i] 88 | for k in range(deref(sr).size()): 89 | summer[deref(sr)[k].first] += deref(sr)[k].second 90 | 91 | # Copy it into a new vector, averaging the values 92 | cdef unordered_map[int,float].iterator b = summer.begin() 93 | cdef unordered_map[int,float].iterator e = summer.end() 94 | 95 | cdef DP val 96 | while b != e: 97 | val = deref(b) 98 | val.second = val.second / probs.size() 99 | averaged.push_back(val) 100 | inc(b) 101 | 102 | stdsort(averaged.begin(), averaged.end()) 103 | 104 | return averaged 105 | 106 | cdef object read_row(object f, str type): 107 | d = f.read(struct.calcsize('I')) 108 | if not d: 109 | return None 110 | 111 | # Get size of row, and unpack entire pair set 112 | num, = struct.unpack("I", d) 113 | d2 = f.read(num * struct.calcsize(type)) 114 | 115 | return struct.unpack(type * num, d2) 116 | 117 | cdef void load_sparse(str fname, CSR& csr): 118 | cdef SR row 119 | cdef DP p 120 | cdef int i 121 | with open(fname, 'rb') as f: 122 | while True: 123 | values = read_row(f, 'If') 124 | if values is None: 125 | break 126 | 127 | row = vector[DP]() 128 | for i in range(0, len(values), 2): 129 | p.first = values[i] 130 | p.second = values[i+1] 131 | row.push_back(p) 132 | 133 | csr.push_back(row) 134 | 135 | cdef load_dense_f32(str fname, DENSE& dense): 136 | cdef vector[float] row 137 | cdef int i 138 | with open(fname, 'rb') as f: 139 | while True: 140 | values = read_row(f, 'f') 141 | if values is None: 142 | break 143 | 144 | row = vector[float]() 145 | 146 | # Get size of row, and unpack floats 147 | for i in range(0, len(values)): 148 | row.push_back(values[i]) 149 | 150 | dense.push_back(row) 151 | 152 | cdef load_dense_int(str fname, vector[vector[int]]& dense): 153 | cdef vector[int] row 154 | cdef int i 155 | with open(fname, 'rb') as f: 156 | 157 | while True: 158 | values = read_row(f, 'I') 159 | if values is None: 160 | break 161 | 162 | row = vector[int]() 163 | 164 | for i in range(0, len(values)): 165 | row.push_back(values[i]) 166 | 167 | dense.push_back(row) 168 | 169 | cdef class Blender: 170 | cdef IForest forest 171 | cdef LeafComputer lc 172 | 173 | def __init__(self, IForest forest, LeafComputer lc): 174 | self.forest = forest 175 | self.lc = lc 176 | 177 | def predict(self, np.ndarray[np.float32_t] data, 178 | np.ndarray[np.int32_t] indices, 179 | const float blend, 180 | const float gamma, 181 | const bool keep_probs=False): 182 | cdef SR sr = convert_to_sr(indices, data, data.shape[0]) 183 | cdef SR tree_probs 184 | 185 | # Get tree probs 186 | self.forest._predict(sr, tree_probs) 187 | 188 | # If blend == 1.0, we're done 189 | if blend == 1.0: 190 | return sr_to_sparse(tree_probs, self.forest.n_labels) 191 | 192 | cdef SR leaf_probs 193 | cdef vector[int] labels 194 | cdef int i 195 | 196 | # Build the indices 197 | for i in range(tree_probs.size()): 198 | labels.push_back(tree_probs[i].first) 199 | 200 | # Compute leaf classifier 201 | self.lc.predict(sr, labels, gamma, leaf_probs) 202 | 203 | cdef SR res 204 | self._blend(tree_probs, leaf_probs, blend, keep_probs, res) 205 | return sr_to_sparse(res, self.forest.n_labels) 206 | 207 | cdef void _blend(self, const SR& tree_probs, 208 | const SR& leaf_probs, 209 | const float blend, 210 | const bool keep_probs, 211 | SR& out): 212 | cdef int i 213 | cdef DP tp, lp, t 214 | for i in range(tree_probs.size()): 215 | tp = tree_probs[i] 216 | lp = leaf_probs[i] 217 | if keep_probs: 218 | tp.second *= blend 219 | lp.second *= (1 - blend) 220 | else: 221 | tp.second = log(tp.second) * blend 222 | lp.second = log(lp.second) * (1 - blend) 223 | 224 | t.first = tp.first 225 | t.second = tp.second + lp.second 226 | out.push_back(t) 227 | 228 | cdef class IForestBlender: 229 | cdef IForest forest 230 | 231 | def __init__(self, IForest forest): 232 | self.forest = forest 233 | 234 | def predict(self, np.ndarray[np.float32_t] data, 235 | np.ndarray[np.int32_t] indices, 236 | const float blend, 237 | const float gamma, 238 | const bool keep_probs=False): 239 | 240 | return self.forest.predict(data, indices) 241 | 242 | cdef class IForest: 243 | cdef list trees 244 | cdef int n_labels 245 | 246 | def __init__(self, str dname, int trees, int n_labels): 247 | self.n_labels = n_labels 248 | self.trees = [] 249 | cdef int i 250 | for i in range(trees): 251 | self.trees.append(ITree(dname, i)) 252 | 253 | cdef SR* _predict_tree(self, const SR& sr, ITree t): 254 | return t.predict_payload(sr) 255 | 256 | cdef void _predict(self, const SR& sr, SR& payload): 257 | cdef vector[SR*] prob_set 258 | cdef SR* res 259 | for t in self.trees: 260 | res = self._predict_tree(sr, t) 261 | prob_set.push_back(res) 262 | 263 | sparse_sr_mean(prob_set, payload) 264 | 265 | def predict(self, np.ndarray[np.float32_t] data, np.ndarray[np.int32_t] indices): 266 | cdef SR sr = convert_to_sr(indices, data, data.shape[0]) 267 | cdef SR payload 268 | 269 | self._predict(sr, payload) 270 | return sr_to_sparse(payload, self.n_labels) 271 | 272 | cdef class ITree: 273 | cdef int rootIdx 274 | cdef vector[vector[int]] tree 275 | cdef CSR payloads 276 | 277 | cdef CSR W 278 | 279 | cdef vector[float] bias 280 | 281 | def __init__(self, str dname, int tree_idx) : 282 | 283 | p = dname.rstrip('/') + '/tree.%s' % tree_idx 284 | 285 | # Load Sparse into W points 286 | load_sparse(p + '.weights', self.W) 287 | 288 | # Load bias 289 | cdef DENSE tmp 290 | load_dense_f32(p + '.bias', tmp) 291 | self.bias.swap(tmp[0]) 292 | 293 | # Load Tree 294 | load_dense_int(p + '.tree', self.tree) 295 | 296 | # Load Payloads 297 | load_sparse(p + '.probs', self.payloads) 298 | 299 | self.rootIdx = self.tree.size() - 1 300 | 301 | cdef SR* predict_payload(self, const SR& sr): 302 | cdef int idx = self.predict_sr(sr) 303 | return &self.payloads[idx] 304 | 305 | cdef inline int index(self, const vector[int]& node): 306 | return node[0] 307 | 308 | cdef inline int left(self, const vector[int]& node): 309 | return node[1] 310 | 311 | cdef inline int right(self, const vector[int]& node): 312 | return node[2] 313 | 314 | cdef inline bool is_leaf(self, const vector[int]& node): 315 | return node[3] == 1 316 | 317 | cdef int predict_sr(self, const SR& data): 318 | cdef unsigned int index, nIndex 319 | cdef vector[int] node 320 | cdef float d 321 | cdef SR* W 322 | 323 | node = self.tree[self.rootIdx] 324 | while not self.is_leaf(node): 325 | index = self.index(node) 326 | W = &self.W[index] 327 | d = dot(data, deref(W), self.bias[index]) 328 | if d < 0: 329 | nIndex = self.left(node) 330 | else: 331 | nIndex = self.right(node) 332 | 333 | node = self.tree[nIndex] 334 | 335 | return self.index(node) 336 | 337 | cdef class LeafComputer: 338 | cdef vector[float] norms 339 | cdef vector[float] radii 340 | cdef CSR means 341 | 342 | def __init__(self, str dname): 343 | p = dname.rstrip('/') + '/lc' 344 | 345 | # Load norms 346 | cdef DENSE tmp 347 | load_dense_f32(p + '.norms', tmp) 348 | self.norms.swap(tmp[0]) 349 | 350 | # Load bias 351 | cdef DENSE tmp2 352 | load_dense_f32(p + '.radii', tmp2) 353 | self.radii.swap(tmp2[0]) 354 | 355 | # Load means 356 | load_sparse(p + '.means', self.means) 357 | 358 | cdef void predict(self, const SR& X, const vector[int] ys, const float gamma, SR& out): 359 | cdef SR normed 360 | cdef int yi, i 361 | cdef DP p 362 | cdef float dist 363 | cdef SR* mean 364 | 365 | # Norm the vector 366 | norm(self.norms, X, normed) 367 | 368 | # Loop over each class, determining the leaf classifier vlaues 369 | for i in range(ys.size()): 370 | yi = ys[i] 371 | mean = &self.means[yi] 372 | dist = radius_sr(normed, deref(mean)) 373 | k = exp(gamma * (dist - self.radii[yi])) 374 | p.first = i 375 | p.second = 1. / (1. + k) 376 | out.push_back(p) 377 | 378 | cdef void norm(const vector[float]& norms, const SR& X, SR& normed): 379 | cdef int i 380 | cdef float l2 = 0 381 | cdef DP p 382 | 383 | # Column norm and compute l2 norm 384 | for i in range(X.size()): 385 | p = X[i] 386 | p.second /= norms[p.first] 387 | l2 += p.second * p.second 388 | normed.push_back(p) 389 | 390 | # Divide out the l2 norm 391 | l2 = pow(l2, .5) 392 | for i in range(normed.size()): 393 | normed[i].second = normed[i].second / l2 394 | 395 | 396 | cdef float radius_sr(const SR& xi, const SR& ui): 397 | """ 398 | Computes the Sum((Xi - Ux) ** 2) 399 | """ 400 | 401 | cdef int xidx = 0, uidx = 0 402 | cdef int s1 = xi.size(), s2 = ui.size() 403 | cdef double tally = 0.0, diff 404 | cdef DP xp, up 405 | 406 | while xidx < s1 and uidx < s2: 407 | xp = xi[xidx] 408 | up = ui[uidx] 409 | if xp.first < up.first: 410 | tally += pow(xp.second, 2) 411 | xidx += 1 412 | elif xp.first > up.first: 413 | tally += pow(up.second, 2) 414 | uidx += 1 415 | else: 416 | diff = xp.second - up.second 417 | tally += pow(diff, 2) 418 | xidx += 1 419 | uidx += 1 420 | 421 | # Get the remainder 422 | while xidx < s1 or uidx < s2: 423 | if xidx < s1: 424 | tally += pow(xi[xidx].second, 2) 425 | xidx += 1 426 | else: 427 | tally += pow(ui[uidx].second, 2) 428 | uidx += 1 429 | 430 | return tally 431 | 432 | -------------------------------------------------------------------------------- /fastxml/metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from builtins import range 3 | from past.utils import old_div 4 | import math 5 | 6 | def precision(scores, k): 7 | return old_div(sum(scores[:k]), float(k)) 8 | 9 | def dcg(scores, k=None): 10 | if k is not None: 11 | scores = scores[:k] 12 | 13 | return sum(old_div(rl, math.log(i + 2)) for i, rl in enumerate(scores)) 14 | 15 | def ndcg(scores, k=None, eps=1e-6): 16 | idcgs = dcg(sorted(scores, reverse=True), k) 17 | if idcgs < eps: 18 | return 0.0 19 | 20 | dcgs = dcg(scores, k) 21 | 22 | return old_div(dcgs, idcgs) 23 | 24 | def pSdcg(scores, props, k=None): 25 | if k is not None: 26 | scores = scores[:k] 27 | 28 | k = 0 29 | for i, rl in enumerate(scores): 30 | p = props[i] if i < len(props) else 1 31 | k += old_div(rl, (p * math.log(i + 2))) 32 | 33 | return k 34 | 35 | def pSndcg(scores, props, k=None): 36 | dcgs = pSdcg(scores, props, k) 37 | 38 | denom = sum(old_div(1., math.log(i + 2)) for i in range(k or len(scores))) 39 | 40 | return old_div(dcgs, denom) 41 | 42 | 43 | -------------------------------------------------------------------------------- /fastxml/proc.py: -------------------------------------------------------------------------------- 1 | from builtins import object 2 | import multiprocessing 3 | 4 | class Result(object): 5 | 6 | def ready(self): 7 | raise NotImplementedError() 8 | 9 | def get(self): 10 | raise NotImplementedError() 11 | 12 | class ForkResult(Result): 13 | def __init__(self, queue, p): 14 | self.queue = queue 15 | self.p = p 16 | 17 | def ready(self): 18 | return self.p.is_alive() 19 | 20 | def get(self): 21 | result = self.queue.get() 22 | self.p.join() 23 | self.queue.close() 24 | return result 25 | 26 | class SingleResult(Result): 27 | def __init__(self, res): 28 | self.res = res 29 | 30 | def ready(self): 31 | return True 32 | 33 | def get(self): 34 | return self.res 35 | 36 | def _remote_call(q, f, args): 37 | results = f(*args) 38 | q.put(results) 39 | 40 | def faux_fork_call(f): 41 | def f2(*args): 42 | return SingleResult(f(*args)) 43 | 44 | return f2 45 | 46 | def fork_call(f): 47 | def f2(*args): 48 | queue = multiprocessing.Queue(1) 49 | p = multiprocessing.Process(target=_remote_call, args=(queue, f, args)) 50 | p.start() 51 | return ForkResult(queue, p) 52 | 53 | return f2 54 | 55 | -------------------------------------------------------------------------------- /fastxml/splitter.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False, wraparound=False, initializedcheck=False 2 | 3 | from collections import defaultdict 4 | import numpy as np 5 | import scipy.sparse as sp 6 | 7 | cimport cython 8 | cimport numpy as np 9 | 10 | from libc.math cimport log, abs, exp, pow 11 | from cython.operator cimport dereference as deref, preincrement as inc 12 | from libcpp.unordered_map cimport unordered_map 13 | from libcpp cimport bool 14 | from libcpp.algorithm cimport sort as stdsort 15 | from libcpp.vector cimport vector 16 | from libcpp.pair cimport pair 17 | 18 | ctypedef pair[vector[int],vector[int]] LR_SET 19 | ctypedef pair[int,int] I_PAIR 20 | ctypedef vector[I_PAIR] COUNTER 21 | ctypedef vector[vector[int]] YSET 22 | ctypedef vector[pair[int,float]] SR 23 | ctypedef vector[SR] CSR 24 | 25 | @cython.profile(False) 26 | cdef bool sort_pairs(const I_PAIR& l, const I_PAIR& r): 27 | if l.second > r.second: 28 | return True 29 | 30 | if l.second < r.second: 31 | return False 32 | 33 | return l.first > r.first 34 | 35 | cdef void copy_into(vector[int]& dest, vector[int]& src1): 36 | for i in range(src1.size()): 37 | dest.push_back(src1[i]) 38 | 39 | cdef void replace_vecs(vector[int]& dest, vector[int]& src1, vector[int]& src2): 40 | dest.swap(src1) 41 | copy_into(dest, src2) 42 | 43 | cdef inline void dcg(const vector[float]& ord_left, const vector[float]& ord_right, const vector[int]& ls, pair[float,float]& p): 44 | """ 45 | We only need to use DCG since we're only using it to determine which partition 46 | bucket the label set in 47 | """ 48 | cdef int i, l 49 | cdef float sl = 0, sr = 0 50 | for i in range(ls.size()): 51 | l = ls[i] 52 | sl += ord_left[l] 53 | sr += ord_right[l] 54 | 55 | p.first = sl 56 | p.second = sr 57 | 58 | cdef class NDCGSplitter: 59 | cdef void order_labels(self, const vector[int]& idxs, const YSET& yset, 60 | const vector[float]& weights, vector[float]& p_logs, vector[float]& logs): 61 | return 62 | 63 | cdef class DenseNDCGSplitter(NDCGSplitter): 64 | cdef int n_labels 65 | cdef vector[I_PAIR] counter 66 | 67 | def __init__(self, const int n_labels): 68 | 69 | cdef pair[int,int] p 70 | p.first = p.second = 0 71 | 72 | # Variable for NDCG sorting 73 | self.counter = vector[I_PAIR](n_labels, p) 74 | 75 | cdef void count_labels(self, const vector[int]& idxs, const YSET& yset): 76 | 77 | cdef int offset, yi, i, label 78 | cdef vector[int] ys 79 | 80 | # Clear the counter 81 | for i in range(self.counter.size()): 82 | self.counter[i].first = i 83 | self.counter[i].second = 0 84 | 85 | for i in range(idxs.size()): 86 | offset = idxs[i] 87 | ys = yset[offset] 88 | for yi in range(ys.size()): 89 | label = ys[yi] 90 | self.counter[label].second += 1 91 | 92 | return 93 | 94 | cdef void sort_counter(self): 95 | # Since this is potentially very sparse, we do a single pass moving non-empty 96 | # pairs to the front of counter 97 | cdef pair[int,int] tmp 98 | cdef size_t i = 0, j = self.counter.size() - 1 99 | while i < j: 100 | if self.counter[i].second > 0: 101 | i += 1 102 | elif self.counter[j].second == 0: 103 | j -= 1 104 | else: 105 | # swap 106 | tmp = self.counter[i] 107 | self.counter[i] = self.counter[j] 108 | self.counter[j] = tmp 109 | i += 1 110 | j -= 1 111 | 112 | # Partial sort only up to i 113 | stdsort(self.counter.begin(), 114 | self.counter.begin() + i + 1, 115 | &sort_pairs) 116 | 117 | cdef void order_labels(self, const vector[int]& idxs, const YSET& yset, 118 | const vector[float]& weights, vector[float]& p_logs, vector[float]& logs): 119 | 120 | cdef int i, label 121 | cdef float w 122 | cdef pair[int,int] ord 123 | 124 | # Clean and copy 125 | self.count_labels(idxs, yset) 126 | 127 | # Sort the results 128 | self.sort_counter() 129 | 130 | for i in range(self.counter.size()): 131 | ord = self.counter[i] 132 | label = ord.first 133 | if ord.second == 0: 134 | break 135 | else: 136 | w = weights[label] 137 | logs[label] = p_logs[i] * w 138 | 139 | for l in range(i, self.counter.size()): 140 | logs[self.counter[l].first] = 0.0 141 | 142 | return 143 | 144 | cdef class SparseNDCGSplitter(NDCGSplitter): 145 | cdef int n_labels 146 | cdef vector[I_PAIR] sorter 147 | cdef unordered_map[int,int] counter 148 | 149 | def __init__(self, const int n_labels): 150 | pass 151 | 152 | cdef void count_labels(self, const vector[int]& idxs, const YSET& yset): 153 | cdef int offset, yi, i, label 154 | cdef vector[int] ys 155 | 156 | self.counter.clear() 157 | for i in range(idxs.size()): 158 | offset = idxs[i] 159 | ys = yset[offset] 160 | for yi in range(ys.size()): 161 | label = ys[yi] 162 | inc(self.counter[label]) 163 | 164 | cdef void sort_counter(self): 165 | 166 | # Copy it into a new vector 167 | cdef unordered_map[int,int].iterator b = self.counter.begin() 168 | cdef unordered_map[int,int].iterator e = self.counter.end() 169 | 170 | self.sorter.clear() 171 | 172 | while b != e: 173 | self.sorter.push_back(deref(b)) 174 | inc(b) 175 | 176 | stdsort(self.sorter.begin(), self.sorter.end(), &sort_pairs) 177 | 178 | cdef void fill(self, vector[float]& k): 179 | cdef int size = k.size() 180 | for i in range(size): 181 | k[i] = 0.0 182 | 183 | cdef void order_labels(self, const vector[int]& idxs, const YSET& yset, 184 | const vector[float]& weights, vector[float]& p_logs, vector[float]& logs): 185 | cdef int i, label 186 | cdef float w 187 | cdef pair[int,int] ord 188 | 189 | # Clean and copy 190 | self.count_labels(idxs, yset) 191 | 192 | # No access to std::fill, so write it yourself 193 | self.fill(logs) 194 | 195 | # Sort the results 196 | self.sort_counter() 197 | 198 | for i in range(self.sorter.size()): 199 | ord = self.sorter[i] 200 | label = ord.first 201 | w = weights[label] 202 | logs[label] = p_logs[i] * w 203 | 204 | return 205 | 206 | cdef class Splitter: 207 | cdef vector[int] left, right 208 | cdef LR_SET newLeft, newRight 209 | 210 | cdef int n_labels, max_iters 211 | cdef float sparse_multiple 212 | 213 | cdef NDCGSplitter dense 214 | cdef NDCGSplitter sparse 215 | 216 | cdef vector[float] lOrder, rOrder, weights, logs 217 | cdef vector[vector[int]] yset 218 | 219 | def __init__(self, list y, 220 | np.ndarray[np.float32_t] ws, 221 | const float sparse_multiple, 222 | const int max_iters=50): 223 | 224 | # Initialize counters 225 | cdef pair[int,int] p 226 | p.first = p.second = 0 227 | 228 | cdef int n_labels = ws.shape[0] 229 | self.n_labels = n_labels 230 | 231 | # Variable for NDCG sorting 232 | self.sparse_multiple = sparse_multiple 233 | self.dense = DenseNDCGSplitter(n_labels) 234 | self.sparse = SparseNDCGSplitter(n_labels) 235 | 236 | # ndcg cache 237 | self.lOrder = vector[float](n_labels, 0.0) 238 | self.rOrder = vector[float](n_labels, 0.0) 239 | 240 | self.max_iters = max_iters 241 | 242 | self._init_ys(y, n_labels) 243 | self._init_weights(ws, n_labels) 244 | 245 | cdef void _init_ys(self, list y, const int n_labels): 246 | cdef list ys 247 | cdef int yi 248 | cdef vector[int] y_set 249 | 250 | for ys in y: 251 | y_set = vector[int]() 252 | for yi in ys: 253 | if yi > n_labels - 1: 254 | raise Exception("Y label out of bounds") 255 | 256 | y_set.push_back(yi) 257 | 258 | self.yset.push_back(y_set) 259 | 260 | cdef void _init_weights(self, const float [:] ws, const int size): 261 | cdef int i 262 | 263 | self.weights.reserve(size) 264 | self.logs.reserve(size) 265 | 266 | prev = 0.0 267 | for i in range(size): 268 | self.weights.push_back(ws[i]) 269 | self.logs.push_back(1 / (i + 2.0) ) 270 | 271 | @property 272 | def max_label(self): 273 | return self.n_labels 274 | 275 | cdef bool use_sparse(self, const float ratio): 276 | """ 277 | Sparse and Dense use different methods for computing ndcg scores: 278 | 279 | Dense writes a pair vector for label,count and then sorts that vector. This 280 | can be very expensive if the total number of labels is high but the expected 281 | number of labels is low. A big part of this cost comes from zeroing out the 282 | counts array every pass. 283 | 284 | Sparse uses a hashmap to keep the counts. Its speed up comes from not 285 | having to preallocate the count vector or sort the entire vector set. 286 | 287 | """ 288 | cdef int k = (ratio * self.n_labels) 289 | cdef float klogk = k * log(k) / log(2) 290 | return (klogk + self.n_labels) > (self.sparse_multiple * klogk) 291 | 292 | cdef void resevoir_split(self, list idxs, vector[int]& left, vector[int]& right, object rs): 293 | """ 294 | We use sampling to guarantee both left and right sides have exactly half the 295 | items, with the P(left|X) == 0.5 296 | """ 297 | cdef int i = 0 298 | cdef int idx 299 | cdef int size = len(idxs) 300 | cdef int half = size / 2 301 | 302 | if half < 2: 303 | for i in range(len(idxs)): 304 | left.push_back(idxs[i]) 305 | return 306 | 307 | # Initialize counters 308 | for i in range(half): 309 | left.push_back(idxs[i]) 310 | 311 | for i in range(half, size): 312 | idx = rs.randint(0, i) 313 | if idx < half: 314 | right.push_back(left[idx]) 315 | left[idx] = idxs[i] 316 | else: 317 | right.push_back(idxs[i]) 318 | 319 | def split_node(self, list idxs, rs): 320 | cdef vector[int] left, right 321 | cdef LR_SET newLeft, newRight 322 | cdef NDCGSplitter splitter 323 | cdef int i 324 | 325 | # Initialize counters 326 | self.resevoir_split(idxs, left, right, rs) 327 | 328 | cdef float ratio = (left.size() + right.size()) / self.yset.size() 329 | if self.use_sparse(ratio): 330 | splitter = self.sparse 331 | else: 332 | splitter = self.dense 333 | 334 | for idx in range(self.max_iters): 335 | 336 | # Build ndcg for the sides 337 | splitter.order_labels(left, self.yset, self.weights, self.logs, self.lOrder) 338 | splitter.order_labels(right, self.yset, self.weights, self.logs, self.rOrder) 339 | 340 | # Divide out the sides 341 | newLeft = self.divide(left, True) 342 | newRight = self.divide(right, False) 343 | if newLeft.second.empty() and newRight.first.empty(): 344 | # Done! 345 | break 346 | 347 | replace_vecs(left, newLeft.first, newRight.first) 348 | replace_vecs(right, newLeft.second, newRight.second) 349 | 350 | return left, right 351 | 352 | cdef LR_SET divide(self, const vector[int]& idxs, const bool is_left): 353 | cdef vector[int] newLeft, newRight 354 | cdef int i, idx 355 | cdef float lNdcg, rNdcg 356 | cdef LR_SET empty 357 | cdef vector[int] ys 358 | cdef pair[float,float] dcg_out 359 | 360 | for i in range(idxs.size()): 361 | idx = idxs[i] 362 | ys = self.yset[idx] 363 | dcg(self.lOrder, self.rOrder, ys, dcg_out) 364 | if dcg_out.first > dcg_out.second: 365 | newLeft.push_back(idx) 366 | elif dcg_out.first < dcg_out.second: 367 | newRight.push_back(idx) 368 | elif is_left: 369 | newLeft.push_back(idx) 370 | else: 371 | newRight.push_back(idx) 372 | 373 | lNdcg += dcg_out.first 374 | rNdcg += dcg_out.second 375 | 376 | empty.first = newLeft 377 | empty.second = newRight 378 | return empty 379 | 380 | def sparsify(np.ndarray[np.float64_t, ndim=2] dense, float eps=1e-6): 381 | """ 382 | More work speeding up common operations that at large N add up to real time 383 | """ 384 | cdef double [:, :] npv = dense 385 | cdef int i, count = 0 386 | cdef double n 387 | cdef vector[int] col 388 | cdef vector[float] data 389 | 390 | for i in range(npv.shape[1]): 391 | n = npv[0,i] 392 | if abs(n) > eps: 393 | count += 1 394 | data.push_back(n) 395 | col.push_back(i) 396 | 397 | cdef np.ndarray[np.int32_t] ip = np.zeros(2, dtype=np.int32) 398 | cdef np.ndarray[np.int32_t] c = np.zeros(count, dtype=np.int32) 399 | cdef np.ndarray[np.float32_t] d = np.zeros(count, dtype=np.float32) 400 | 401 | cdef int [:] cv = c 402 | cdef float [:] dv = d 403 | for i in range(count): 404 | cv[i] = col[i] 405 | dv[i] = data[i] 406 | 407 | cv = ip 408 | cv[1] = count 409 | 410 | csr = sp.csr_matrix((1, npv.shape[1]), dtype='float32') 411 | csr.indptr = ip 412 | csr.indices = c 413 | csr.data = d 414 | 415 | return csr 416 | 417 | cdef double radius2(const int [:] xi, const double [:] xd, 418 | const int [:] ui, const double [:] ud, 419 | const int s1, const int s2): 420 | """ 421 | Computes the Sum((Xi - Ux) ** 2) 422 | """ 423 | 424 | cdef int xidx = 0, uidx = 0 425 | cdef int xcol, ucol 426 | cdef double tally = 0.0, diff 427 | 428 | while xidx < s1 and uidx < s2: 429 | xcol = xi[xidx] 430 | ucol = ui[uidx] 431 | if xcol < ucol: 432 | tally += pow(xd[xidx], 2) 433 | xidx += 1 434 | elif xcol > ucol: 435 | tally += pow(ud[uidx], 2) 436 | uidx += 1 437 | else: 438 | diff = (xd[xidx] - ud[uidx]) 439 | tally += pow(diff, 2) 440 | xidx += 1 441 | uidx += 1 442 | 443 | # Get the remainder 444 | while xidx < s1 or uidx < s2: 445 | if xidx < s1: 446 | tally += xd[xidx] * xd[xidx] 447 | xidx += 1 448 | else: 449 | tally += ud[uidx] * ud[uidx] 450 | uidx += 1 451 | 452 | return tally 453 | 454 | def radius(np.ndarray[np.double_t] Xid, np.ndarray[np.int32_t] Xii, 455 | np.ndarray[np.double_t] uid, np.ndarray[np.int32_t] uii): 456 | 457 | return radius2(Xii, Xid, uii, uid, Xii.shape[0], uii.shape[0]) 458 | 459 | def sparse_mean_64(list xs, np.ndarray[np.double_t] ret): 460 | cdef int i, k 461 | cdef int [:] indices 462 | cdef double [:] data 463 | cdef double [:] r = ret 464 | for i in range(len(xs)): 465 | x = xs[i] 466 | indices = x.indices 467 | data = x.data 468 | for k in range(data.shape[0]): 469 | r[indices[k]] += data[k] 470 | 471 | cdef int size = len(xs) 472 | for i in range(r.shape[0]): 473 | r[i] /= size 474 | 475 | -------------------------------------------------------------------------------- /fastxml/trainer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from builtins import next 4 | from builtins import range 5 | from past.utils import old_div 6 | from builtins import object 7 | import os 8 | import multiprocessing 9 | import time 10 | import json 11 | import struct 12 | from math import ceil 13 | from itertools import repeat 14 | from contextlib import closing 15 | from collections import Counter, defaultdict 16 | 17 | import numpy as np 18 | import scipy.sparse as sp 19 | 20 | import scipy.sparse as sp 21 | from sklearn.linear_model import SGDClassifier, LogisticRegression 22 | from sklearn.svm import LinearSVC 23 | from sklearn.utils import shuffle 24 | 25 | from .splitter import Splitter, sparsify, sparse_mean_64, radius 26 | from .proc import faux_fork_call, fork_call 27 | 28 | class Node(object): 29 | is_leaf = False 30 | def __init__(self, left, right, w, b): 31 | self.left = left 32 | self.right = right 33 | self.w = w 34 | self.b = b 35 | 36 | class Leaf(object): 37 | is_leaf = True 38 | def __init__(self, probs): 39 | self.probs = probs 40 | 41 | class CLF(object): 42 | __slots__ = ('w', 'b') 43 | def __init__(self, w, bias): 44 | self.w = w 45 | self.b = bias 46 | 47 | def stack(X): 48 | stacker = np.vstack if isinstance(X[0], np.ndarray) else sp.vstack 49 | return stacker(X) 50 | 51 | class Tree(object): 52 | def __init__(self, rootIdx, W, b, tree, probs): 53 | self.rootIdx = rootIdx 54 | self.W = W 55 | self.b = b 56 | self.tree = tree 57 | self.probs = probs 58 | 59 | def sparse_rows_iter(sparse): 60 | indptr, indices, data = sparse.indptr, sparse.indices, sparse.data 61 | for startIdx in range(indptr.shape[0] - 1): 62 | start, stop = indptr[startIdx], indptr[startIdx+1] 63 | 64 | sparse_lines = [] 65 | for i in range(start, stop): 66 | sparse_lines.append(indices[i]) 67 | sparse_lines.append(data[i]) 68 | 69 | # Pack into struct 70 | n = stop - start 71 | size = struct.pack('I', n) 72 | rest = struct.pack('If' * n, *sparse_lines) 73 | 74 | yield size + rest 75 | 76 | def dense_rows_iter(dense, dtype='f'): 77 | n = dense.shape[1] 78 | size = struct.pack('I', n) 79 | for i in range(dense.shape[0]): 80 | rest = struct.pack(dtype * n, *dense[i]) 81 | yield size + rest 82 | 83 | class Trainer(object): 84 | 85 | def __init__(self, n_trees=1, max_leaf_size=10, max_labels_per_leaf=20, 86 | re_split=0, n_jobs=1, alpha=1e-4, n_epochs=2, n_updates=100, bias=True, 87 | subsample=1, loss='log', sparse_multiple=25, leaf_classifiers=False, 88 | gamma=30, blend=0.8, leaf_eps=1e-5, optimization="fastxml", engine='auto', 89 | auto_weight=2**5, C=1, eps=None, leaf_probs=False, verbose=False, seed=2016): 90 | 91 | self.n_trees = n_trees 92 | self.max_leaf_size = max_leaf_size 93 | self.max_labels_per_leaf = max_labels_per_leaf 94 | self.re_split = re_split 95 | self.n_jobs = n_jobs if n_jobs > 0 else (multiprocessing.cpu_count() + 1 + n_jobs) 96 | self.alpha = alpha 97 | 98 | if isinstance(seed, np.random.RandomState): 99 | seed = np.randint(0, np.iinfo(np.int32).max) 100 | 101 | self.seed = seed 102 | assert isinstance(n_epochs, int) or n_epochs == 'auto' 103 | self.n_epochs = n_epochs 104 | self.n_updates = float(n_updates) 105 | self.verbose = verbose 106 | self.bias = bias 107 | self.subsample = subsample 108 | assert loss in ('log', 'hinge') 109 | self.loss = loss 110 | self.sparse_multiple = sparse_multiple 111 | self.leaf_classifiers = leaf_classifiers 112 | self.gamma = gamma 113 | self.blend = blend 114 | self.leaf_eps = leaf_eps 115 | assert optimization in ('fastxml', 'dsimec') 116 | self.optimization = optimization 117 | assert engine in ('auto', 'sgd', 'liblinear') 118 | self.engine = engine 119 | if eps is None: 120 | eps = 1e-6 if optimization == 'fastxml' else 1e-2 121 | 122 | self.auto_weight = auto_weight 123 | self.eps = eps 124 | self.C = C 125 | self.leaf_probs = leaf_probs 126 | 127 | self.roots = [] 128 | 129 | def split_node(self, idxs, splitter, rs): 130 | if self.verbose and len(idxs) > 1000: 131 | print("Splitting {}".format(len(idxs))) 132 | 133 | return splitter.split_node(idxs, rs) 134 | 135 | def compute_probs(self, y, idxs, ml): 136 | counter = Counter(yi for i in idxs for yi in y[i]) 137 | total = float(len(idxs)) 138 | i, j, v = [], [], [] 139 | for l, val in counter.most_common(self.max_labels_per_leaf): 140 | i.append(0) 141 | j.append(l) 142 | v.append(old_div(val, total)) 143 | 144 | return sp.csr_matrix((v, (i, j)), shape=(1, ml)).astype('float32') 145 | 146 | def build_X(self, X, idxs): 147 | if isinstance(X, np.ndarray): 148 | return self.build_X_dense(X, idxs) 149 | 150 | return self.build_X_sparse(X, idxs) 151 | 152 | def build_X_dense(self, X, idxs): 153 | return X[idxs] 154 | 155 | def build_X_sparse(self, X, idxs): 156 | indptr = [0] 157 | indices = [] 158 | data = [] 159 | for idx in idxs: 160 | s = X[idx] 161 | indices.append(s.indices) 162 | data.append(s.data) 163 | indptr.append(indptr[-1] + s.indices.shape[0]) 164 | 165 | X_train = sp.csr_matrix((len(data), X[0].shape[1]), dtype=X[0].dtype.name) 166 | X_train.indptr = np.array(indptr, dtype=np.int32) 167 | X_train.indices = np.concatenate(indices) 168 | X_train.data = np.concatenate(data) 169 | return X_train 170 | 171 | def build_XY(self, X, idxss, rs): 172 | """ 173 | Faster sparse building 174 | """ 175 | y_train = [] 176 | idxes = [] 177 | for i, idxs in enumerate(idxss): 178 | idxes.extend(idxs) 179 | y_train.extend([i] * len(idxs)) 180 | 181 | # Shuffle the flattened data 182 | idxes, y_train = shuffle(idxes, y_train, random_state=rs) 183 | 184 | X_train = self.build_X(X, idxes) 185 | return X_train, y_train 186 | 187 | def compute_epochs(self, N): 188 | if isinstance(self.n_epochs, int): 189 | return self.n_epochs 190 | 191 | # Rules of Thumb state that SGD needs ~1mm updates to converge 192 | # That would take _forever_, so we set it 100 by default 193 | n_epochs = int(ceil(old_div(self.n_updates, N))) 194 | assert n_epochs > 0 195 | return n_epochs 196 | 197 | def train_clf(self, X, idxss, rs): 198 | N = sum(len(idx) for idx in idxss) 199 | n_epochs = self.compute_epochs(N) 200 | 201 | if self.optimization == 'fastxml': 202 | penalty = 'l1' 203 | else: 204 | penalty = 'l2' 205 | 206 | X_train, y_train = self.build_XY(X, idxss, rs) 207 | 208 | in_liblinear = X_train.shape[0] > (self.auto_weight * self.max_leaf_size) 209 | if self.engine == 'liblinear' or (self.engine == 'auto' and in_liblinear): 210 | if self.loss == 'log': 211 | # No control over penalty 212 | clf = LogisticRegression(solver='liblinear', random_state=rs, tol=1, 213 | C=self.C, penalty=penalty) 214 | else: 215 | clf = LinearSVC(C=self.C, fit_intercept=self.bias, 216 | max_iter=n_epochs, class_weight='balanced', 217 | penalty=penalty, random_state=rs) 218 | 219 | else: 220 | clf = SGDClassifier(loss=self.loss, penalty=penalty, max_iter=n_epochs, 221 | alpha=self.alpha, fit_intercept=self.bias, class_weight='balanced', 222 | random_state=rs) 223 | 224 | clf.fit(X_train, y_train) 225 | 226 | # Halves the memory requirement 227 | clf.coef_ = sparsify(clf.coef_, self.eps) 228 | if self.bias: 229 | clf.intercept_ = clf.intercept_.astype('float32') 230 | 231 | return clf, CLF(clf.coef_, clf.intercept_) 232 | 233 | def _save_trees(self, dname): 234 | for i, tree in enumerate(self.roots): 235 | fname = lambda x: os.path.join(dname, 'tree.%s.%s' % (i, x)) 236 | 237 | # Write out dense tree 238 | with open(fname('tree'), 'wb') as out: 239 | for line in dense_rows_iter(tree.tree, 'I'): 240 | out.write(line) 241 | 242 | # Write out weights 243 | with open(fname('weights'), 'wb') as out: 244 | for line in sparse_rows_iter(tree.W): 245 | out.write(line) 246 | 247 | # Write bias 248 | with open(fname('bias'), 'wb') as out: 249 | for line in dense_rows_iter(tree.b.reshape((1,-1))): 250 | out.write(line) 251 | 252 | # Write Probabilities 253 | with open(fname('probs'), 'wb') as out: 254 | for p in tree.probs: 255 | for line in sparse_rows_iter(p): 256 | out.write(line) 257 | 258 | def _save_leaf_classifiers(self, dname): 259 | fname = lambda x: os.path.join(dname, 'lc.%s' % x) 260 | # Save l2 norms 261 | with open(fname('norms'), 'wb') as out: 262 | for line in dense_rows_iter(self.norms_.reshape((1,-1))): 263 | out.write(line) 264 | 265 | # Save Radii 266 | with open(fname('radii'), 'wb') as out: 267 | for line in dense_rows_iter(self.xr_.reshape((1,-1))): 268 | out.write(line) 269 | 270 | # Save means 271 | with open(fname('means'), 'wb') as out: 272 | for line in sparse_rows_iter(self.uxs_): 273 | out.write(line) 274 | 275 | def _save_settings(self, dname): 276 | settings = {} 277 | for k, v in self.__dict__.items(): 278 | if k == 'roots' or k.endswith('_'): 279 | continue 280 | 281 | settings[k] = v 282 | 283 | with open(os.path.join(dname, 'settings'), 'wt') as out: 284 | json.dump(settings, out) 285 | 286 | def save(self, dname): 287 | if not os.path.exists(dname): 288 | os.mkdir(dname) 289 | 290 | # Save settings 291 | self._save_settings(dname) 292 | 293 | # Save trees 294 | self._save_trees(dname) 295 | 296 | # Save leaf classifiers 297 | if self.leaf_classifiers: 298 | self._save_leaf_classifiers(dname) 299 | 300 | def resplit_data(self, X, idxs, clf, classes): 301 | X_train = self.build_X(X, idxs) 302 | new_idxs = [[] for _ in range(classes)] 303 | for i, k in enumerate(clf.predict(X_train)): 304 | new_idxs[k].append(idxs[i]) 305 | 306 | return new_idxs 307 | 308 | def split_train(self, X, idxs, splitter, rs): 309 | l_idx, r_idx = self.split_node(idxs, splitter, rs) 310 | 311 | clf = clf_fast = None 312 | if l_idx and r_idx: 313 | # Train the classifier 314 | if self.verbose and len(idxs) > 1000: 315 | print("Training classifier") 316 | 317 | clf, clf_fast = self.train_clf(X, [l_idx, r_idx], rs) 318 | 319 | return l_idx, r_idx, (clf, clf_fast) 320 | 321 | def grow_root(self, X, y, idxs, rs, splitter): 322 | node = self.grow_tree(X, y, idxs, rs, splitter) 323 | 324 | if isinstance(X, np.ndarray): 325 | cols = X.shape[1] 326 | else: 327 | cols = X[0].shape[1] 328 | 329 | return self.compact(node, cols) 330 | 331 | def grow_tree(self, X, y, idxs, rs, splitter): 332 | 333 | if len(idxs) <= self.max_leaf_size: 334 | return Leaf(self.compute_probs(y, idxs, splitter.max_label)) 335 | 336 | l_idx, r_idx, (clf, clff) = self.split_train(X, idxs, splitter, rs) 337 | 338 | if not l_idx or not r_idx: 339 | return Leaf(self.compute_probs(y, idxs, splitter.max_label)) 340 | 341 | # Resplit the data 342 | for tries in range(self.re_split): 343 | 344 | if clf is not None: 345 | l_idx, r_idx = self.resplit_data(X, idxs, clf, 2) 346 | 347 | if l_idx and r_idx: break 348 | 349 | if self.verbose and len(idxs) > 1000: 350 | print("Re-splitting {}".format(len(idxs))) 351 | 352 | l_idx, r_idx, (clf, clff) = self.split_train( 353 | X, idxs, splitter, rs) 354 | 355 | if not l_idx or not r_idx: 356 | return Leaf(self.compute_probs(y, idxs, splitter.max_label)) 357 | 358 | lNode = self.grow_tree(X, y, l_idx, rs, splitter) 359 | rNode = self.grow_tree(X, y, r_idx, rs, splitter) 360 | 361 | return Node(lNode, rNode, clff.w, clff.b) 362 | 363 | def generate_idxs(self, dataset_len): 364 | if self.subsample == 1: 365 | return repeat(list(range(dataset_len))) 366 | 367 | batch_size = int(dataset_len * self.subsample) \ 368 | if self.subsample < 1 else self.subsample 369 | 370 | if batch_size > dataset_len: 371 | raise Exception("dataset subset is larger than dataset") 372 | 373 | def gen(bs): 374 | rs = np.random.RandomState(seed=self.seed + 1000) 375 | idxs = list(range(dataset_len)) 376 | while True: 377 | rs.shuffle(idxs) 378 | yield idxs[:bs] 379 | 380 | return gen(batch_size) 381 | 382 | def _build_roots(self, X, y, weights): 383 | assert isinstance(X, list) and isinstance(X[0], sp.csr_matrix), "Requires list of csr_matrix" 384 | if self.n_jobs > 1: 385 | f = fork_call(self.grow_root) 386 | else: 387 | f = faux_fork_call(self.grow_root) 388 | 389 | nl = max(yi for ys in y for yi in ys) + 1 390 | if weights is None: 391 | weights = np.ones(nl, dtype='float32') 392 | else: 393 | assert weights.shape[0] == nl, "Weights need to be same as largest y class" 394 | 395 | self.n_labels = nl 396 | 397 | # Initialize cython splitter 398 | splitter = Splitter(y, weights, self.sparse_multiple) 399 | 400 | procs = [] 401 | finished = [] 402 | counter = iter(range(self.n_trees)) 403 | idxs = self.generate_idxs(len(X)) 404 | while len(finished) < self.n_trees: 405 | if len(procs) < self.n_jobs and (len(procs) + len(finished)) < self.n_trees : 406 | rs = np.random.RandomState(seed=self.seed + next(counter)) 407 | procs.append(f(X, y, next(idxs), rs, splitter)) 408 | else: 409 | # Check 410 | _procs = [] 411 | for p in procs: 412 | if p.ready(): 413 | finished.append(p.get()) 414 | else: 415 | _procs.append(p) 416 | 417 | # No change in readyness, just sleep 418 | if len(procs) == len(_procs): 419 | time.sleep(0.1) 420 | 421 | procs = _procs 422 | 423 | return finished 424 | 425 | def compact(self, root, dims): 426 | #CLS 427 | Ws = [] 428 | bs = [] 429 | 430 | # Tree: index, left, right, isLeaf 431 | tree = [] 432 | 433 | # Payload 434 | probs = [] 435 | 436 | def f(node): 437 | if node.is_leaf: 438 | treeIdx = len(probs) 439 | probs.append(node.probs) 440 | tree.append([treeIdx, 0, 0, 1]) 441 | else: 442 | leftIndex = f(node.left) 443 | rightIndex = f(node.right) 444 | 445 | clfIdx = len(Ws) 446 | Ws.append(node.w) 447 | bs.append(node.b[0]) 448 | tree.append([clfIdx, leftIndex, rightIndex, 0]) 449 | 450 | curIdx = len(tree) - 1 451 | return curIdx 452 | 453 | rootIdx = f(root) 454 | 455 | if Ws: 456 | W_stack = sp.vstack(Ws) 457 | else: 458 | W_stack = sp.csr_matrix(([], ([], [])), shape=(0, dims)).astype('float32') 459 | 460 | b = np.array(bs, dtype='float32') 461 | t = np.array(tree, dtype='uint32') 462 | return Tree(rootIdx, W_stack, b, t, probs) 463 | 464 | def fit(self, X, y, weights=None): 465 | self.roots = self._build_roots(X, y, weights) 466 | if self.leaf_classifiers: 467 | self.norms_, self.uxs_, self.xr_ = self._compute_leaf_probs(X, y) 468 | 469 | def _compute_leaf_probs(self, X, y): 470 | dd = defaultdict(list) 471 | norms = compute_unit_norms(X) 472 | ml = 0 473 | for Xi, yis in zip(X, y): 474 | Xin = norm(norms, Xi) 475 | for yi in yis: 476 | dd[yi].append(Xin) 477 | ml = max(yi, ml) 478 | 479 | if self.verbose: 480 | print("Computing means and radius for hard margin") 481 | 482 | xmeans = [] 483 | xrs = [] 484 | with closing(multiprocessing.Pool(processes=self.n_jobs)) as p: 485 | it = ((i, dd[i], self.leaf_eps) for i in range(ml + 1)) 486 | for k, ux, r in p.imap(compute_leaf_metrics, it, 100): 487 | if self.verbose and k % 100 == 0: 488 | print("Training leaf classifier: %s of %s" % (k, ml)) 489 | 490 | if ux is None: 491 | ux = sp.csr_matrix((1, X[0].shape[1])).astype('float64') 492 | 493 | xmeans.append(ux) 494 | xrs.append(r) 495 | 496 | return norms, sp.vstack(xmeans), np.array(xrs, dtype=np.float32) 497 | 498 | def norm(norms, Xi): 499 | Xi = Xi.astype('float64') 500 | for i, ind in enumerate(Xi.indices): 501 | Xi.data[i] /= norms[ind] 502 | 503 | Xi.data /= np.linalg.norm(Xi.data) 504 | return Xi 505 | 506 | def compute_leaf_metrics(data): 507 | i, Xs, eps = data 508 | if len(Xs) > 100: 509 | v = np.zeros(Xs[0].shape[1], dtype='float64') 510 | sparse_mean_64(Xs, v) 511 | ux = sparsify(v.reshape((1, -1)), eps=eps).astype('float64') 512 | 513 | elif len(Xs) > 1: 514 | ux = old_div(sum(Xs), len(Xs)) 515 | 516 | else: 517 | return i, None, 0.0 518 | 519 | rad = max(radius(ux.data, ux.indices, Xi.data, Xi.indices) for Xi in Xs) 520 | return i, ux, rad 521 | 522 | def compute_unit_norms(X): 523 | norms = np.zeros(X[0].shape[1]) 524 | for Xi in X: 525 | for i, ind in enumerate(Xi.indices): 526 | norms[ind] += Xi.data[i] ** 2 527 | 528 | norms = norms ** .5 529 | norms[np.where(norms == 0)] = 1.0 530 | return norms.astype('float32') 531 | 532 | 533 | class MetricNode(object): 534 | __slots__ = ('left', 'right') 535 | is_leaf = False 536 | 537 | def __init__(self, left, right): 538 | self.left = left 539 | self.right = right 540 | 541 | @property 542 | def idxs(self): 543 | return self.left.idxs + self.right.idxs 544 | 545 | def build_discrete(self): 546 | _, res = self._build_discrete(0) 547 | return res 548 | 549 | def _build_discrete(self, n=0): 550 | n2, left = self.left._build_discrete(n) 551 | n3, right = self.right._build_discrete(n2 + 1) 552 | return n3, left + right 553 | 554 | def build_probs(self, w): 555 | _, probs = self._build_probs(w) 556 | return [p for lidx, p in probs] 557 | 558 | def _build_probs(self, w, n=0): 559 | n2, left = self.left._build_probs(w, n) 560 | n3, right = self.right._build_probs(w, n2 + 1) 561 | return n3, left + right 562 | 563 | class MetricLeaf(object): 564 | __slots__ = ('idxs') 565 | is_leaf = True 566 | 567 | def __init__(self, idxs): 568 | self.idxs = idxs 569 | 570 | def build_discrete(self): 571 | return self._build_discrete(0)[1] 572 | 573 | def _build_discrete(self, n=0): 574 | return n, [(n, self.idxs)] 575 | 576 | def _build_probs(self, w, n=0): 577 | ys = Counter(y for idx in self.idxs for y in w[idx]) 578 | total = len(self.idxs) 579 | return n, [(n, {k: old_div(v, float(total)) for k, v in ys.items()})] 580 | 581 | def metric_cluster(y, weights=None, max_leaf_size=10, 582 | sparse_multiple=25, seed=2016, verbose=False): 583 | 584 | rs = np.random.RandomState(seed=seed) 585 | n_labels = max(yi for ys in y for yi in ys) + 1 586 | if weights is None: 587 | weights = np.ones(n_labels, dtype='float32') 588 | 589 | # Initialize splitter 590 | splitter = Splitter(y, weights, sparse_multiple) 591 | 592 | def _metric_cluster(idxs): 593 | if verbose and len(idxs) > 1000: 594 | print("Splitting:", len(idxs)) 595 | 596 | if len(idxs) < max_leaf_size: 597 | return MetricLeaf(idxs) 598 | 599 | left, right = splitter.split_node(idxs, rs) 600 | if not left or not right: 601 | return MetricLeaf(idxs) 602 | 603 | return MetricNode(_metric_cluster(left), _metric_cluster(right)) 604 | 605 | return _metric_cluster(list(range(len(y)))) 606 | -------------------------------------------------------------------------------- /fastxml/weights.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from builtins import range 3 | from past.utils import old_div 4 | from collections import Counter 5 | import numpy as np 6 | 7 | def metrics(y): 8 | Nl = Counter(yi for ys in y for yi in ys) 9 | N = len(y) 10 | return N, Nl, max(Nl) + 1 11 | 12 | def uniform(y): 13 | N, Nl, ml = metrics(y) 14 | return np.ones(ml, dtype='float32') 15 | 16 | def propensity(y, A=0.55, B=1.5): 17 | """ 18 | Computes propensity scores based on ys 19 | """ 20 | N, Nl, ml = metrics(y) 21 | C = (np.log(N) - 1) * (B + 1) ** A 22 | weights = [] 23 | for i in range(ml): 24 | weights.append(1 + C * (Nl.get(i, 0) + B) ** -A) 25 | 26 | return np.array(weights, dtype='float32') 27 | 28 | def nnllog(y, a=1, b=0): 29 | N, Nl, ml = metrics(y) 30 | N = float(N) 31 | 32 | weights = [] 33 | for i in range(ml): 34 | if i in Nl: 35 | weights.append(a * np.log(old_div(N, Nl[i])) + b) 36 | else: 37 | weights.append(0) 38 | 39 | return np.array(weights, dtype='float32') 40 | 41 | def logexp(y, a=1, b=1): 42 | N, Nl, ml = metrics(y) 43 | weights = [] 44 | for i in range(ml): 45 | if i in Nl: 46 | weights.append(a * np.log(1 + Nl[i]) ** -b) 47 | else: 48 | weights.append(0) 49 | 50 | return np.array(weights, dtype='float32') 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.8.1 2 | scipy>=0.13.3 3 | scikit-learn>=0.17 4 | Cython>=0.23.4 5 | future>=0.16.0 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | from setuptools import setup 5 | from setuptools.command.build_ext import build_ext as _build_ext 6 | 7 | from distutils.extension import Extension 8 | 9 | class build_ext(_build_ext): 10 | """ 11 | This class is necessary because numpy won't be installed at import time. 12 | """ 13 | def finalize_options(self): 14 | _build_ext.finalize_options(self) 15 | # Prevent numpy from thinking it is still in its setup process: 16 | __builtins__.__NUMPY_SETUP__ = False 17 | import numpy 18 | self.include_dirs.append(numpy.get_include()) 19 | 20 | compile_args = ['-O3', '-std=c++11', '-stdlib=libc++', '-mmacosx-version-min=10.8'] if sys.platform == 'darwin' else ['-O3', '-std=c++11'] 21 | 22 | extensions = [ 23 | Extension("fastxml.splitter", ["fastxml/splitter.pyx"], 24 | language='c++', 25 | extra_compile_args=compile_args), 26 | Extension("fastxml.inferencer", ["fastxml/inferencer.pyx"], 27 | language='c++', 28 | extra_compile_args=compile_args) 29 | ] 30 | 31 | setup(name='fastxml', 32 | version="2.0.0", 33 | description='FastXML Extreme Multi-label Classification Algorithm', 34 | url="https://github.com/refefer/fastxml", 35 | author_email="refefer@gmail.com", 36 | packages=["fastxml"], 37 | license="LICENSE", 38 | cmdclass = {'build_ext': build_ext}, 39 | ext_modules=extensions, 40 | scripts=[ 41 | "bin/fxml.py" 42 | ], 43 | install_requires=[ 44 | "numpy>=1.8.1", 45 | "scipy>=0.13.3", 46 | "scikit-learn>=0.17", 47 | "Cython>=0.23.4", 48 | "future>=0.16.0" 49 | ], 50 | classifiers=[ 51 | "License :: OSI Approved :: Apache Software License", 52 | "Programming Language :: Python :: 2.7", 53 | "Operating System :: OS Independent" 54 | ], 55 | author='Andrew Stanton') 56 | --------------------------------------------------------------------------------