├── CONTRIBUTING.md ├── data ├── .gitignore ├── susy │ └── SUSY.csv.download ├── covtype │ ├── covtype-test-1.csv.download │ └── covtype-train-1.csv.download └── higgs │ └── HIGGS.csv.download ├── woody ├── tests │ └── __init__.py ├── models │ ├── subset │ │ ├── __init__.py │ │ ├── regression.py │ │ └── classification.py │ ├── huge │ │ ├── __init__.py │ │ ├── util.py │ │ ├── regression.py │ │ ├── classification.py │ │ └── predict.py │ ├── forest │ │ ├── __init__.py │ │ ├── src │ │ │ ├── include │ │ │ │ ├── qsort.h │ │ │ │ ├── float.h │ │ │ │ ├── util.h │ │ │ │ ├── pqueue.h │ │ │ │ └── timing.h │ │ │ ├── tree │ │ │ │ ├── cpu │ │ │ │ │ ├── include │ │ │ │ │ │ ├── fastsort.h │ │ │ │ │ │ ├── standard.h │ │ │ │ │ │ └── criteria.h │ │ │ │ │ └── fastsort.c │ │ │ │ └── include │ │ │ │ │ ├── global.h │ │ │ │ │ ├── types.h │ │ │ │ │ ├── cpu.h │ │ │ │ │ └── tree.h │ │ │ ├── timing.c │ │ │ ├── util.c │ │ │ ├── .cproject │ │ │ ├── qsort.c │ │ │ └── pqueue.c │ │ ├── swig │ │ │ ├── cpu_float.i │ │ │ ├── gpu_float.i │ │ │ ├── gpu_double.i │ │ │ └── cpu_double.i │ │ ├── classification.py │ │ ├── regression.py │ │ ├── util.py │ │ ├── setup.py │ │ └── .cproject │ ├── __init__.py │ ├── util.py │ ├── base.py │ └── sampler.py ├── util │ ├── array │ │ ├── __init__.py │ │ ├── src │ │ │ ├── include │ │ │ │ ├── util.h │ │ │ │ ├── global.h │ │ │ │ └── array.h │ │ │ ├── util.c │ │ │ └── array.c │ │ ├── swig │ │ │ ├── cpu_float.i │ │ │ └── cpu_double.i │ │ ├── setup.py │ │ └── base.py │ ├── __init__.py │ ├── timer.py │ ├── url.py │ ├── base.py │ ├── draw.py │ └── parallel.py ├── io │ ├── __init__.py │ ├── split.py │ ├── reader.py │ ├── store.py │ └── csv.py ├── data │ ├── __init__.py │ ├── landsat.py │ ├── artificial.py │ ├── covtype.py │ ├── util.py │ ├── susy.py │ └── higgs.py ├── __init__.py └── setup.py ├── setup.cfg ├── MANIFEST.in ├── experiments ├── landsat │ ├── util.py │ ├── launch.py │ ├── params.py │ └── sk.py ├── large_data │ ├── util.py │ ├── launch.py │ └── params.py ├── small_data │ ├── util.py │ ├── launch.py │ ├── params.py │ ├── sk.py │ ├── wood.py │ ├── subsetwood.py │ ├── h2.py │ └── hugewood_lam.py ├── influence_lamda │ ├── util.py │ ├── launch.py │ ├── params.py │ └── wood.py └── influence_n_bottom │ ├── util.py │ ├── launch.py │ ├── params.py │ ├── hugewood_10K.py │ ├── hugewood_1K.py │ └── hugewood_75K.py ├── requirements.txt ├── .gitignore ├── README.rst └── setup.py /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | covtype/*.csv 2 | higgs/*.csv 3 | susy/*.csv 4 | -------------------------------------------------------------------------------- /data/susy/SUSY.csv.download: -------------------------------------------------------------------------------- 1 | http://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz 2 | -------------------------------------------------------------------------------- /data/covtype/covtype-test-1.csv.download: -------------------------------------------------------------------------------- 1 | https://sid.erda.dk/share_redirect/bx3kbiD08L/covtype-test-1.csv 2 | -------------------------------------------------------------------------------- /data/covtype/covtype-train-1.csv.download: -------------------------------------------------------------------------------- 1 | https://sid.erda.dk/share_redirect/bx3kbiD08L/covtype-train-1.csv 2 | -------------------------------------------------------------------------------- /data/higgs/HIGGS.csv.download: -------------------------------------------------------------------------------- 1 | https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz 2 | -------------------------------------------------------------------------------- /woody/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # -------------------------------------------------------------------------------- /woody/models/subset/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification import SubsetWoodClassifier 2 | from .regression import SubsetWoodRegressor -------------------------------------------------------------------------------- /woody/util/array/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .base import split_array, transpose_array -------------------------------------------------------------------------------- /woody/io/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .base import DataGenerator 7 | from .store import DiskStore, MemoryStore -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test = nosetests 3 | 4 | [nosetests] 5 | no-path-adjustment=1 6 | exe = 1 7 | detailed-errors = 1 8 | cover-html = 1 9 | cover-html-dir = coverage 10 | cover-package = woody 11 | -------------------------------------------------------------------------------- /woody/models/huge/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .classification import HugeWoodClassifier 7 | from .regression import HugeWoodRegressor 8 | -------------------------------------------------------------------------------- /woody/models/forest/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .classification import WoodClassifier 7 | from .regression import WoodRegressor 8 | from .base import Wood 9 | -------------------------------------------------------------------------------- /woody/models/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .forest import WoodClassifier, WoodRegressor, Wood 7 | from .huge import HugeWoodClassifier, HugeWoodRegressor 8 | from .subset import SubsetWoodClassifier, SubsetWoodRegressor -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include CONTRIBUTING.md 3 | include LICENSE 4 | include requirements.txt 5 | 6 | recursive-include examples *.py 7 | recursive-include woody *.c *.h *.i 8 | 9 | #include docs/conf.py 10 | #include docs/Makefile 11 | #recursive-include docs *.rst 12 | #include docs/_static/bibtex/* 13 | #include docs/_static/images/* 14 | 15 | exclude MANIFEST.in 16 | -------------------------------------------------------------------------------- /woody/models/forest/src/include/qsort.h: -------------------------------------------------------------------------------- 1 | /* 2 | * qsort.h 3 | * 4 | * Created on: 12.11.2014 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef INCLUDE_QSORT_H_ 9 | #define INCLUDE_QSORT_H_ 10 | 11 | void woody_qsort(void *base, unsigned num, unsigned width, 12 | int (*comp)(const void *, const void *, const void *), 13 | const void* comp_param); 14 | 15 | #endif /* INCLUDE_QSORT_H_ */ 16 | -------------------------------------------------------------------------------- /experiments/landsat/util.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | 3 | metrics = {"accuracy": accuracy_score} 4 | 5 | def evaluate(preds, y, results, prefix, verbose=1): 6 | 7 | for key in metrics.keys(): 8 | res = metrics[key](y, preds) 9 | results[prefix + "_" + key] = res 10 | if verbose > 0: 11 | print(prefix + " " + key + ":\t" + str(res)) 12 | -------------------------------------------------------------------------------- /woody/util/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .base import makedirs, ensure_dir_for_file, convert_to_libsvm 7 | from .timer import Timer 8 | from .array import split_array 9 | from .url import download_from_url 10 | from .draw import draw_single_tree 11 | from .parallel import perform_task_in_parallel, start_via_single_process -------------------------------------------------------------------------------- /experiments/large_data/util.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | 3 | metrics = {"accuracy": accuracy_score} 4 | 5 | def evaluate(preds, y, results, prefix, verbose=1): 6 | 7 | for key in metrics.keys(): 8 | res = metrics[key](y, preds) 9 | results[prefix + "_" + key] = res 10 | if verbose > 0: 11 | print(prefix + " " + key + ":\t" + str(res)) 12 | -------------------------------------------------------------------------------- /experiments/small_data/util.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | 3 | metrics = {"accuracy": accuracy_score} 4 | 5 | def evaluate(preds, y, results, prefix, verbose=1): 6 | 7 | for key in metrics.keys(): 8 | res = metrics[key](y, preds) 9 | results[prefix + "_" + key] = res 10 | if verbose > 0: 11 | print(prefix + " " + key + ":\t" + str(res)) 12 | -------------------------------------------------------------------------------- /experiments/influence_lamda/util.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | 3 | metrics = {"accuracy": accuracy_score} 4 | 5 | def evaluate(preds, y, results, prefix, verbose=1): 6 | 7 | for key in metrics.keys(): 8 | res = metrics[key](y, preds) 9 | results[prefix + "_" + key] = res 10 | if verbose > 0: 11 | print(prefix + " " + key + ":\t" + str(res)) 12 | -------------------------------------------------------------------------------- /experiments/influence_n_bottom/util.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | 3 | metrics = {"accuracy": accuracy_score} 4 | 5 | def evaluate(preds, y, results, prefix, verbose=1): 6 | 7 | for key in metrics.keys(): 8 | res = metrics[key](y, preds) 9 | results[prefix + "_" + key] = res 10 | if verbose > 0: 11 | print(prefix + " " + key + ":\t" + str(res)) 12 | -------------------------------------------------------------------------------- /woody/io/split.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import pandas 7 | 8 | def train_test_split_csv(fname, fname_train, fname_test, train_size=None, test_size=None, chunksize=500000): 9 | 10 | pandas.read_csv(fname, iterator=True, chunksize=chunksize) 11 | 12 | def train_test_split_h5pd(fname, fname_train, fname_test, train_size=None, test_size=None): 13 | 14 | pass -------------------------------------------------------------------------------- /woody/data/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from woody.data.generate import covtype_files, covtype, covtype_generators 7 | from woody.data.generate import higgs_files, higgs, higgs_generators 8 | from woody.data.generate import susy_files, susy, susy_generators 9 | from woody.data.generate import landsat_files, landsat_generators 10 | from woody.data.generate import artificial, artificial_generators -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h2o==3.10.4.8 2 | h5py==2.6.0 3 | matplotlib==1.5.3 4 | networkx==1.11 5 | nose==1.3.7 6 | numpy==1.11.2 7 | pandas==0.19.1 8 | pygraphviz==1.3.1 9 | pyparsing==2.1.10 10 | requests==2.18.4 11 | scikit-image==0.12.3 12 | scikit-learn==0.18.1 13 | scipy==0.18.1 14 | seaborn==0.8.1 15 | sklearn-evaluation==0.3 16 | tables==3.3.0 17 | tabulate==0.7.7 18 | urllib3==1.22 19 | yep==0.4 20 | Cython==0.26.1 21 | #skutil==0.1.6 # install manually from https://github.com/tgsmith61591/skutil 22 | 23 | -------------------------------------------------------------------------------- /woody/util/array/src/include/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * util.h 3 | */ 4 | #ifndef INCLUDE_UTIL_H_ 5 | #define INCLUDE_UTIL_H_ 6 | 7 | #include "global.h" 8 | 9 | #include 10 | #include 11 | 12 | /* -------------------------------------------------------------------------------- 13 | * Copies a single pattern 14 | * -------------------------------------------------------------------------------- 15 | */ 16 | inline void copy_pattern(FLOAT_TYPE *src, FLOAT_TYPE *dst, int dim); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /woody/util/array/src/util.c: -------------------------------------------------------------------------------- 1 | #include "include/array.h" 2 | 3 | /* -------------------------------------------------------------------------------- 4 | * Copies a single pattern 5 | * -------------------------------------------------------------------------------- 6 | */ 7 | inline void copy_pattern(FLOAT_TYPE *src, FLOAT_TYPE *dst, int dim){ 8 | 9 | int j; 10 | 11 | // memcpy seems to be slower (function call) 12 | for (j=0; j 0 13 | #define FLOAT_TYPE double 14 | #define PARSE_FLOAT strtod 15 | #define MAX_FLOAT_TYPE 1.7976931348623158e+308 16 | #define MIN_FLOAT_TYPE -1.7976931348623158e+308 17 | #else 18 | #define FLOAT_TYPE float 19 | #define PARSE_FLOAT strtof 20 | #define MAX_FLOAT_TYPE 3.402823466e+38 21 | #define MIN_FLOAT_TYPE -3.402823466e+38 22 | #endif 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /woody/models/forest/src/tree/cpu/include/fastsort.h: -------------------------------------------------------------------------------- 1 | /* 2 | * fastsort.h 3 | * 4 | * Created on: 23.01.2017 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef ENSEMBLE_CPU_INCLUDE_FASTSORT_H_ 9 | #define ENSEMBLE_CPU_INCLUDE_FASTSORT_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "criteria.h" 16 | 17 | #include "../../include/global.h" 18 | #include "../../include/util.h" 19 | 20 | 21 | #define fast_size_threshold 64 22 | 23 | void combined_sort(FLOAT_TYPE *XF, int *samples, int n); 24 | 25 | #endif /* ENSEMBLE_CPU_INCLUDE_FASTSORT_H_ */ 26 | -------------------------------------------------------------------------------- /woody/util/array/src/include/global.h: -------------------------------------------------------------------------------- 1 | /* 2 | * global.h 3 | */ 4 | 5 | #ifndef INCLUDE_GLOBAL_H_ 6 | #define INCLUDE_GLOBAL_H_ 7 | 8 | #include 9 | 10 | #ifndef USE_DOUBLE 11 | #define USE_DOUBLE 0 12 | #endif 13 | 14 | #if USE_DOUBLE > 0 15 | #define FLOAT_TYPE double 16 | #define PARSE_FLOAT strtod 17 | #define MAX_FLOAT_TYPE 1.7976931348623158e+308 18 | #define MIN_FLOAT_TYPE -1.7976931348623158e+308 19 | #else 20 | #define FLOAT_TYPE float 21 | #define PARSE_FLOAT strtof 22 | #define MAX_FLOAT_TYPE 3.402823466e+38 23 | #define MIN_FLOAT_TYPE -3.402823466e+38 24 | #endif 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /experiments/large_data/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import params 3 | 4 | seed = params.seed 5 | odir = params.odir 6 | methods = params.methods 7 | 8 | for method in methods: 9 | for dkey in params.datasets.keys(): 10 | for train_size in params.datasets[dkey]['train_sizes']: 11 | for key in params.parameters: 12 | print("Processing method %s with data set %s, train_size %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(key))) 13 | cmd = "python " + method + ".py --dkey %s --train_size %i --key %s" % (dkey, train_size, key) 14 | print(cmd) 15 | os.system(cmd) -------------------------------------------------------------------------------- /woody/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | """ 7 | The woody package aims at large-scale implementations 8 | for random forests. It is based on an efficient C 9 | implementation and resorts to distributed computing 10 | strategies. 11 | """ 12 | 13 | import warnings 14 | 15 | try: 16 | from woody.models import WoodClassifier, WoodRegressor, HugeWoodClassifier, HugeWoodRegressor, SubsetWoodClassifier, SubsetWoodRegressor 17 | except Exception as e: 18 | warnings.warn("Swig models not compiled yet? Error message: %s" % str(e)) 19 | 20 | __version__ = "0.3.1" 21 | -------------------------------------------------------------------------------- /experiments/landsat/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import params 3 | 4 | seeds = [0,1,2,3] 5 | odir = params.odir 6 | methods = params.methods 7 | 8 | for method in methods: 9 | for dkey in params.datasets.keys(): 10 | for train_size in params.datasets[dkey]['train_sizes']: 11 | for seed in seeds: 12 | for key in params.parameters: 13 | print("Processing method %s with data set %s, train_size %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(seed), str(key))) 14 | cmd = "python " + method + ".py --dkey %s --train_size %i --seed %i --key %s" % (dkey, train_size, seed, key) 15 | print(cmd) 16 | os.system(cmd) 17 | -------------------------------------------------------------------------------- /experiments/small_data/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import params 3 | 4 | seeds = [0,1,2,3] 5 | odir = params.odir 6 | methods = params.methods 7 | 8 | for method in methods: 9 | for dkey in params.datasets.keys(): 10 | for train_size in params.datasets[dkey]['train_sizes']: 11 | for seed in seeds: 12 | for key in params.parameters: 13 | print("Processing method %s with data set %s, train_size %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(seed), str(key))) 14 | cmd = "python " + method + ".py --dkey %s --train_size %i --seed %i --key %s" % (dkey, train_size, seed, key) 15 | print(cmd) 16 | os.system(cmd) 17 | -------------------------------------------------------------------------------- /woody/util/timer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import time 7 | 8 | class Timer(object): 9 | 10 | def __init__(self): 11 | 12 | self._start_time = 0.0 13 | self._elapsed_time = 0.0 14 | 15 | def start(self): 16 | 17 | self._start_time = time.time() 18 | 19 | def stop(self): 20 | 21 | self._elapsed_time += time.time() - self._start_time 22 | self._start_time = 0.0 23 | 24 | def reset(self): 25 | 26 | self._start_time = 0.0 27 | self._elapsed_time = 0.0 28 | 29 | def get_elapsed_time(self): 30 | 31 | return self._elapsed_time 32 | -------------------------------------------------------------------------------- /woody/models/forest/src/tree/cpu/include/standard.h: -------------------------------------------------------------------------------- 1 | /* 2 | * standard.h 3 | * 4 | * Created on: 23.01.2017 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef ENSEMBLE_CPU_INCLUDE_STANDARD_H_ 9 | #define ENSEMBLE_CPU_INCLUDE_STANDARD_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "criteria.h" 16 | #include "fastsort.h" 17 | 18 | #include "../../include/global.h" 19 | #include "../../include/util.h" 20 | 21 | 22 | #define size_threshold 16 23 | 24 | void intro_sort(PATTERN_LABEL_WEIGHT *a, int n); 25 | 26 | FLOAT_TYPE compute_optimal_threshold(PATTERN_LABEL_WEIGHT *XF_Y_W, int n_XF_Y_W, PARAMETERS *params, TRAINING_DATA *train_data, SPLIT_RECORD *best_split); 27 | 28 | #endif /* ENSEMBLE_CPU_INCLUDE_STANDARD_H_ */ 29 | -------------------------------------------------------------------------------- /woody/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | def configuration(parent_package='', top_path=None): 7 | 8 | from numpy.distutils.misc_util import Configuration 9 | 10 | config = Configuration('woody', parent_package, top_path) 11 | config.add_subpackage('models', subpackage_path='models') 12 | config.add_subpackage('models/forest', subpackage_path='models/forest') 13 | config.add_subpackage('tests') 14 | config.add_subpackage('util') 15 | config.add_subpackage('util/array', subpackage_path='util/array') 16 | 17 | return config 18 | 19 | if __name__ == '__main__': 20 | 21 | from numpy.distutils.core import setup 22 | setup(**configuration(top_path='').todict()) 23 | -------------------------------------------------------------------------------- /experiments/influence_lamda/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import params 3 | 4 | seeds = [0,1,2,3] 5 | odir = params.odir 6 | methods = params.methods 7 | 8 | for method in methods: 9 | for dkey in params.datasets.keys(): 10 | for train_size in params.datasets[dkey]['train_sizes']: 11 | for lamcrit in params.lamcrits: 12 | for seed in seeds: 13 | for key in params.parameters: 14 | print("Processing method %s with data set %s, train_size %s, lamcrit %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(lamcrit), str(seed), str(key))) 15 | cmd = "python " + method + ".py --dkey %s --train_size %i --lamcrit %f --seed %i --key %s" % (dkey, train_size, lamcrit, seed, key) 16 | print(cmd) 17 | os.system(cmd) 18 | -------------------------------------------------------------------------------- /experiments/influence_n_bottom/launch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import params 3 | 4 | seeds = [0,1,2,3] 5 | odir = params.odir 6 | methods = params.methods 7 | 8 | for method in methods: 9 | for dkey in params.datasets.keys(): 10 | for train_size in params.datasets[dkey]['train_sizes']: 11 | for n_bottom in params.n_estimators_bottoms: 12 | for seed in seeds: 13 | for key in params.parameters: 14 | print("Processing method %s with data set %s, train_size %s, n_bottom %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(n_bottom), str(seed), str(key))) 15 | cmd = "python " + method + ".py --dkey %s --train_size %i --n_bottom %f --seed %i --key %s" % (dkey, train_size, n_bottom, seed, key) 16 | print(cmd) 17 | os.system(cmd) 18 | -------------------------------------------------------------------------------- /experiments/large_data/params.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | seed = 0 4 | odir = "results" 5 | methods = ["hugewood"] 6 | 7 | datasets = collections.OrderedDict() 8 | datasets['landsat'] = {'train_sizes':[250000000, 500000000, 750000000, 1000000000]} 9 | 10 | parameters = collections.OrderedDict() 11 | #parameters['ert'] = {'n_estimators':4, 12 | # 'max_features':None, 13 | # 'bootstrap':False, 14 | # 'tree_type':'randomized', 15 | # 'n_jobs':4} 16 | parameters['rf'] = {'n_estimators':4, 17 | 'max_features':"sqrt", 18 | 'bootstrap':True, 19 | 'tree_type':'standard', 20 | 'n_jobs':4} 21 | 22 | parameters_hugewood = collections.OrderedDict() 23 | 24 | for key in parameters: 25 | 26 | param_hugewood = {} 27 | param_hugewood['param_wood'] = parameters[key] 28 | param_hugewood['n_estimators'] = 1 29 | param_hugewood['n_estimators_bottom'] = 4 30 | 31 | parameters_hugewood[key] = param_hugewood 32 | -------------------------------------------------------------------------------- /woody/util/array/swig/cpu_float.i: -------------------------------------------------------------------------------- 1 | %module wrapper_utils_cpu_float 2 | 3 | %{ 4 | #define SWIG_FILE_WITH_INIT 5 | #include "array.h" 6 | %} 7 | 8 | %include "numpy.i" 9 | 10 | %init %{ 11 | import_array(); 12 | %} 13 | 14 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *X, int nX, int dX)} 15 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *XT, int nXT, int dXT)} 16 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *Xnew, int nXnew, int dXnew)} 17 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *y, int ny)} 18 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *ynew, int nynew)} 19 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *offsets, int noffsets)} 20 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *indicator, int nindicator)} 21 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *chunks, int nchunks)} 22 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *counts, int ncounts)} 23 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *cumsums_minus_counts, int ncumsums_minus_counts)} 24 | 25 | %include "array.h" 26 | -------------------------------------------------------------------------------- /experiments/landsat/params.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | odir = "results" 4 | 5 | methods = ["hugewood", "subsetwood", "sk", "h2"] 6 | 7 | datasets = collections.OrderedDict() 8 | datasets['landsat'] = {'train_sizes':[i*1000000 for i in [10,20,30,40,50]]} 9 | 10 | parameters = collections.OrderedDict() 11 | #parameters['ert'] = {'n_estimators':4, 12 | # 'max_features':None, 13 | # 'bootstrap':False, 14 | # 'tree_type':'randomized', 15 | # 'n_jobs':4} 16 | parameters['rf'] = {'n_estimators':12, 17 | 'max_features':"sqrt", 18 | 'bootstrap':True, 19 | 'tree_type':'standard', 20 | 'n_jobs':4} 21 | 22 | parameters_hugewood = collections.OrderedDict() 23 | 24 | for key in parameters: 25 | 26 | param_hugewood = {} 27 | param_hugewood['param_wood'] = parameters[key] 28 | param_hugewood['n_estimators'] = 3 29 | param_hugewood['n_estimators_bottom'] = 4 30 | 31 | parameters_hugewood[key] = param_hugewood 32 | -------------------------------------------------------------------------------- /woody/util/array/swig/cpu_double.i: -------------------------------------------------------------------------------- 1 | %module wrapper_utils_cpu_double 2 | 3 | %{ 4 | #define SWIG_FILE_WITH_INIT 5 | #include "array.h" 6 | %} 7 | 8 | %include "numpy.i" 9 | 10 | %init %{ 11 | import_array(); 12 | %} 13 | 14 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *X, int nX, int dX)} 15 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *XT, int nXT, int dXT)} 16 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *Xnew, int nXnew, int dXnew)} 17 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *y, int ny)} 18 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *ynew, int nynew)} 19 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *offsets, int noffsets)} 20 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *indicator, int nindicator)} 21 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *chunks, int nchunks)} 22 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *counts, int ncounts)} 23 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *cumsums_minus_counts, int ncumsums_minus_counts)} 24 | 25 | %include "array.h" 26 | -------------------------------------------------------------------------------- /experiments/influence_lamda/params.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | odir = "results" 4 | methods = ["hugewood", "wood"] 5 | 6 | lamcrits = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] 7 | 8 | datasets = collections.OrderedDict() 9 | datasets['covtype'] = {'train_sizes':[100000, 150000, 200000, 250000, 300000, 350000, 400000]} 10 | 11 | parameters = collections.OrderedDict() 12 | #parameters['ert'] = {'n_estimators':4, 13 | # 'max_features':None, 14 | # 'bootstrap':False, 15 | # 'tree_type':'randomized', 16 | # 'n_jobs':4} 17 | parameters['rf'] = {'n_estimators':24, 18 | 'max_features':"sqrt", 19 | 'bootstrap':True, 20 | 'tree_type':'standard', 21 | 'n_jobs':4} 22 | 23 | parameters_hugewood = collections.OrderedDict() 24 | 25 | for key in parameters: 26 | 27 | param_hugewood = {} 28 | param_hugewood['param_wood'] = parameters[key] 29 | param_hugewood['n_estimators'] = 6 30 | param_hugewood['n_estimators_bottom'] = 4 31 | 32 | parameters_hugewood[key] = param_hugewood 33 | -------------------------------------------------------------------------------- /woody/models/forest/src/timing.c: -------------------------------------------------------------------------------- 1 | #include "include/timing.h" 2 | 3 | /* -------------------------------------------------------------------------------- 4 | * Helper method for computing the current time (w.r.t to an offset). 5 | * -------------------------------------------------------------------------------- 6 | */ 7 | long get_system_time_in_microseconds(void) { 8 | 9 | struct timeval tempo; 10 | gettimeofday(&tempo, NULL); 11 | 12 | return tempo.tv_sec * 1000000 + tempo.tv_usec; 13 | 14 | } 15 | 16 | void init_my_timer(TIMER *timer) { 17 | 18 | timer->start_time = 0; 19 | timer->elapsed_time = 0.0f; 20 | timer->elapsed_time_total = 0.0f; 21 | 22 | } 23 | 24 | void start_my_timer(TIMER *timer) { 25 | 26 | timer->start_time = get_system_time_in_microseconds(); 27 | 28 | } 29 | 30 | void stop_my_timer(TIMER *timer) { 31 | 32 | double current = (double) get_system_time_in_microseconds(); 33 | timer->elapsed_time = current - timer->start_time; 34 | timer->elapsed_time_total += timer->elapsed_time; 35 | 36 | } 37 | 38 | double get_my_timer(TIMER *timer) { 39 | 40 | return (double) (1.0 * timer->elapsed_time_total / 1000000.0); 41 | 42 | } 43 | -------------------------------------------------------------------------------- /experiments/influence_n_bottom/params.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | odir = "results" 4 | methods = ["hugewood_1K", "hugewood_10K", "hugewood_75K"] 5 | 6 | n_estimators_bottoms = [1,4,12,24] 7 | 8 | datasets = collections.OrderedDict() 9 | datasets['covtype'] = {'train_sizes':[100000, 150000, 200000, 250000, 300000, 350000, 400000]} 10 | 11 | parameters = collections.OrderedDict() 12 | #parameters['ert'] = {'n_estimators':4, 13 | # 'max_features':None, 14 | # 'bootstrap':False, 15 | # 'tree_type':'randomized', 16 | # 'n_jobs':4} 17 | parameters['rf'] = {'n_estimators':24, 18 | 'max_features':"sqrt", 19 | 'bootstrap':True, 20 | 'tree_type':'standard', 21 | 'n_jobs':4} 22 | 23 | parameters_hugewood = collections.OrderedDict() 24 | 25 | for key in parameters: 26 | 27 | param_hugewood = {} 28 | param_hugewood['param_wood'] = parameters[key] 29 | # set in hugewood*.py 30 | #param_hugewood['n_estimators'] = 6 31 | #param_hugewood['n_estimators_bottom'] = 4 32 | 33 | parameters_hugewood[key] = param_hugewood 34 | -------------------------------------------------------------------------------- /woody/models/forest/swig/cpu_float.i: -------------------------------------------------------------------------------- 1 | %module wrapper_cpu_float 2 | 3 | %{ 4 | #define SWIG_FILE_WITH_INIT 5 | #include "base.h" 6 | #include "types.h" 7 | %} 8 | 9 | %include "numpy.i" 10 | 11 | %init %{ 12 | import_array(); 13 | %} 14 | 15 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)} 16 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)} 17 | 18 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)} 19 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)} 20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)} 21 | 22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)} 23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)} 24 | 25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)} 26 | 27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)} 28 | 29 | %include "base.h" 30 | %include "types.h" 31 | -------------------------------------------------------------------------------- /woody/models/forest/swig/gpu_float.i: -------------------------------------------------------------------------------- 1 | %module wrapper_gpu_float 2 | 3 | %{ 4 | #define SWIG_FILE_WITH_INIT 5 | #include "base.h" 6 | #include "types.h" 7 | %} 8 | 9 | %include "numpy.i" 10 | 11 | %init %{ 12 | import_array(); 13 | %} 14 | 15 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)} 16 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)} 17 | 18 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)} 19 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)} 20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)} 21 | 22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)} 23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)} 24 | 25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)} 26 | 27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)} 28 | 29 | %include "base.h" 30 | %include "types.h" 31 | -------------------------------------------------------------------------------- /woody/models/forest/swig/gpu_double.i: -------------------------------------------------------------------------------- 1 | %module wrapper_gpu_double 2 | 3 | %{ 4 | #define SWIG_FILE_WITH_INIT 5 | #include "base.h" 6 | #include "types.h" 7 | %} 8 | 9 | %include "numpy.i" 10 | 11 | %init %{ 12 | import_array(); 13 | %} 14 | 15 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)} 16 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)} 17 | 18 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)} 19 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)} 20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)} 21 | 22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)} 23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)} 24 | 25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)} 26 | 27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)} 28 | 29 | %include "base.h" 30 | %include "types.h" 31 | -------------------------------------------------------------------------------- /woody/models/forest/swig/cpu_double.i: -------------------------------------------------------------------------------- 1 | %module wrapper_cpu_double 2 | 3 | %{ 4 | #define SWIG_FILE_WITH_INIT 5 | #include "base.h" 6 | #include "types.h" 7 | %} 8 | 9 | %include "numpy.i" 10 | 11 | %init %{ 12 | import_array(); 13 | %} 14 | 15 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)} 16 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)} 17 | 18 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)} 19 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)} 20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)} 21 | 22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)} 23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)} 24 | 25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)} 26 | 27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)} 28 | 29 | 30 | 31 | %include "base.h" 32 | %include "types.h" 33 | 34 | -------------------------------------------------------------------------------- /experiments/small_data/params.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | odir = "results" 4 | methods = ["hugewood_lam", "subsetwood", "sk", "h2"] 5 | 6 | datasets = collections.OrderedDict() 7 | datasets['covtype'] = {'train_sizes':[100000, 150000, 200000, 250000, 300000, 350000, 400000]} 8 | datasets["susy"] = {'train_sizes':[1000000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000]} 9 | datasets["higgs"] = {'train_sizes':[1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000]} 10 | 11 | parameters = collections.OrderedDict() 12 | #parameters['ert'] = {'n_estimators':4, 13 | # 'max_features':None, 14 | # 'bootstrap':False, 15 | # 'tree_type':'randomized', 16 | # 'n_jobs':4} 17 | parameters['rf'] = {'n_estimators':24, 18 | 'max_features':"sqrt", 19 | 'bootstrap':True, 20 | 'tree_type':'standard', 21 | 'n_jobs':4} 22 | 23 | parameters_hugewood = collections.OrderedDict() 24 | 25 | for key in parameters: 26 | 27 | param_hugewood = {} 28 | param_hugewood['param_wood'] = parameters[key] 29 | param_hugewood['n_estimators'] = 6 30 | param_hugewood['n_estimators_bottom'] = 4 31 | 32 | parameters_hugewood[key] = param_hugewood 33 | -------------------------------------------------------------------------------- /woody/io/reader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import random 7 | 8 | class Reader(object): 9 | """ 10 | """ 11 | 12 | def __init__(self, 13 | fname=None, 14 | data=None, 15 | patterns=True, 16 | target=True, 17 | chunksize=32000, 18 | n_lines_max=None, 19 | seed=0, 20 | ): 21 | 22 | self.fname = fname 23 | self.data = data 24 | self.patterns = patterns 25 | self.target = target 26 | self.chunksize = chunksize 27 | self.n_lines_max = n_lines_max 28 | self.seed = seed 29 | 30 | self._randomgen = random.Random(self.seed) 31 | self._reader = None 32 | 33 | def __del__(self): 34 | 35 | self.close() 36 | 37 | def close(self): 38 | 39 | try: 40 | self._reader.close() 41 | except: 42 | pass 43 | 44 | def set_seed(self, s): 45 | 46 | self._randomgen.seed(s) 47 | 48 | def set_mode(self, patterns=True, target=True): 49 | 50 | self.patterns = patterns 51 | self.target = target 52 | -------------------------------------------------------------------------------- /woody/models/util.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import logging 8 | from datetime import datetime 9 | 10 | from logging.handlers import RotatingFileHandler 11 | 12 | def init_logger(fname, log_name="Logger", log_level="INFO"): 13 | 14 | # create logging directory if needed 15 | d = os.path.dirname(fname) 16 | if not os.path.exists(d): 17 | os.makedirs(d) 18 | 19 | logger = logging.getLogger(log_name + "_" + str(datetime.now())) 20 | if log_level == 'INFO': 21 | logger.setLevel(logging.INFO) 22 | else: 23 | logger.setLevel(logging.DEBUG) 24 | 25 | # logging formatter 26 | #formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 27 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 28 | 29 | # store output if specified 30 | if fname is not None: 31 | log_handler = RotatingFileHandler(fname, 'a') 32 | log_handler.setFormatter(formatter) 33 | logger.addHandler(log_handler) 34 | 35 | # standard streaming handler 36 | ch = logging.StreamHandler() 37 | ch.setFormatter(formatter) 38 | logger.addHandler(ch) 39 | 40 | # avoid double outputs 41 | logger.propagate = 0 42 | 43 | return logger -------------------------------------------------------------------------------- /woody/util/array/src/include/array.h: -------------------------------------------------------------------------------- 1 | /* 2 | * util.h 3 | */ 4 | #ifndef INCLUDE_ARRAY_H_ 5 | #define INCLUDE_ARRAY_H_ 6 | 7 | #include "global.h" 8 | #include "util.h" 9 | 10 | #include 11 | #include 12 | 13 | /* -------------------------------------------------------------------------------- 14 | * Splits the array X according to the indices 15 | * -------------------------------------------------------------------------------- 16 | */ 17 | void split_array(FLOAT_TYPE *X, int nX, int dX, FLOAT_TYPE *Xnew, int nXnew, int dXnew, int *indicator, int nindicator, int *chunks, int nchunks, int *cumsums_minus_counts, int ncumsums_minus_counts); 18 | 19 | /* -------------------------------------------------------------------------------- 20 | * Computes split offsets 21 | * -------------------------------------------------------------------------------- 22 | */ 23 | void compute_split_offsets(int *offsets, int noffsets, 24 | int *indicator, int nindicator, 25 | int *chunks, int nchunks, 26 | int *cumsums_minus_counts, int ncumsums_minus_counts); 27 | 28 | /* -------------------------------------------------------------------------------- 29 | * Transposes an array 30 | * -------------------------------------------------------------------------------- 31 | */ 32 | void transpose_array(FLOAT_TYPE* X, int nX, int dX, FLOAT_TYPE* XT, int nXT, int dXT); 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | .venv 65 | data/*/*.h5 66 | data/*/*.h5pd 67 | data/*/*.csv 68 | data/*/*.html 69 | data/*/*_files* 70 | data/landsat 71 | 72 | .project 73 | .pydevproject 74 | 75 | # swig related 76 | *_double.py 77 | *_float.py 78 | *_wrap.c 79 | 80 | experiments/landsat/data 81 | experiments/landsat/tmp 82 | experiments/large_data/data 83 | experiments/large_data/tmp 84 | -------------------------------------------------------------------------------- /woody/data/landsat.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | from woody.io import DataGenerator 8 | 9 | from .util import check_and_download 10 | 11 | def get_landsat_files(data_path, data_set="LC81950212016133LGN00", version="1_1", train_size=0): 12 | 13 | fname_train = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".train.csv") 14 | fname_test = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".test.csv") 15 | check_and_download(fname_train) 16 | check_and_download(fname_test) 17 | 18 | if train_size > 0: 19 | fname_train_size = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".train_%i.csv" % train_size) 20 | if not os.path.exists(fname_train_size): 21 | os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname_train, fname_train_size)) 22 | fname_train = fname_train_size 23 | 24 | return fname_train, fname_test 25 | 26 | def get_landsat_generator(data_path, train_size=10000000, data_set="LC81950212016133LGN00", version="1_1", seed=0, part="train", store=None, patterns=True, target=True, chunksize=5000000): 27 | 28 | assert version in ["1_1", "3_3", "pan_1_1", "pan_3_3"] 29 | 30 | if part=="train": 31 | fname = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".train.h5pd") 32 | elif part=="test": 33 | fname = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".test.h5pd") 34 | check_and_download(fname) 35 | 36 | return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize) -------------------------------------------------------------------------------- /woody/models/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .util import init_logger 7 | 8 | class NoLogger(): 9 | 10 | def __init__(self): 11 | pass 12 | 13 | def info(self, msg): 14 | pass 15 | 16 | def debug(self, msg): 17 | pass 18 | 19 | class BaseEstimator(object): 20 | 21 | def __init__(self, 22 | verbose=0, 23 | logging_name="BaseEstimator", 24 | logging_file=None, 25 | seed=0, 26 | ): 27 | 28 | self.verbose = verbose 29 | self.logging_name = logging_name 30 | self.seed = seed 31 | 32 | def fit(self, logging_file="estimator.log"): 33 | 34 | # instantiate logger 35 | if self.verbose > 0: 36 | self._logger = init_logger(fname=logging_file, 37 | log_name=self.logging_name, 38 | log_level="DEBUG") 39 | else: 40 | self._logger = NoLogger() 41 | 42 | def get_params(self): 43 | """ Returns the models's parameters 44 | """ 45 | 46 | return {"verbose": self.verbose, 47 | "logging_name" : self.logging_name, 48 | "seed": self.seed, 49 | } 50 | 51 | def set_params(self, **parameters): 52 | """ Sets local parameters (does not need 53 | to be overwritten). 54 | """ 55 | 56 | for parameter, value in parameters.items(): 57 | self.setattr(parameter, value) 58 | 59 | 60 | -------------------------------------------------------------------------------- /woody/models/forest/src/include/util.h: -------------------------------------------------------------------------------- 1 | /* 2 | * util.h 3 | */ 4 | #ifndef COMMON_INCLUDE_UTIL_H_ 5 | #define COMMON_INCLUDE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "float.h" 22 | 23 | #define max(a,b) \ 24 | ({ __typeof__ (a) _a = (a); \ 25 | __typeof__ (b) _b = (b); \ 26 | _a > _b ? _a : _b; }) 27 | 28 | #define min(a,b) \ 29 | ({ __typeof__ (a) _a = (a); \ 30 | __typeof__ (b) _b = (b); \ 31 | _a < _b ? _a : _b; }) 32 | 33 | #define ELEM_SWAP(a,b) { register FLOAT_TYPE t=(a);(a)=(b);(b)=t; } 34 | #define median(a,n) kth_smallest(a,n,((n)/2)) 35 | 36 | /* -------------------------------------------------------------------------------- 37 | * Transposes an array (float) 38 | * -------------------------------------------------------------------------------- 39 | */ 40 | void transpose_array_float(float* array, int n, int d, float* array_transposed); 41 | 42 | /* -------------------------------------------------------------------------------- 43 | * Transposes an array (double) 44 | * -------------------------------------------------------------------------------- 45 | */ 46 | void transpose_array_double(double* array, int n, int d, 47 | double* array_transposed); 48 | 49 | int compare_floats(const void *p1, const void *p2); 50 | 51 | int compare_ints(const void *p1, const void *p2); 52 | 53 | FLOAT_TYPE kth_smallest(FLOAT_TYPE a[], int n, int k); 54 | int kth_smallest_idx(FLOAT_TYPE a[], int n, int k); 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /woody/util/url.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2013-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from __future__ import print_function 7 | 8 | import os 9 | import sys 10 | 11 | try: 12 | import urllib.request as urllib2 13 | except ImportError: 14 | import urllib2 15 | 16 | def download_from_url(url, fname): 17 | """ Downloads data from a given url. 18 | 19 | Parameters 20 | ---------- 21 | url : str 22 | The target url from which the data 23 | shall be downloaded 24 | fname : str 25 | The local filename; if the corresponding 26 | directory does not exists, it will be created 27 | """ 28 | 29 | # create directory if needed 30 | d = os.path.dirname(fname) 31 | if not os.path.exists(d): 32 | os.makedirs(d) 33 | 34 | # open local file 35 | f = open(fname, 'wb') 36 | 37 | # get data from url; based on 38 | # http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python 39 | u = urllib2.urlopen(url) 40 | meta = u.info() 41 | fsize = int(meta.getheaders("Content-Length")[0]) 42 | print("Downloading from %s (%i bytes) ... \n" % (url, fsize)) 43 | 44 | fsize_current = 0 45 | block_size = 8192 46 | 47 | print("Progress") 48 | while True: 49 | 50 | buff = u.read(block_size) 51 | if not buff: 52 | break 53 | 54 | fsize_current += len(buff) 55 | f.write(buff) 56 | 57 | percent = fsize_current * 100. / fsize 58 | 59 | sys.stdout.flush() 60 | sys.stdout.write("\r%2d%%" % percent) 61 | 62 | sys.stdout.flush() 63 | 64 | print("\n") 65 | f.close() 66 | -------------------------------------------------------------------------------- /woody/models/forest/src/tree/cpu/include/criteria.h: -------------------------------------------------------------------------------- 1 | /* 2 | * criteria.h 3 | * 4 | * Created on: 08.01.2015 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef ENSEMBLE_HUGE_FOREST_INCLUDE_CRITERIA_H_ 9 | #define ENSEMBLE_HUGE_FOREST_INCLUDE_CRITERIA_H_ 10 | 11 | #include "../../include/global.h" 12 | #include "../../include/util.h" 13 | 14 | void criterion_improvement_via_threshold(FLOAT_TYPE threshold, PATTERN_LABEL_WEIGHT *XF_Y_W, TRAINING_DATA *train_data, 15 | TRAVERSAL_RECORD *trecord, PARAMETERS *params, SPLIT_RECORD *current_split); 16 | 17 | /* -------------------------------------------------------------------------------- 18 | * Computes the impurity for samples[start:end] 19 | * Similar to RegressionCriterion(Criterion) of sklearn 20 | * -------------------------------------------------------------------------------- 21 | */ 22 | FLOAT_TYPE cpu_criterion_leaf(int start, int end, 23 | TRAINING_DATA *train_data, PARAMETERS *params); 24 | 25 | /* -------------------------------------------------------------------------------- 26 | * Initializes a splitting criterion (which can be updated). 27 | * -------------------------------------------------------------------------------- 28 | */ 29 | void init_criterion_cpu(CRITERION_RECORD *crit_record, PATTERN_LABEL_WEIGHT *XF_Y_W, 30 | int n_XF_Y_W, PARAMETERS *params, TRAINING_DATA *train_data); 31 | 32 | void free_criterion_cpu(CRITERION_RECORD *crit_record, PARAMETERS *params, TRAINING_DATA *train_data); 33 | 34 | /* -------------------------------------------------------------------------------- 35 | * Updates a criterion. 36 | * -------------------------------------------------------------------------------- 37 | */ 38 | void inline update_criterion_cpu(CRITERION_RECORD *crit_record, 39 | PATTERN_LABEL_WEIGHT *XF_Y_W, int n_XF_Y_W, int new_pos, PARAMETERS *params, TRAINING_DATA *train_data); 40 | 41 | #endif /* ENSEMBLE_HUGE_FOREST_INCLUDE_CRITERIA_H_ */ 42 | -------------------------------------------------------------------------------- /woody/models/sampler.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import copy 7 | import numpy 8 | 9 | class Sampler(object): 10 | """ 11 | """ 12 | 13 | def __init__(self, model, seed=0, n_estimators=10, percentage=0.5): 14 | 15 | self.model = model 16 | self.seed = seed 17 | self.n_estimators = n_estimators 18 | self.percentage = percentage 19 | 20 | self.models = [] 21 | for i in xrange(self.n_estimators): 22 | self.models.append(copy.deepcopy(self.model)) 23 | 24 | def fit(self, X, y): 25 | 26 | for i in xrange(self.n_estimators): 27 | print("Fitting model %i ..." % i) 28 | partition = numpy.random.permutation(X.shape[0]) 29 | partition = partition[:int(self.percentage * len(partition))] 30 | Xsub = X[partition] 31 | ysub = y[partition] 32 | self.models[i].fit(Xsub, ysub) 33 | 34 | def predict(self, X, operator="max"): 35 | 36 | all_predictions = self._predict_all(X) 37 | 38 | preds = [] 39 | for j in xrange(all_predictions.shape[0]): 40 | p = all_predictions[j,:] 41 | values, counts = numpy.unique(p,return_counts=True) 42 | ind = numpy.argmax(counts) 43 | preds.append(values[ind]) 44 | preds = numpy.array(preds) 45 | 46 | return preds 47 | 48 | def _predict_all(self, X): 49 | 50 | predictions = [] 51 | for i in xrange(self.n_estimators): 52 | print("Computing predictions for model %i ..." % i) 53 | preds = self.models[i].predict(X) 54 | predictions.append(preds) 55 | predictions = numpy.array(predictions).T 56 | 57 | return predictions 58 | -------------------------------------------------------------------------------- /woody/models/forest/classification.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .base import Wood 7 | 8 | class WoodClassifier(Wood): 9 | """ Random forest classifier. 10 | """ 11 | 12 | def __init__(self, 13 | seed=0, 14 | n_estimators=10, 15 | min_samples_split=2, 16 | max_features=None, 17 | bootstrap=False, 18 | max_depth=None, 19 | min_samples_leaf=1, 20 | criterion="gini", 21 | tree_traversal_mode="dfs", 22 | leaf_stopping_mode="all", 23 | tree_type="randomized", 24 | float_type="double", 25 | patts_trans=True, 26 | do_patts_trans=True, 27 | lam_criterion=0.0, 28 | n_jobs=1, 29 | verbose=1, 30 | ): 31 | 32 | super(WoodClassifier, self).__init__( 33 | seed=seed, 34 | n_estimators=n_estimators, 35 | min_samples_split=min_samples_split, 36 | max_features=max_features, 37 | bootstrap=bootstrap, 38 | max_depth=max_depth, 39 | min_samples_leaf=min_samples_leaf, 40 | learning_type="classification", 41 | criterion=criterion, 42 | tree_traversal_mode=tree_traversal_mode, 43 | leaf_stopping_mode=leaf_stopping_mode, 44 | tree_type=tree_type, 45 | float_type=float_type, 46 | patts_trans=patts_trans, 47 | do_patts_trans=do_patts_trans, 48 | lam_criterion=lam_criterion, 49 | n_jobs=n_jobs, 50 | verbose=verbose) -------------------------------------------------------------------------------- /woody/models/forest/regression.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from .base import Wood 7 | 8 | class WoodRegressor(Wood): 9 | """ Random forest regressor. 10 | """ 11 | 12 | def __init__(self, 13 | seed=0, 14 | n_estimators=10, 15 | min_samples_split=2, 16 | max_features=None, 17 | bootstrap=False, 18 | max_depth=None, 19 | min_samples_leaf=1, 20 | criterion="mse", 21 | tree_traversal_mode="dfs", 22 | leaf_stopping_mode="all", 23 | tree_type="randomized", 24 | float_type="double", 25 | patts_trans=True, 26 | do_patts_trans=True, 27 | lam_criterion=0.0, 28 | n_jobs=1, 29 | verbose=1, 30 | ): 31 | 32 | super(WoodRegressor, self).__init__( 33 | seed=seed, 34 | n_estimators=n_estimators, 35 | min_samples_split=min_samples_split, 36 | max_features=max_features, 37 | bootstrap=bootstrap, 38 | max_depth=max_depth, 39 | min_samples_leaf=min_samples_leaf, 40 | learning_type="regression", 41 | criterion=criterion, 42 | tree_traversal_mode=tree_traversal_mode, 43 | leaf_stopping_mode=leaf_stopping_mode, 44 | tree_type=tree_type, 45 | float_type=float_type, 46 | patts_trans=patts_trans, 47 | do_patts_trans=do_patts_trans, 48 | lam_criterion=lam_criterion, 49 | n_jobs=n_jobs, 50 | verbose=verbose) 51 | -------------------------------------------------------------------------------- /woody/data/artificial.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import shutil 8 | 9 | from woody.io import DataGenerator 10 | 11 | from .util import save_to_h5pd 12 | 13 | def get_artificial_data(size=1000, seed=0): 14 | 15 | from sklearn.datasets import make_classification 16 | 17 | X, y = make_classification(n_samples=size, n_features=2, n_redundant=0, 18 | n_informative=2, random_state=seed, 19 | n_clusters_per_class=1) 20 | n_train = len(X) / 2 21 | X_train, y_train, X_test, y_test = X[:n_train], y[:n_train], X[n_train:], y[n_train:] 22 | 23 | return X_train, y_train, X_test, y_test 24 | 25 | def _convert_datasets(data_path, size=1000, seed=0): 26 | 27 | X_train, y_train, X_test, y_test = get_artificial_data(size=size, seed=seed) 28 | 29 | fname_store_train = os.path.join(data_path, "artificial/train_" + str(size) + ".h5pd") 30 | fname_store_test = os.path.join(data_path, "artificial/test_" + str(size) + ".h5pd") 31 | 32 | save_to_h5pd(X_train, y_train, fname_store_train) 33 | save_to_h5pd(X_test, y_test, fname_store_test) 34 | 35 | def get_artificial_generator(data_path, size=1000, seed=0, part="train", store="h5", patterns=True, target=True): 36 | 37 | if part=="train": 38 | fname = os.path.join(data_path, "artificial/train_" + str(size) + ".h5pd") 39 | elif part=="test": 40 | fname = os.path.join(data_path, "artificial/test_" + str(size) + ".h5pd") 41 | 42 | 43 | try: 44 | shutil.rmtree(fname) 45 | except: 46 | pass 47 | 48 | if not os.path.exists(fname): 49 | print("Store for artificial data does not exist. Generating all stores ...") 50 | _convert_datasets(data_path, size=size, seed=seed) 51 | 52 | return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=200000) 53 | -------------------------------------------------------------------------------- /woody/util/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | 8 | def makedirs(d): 9 | """ 10 | """ 11 | 12 | if not os.path.exists(d): 13 | os.makedirs(d) 14 | 15 | def ensure_dir_for_file(f): 16 | """ 17 | """ 18 | 19 | d = os.path.dirname(f) 20 | makedirs(d) 21 | 22 | def convert_to_libsvm(ifile_name, ofile_name, counter_print=1000000, label_offset=None): 23 | 24 | orig_labels = [] 25 | new_labels = [] 26 | 27 | ifile = open(ifile_name, 'r') 28 | ofile = open(ofile_name, 'w') 29 | 30 | # process file line-by-line 31 | counter = 0 32 | 33 | for line in ifile: 34 | 35 | new_line = [] 36 | 37 | if counter % counter_print == 0: 38 | print("Processing line %i ..." % counter) 39 | print("orig_labels=" + str(orig_labels)) 40 | print("new_labels=" + str(new_labels)) 41 | 42 | line = line.split(',') 43 | 44 | # append label 45 | label = line[0] 46 | orig_labels = list(orig_labels) 47 | orig_labels.append(label) 48 | orig_labels = set(orig_labels) 49 | 50 | if label_offset is not None: 51 | label = int(label) + label_offset 52 | new_labels = list(new_labels) 53 | new_labels.append(label) 54 | new_labels = set(new_labels) 55 | 56 | new_line.append(str(label)) 57 | 58 | # append features 59 | for i, item in enumerate(line[1:]): 60 | new_item = "%s:%s" % (i+1, item.strip()) 61 | new_line.append(new_item) 62 | 63 | new_line = " ".join(new_line) 64 | new_line += "\n" 65 | 66 | ofile.write(new_line) 67 | 68 | counter += 1 69 | 70 | ifile.close() 71 | ofile.close() 72 | 73 | print("orig_labels=" + str(orig_labels)) 74 | print("new_labels=" + str(new_labels)) -------------------------------------------------------------------------------- /woody/models/subset/regression.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from woody.io import DiskStore 7 | 8 | from .base import SubsetWood 9 | 10 | class SubsetWoodRegressor(SubsetWood): 11 | """ Random forest regressor. 12 | """ 13 | 14 | def __init__(self, 15 | seed=0, 16 | n_estimators=10, 17 | min_samples_split=2, 18 | max_features=None, 19 | bootstrap=False, 20 | max_depth=None, 21 | min_samples_leaf=1, 22 | criterion="mse", 23 | tree_traversal_mode="dfs", 24 | leaf_stopping_mode="all", 25 | tree_type="randomized", 26 | float_type="double", 27 | patts_trans=True, 28 | do_patts_trans=True, 29 | lam_criterion = 1.0, 30 | n_jobs=1, 31 | verbose=1, 32 | odir=".subsetwood", 33 | store=DiskStore(), 34 | ): 35 | 36 | super(SubsetWoodRegressor, self).__init__( 37 | seed=seed, 38 | n_estimators=n_estimators, 39 | min_samples_split=min_samples_split, 40 | max_features=max_features, 41 | bootstrap=bootstrap, 42 | max_depth=max_depth, 43 | min_samples_leaf=min_samples_leaf, 44 | learning_type="regression", 45 | criterion=criterion, 46 | tree_traversal_mode=tree_traversal_mode, 47 | leaf_stopping_mode=leaf_stopping_mode, 48 | tree_type=tree_type, 49 | float_type=float_type, 50 | patts_trans=patts_trans, 51 | do_patts_trans=do_patts_trans, 52 | lam_criterion=lam_criterion, 53 | n_jobs=n_jobs, 54 | verbose=verbose, 55 | odir=odir, 56 | store=store) 57 | -------------------------------------------------------------------------------- /woody/models/subset/classification.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from woody.io import DiskStore 7 | 8 | from .base import SubsetWood 9 | 10 | class SubsetWoodClassifier(SubsetWood): 11 | """ Random forest classifier. 12 | """ 13 | 14 | def __init__(self, 15 | seed=0, 16 | n_estimators=10, 17 | min_samples_split=2, 18 | max_features=None, 19 | bootstrap=False, 20 | max_depth=None, 21 | min_samples_leaf=1, 22 | criterion="gini", 23 | tree_traversal_mode="dfs", 24 | leaf_stopping_mode="all", 25 | tree_type="randomized", 26 | float_type="double", 27 | patts_trans=True, 28 | do_patts_trans=True, 29 | lam_criterion = 1.0, 30 | n_jobs=1, 31 | verbose=1, 32 | odir=".subsetwood", 33 | store=DiskStore(), 34 | ): 35 | 36 | super(SubsetWoodClassifier, self).__init__( 37 | seed=seed, 38 | n_estimators=n_estimators, 39 | min_samples_split=min_samples_split, 40 | max_features=max_features, 41 | bootstrap=bootstrap, 42 | max_depth=max_depth, 43 | min_samples_leaf=min_samples_leaf, 44 | learning_type="classification", 45 | criterion=criterion, 46 | tree_traversal_mode=tree_traversal_mode, 47 | leaf_stopping_mode=leaf_stopping_mode, 48 | tree_type=tree_type, 49 | float_type=float_type, 50 | patts_trans=patts_trans, 51 | do_patts_trans=do_patts_trans, 52 | lam_criterion=lam_criterion, 53 | n_jobs=n_jobs, 54 | verbose=verbose, 55 | odir=odir, 56 | store=store) -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | woody 2 | ===== 3 | 4 | A Python library for constructing very large random forests. The basic idea is to use "top trees" built for a small random subset of the data and to use these top trees to distribute all the training instances to the top trees' leaves. For each leaf, one or more bottom trees are built. For the bottom trees, woody resorts to pure C code that follows the random forest construction scheme provided by the `Scikit-Learn `_. 5 | 6 | Dependencies 7 | ------------ 8 | 9 | The woody package is tested under Python 2.7. See the requirements.txt for the packages that need to be installed. 10 | 11 | Further, `Swig `_, `setuptools `_, and a working C/C++ compiler need to be available. 12 | 13 | Quickstart 14 | ---------- 15 | 16 | To install the package from the sources, first get the current development release via:: 17 | 18 | git clone https://github.com/gieseke/woody.git 19 | 20 | Afterwards, install a virtual environment via virtualenv. Go to the root of the woody package and type:: 21 | 22 | mkdir .venv 23 | cd .venv 24 | virtualenv woody 25 | source woody/bin/activate 26 | cd .. 27 | pip install -r requirements 28 | 29 | Next, you can install the package locally (development) via:: 30 | 31 | python setup.py clean 32 | python setup.py develop 33 | 34 | To run all the experiments, you also need to manually install:: 35 | 36 | git clone https://github.com/tgsmith61591/skutil 37 | cd skutil 38 | python setup.py install 39 | 40 | Experiments 41 | ----------- 42 | 43 | To run the experiments, simply run the launch.py file in the corresponding subdirectory. The associated run files will automatically download the datasets needed (in case this phase is interrupted, please delete the incomplete data files in the corresponding directory under woody/data). For instance:: 44 | 45 | cd experiments/small_data 46 | python launch.py 47 | 48 | Disclaimer 49 | ---------- 50 | 51 | The source code is published under the GNU General Public License (GPLv3). The authors are not responsible for any implications that stem from the use of this software. 52 | 53 | -------------------------------------------------------------------------------- /woody/models/forest/src/include/pqueue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * heap.h 3 | * 4 | * Created on: 21.10.2014 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef COMMON_INCLUDE_PQUEUE_H_ 9 | #define COMMON_INCLUDE_PQUEUE_H_ 10 | 11 | #include 12 | #include 13 | 14 | #define PQUEUE_MIN_SIZE 64 15 | 16 | typedef struct { 17 | void * data; 18 | int pri; 19 | } PQUEUE_ITEM; 20 | 21 | typedef struct { 22 | PQUEUE_ITEM *buf; 23 | int n; 24 | int alloc; 25 | } PQUEUE; 26 | 27 | // macros 28 | #define pqueue_purge(q) (q)->n = 1 29 | #define pqueue_size(q) ((q)->n - 1) 30 | 31 | /* -------------------------------------------------------------------------------- 32 | * Instantiates a new queue 33 | * -------------------------------------------------------------------------------- 34 | */ 35 | PQUEUE *pqueue_new(int size); 36 | 37 | /* -------------------------------------------------------------------------------- 38 | * Tests if the queue is empty 39 | * -------------------------------------------------------------------------------- 40 | */ 41 | inline int pqueue_is_empty(PQUEUE *q); 42 | 43 | /* -------------------------------------------------------------------------------- 44 | * Pushes "data" with priority "pri" 45 | * -------------------------------------------------------------------------------- 46 | */ 47 | void pqueue_push(PQUEUE *q, void *data, int pri); 48 | 49 | /* -------------------------------------------------------------------------------- 50 | * Removes top item (or returns 0 if queue is empty); *pri can be NULL. 51 | * -------------------------------------------------------------------------------- 52 | */ 53 | void *pqueue_pop(PQUEUE *q, int *pri); 54 | 55 | /* -------------------------------------------------------------------------------- 56 | * Returns the top of the queue 57 | * -------------------------------------------------------------------------------- 58 | */ 59 | inline void *pqueue_top(PQUEUE *q, int *pri); 60 | 61 | /* -------------------------------------------------------------------------------- 62 | * Combines/merges two queues 63 | * -------------------------------------------------------------------------------- 64 | */ 65 | void pqueue_combine(PQUEUE *q1, PQUEUE *q2); 66 | 67 | #endif /* COMMON_INCLUDE_PQUEUE_H_ */ 68 | -------------------------------------------------------------------------------- /woody/models/forest/src/include/timing.h: -------------------------------------------------------------------------------- 1 | /* 2 | * timing.h 3 | */ 4 | #ifndef INCLUDE_TIMING_H_ 5 | #define INCLUDE_TIMING_H_ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | // don't use time if not specified 22 | #ifndef TIMING 23 | #define TIMING 0 24 | #endif 25 | 26 | // struct for input parameters 27 | typedef struct timer_struct { 28 | 29 | long start_time; 30 | double elapsed_time; 31 | double elapsed_time_total; 32 | 33 | } TIMER; 34 | 35 | #define INIT_MY_TIMER init_my_timer 36 | #define START_MY_TIMER start_my_timer 37 | #define RESUME_MY_TIMER start_my_timer 38 | #define STOP_MY_TIMER stop_my_timer 39 | #define GET_MY_TIMER get_my_timer 40 | 41 | void start_my_timer(TIMER *timer); 42 | void resume_my_timer(TIMER *timer); 43 | void stop_my_timer(TIMER *timer); 44 | double get_my_timer(TIMER *timer); 45 | void init_my_timer(TIMER *timer); 46 | 47 | // timing macros 48 | #if TIMING > 0 49 | #define DEFINE_TIMER(num) long start_time##num = 0; double elapsed_time##num = 0.0f; double elapsed_time_total##num = 0.0f; 50 | #define DECLARE_TIMER(num) extern long start_time##num; extern double elapsed_time##num; extern double elapsed_time_total##num; 51 | #define START_TIMER(num) start_time##num = get_system_time_in_microseconds(); 52 | #define STOP_TIMER(num) elapsed_time##num = (((double)get_system_time_in_microseconds())-((double)start_time##num)); elapsed_time_total##num+=elapsed_time##num; 53 | #define GET_TIME(num) (double)(1.0*elapsed_time_total##num / 1000000.0) 54 | #define RESET_TIMER(num) start_time##num = 0; elapsed_time##num = 0.0f; elapsed_time_total##num = 0.0f; 55 | #else 56 | #define DEFINE_TIMER(num) 57 | #define DECLARE_TIMER(num) 58 | #define START_TIMER(num) 59 | #define STOP_TIMER(num) 60 | #define GET_TIME(num) 61 | #define RESET_TIMER(num) 62 | #endif 63 | 64 | /* -------------------------------------------------------------------------------- 65 | * Helper method for computing the current time (w.r.t to an offset). 66 | * -------------------------------------------------------------------------------- 67 | */ 68 | long get_system_time_in_microseconds(void); 69 | 70 | #endif /* INCLUDE_TIMING_H_ */ 71 | -------------------------------------------------------------------------------- /woody/util/array/src/array.c: -------------------------------------------------------------------------------- 1 | #include "include/array.h" 2 | 3 | /* -------------------------------------------------------------------------------- 4 | * Splits the array X according to the indices 5 | * -------------------------------------------------------------------------------- 6 | */ 7 | void split_array(FLOAT_TYPE *X, int nX, int dX, 8 | FLOAT_TYPE *Xnew, int nXnew, int dXnew, 9 | int *indicator, int nindicator, 10 | int *chunks, int nchunks, 11 | int *cumsums_minus_counts, int ncumsums_minus_counts){ 12 | 13 | int i; 14 | 15 | int *offsets = (int*) malloc(nX * sizeof(int)); 16 | 17 | compute_split_offsets(offsets, nX, indicator, nindicator, chunks, nchunks, cumsums_minus_counts, ncumsums_minus_counts); 18 | 19 | for(i=0; i= nX){ 22 | printf("Bad offset: %i [%i, %i]\n!", offsets[i], 0, nX); 23 | exit(-1); 24 | } 25 | copy_pattern(X + i * dX, Xnew + offsets[i] * dX, dX); 26 | } 27 | 28 | free(offsets); 29 | 30 | } 31 | 32 | 33 | void compute_split_offsets(int *offsets, int noffsets, 34 | int *indicator, int nindicator, 35 | int *chunks, int nchunks, 36 | int *cumsums_minus_counts, int ncumsums_minus_counts){ 37 | 38 | int i; 39 | int *chunks_counters = (int*) calloc(nchunks, sizeof(int)); 40 | 41 | for(i=0; i 0 75 | #define FLOAT_TYPE double 76 | #else 77 | #define FLOAT_TYPE float 78 | #endif 79 | 80 | #define FREE_RESOURCES cpu_free_resources 81 | #define COMPUTE_SPLITS cpu_compute_splits 82 | #define INIT_BINDICES cpu_init_bindices 83 | #define FREE_BINDICES cpu_free_bindices 84 | #define INIT_TRAINING_DATA cpu_init_training_data 85 | #define FREE_TRAINING_DATA cpu_free_training_data 86 | #define INIT cpu_init 87 | #define INIT_AFTER_FITTING cpu_init_after_fitting 88 | #define PREDICT cpu_predict 89 | 90 | #define PRINT(params) if ((params->verbosity_level) > 0) printf 91 | 92 | #endif /* FOREST_STANDARD_GLOBAL_INCLUDE_H_ */ 93 | -------------------------------------------------------------------------------- /woody/util/array/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import numpy 8 | 9 | FILES_TO_BE_COMPILED_CPU = ["array.c", "util.c"] 10 | DIRS_TO_BE_INCLUDED_CPU = ["include"] 11 | 12 | SOURCES_RELATIVE_PATH = "src/" 13 | current_path = os.path.dirname(os.path.abspath(__file__)) 14 | sources_abs_path = os.path.abspath(os.path.join(current_path, SOURCES_RELATIVE_PATH)) 15 | 16 | # source files 17 | source_files_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in FILES_TO_BE_COMPILED_CPU] 18 | include_paths_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in DIRS_TO_BE_INCLUDED_CPU] 19 | 20 | numpy_include = numpy.get_include() 21 | 22 | def configuration(parent_package='', top_path=None): 23 | 24 | from numpy.distutils.misc_util import Configuration 25 | config = Configuration('util/c', parent_package, top_path) 26 | 27 | # CPU + FLOAT 28 | config.add_extension("_wrapper_utils_cpu_float", \ 29 | sources = ["swig/cpu_float.i"] + source_files_cpu, 30 | swig_opts=['-modern'], 31 | include_dirs = [numpy_include] +[include_paths_cpu], 32 | define_macros = [ 33 | ('USE_DOUBLE', 0), 34 | ], 35 | libraries=['gomp'], 36 | extra_compile_args=["-std=gnu89", "-fopenmp", '-pthread', '-O3', '-Wall', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu]) 37 | 38 | # CPU + DOUBLE 39 | config.add_extension("_wrapper_utils_cpu_double", \ 40 | sources = ["swig/cpu_double.i"] + source_files_cpu, 41 | swig_opts=['-modern'], 42 | include_dirs = [numpy_include] +[include_paths_cpu], 43 | define_macros = [ 44 | ('USE_DOUBLE', 1), 45 | ], 46 | libraries=['gomp'], 47 | extra_compile_args=["-std=gnu89", "-fopenmp", '-pthread', '-O3', '-Wall', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu]) 48 | 49 | return config 50 | 51 | if __name__ == '__main__': 52 | 53 | from numpy.distutils.core import setup 54 | setup(**configuration(top_path='').todict()) 55 | 56 | -------------------------------------------------------------------------------- /woody/models/forest/util.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import numpy 7 | 8 | def ensure_data_types(X, y, numpy_dtype_float): 9 | 10 | # ensure floats everywhere (e.g., for split array computations) 11 | if X.dtype != numpy_dtype_float: 12 | X = X.astype(numpy_dtype_float) 13 | if y.dtype != numpy_dtype_float: 14 | y = y.astype(numpy_dtype_float) 15 | 16 | return X, y 17 | 18 | class PickableWoodyRFWrapper(object): 19 | """ 20 | """ 21 | 22 | def __init__(self, *args): 23 | 24 | self.args = args 25 | 26 | self.float_type = args[0] 27 | 28 | self._params_swig = self.module.PARAMETERS() 29 | self._forest_swig = self.module.FOREST() 30 | 31 | @property 32 | def params(self): 33 | 34 | return self._params_swig 35 | 36 | @property 37 | def forest(self): 38 | 39 | return self._forest_swig 40 | 41 | @property 42 | def module(self): 43 | 44 | return self._get_wrapper_module() 45 | 46 | def _get_wrapper_module(self): 47 | 48 | if self.float_type == "float": 49 | import wrapper_cpu_float 50 | return wrapper_cpu_float 51 | elif self.float_type == "double": 52 | import wrapper_cpu_double 53 | return wrapper_cpu_double 54 | 55 | def __setstate__(self, state): 56 | """ Is called when object is unpickled 57 | """ 58 | 59 | self.__dict__.update(state) 60 | 61 | self._params_swig = self.module.PARAMETERS() 62 | self._forest_swig = self.module.FOREST() 63 | 64 | self._get_wrapper_module().restore_forest_from_array_extern(self.params, self.forest, self._aforest) 65 | 66 | def __getstate__(self): 67 | """ Is called when object is pickled 68 | 69 | https://docs.python.org/3/library/pickle.html#pickle-state 70 | 71 | """ 72 | 73 | n_bytes_forest = self._get_wrapper_module().get_num_bytes_forest_extern(self.params, self.forest); 74 | n_bytes_forest = int((float(n_bytes_forest) / 4.0) + 4) 75 | 76 | aforest = numpy.empty(n_bytes_forest, dtype=numpy.int32) 77 | self._get_wrapper_module().get_forest_as_array_extern(self.params, self.forest, aforest) 78 | 79 | state = self.__dict__.copy() 80 | state['_aforest'] = aforest 81 | 82 | del state['_params_swig'] 83 | del state['_forest_swig'] 84 | 85 | return state 86 | -------------------------------------------------------------------------------- /woody/models/huge/util.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import numpy 7 | import multiprocessing 8 | 9 | from woody.util import split_array 10 | 11 | from .. import Wood 12 | 13 | def distribute_patterns(toptree, X, y, verbose=0, logger=None): 14 | 15 | if logger is not None: 16 | logger.debug("\tUsing top tree to distribute patterns to leaves ...") 17 | 18 | leaves_ids = toptree.get_leaves_ids(X) 19 | unique_leaves_ids, counts = numpy.unique(leaves_ids, return_counts=True) 20 | 21 | if logger is not None: 22 | logger.debug("\tPatterns are distributed to %i leaves of the top tree ..." % len(unique_leaves_ids)) 23 | 24 | chunks = -1 * numpy.ones(int(unique_leaves_ids[-1]) + 1, dtype=numpy.int32) 25 | for i in xrange(len(unique_leaves_ids)): 26 | leaf_id = int(unique_leaves_ids[i]) 27 | chunks[leaf_id] = i 28 | 29 | Xsubs, ysubs = {}, {} 30 | 31 | Xnew = split_array(X, leaves_ids, chunks, counts) 32 | ynew = split_array(y, leaves_ids, chunks, counts) 33 | 34 | current_count = 0 35 | for i in xrange(len(unique_leaves_ids)): 36 | leaf_id = unique_leaves_ids[i] 37 | cts = counts[i] 38 | Xsubs[leaf_id] = Xnew[current_count:current_count + cts, :] 39 | ysubs[leaf_id] = ynew[current_count:current_count + cts] 40 | current_count += cts 41 | 42 | return Xsubs, ysubs, unique_leaves_ids 43 | 44 | def get_XY_subsets_from_store(dset, heavy_leaf_domsize): 45 | 46 | pure = False 47 | 48 | ychunk = numpy.array(dset[:, -1]) 49 | counts = numpy.bincount(ychunk.astype(numpy.int32)) 50 | 51 | dominant = numpy.argmax(counts) 52 | if len(ychunk) > heavy_leaf_domsize: 53 | rsubset = numpy.random.choice(len(ychunk), heavy_leaf_domsize) 54 | else: 55 | rsubset = numpy.arange(len(ychunk)) 56 | subindices = ychunk != dominant 57 | subindices = numpy.union1d(rsubset, subindices) 58 | subindices.sort() 59 | 60 | # random access slow in h5py, process in chunks 61 | Xsub, ysub = numpy.array(dset[:, :-1]), numpy.array(dset[:, -1]) 62 | Xsub, ysub = Xsub[subindices,:], ysub[subindices] 63 | 64 | print "REMOVE UPWARDS, no XSUB needed" 65 | if (counts != 0).sum() == 1: 66 | pure = True 67 | 68 | return Xsub, ysub, pure 69 | 70 | def _load_single_tree(store, fname, wrapped_instance, typ=None): 71 | 72 | assert typ in ["top", "bottom"] 73 | 74 | if typ == "top": 75 | return store.load(fname, Wood) 76 | 77 | elif typ == "bottom": 78 | return store.load(fname, wrapped_instance) 79 | 80 | return None 81 | -------------------------------------------------------------------------------- /woody/models/huge/regression.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from woody.io import DiskStore 7 | 8 | from .base import HugeWood 9 | 10 | from .. import WoodRegressor 11 | 12 | class HugeWoodRegressor(HugeWood): 13 | """ Large-scale contruction of a random forest on 14 | a single workstation (with limited memory resources). 15 | Each tree belonging to the ensemble is constructed 16 | in a multi-stage fashion and the intermediate data 17 | are stored on disk (e.g., via h5py). 18 | """ 19 | 20 | TKEY_ALL_FIT = 0 21 | TKEY_TOP_TREE = 1 22 | TKEY_DISTR_PATTS = 2 23 | TKEY_BOTTOM_TREES = 3 24 | 25 | MAX_RAND_INT = 10000000 26 | 27 | def __init__(self, 28 | n_top="auto", 29 | n_patterns_leaf="auto", 30 | balanced_top_tree=True, 31 | top_tree_lambda=0.0, 32 | top_tree_max_depth=None, 33 | top_tree_type="randomized", 34 | top_tree_leaf_stopping_mode="ignore_impurity", 35 | n_estimators=1, 36 | n_estimators_bottom=1, 37 | n_jobs=1, 38 | seed=0, 39 | odir=".hugewood", 40 | verbose=1, 41 | plot_intermediate={}, 42 | chunk_max_megabytes=256, 43 | wrapped_instance=WoodRegressor(), 44 | store=DiskStore(), 45 | ): 46 | 47 | super(HugeWoodRegressor, self).__init__(n_top=n_top, 48 | n_patterns_leaf=n_patterns_leaf, 49 | balanced_top_tree=balanced_top_tree, 50 | top_tree_lambda=top_tree_lambda, 51 | top_tree_max_depth=top_tree_max_depth, 52 | top_tree_type=top_tree_type, 53 | top_tree_leaf_stopping_mode=top_tree_leaf_stopping_mode, 54 | n_estimators=n_estimators, 55 | n_estimators_bottom=n_estimators_bottom, 56 | n_jobs=n_jobs, 57 | seed=seed, 58 | odir=odir, 59 | verbose=verbose, 60 | plot_intermediate=plot_intermediate, 61 | chunk_max_megabytes=chunk_max_megabytes, 62 | wrapped_instance=wrapped_instance, 63 | store=store) 64 | -------------------------------------------------------------------------------- /woody/models/huge/classification.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | from woody.io import DiskStore 7 | 8 | from .base import HugeWood 9 | 10 | from .. import WoodClassifier 11 | 12 | class HugeWoodClassifier(HugeWood): 13 | """ Large-scale contruction of a random forest on 14 | a single workstation (with limited memory resources). 15 | Each tree belonging to the ensemble is constructed 16 | in a multi-stage fashion and the intermediate data 17 | are stored on disk (e.g., via h5py). 18 | """ 19 | 20 | TKEY_ALL_FIT = 0 21 | TKEY_TOP_TREE = 1 22 | TKEY_DISTR_PATTS = 2 23 | TKEY_BOTTOM_TREES = 3 24 | 25 | MAX_RAND_INT = 10000000 26 | 27 | def __init__(self, 28 | n_top="auso", 29 | n_patterns_leaf="auto", 30 | balanced_top_tree=True, 31 | top_tree_lambda=0.0, 32 | top_tree_max_depth=None, 33 | top_tree_type="randomized", 34 | top_tree_leaf_stopping_mode="ignore_impurity", 35 | n_estimators=1, 36 | n_estimators_bottom=1, 37 | n_jobs=1, 38 | seed=0, 39 | odir=".hugewood", 40 | verbose=1, 41 | plot_intermediate={}, 42 | chunk_max_megabytes=256, 43 | wrapped_instance=WoodClassifier(), 44 | store=DiskStore(), 45 | ): 46 | 47 | super(HugeWoodClassifier, self).__init__(n_top=n_top, 48 | n_patterns_leaf=n_patterns_leaf, 49 | balanced_top_tree=balanced_top_tree, 50 | top_tree_lambda=top_tree_lambda, 51 | top_tree_max_depth=top_tree_max_depth, 52 | top_tree_type=top_tree_type, 53 | top_tree_leaf_stopping_mode=top_tree_leaf_stopping_mode, 54 | n_estimators=n_estimators, 55 | n_estimators_bottom=n_estimators_bottom, 56 | n_jobs=n_jobs, 57 | seed=seed, 58 | odir=odir, 59 | verbose=verbose, 60 | plot_intermediate=plot_intermediate, 61 | chunk_max_megabytes=chunk_max_megabytes, 62 | wrapped_instance=wrapped_instance, 63 | store=store) 64 | -------------------------------------------------------------------------------- /woody/models/forest/src/util.c: -------------------------------------------------------------------------------- 1 | #include "include/util.h" 2 | #include "include/float.h" 3 | 4 | /* -------------------------------------------------------------------------------- 5 | * Transposes an array (float) 6 | * -------------------------------------------------------------------------------- 7 | */ 8 | void transpose_array_float(float* array, int n, int d, float* array_transposed) { 9 | 10 | int i, j; 11 | 12 | for (j = 0; j < d; j++) { 13 | for (i = 0; i < n; i++) { 14 | array_transposed[j * n + i] = array[i * d + j]; 15 | } 16 | } 17 | 18 | } 19 | 20 | /* -------------------------------------------------------------------------------- 21 | * Transposes an array (double) 22 | * -------------------------------------------------------------------------------- 23 | */ 24 | void transpose_array_double(double* array, int n, int d, 25 | double* array_transposed) { 26 | 27 | int i, j; 28 | 29 | for (j = 0; j < d; j++) { 30 | for (i = 0; i < n; i++) { 31 | array_transposed[j * n + i] = array[i * d + j]; 32 | } 33 | } 34 | 35 | } 36 | 37 | int compare_floats(const void *p1, const void *p2) { 38 | 39 | // the index is stored at the end of each element... 40 | FLOAT_TYPE *p1_point, *p2_point; 41 | p1_point = (FLOAT_TYPE *) p1; 42 | p2_point = (FLOAT_TYPE *) p2; 43 | 44 | if (*p1_point < *p2_point) { 45 | return -1; 46 | } 47 | if (*p1_point > *p2_point) { 48 | return +1; 49 | } 50 | 51 | return 0; 52 | 53 | } 54 | 55 | int compare_ints(const void *p1, const void *p2) { 56 | 57 | // the index is stored at the end of each element... 58 | int *p1_point, *p2_point; 59 | p1_point = (int *) p1; 60 | p2_point = (int *) p2; 61 | 62 | if (*p1_point < *p2_point) { 63 | return -1; 64 | } 65 | if (*p1_point > *p2_point) { 66 | return +1; 67 | } 68 | 69 | return 0; 70 | 71 | } 72 | 73 | /*--------------------------------------------------------------------------- 74 | Function : kth_smallest() 75 | In : array of elements, # of elements in the array, rank k 76 | Out : one element 77 | Job : find the kth smallest element in the array 78 | Notice : use the median() macro defined below to get the median. 79 | 80 | Reference: 81 | 82 | Author: Wirth, Niklaus 83 | Title: Algorithms + data structures = programs 84 | Publisher: Englewood Cliffs: Prentice-Hall, 1976 85 | Physical description: 366 p. 86 | Series: Prentice-Hall Series in Automatic Computation 87 | 88 | ---------------------------------------------------------------------------*/ 89 | FLOAT_TYPE kth_smallest(FLOAT_TYPE a[], int n, int k) { 90 | 91 | return a[kth_smallest_idx(a, n, k)]; 92 | 93 | } 94 | 95 | int kth_smallest_idx(FLOAT_TYPE a[], int n, int k) { 96 | 97 | register unsigned int i, j, l, m; 98 | register FLOAT_TYPE x; 99 | 100 | l = 0; 101 | m = n - 1; 102 | 103 | while (l < m) { 104 | x = a[k]; 105 | i = l; 106 | j = m; 107 | do { 108 | while (a[i] < x) 109 | i++; 110 | while (x < a[j]) 111 | j--; 112 | if (i <= j) { 113 | ELEM_SWAP(a[i], a[j]); 114 | i++; 115 | j--; 116 | } 117 | } while (i <= j); 118 | if (j < k) 119 | l = i; 120 | if (k < i) 121 | m = j; 122 | } 123 | 124 | return k; 125 | 126 | } 127 | -------------------------------------------------------------------------------- /woody/models/huge/predict.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import gc 4 | import numpy 5 | from scipy.stats import mode 6 | 7 | from woody.io import DiskStore 8 | from woody.util import perform_task_in_parallel 9 | from .util import distribute_patterns 10 | from .util import _load_single_tree 11 | 12 | def predict_array(X, n_estimators, n_estimators_bottom, numpy_dtype_float, odir, store, wrapped_instance, n_jobs): 13 | """ Returns predictions for a given set of patterns. 14 | """ 15 | 16 | params_parallel = [] 17 | 18 | for b in xrange(n_estimators): 19 | 20 | odir_local = os.path.join(odir, str(int(b))) 21 | fname = os.path.join(odir_local, "toptree.tree") 22 | toptree = _load_single_tree(store, fname, wrapped_instance, typ="top") 23 | args = [n_estimators_bottom, toptree, X, odir_local, store, wrapped_instance, numpy_dtype_float] 24 | params_parallel.append(args) 25 | 26 | if type(store) == DiskStore: 27 | results = perform_task_in_parallel(predict_bottom, params_parallel, n_jobs=n_jobs, backend="multiprocessing") 28 | else: 29 | results = [] 30 | for param in params_parallel: 31 | res = predict_bottom(param) 32 | results.append(res) 33 | allpreds = numpy.zeros((len(X), n_estimators*n_estimators_bottom), dtype=numpy_dtype_float) 34 | for i in xrange(len(results)): 35 | allpreds[:,i*n_estimators_bottom:(i+1)*n_estimators_bottom] = results[i] 36 | allpreds = numpy.array(allpreds) 37 | 38 | preds = _combine_preds(allpreds, wrapped_instance.learning_type, numpy_dtype_float) 39 | 40 | return preds 41 | 42 | def predict_bottom(args): 43 | """ FIXME: This is by far the slowest part during prediction. 44 | """ 45 | 46 | n_estimators_bottom, toptree, X, odir, store, wrapped_instance, numpy_dtype_float = args 47 | 48 | preds = numpy.zeros((len(X), n_estimators_bottom), dtype=numpy_dtype_float) 49 | 50 | oindices = numpy.array(xrange(len(X)), dtype=numpy.float64) 51 | 52 | Xsubs, isubs, unique_leaves_ids = distribute_patterns(toptree, X, oindices) 53 | 54 | for leaf_id in unique_leaves_ids: 55 | isubs[leaf_id] = isubs[leaf_id].astype(numpy.int64) 56 | unique_leaves_ids = unique_leaves_ids.astype(numpy.int64) 57 | 58 | for leaf_id in unique_leaves_ids: 59 | fname = os.path.join(odir, str(int(leaf_id)) + ".tree") 60 | btree = _load_single_tree(store, fname, wrapped_instance, typ="bottom") 61 | pleaf = btree.predict_all(Xsubs[leaf_id]) 62 | preds[isubs[leaf_id], :] = pleaf 63 | 64 | del btree 65 | gc.collect() 66 | 67 | return preds 68 | 69 | def _combine_preds(allpreds, learning_type, numpy_dtype_float): 70 | 71 | if learning_type == "regression": 72 | 73 | preds = allpreds.mean(axis=1) 74 | 75 | elif learning_type == "classification": 76 | 77 | preds, _ = mode(allpreds, axis=1) 78 | preds = preds[:, 0] 79 | 80 | else: 81 | raise Exception("Unknown learning type for wrapped instance: %s" % learning_type) 82 | 83 | if preds.dtype != numpy_dtype_float: 84 | preds = preds.astype(numpy_dtype_float) 85 | 86 | return preds 87 | -------------------------------------------------------------------------------- /woody/models/forest/src/tree/include/types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * types.h 3 | * 4 | * Created on: 15.02.2016 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef ENSEMBLE_INCLUDE_TYPES_H_ 9 | #define ENSEMBLE_INCLUDE_TYPES_H_ 10 | 11 | #include "../../include/float.h" 12 | #include "../../include/timing.h" 13 | 14 | typedef struct parameters { 15 | 16 | int seed; 17 | int n_estimators; 18 | int min_samples_split; 19 | int max_features; 20 | int bootstrap; 21 | int max_depth; 22 | int min_samples_leaf; 23 | int num_threads; 24 | int verbosity_level; 25 | int tree_traversal_mode; 26 | int leaf_stopping_mode; 27 | int criterion; 28 | int learning_type; 29 | int tree_type; 30 | int prediction_type; 31 | int patterns_transposed; 32 | double lam_crit; 33 | int n_subset_check; 34 | 35 | // training 36 | FLOAT_TYPE *Xtrain; 37 | int nXtrain; 38 | int dXtrain; 39 | FLOAT_TYPE max_ytrain_value; 40 | 41 | TIMER timers[10]; 42 | 43 | } PARAMETERS; 44 | 45 | typedef struct bootstrap_indices { 46 | int n_indices; 47 | int *indices; 48 | int *indices_wmappings; 49 | } BINDICES; 50 | 51 | typedef struct training_data { 52 | 53 | FLOAT_TYPE *Xtrain; 54 | FLOAT_TYPE *Ytrain; 55 | FLOAT_TYPE *Ytrain_mapped; 56 | int nXtrain; 57 | int dXtrain; 58 | BINDICES *bindices; 59 | int n_classes; 60 | FLOAT_TYPE *classes; 61 | 62 | } TRAINING_DATA; 63 | 64 | typedef struct split_record { 65 | 66 | unsigned int feature; 67 | int pos; 68 | FLOAT_TYPE threshold; 69 | FLOAT_TYPE improvement; 70 | FLOAT_TYPE impurity; 71 | FLOAT_TYPE impurity_left; 72 | FLOAT_TYPE impurity_right; 73 | FLOAT_TYPE prob_left; 74 | FLOAT_TYPE prob_right; 75 | int leaf_detected; 76 | 77 | } SPLIT_RECORD; 78 | 79 | typedef struct traversal_record { 80 | 81 | int start; 82 | int end; 83 | int depth; 84 | int parent_id; 85 | int is_left_child; 86 | int is_leaf; 87 | 88 | int n_constant_features; 89 | int *const_features; 90 | 91 | SPLIT_RECORD *split_record; 92 | 93 | } TRAVERSAL_RECORD; 94 | 95 | typedef struct tree_node { 96 | 97 | unsigned int left_id; 98 | unsigned int right_id; 99 | unsigned int feature; 100 | FLOAT_TYPE thres_or_leaf; 101 | unsigned int leaf_criterion; 102 | 103 | } TREE_NODE; 104 | 105 | typedef struct tree { 106 | 107 | TREE_NODE *root; 108 | int n_allocated; 109 | int node_counter; 110 | 111 | } TREE; 112 | 113 | typedef struct forest { 114 | 115 | TREE *trees; 116 | int n_trees; 117 | 118 | } FOREST; 119 | 120 | typedef struct pattern_label_weight PATTERN_LABEL_WEIGHT; 121 | 122 | struct pattern_label_weight { 123 | FLOAT_TYPE pattern; 124 | FLOAT_TYPE label; 125 | int weight; 126 | }; 127 | 128 | typedef struct criterion_record CRITERION_RECORD; 129 | 130 | struct criterion_record { 131 | 132 | int current_pos; 133 | FLOAT_TYPE impurity; 134 | FLOAT_TYPE impurity_left; 135 | FLOAT_TYPE impurity_right; 136 | FLOAT_TYPE improvement; 137 | 138 | int weight_left; 139 | int weight_right; 140 | 141 | // needed for regression (MSE) 142 | FLOAT_TYPE sum_left; 143 | FLOAT_TYPE sum_right; 144 | FLOAT_TYPE sq_sum_left; 145 | FLOAT_TYPE sq_sum_right; 146 | 147 | // needed for classification (GINI and ENTROPY) 148 | int *class_counts_left; 149 | int *class_counts_right; 150 | 151 | }; 152 | 153 | #endif /* ENSEMBLE_INCLUDE_TYPES_H_ */ 154 | -------------------------------------------------------------------------------- /woody/models/forest/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import numpy 8 | 9 | TIMING = 1 10 | 11 | FILES_TO_BE_COMPILED_CPU = ["tree/base.c", 12 | "tree/cpu.c", 13 | "tree/tree.c", 14 | "tree/util.c", 15 | "tree/cpu/base.c", 16 | "tree/cpu/criteria.c", 17 | "tree/cpu/standard.c", 18 | "tree/cpu/fastsort.c", 19 | "timing.c", 20 | "util.c", 21 | "pqueue.c", 22 | ] 23 | 24 | DIRS_TO_BE_INCLUDED_CPU = ["tree/include", "tree/cpu/include"] 25 | 26 | SOURCES_RELATIVE_PATH = "src/" 27 | current_path = os.path.dirname(os.path.abspath(__file__)) 28 | sources_abs_path = os.path.abspath(os.path.join(current_path, SOURCES_RELATIVE_PATH)) 29 | 30 | # source files 31 | source_files_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in FILES_TO_BE_COMPILED_CPU] 32 | include_paths_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in DIRS_TO_BE_INCLUDED_CPU] 33 | 34 | numpy_include = numpy.get_include() 35 | 36 | def configuration(parent_package='', top_path=None): 37 | 38 | from numpy.distutils.misc_util import Configuration 39 | config = Configuration('models/forest', parent_package, top_path) 40 | 41 | # CPU + FLOAT 42 | config.add_extension("_wrapper_cpu_float", \ 43 | sources = ["swig/cpu_float.i"] + source_files_cpu, 44 | swig_opts=['-modern', '-threads'], 45 | include_dirs = [numpy_include] +[include_paths_cpu], 46 | define_macros = [ 47 | ('ABSOLUTE_PATH', os.path.join(sources_abs_path, "ensemble")), 48 | ('USE_DOUBLE', 0), 49 | ('TIMING', TIMING) 50 | ], 51 | libraries=['gomp'], 52 | extra_compile_args=["-std=gnu89", "-fopenmp", '-O3', '-Wall', '-pthread', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu]) 53 | 54 | # CPU + DOUBLE 55 | config.add_extension("_wrapper_cpu_double", \ 56 | sources = ["swig/cpu_double.i"] + source_files_cpu, 57 | swig_opts=['-modern', '-threads'], 58 | include_dirs = [numpy_include] +[include_paths_cpu], 59 | define_macros = [ 60 | ('ABSOLUTE_PATH', os.path.join(sources_abs_path, "ensemble")), 61 | ('USE_DOUBLE', 1), 62 | ('TIMING', TIMING) 63 | ], 64 | libraries=['gomp'], 65 | extra_compile_args=["-std=gnu89", "-fopenmp", '-O3', '-Wall', '-pthread', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu]) 66 | 67 | return config 68 | 69 | if __name__ == '__main__': 70 | 71 | from numpy.distutils.core import setup 72 | setup(**configuration(top_path='').todict()) 73 | -------------------------------------------------------------------------------- /woody/models/forest/src/tree/include/cpu.h: -------------------------------------------------------------------------------- 1 | /* 2 | * cpu.h 3 | * 4 | * Created on: 17.04.2015 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef FORESTS_STANDARD_INCLUDE_CPU_H_ 9 | #define FORESTS_STANDARD_INCLUDE_CPU_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "global.h" 17 | #include "util.h" 18 | #include "../cpu/include/base.h" 19 | 20 | 21 | 22 | 23 | /* -------------------------------------------------------------------------------- 24 | * Fits a model given the training data (and parameters) 25 | * -------------------------------------------------------------------------------- 26 | */ 27 | void fit_forest(FLOAT_TYPE *Xtrain, int nXtrain, int dXtrain, 28 | FLOAT_TYPE *Ytrain, int *bootstrap_indices, int *bootstrap_indices_weights, 29 | int n_bootstrap_indices, int d_bootstrap_indices, int use_bindices, PARAMETERS *params, FOREST *forest); 30 | 31 | /* -------------------------------------------------------------------------------- 32 | * Builds a single tree. 33 | * -------------------------------------------------------------------------------- 34 | */ 35 | void build_single_tree(TREE *tree, TRAINING_DATA *train_data, 36 | PARAMETERS *params, unsigned int *rstate); 37 | 38 | 39 | 40 | /* -------------------------------------------------------------------------------- 41 | * Process huge nodes 42 | * -------------------------------------------------------------------------------- 43 | */ 44 | void process_all_nodes(TREE *tree, TRAINING_DATA *train_data, PQUEUE *huge_traversal_queue, 45 | PARAMETERS *params, unsigned int *rstate); 46 | 47 | 48 | /* -------------------------------------------------------------------------------- 49 | * Returns a chunk of traversal records. 50 | * -------------------------------------------------------------------------------- 51 | */ 52 | TRAVERSAL_RECORD **get_chunk_trecords(PQUEUE *traversal_queue, int *n_trecords, int n_to_be_removed); 53 | 54 | /* -------------------------------------------------------------------------------- 55 | * Generate traversal records for the children of a given traversal record 56 | * -------------------------------------------------------------------------------- 57 | */ 58 | void generate_traversal_records_children(TREE *tree, TRAINING_DATA *train_data, PQUEUE *huge_traversal_queue, 59 | TRAVERSAL_RECORD *trecord, 60 | PARAMETERS *params, int node_id); 61 | 62 | void generate_next_leaf_node(int node_id, int start, int end, unsigned int leaf_criterion, int depth, int prio, int child_flag, TREE *tree, 63 | PQUEUE *huge_traversal_queue, TRAVERSAL_RECORD *trecord, TRAINING_DATA *train_data, PARAMETERS *params); 64 | 65 | /* -------------------------------------------------------------------------------- 66 | * Generates leaves and nodes 67 | * -------------------------------------------------------------------------------- 68 | */ 69 | void generate_leaves_nodes(TREE *tree, TRAVERSAL_RECORD **trecords, int n_trecords, 70 | TRAINING_DATA *train_data, PARAMETERS *params, PQUEUE *huge_traversal_queue); 71 | 72 | /* -------------------------------------------------------------------------------- 73 | * Generates a single leaf node 74 | * -------------------------------------------------------------------------------- 75 | */ 76 | void generate_leaf(TREE *tree, int start, int end, int parent_id, int is_left_child, 77 | unsigned int leaf_criterion, TRAINING_DATA *train_data, PARAMETERS *params); 78 | 79 | 80 | #endif /* FORESTS_STANDARD_INCLUDE_CPU_H_ */ 81 | -------------------------------------------------------------------------------- /woody/util/draw.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import numpy 8 | 9 | def draw_single_tree(tree, 10 | node_stats=None, 11 | ax=None, 12 | figsize=(200,20), 13 | fname="tree.pdf", 14 | with_labels=True, 15 | arrows=False, 16 | edge_width=1.0, 17 | font_size=7, 18 | alpha=0.5, 19 | edges_alpha=1.0, 20 | node_size=1000): 21 | 22 | try: 23 | import networkx as nx 24 | from networkx.drawing.nx_agraph import graphviz_layout 25 | import matplotlib.pyplot as plt 26 | except Exception as e: 27 | raise Exception("Module 'networkx' is required to export the tree structure: %s" % str(e)) 28 | 29 | d = os.path.dirname(fname) 30 | if len(d) > 0: 31 | if not os.path.exists(d): 32 | os.makedirs(d) 33 | 34 | if ax is None: 35 | fig = plt.figure(figsize=figsize) 36 | ax = fig.add_subplot(111) 37 | 38 | pos = graphviz_layout(tree, prog='dot') 39 | 40 | if node_stats is not None: 41 | lmin = numpy.array([node_stats[i] for i in node_stats.keys()]).min() 42 | lmax = numpy.array([node_stats[i] for i in node_stats.keys()]).max() 43 | 44 | internal_nodes = {'labels':{}, 'sizes':[], 'node_list':[]} 45 | leaves = {'labels':{}, 'sizes':[], 'node_list':[]} 46 | 47 | for i in xrange(len(tree.nodes())): 48 | if tree.node[i]['is_leaf'] == True: 49 | leaves['node_list'].append(i) 50 | if node_stats is not None: 51 | leaves['labels'][i] = "#" + str(node_stats[i]) + "(" + str(tree.node[i]['leaf_criterion']) + ")" 52 | leaves['sizes'].append((0.00001 + ((float(node_stats[i] - lmin) / (lmax-lmin)))) * node_size) 53 | else: 54 | leaves['sizes'].append(node_size) 55 | else: 56 | internal_nodes['node_list'].append(i) 57 | internal_nodes['labels'][i] = "" #str(i) 58 | internal_nodes['sizes'].append(node_size) 59 | 60 | # internal nodes 61 | nx.draw_networkx_nodes(tree, 62 | pos, 63 | nodelist=internal_nodes['node_list'], 64 | ax=ax, 65 | node_color='#0000FF', 66 | with_labels=False, 67 | linewidths=edge_width, 68 | alpha=alpha, 69 | node_size=internal_nodes['sizes']) 70 | if with_labels == True: 71 | nx.draw_networkx_labels(tree, pos, internal_nodes['labels'], font_size=7) 72 | 73 | # leaves 74 | nx.draw_networkx_nodes(tree, 75 | pos, 76 | nodelist=leaves['node_list'], 77 | ax=ax, 78 | node_color='#FF0000', 79 | with_labels=False, 80 | linewidths=edge_width, 81 | alpha=alpha, 82 | node_size=leaves['sizes']) 83 | if with_labels == True: 84 | nx.draw_networkx_labels(tree, pos, leaves['labels'], font_size=font_size) 85 | 86 | # draw edges 87 | nx.draw_networkx_edges(tree, pos, edge_color='#000000', width=edge_width, alpha=edges_alpha) 88 | 89 | plt.axis('off') 90 | plt.savefig(fname, bbox_inches='tight') 91 | plt.close() 92 | plt.close() 93 | -------------------------------------------------------------------------------- /woody/models/forest/.cproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /woody/util/parallel.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import multiprocessing 7 | from multiprocessing.pool import ThreadPool 8 | 9 | def pool_init(): 10 | import gc 11 | gc.collect() 12 | 13 | def wrapped_task(queue, task, args, kwargs): 14 | 15 | queue.put(task(*args, **kwargs)) 16 | 17 | from multiprocessing import Queue 18 | 19 | # https://github.com/joblib/joblib/issues/138 20 | def start_via_single_process(task, args, kwargs): 21 | 22 | queue = Queue() 23 | 24 | proc = multiprocessing.Process(target=wrapped_task, args=(queue, task, args, kwargs)) 25 | proc.start() 26 | 27 | result = queue.get() 28 | 29 | # joining might yield errors ... 30 | # https://gist.github.com/schlamar/2311116 31 | # see https://docs.python.org/2/library/multiprocessing.html#all-platforms 32 | #proc.join() 33 | return result 34 | 35 | 36 | # def perform_task_in_parallel_in_place(task, params_parallel, n_jobs=1): 37 | # """ Performas a task in parallel (in place, not return results are generated 38 | # 39 | # Parameters 40 | # ---------- 41 | # task : callable 42 | # The function/procedure that shall be executed 43 | # params_parallel : list 44 | # The parallel parameters 45 | # n_jobs : int, default 1 46 | # The number of jobs that shall be used 47 | # """ 48 | # 49 | # 50 | # # https://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.pool 51 | # pool = multiprocessing.Pool(n_jobs, maxtasksperchild=1) 52 | # results = pool.apply_async(task, params_parallel) 53 | # 54 | # pool.close() 55 | # pool.join() 56 | # 57 | # return results 58 | 59 | 60 | def perform_task_in_parallel(task, params_parallel, n_jobs=1, backend="multiprocessing"): 61 | """ Performas a task in parallel 62 | 63 | Parameters 64 | ---------- 65 | task : callable 66 | The function/procedure that shall be executed 67 | params_parallel : list 68 | The parallel parameters 69 | n_jobs : int, default 1 70 | The number of jobs that shall be used 71 | backend : str, default 'multiprocessing' 72 | """ 73 | 74 | if backend == 'multiprocessing': 75 | 76 | # https://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.pool 77 | pool = multiprocessing.Pool(n_jobs, maxtasksperchild=1, initializer=pool_init) 78 | results = pool.map(task, params_parallel) 79 | 80 | pool.close() 81 | pool.join() 82 | 83 | return results 84 | 85 | elif backend == 'threading': 86 | 87 | pool = ThreadPool(n_jobs) 88 | results = pool.map(task, params_parallel) 89 | pool.close() 90 | pool.join() 91 | 92 | return results 93 | 94 | else: 95 | raise Exception("Unknown backend: %s" % str(backend)) 96 | 97 | 98 | if __name__ == "__main__": 99 | 100 | def foo(x): 101 | print x 102 | return x*x 103 | 104 | params_parallel = range(10000) 105 | #perform_task_in_parallel(foo, params_parallel, backend="multiprocessing", n_jobs=4) 106 | results = perform_task_in_parallel(foo, params_parallel, backend="multiprocessing", n_jobs=4) 107 | print "results=", results 108 | #results = perform_task_in_parallel(foo, params_parallel, backend="threading", n_jobs=4) 109 | #print "results=", results 110 | -------------------------------------------------------------------------------- /woody/io/store.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import h5py 7 | import numpy 8 | from woody.util import ensure_dir_for_file 9 | 10 | class Store(object): 11 | 12 | def __init__(self): 13 | pass 14 | 15 | class MemoryStore(Store): 16 | 17 | def __init__(self): 18 | 19 | self._containers = {} 20 | self._objects = {} 21 | 22 | def create_dataset(self, container_key, dkey, data): 23 | 24 | if container_key not in self._containers.keys(): 25 | self._containers[container_key] = {} 26 | 27 | self._containers[container_key][dkey] = data 28 | 29 | def append_to_dataset(self, container_key, dkey, data): 30 | 31 | if container_key not in self._containers.keys(): 32 | self._containers[container_key] = {} 33 | 34 | if not dkey in self._containers[container_key].keys(): 35 | 36 | self._containers[container_key][dkey] = data 37 | 38 | else: 39 | 40 | newdata = numpy.concatenate([self._containers[container_key][dkey], data], axis=0) 41 | self._containers[container_key][dkey] = newdata 42 | 43 | def get_dataset(self, container_key, dkey): 44 | 45 | return numpy.ascontiguousarray(self._containers[container_key][dkey]) 46 | 47 | def get_keys(self, container_key): 48 | 49 | return self._containers[container_key].keys() 50 | 51 | def save(self, key, obj): 52 | 53 | self._objects[key] = obj 54 | 55 | def load(self, key, loader): 56 | 57 | return self._objects[key] 58 | 59 | class DiskStore(Store): 60 | 61 | def __init__(self): 62 | pass 63 | 64 | def create_dataset(self, container_key, dkey, data): 65 | 66 | ensure_dir_for_file(container_key) 67 | s = h5py.File(container_key, 'a', driver="sec2", libver='latest') 68 | 69 | dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf") 70 | dset[:,:] = data 71 | 72 | s.close() 73 | 74 | def append_to_dataset(self, container_key, dkey, data): 75 | 76 | ensure_dir_for_file(container_key) 77 | s = h5py.File(container_key, 'a', driver="sec2", libver='latest') 78 | 79 | offset = 0 80 | 81 | if not dkey in s.keys(): 82 | 83 | dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf") 84 | 85 | else: 86 | 87 | dset = s.get(dkey) 88 | offset += dset.shape[0] 89 | dset.resize(dset.shape[0] + data.shape[0], axis=0) 90 | 91 | dset[offset:, :] = data 92 | 93 | s.close() 94 | 95 | def get_dataset(self, container_key, dkey): 96 | 97 | with h5py.File(container_key, 'r') as container: 98 | dset = numpy.array(container.get(dkey)) 99 | 100 | return dset[:,:] 101 | 102 | def get_keys(self, container_key): 103 | 104 | s = h5py.File(container_key, 'r') 105 | keys = s.keys() 106 | s.close() 107 | 108 | return keys 109 | 110 | def save(self, key, obj): 111 | 112 | obj.save(key) 113 | 114 | def load(self, key, loader): 115 | 116 | return loader.load(key) 117 | -------------------------------------------------------------------------------- /woody/models/forest/src/tree/include/tree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * tree.h 3 | * 4 | * Created on: 25.10.2014 5 | * Author: fgieseke 6 | */ 7 | 8 | #ifndef FOREST_STANDARD_INCLUDE_TREE_H_ 9 | #define FOREST_STANDARD_INCLUDE_TREE_H_ 10 | 11 | #include "global.h" 12 | 13 | /* -------------------------------------------------------------------------------- 14 | * Initializes a forest 15 | * -------------------------------------------------------------------------------- 16 | */ 17 | inline void init_forest(FOREST *forest, int n_trees); 18 | 19 | /* -------------------------------------------------------------------------------- 20 | * Frees memory allocated for a forest 21 | * -------------------------------------------------------------------------------- 22 | */ 23 | inline void free_forest(FOREST *forest, int free_trees, int free_forest); 24 | 25 | /* -------------------------------------------------------------------------------- 26 | * Initializes a single tree 27 | * -------------------------------------------------------------------------------- 28 | */ 29 | inline void init_tree(TREE *tree, int n_allocated); 30 | 31 | /* -------------------------------------------------------------------------------- 32 | * Frees memory allocated for a single tree 33 | * -------------------------------------------------------------------------------- 34 | */ 35 | inline void free_tree(TREE *tree); 36 | 37 | /* -------------------------------------------------------------------------------- 38 | * Returns a node based on given node id 39 | * -------------------------------------------------------------------------------- 40 | */ 41 | inline TREE_NODE* get_node(TREE *tree, int node_id); 42 | 43 | /* -------------------------------------------------------------------------------- 44 | * Adds a node to a given tree 45 | * -------------------------------------------------------------------------------- 46 | */ 47 | int add_node_to_tree(TREE *tree, int parent_id, int is_left_child); 48 | 49 | /* -------------------------------------------------------------------------------- 50 | * Attaches tree to a leaf of another tree 51 | * -------------------------------------------------------------------------------- 52 | */ 53 | int attach_tree(TREE *tree, TREE *subtree, int leaf_id); 54 | 55 | /* -------------------------------------------------------------------------------- 56 | * Generates an internal node 57 | * -------------------------------------------------------------------------------- 58 | */ 59 | inline int generate_internal_tree_node(TREE *tree, int parent_id, int is_left_child, 60 | int is_leaf, int feature, FLOAT_TYPE threshold, int node_samples); 61 | 62 | /* -------------------------------------------------------------------------------- 63 | * Generates a single leaf 64 | * -------------------------------------------------------------------------------- 65 | */ 66 | inline int generate_tree_leaf(TREE *tree, int parent_id, int is_left_child, 67 | FLOAT_TYPE leaf_value, unsigned int leaf_criterion); 68 | 69 | /* -------------------------------------------------------------------------------- 70 | * Initializes node entries for internal node 71 | * -------------------------------------------------------------------------------- 72 | */ 73 | void init_internal_tree_node(TREE_NODE *node, int parent_id, int feature, 74 | FLOAT_TYPE threshold, int node_samples); 75 | 76 | /* -------------------------------------------------------------------------------- 77 | * Initializes node entries for leaf node 78 | * -------------------------------------------------------------------------------- 79 | */ 80 | void init_tree_leaf(TREE_NODE *node, int parent_id, FLOAT_TYPE leaf_value, unsigned int leaf_criterion); 81 | 82 | #endif /* FOREST_STANDARD_INCLUDE_TREE_H_ */ 83 | -------------------------------------------------------------------------------- /woody/models/forest/src/.cproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /woody/models/forest/src/tree/cpu/fastsort.c: -------------------------------------------------------------------------------- 1 | /* 2 | * fastsort.c 3 | * 4 | * Created on: 23.01.2017 5 | * Author: fgieseke 6 | */ 7 | #include "include/fastsort.h" 8 | 9 | 10 | #define swap_fast(a1, a2, s1, s2) { \ 11 | register FLOAT_TYPE tmp = *(a1); \ 12 | *(a1) = *(a2); \ 13 | *(a2) = tmp; \ 14 | register int tmpint = *(s1); \ 15 | *(s1) = *(s2); \ 16 | *(s2) = tmpint; \ 17 | } 18 | 19 | inline static int fast_partition(FLOAT_TYPE *a, int *samples, int lo, int hi, FLOAT_TYPE x); 20 | inline static int fast_floor_lg(int a); 21 | static FLOAT_TYPE fast_medianof3(FLOAT_TYPE *a, int lo, int mid, int hi); 22 | static void fast_downheap(FLOAT_TYPE *a, int i, int n, int lo); 23 | static void fast_heapsort(FLOAT_TYPE *a, int *samples, int lo, int hi); 24 | static void fast_introsort_loop(FLOAT_TYPE *a, int *samples, int lo, int hi, int depth_limit); 25 | static void fast_insertionsort(FLOAT_TYPE *a, int *samples, int lo, int hi); 26 | 27 | void combined_sort(FLOAT_TYPE *XF, int *samples, int n) { 28 | 29 | fast_introsort_loop(XF, samples, 0, n, 2 * fast_floor_lg(n)); 30 | fast_insertionsort(XF, samples, 0, n); 31 | } 32 | 33 | static void fast_introsort_loop(FLOAT_TYPE *a, int *samples, int lo, int hi, int depth_limit) { 34 | int p = -1; 35 | 36 | while (hi - lo > fast_size_threshold) { 37 | 38 | if (depth_limit == 0) { 39 | fast_heapsort(a, samples, lo, hi); 40 | return; 41 | } 42 | depth_limit--; 43 | 44 | p = fast_partition(a, samples, lo, hi, fast_medianof3(a, lo, lo + ((hi - lo) / 2) + 1, hi - 1)); 45 | 46 | fast_introsort_loop(a, samples, p, hi, depth_limit); 47 | hi = p; 48 | } 49 | } 50 | 51 | inline static int fast_partition(FLOAT_TYPE *a, int *samples, int lo, int hi, FLOAT_TYPE x) { 52 | int i = lo, j = hi; 53 | while (1) { 54 | while (a[i] < x) 55 | i++; 56 | j--; 57 | while (x < a[j]) 58 | j--; 59 | if (i >= j) 60 | return i; 61 | swap_fast(&a[i], &a[j], &samples[i], &samples[j]); 62 | i++; 63 | } 64 | } 65 | 66 | inline static FLOAT_TYPE fast_medianof3(FLOAT_TYPE *a, int lo, int mid, int hi) { 67 | 68 | if (a[mid] < a[lo]) { 69 | 70 | if (a[hi] < a[mid]) 71 | return a[mid]; 72 | else { 73 | if (a[hi] < a[lo]) 74 | return a[hi]; 75 | else 76 | return a[lo]; 77 | } 78 | } else { 79 | if (a[hi] < a[mid]) { 80 | if (a[hi] < a[lo]) 81 | return a[lo]; 82 | else 83 | return a[hi]; 84 | } else 85 | return a[mid]; 86 | } 87 | } 88 | 89 | static void fast_heapsort(FLOAT_TYPE *a, int *samples, int lo, int hi) { 90 | int n = hi - lo; 91 | int i; 92 | for (i = n / 2; i >= 1; i--) { 93 | fast_downheap(a, i, n, lo); 94 | } 95 | for (i = n; i > 1; i--) { 96 | swap_fast(&a[lo], &a[lo + i - 1], &samples[lo], &samples[lo + i -1]); 97 | fast_downheap(a, 1, i - 1, lo); 98 | } 99 | } 100 | 101 | inline static void fast_downheap(FLOAT_TYPE *a, int i, int n, int lo) { 102 | FLOAT_TYPE d = a[lo + i - 1]; 103 | int child; 104 | int n2 = n / 2; 105 | while (i <= n2) { 106 | child = 2 * i; 107 | if (child < n && a[lo + child - 1] < a[lo + child]) 108 | child++; 109 | if (d >= a[lo + child - 1]) 110 | break; 111 | a[lo + i - 1] = a[lo + child - 1]; 112 | i = child; 113 | } 114 | a[lo + i - 1] = d; 115 | } 116 | 117 | 118 | static void fast_insertionsort(FLOAT_TYPE *a, int *samples, int lo, int hi) { 119 | int i, j; 120 | FLOAT_TYPE tfloat; 121 | int tint; 122 | for (i = lo; i < hi; i++) { 123 | j = i; 124 | tfloat = a[i]; 125 | tint = samples[i]; 126 | while (j != lo && tfloat < a[j - 1]) { 127 | a[j] = a[j - 1]; 128 | samples[j] = samples[j - 1]; 129 | j--; 130 | } 131 | a[j] = tfloat; 132 | samples[j] = tint; 133 | } 134 | } 135 | 136 | inline static int fast_floor_lg(int a) { 137 | return (int) floor(log(a) / log(2)); 138 | } 139 | -------------------------------------------------------------------------------- /woody/util/array/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import numpy 7 | 8 | import wrapper_utils_cpu_float, wrapper_utils_cpu_double 9 | 10 | def split_array_chunk(a, indicator, chunks, counts): 11 | 12 | if type(a[0,0]) == numpy.float64: 13 | wrapper = wrapper_utils_cpu_double 14 | elif type(a[0,0]) == numpy.float32: 15 | wrapper = wrapper_utils_cpu_float 16 | else: 17 | raise Exception("Invalid dtype for array: %s" % str(type(a[0,0]))) 18 | 19 | anew = numpy.empty(a.shape, dtype=a.dtype) 20 | 21 | cumsums = numpy.cumsum(counts).astype(numpy.int32) 22 | 23 | wrapper.split_array(a, anew, indicator, chunks, cumsums) 24 | 25 | return anew 26 | 27 | def split_array(a, indicator, chunks, counts, n_jobs=1): 28 | """ Splits an array according to an indicator array. 29 | 30 | Parameters 31 | ---------- 32 | a : array, numpy-like 33 | The input array that is supposed to 34 | be split according to the indicator array. 35 | indicator: array, numpy-like 36 | The array that contains the indices 37 | according to which the array should be 38 | split up. Each index is also contained in 39 | the chunks array (see below). 40 | chunks: array, numpy-like 41 | This array contains all possible chunk indices 42 | that occur in the indices array. E.g., 43 | chunks = [-1,-1,0,-1,-1,1] means that we 44 | have two chunks in total and an indicator 45 | index 2 is mapped to chunk 0 and an 46 | indicator index 5 to chunk 1. 47 | counts: array, numpy-like 48 | """ 49 | 50 | reshaped = False 51 | 52 | if len(a.shape) == 1: 53 | reshaped = True 54 | a = a.reshape((len(a), 1)) 55 | 56 | if type(a[0,0]) == numpy.float64: 57 | wrapper = wrapper_utils_cpu_double 58 | elif type(a[0,0]) == numpy.float32: 59 | wrapper = wrapper_utils_cpu_float 60 | else: 61 | raise Exception("Invalid dtype for array: %s" % str(type(a[0,0]))) 62 | 63 | indicator = indicator.astype(numpy.int32) 64 | chunks = chunks.astype(numpy.int32) 65 | counts = counts.astype(numpy.int32) 66 | 67 | # sanity_check = True 68 | # # sanity checks (to be removed) 69 | # if sanity_check: 70 | # for indi in indicator: 71 | # assert chunks[indi] != -1 72 | # 73 | # anew_check = numpy.empty(a.shape, dtype=a.dtype) 74 | # # compute splits 75 | # counter = 0 76 | # unique, unique_counts = numpy.unique(indicator, return_counts=True) 77 | # for i in xrange(len(unique)): 78 | # u = unique[i] 79 | # selector = indicator == u 80 | # 81 | # sub = a[selector,:] 82 | # anew_check[counter:counter+len(sub),:] = sub 83 | # counter += len(sub) 84 | 85 | # compute new array 86 | anew = numpy.empty(a.shape, dtype=a.dtype) 87 | cumsums = numpy.cumsum(counts).astype(numpy.int32) 88 | cumsums_minus_counuts = cumsums - counts 89 | wrapper.split_array(a, anew, indicator, chunks, cumsums_minus_counuts) 90 | #anew = anew_check 91 | 92 | # if sanity_check == True: 93 | # assert numpy.allclose(anew_check, anew) 94 | 95 | if reshaped == True: 96 | a = a.reshape(a.shape[0]) 97 | anew = anew.reshape(anew.shape[0]) 98 | 99 | return anew 100 | 101 | def transpose_array(a, a_trans): 102 | 103 | if type(a[0,0]) == numpy.float64: 104 | wrapper = wrapper_utils_cpu_double 105 | else: 106 | wrapper = wrapper_utils_cpu_float 107 | 108 | wrapper.transpose_array(a, a_trans) 109 | 110 | -------------------------------------------------------------------------------- /woody/data/covtype.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import numpy 8 | import pandas 9 | 10 | from woody.io import DataGenerator 11 | 12 | from .util import check_and_download, save_to_h5pd 13 | 14 | def get_covtype_files(data_path, train_size=100000): 15 | 16 | fname_train = os.path.join(data_path, "covtype/covtype-train-1.csv") 17 | fname_test = os.path.join(data_path, "covtype/covtype-test-1.csv") 18 | check_and_download(fname_train) 19 | check_and_download(fname_test) 20 | 21 | fname_train_size = os.path.join(data_path, "covtype/covtype-train-1_%s.csv" % str(train_size)) 22 | 23 | if not os.path.exists(fname_train_size): 24 | os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname_train, fname_train_size)) 25 | 26 | return fname_train_size, fname_test 27 | 28 | def get_covtype_data(data_path, train_size=100000, shuffle_train=False, shuffle_test=False, seed=0): 29 | 30 | numpy.random.seed(seed) 31 | 32 | fname_train, fname_test = get_covtype_files(data_path, train_size) 33 | 34 | # training data 35 | outcome_col = 55 36 | features = 54 37 | data = pandas.read_csv(fname_train, dtype="int", header=None) 38 | ytrain = numpy.ascontiguousarray(data[(outcome_col-1)].values) 39 | xcols = set(range(features+1)).difference(set([outcome_col-1])) 40 | Xtrain = numpy.ascontiguousarray(data.ix[:,xcols].values) 41 | 42 | if shuffle_train == True: 43 | train_partition = numpy.random.permutation(Xtrain.shape[0]) 44 | Xtrain = Xtrain[train_partition] 45 | ytrain = ytrain[train_partition] 46 | 47 | # testing data 48 | data = pandas.read_csv(fname_test, dtype=int, header=None) 49 | ytest = numpy.ascontiguousarray(data[(outcome_col-1)].values) 50 | xcols = set(range(features+1)).difference(set([outcome_col-1])) 51 | Xtest = numpy.ascontiguousarray(data.ix[:,xcols].values) 52 | 53 | if shuffle_test == True: 54 | test_partition = numpy.random.permutation(Xtest.shape[0]) 55 | Xtest = Xtest[test_partition] 56 | ytest = ytest[test_partition] 57 | 58 | return Xtrain, ytrain, Xtest, ytest 59 | 60 | def _convert_datasets(data_path, train_size): 61 | 62 | X_train, y_train, X_test, y_test = get_covtype_data(data_path, train_size, shuffle_train=False, shuffle_test=False) 63 | 64 | fname_store_train = os.path.join(data_path, "covtype/covtype-train-1_%s.csv.h5pd" % str(train_size)) 65 | fname_store_test = os.path.join(data_path, "covtype/covtype-test-1.csv.h5pd") 66 | 67 | save_to_h5pd(X_train, y_train, fname_store_train) 68 | save_to_h5pd(X_test, y_test, fname_store_test) 69 | 70 | def get_covtype_generator(data_path, train_size=100000, store="h5", seed=0, part="train", patterns=True, target=True): 71 | 72 | 73 | if store == "h5": 74 | 75 | if part=="train": 76 | fname = os.path.join(data_path, "covtype/covtype-train-1_%s.csv.h5pd" % str(train_size)) 77 | elif part=="test": 78 | fname = os.path.join(data_path, "covtype/covtype-test-1.csv.h5pd") 79 | 80 | if not os.path.exists(fname): 81 | print("Store for covtype data does not exist. Generating all stores ...") 82 | _convert_datasets(data_path, train_size) 83 | 84 | return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=200000) 85 | 86 | elif store == "mem": 87 | 88 | X_train, y_train, X_test, y_test = get_covtype_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False) 89 | 90 | data = {} 91 | if part == "train": 92 | data['X'] = X_train 93 | data['y'] = y_train 94 | else: 95 | data['X'] = X_test 96 | data['y'] = y_test 97 | 98 | return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=200000) 99 | -------------------------------------------------------------------------------- /experiments/small_data/sk.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import params 5 | from util import evaluate 6 | 7 | import os 8 | import time 9 | import json 10 | 11 | from woody.util import ensure_dir_for_file 12 | from woody.data import * 13 | 14 | def single_run(dkey, train_size, param, seed, profile=False): 15 | 16 | print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) 17 | 18 | if dkey == "covtype": 19 | Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed) 20 | elif dkey == "higgs": 21 | Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed) 22 | elif dkey == "susy": 23 | Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed) 24 | else: 25 | raise Exception("Unknown data set!") 26 | 27 | print("") 28 | print("Number of training patterns:\t%i" % Xtrain.shape[0]) 29 | print("Number of test patterns:\t%i" % Xtest.shape[0]) 30 | print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) 31 | 32 | if param['tree_type'] == "randomized": 33 | from sklearn.ensemble import ExtraTreesClassifier as RF 34 | elif param['tree_type'] == "standard": 35 | from sklearn.ensemble import RandomForestClassifier as RF 36 | 37 | model = RF( 38 | n_estimators=param['n_estimators'], 39 | criterion="gini", 40 | max_features=param['max_features'], 41 | min_samples_split=2, 42 | n_jobs=param['n_jobs'], 43 | random_state=seed, 44 | bootstrap=param['bootstrap'], 45 | min_samples_leaf=1, 46 | max_depth=None, 47 | verbose=0) 48 | 49 | if profile == True: 50 | import yep 51 | assert param['n_jobs'] == 1 52 | yep.start("train.prof") 53 | 54 | # training 55 | fit_start_time = time.time() 56 | model.fit(Xtrain, ytrain) 57 | fit_end_time = time.time() 58 | if profile == True: 59 | yep.stop() 60 | ypreds_train = model.predict(Xtrain) 61 | 62 | # testing 63 | test_start_time = time.time() 64 | ypred_test = model.predict(Xtest) 65 | test_end_time = time.time() 66 | 67 | results = {} 68 | results['dataset'] = dkey 69 | results['param'] = param 70 | results['training_time'] = fit_end_time - fit_start_time 71 | results['testing_time'] = test_end_time - test_start_time 72 | print("Training time: %f" % results['training_time']) 73 | print("Testing time: %f" % results['testing_time']) 74 | 75 | evaluate(ypreds_train, ytrain, results, "training") 76 | evaluate(ypred_test, ytest, results, "testing") 77 | 78 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), 79 | str(param['max_features']), 80 | str(param['n_jobs']), 81 | str(param['bootstrap']), 82 | str(param['tree_type']), 83 | str(seed), 84 | ) 85 | fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname) 86 | ensure_dir_for_file(fname) 87 | with open(fname, 'w') as fp: 88 | json.dump(results, fp) 89 | 90 | ################################################################################### 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 94 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 95 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 96 | parser.add_argument('--key', type=str) 97 | args = parser.parse_args() 98 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key 99 | ################################################################################### 100 | 101 | single_run(dkey, train_size, params.parameters[key], seed) -------------------------------------------------------------------------------- /experiments/small_data/wood.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import params 5 | from util import evaluate 6 | 7 | import os 8 | import time 9 | import json 10 | 11 | from woody import WoodClassifier 12 | from woody.util import ensure_dir_for_file 13 | from woody.data import * 14 | 15 | def single_run(dkey, train_size, param, seed, profile=False): 16 | 17 | print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) 18 | 19 | if dkey == "covtype": 20 | Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed) 21 | elif dkey == "higgs": 22 | Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed) 23 | elif dkey == "susy": 24 | Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed) 25 | else: 26 | raise Exception("Unknown data set!") 27 | 28 | print("") 29 | print("Number of training patterns:\t%i" % Xtrain.shape[0]) 30 | print("Number of test patterns:\t%i" % Xtest.shape[0]) 31 | print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) 32 | 33 | model = WoodClassifier( 34 | n_estimators=param['n_estimators'], 35 | criterion="gini", 36 | max_features=param['max_features'], 37 | min_samples_split=2, 38 | n_jobs=param['n_jobs'], 39 | seed=seed, 40 | bootstrap=param['bootstrap'], 41 | tree_traversal_mode="dfs", 42 | tree_type=param['tree_type'], 43 | min_samples_leaf=1, 44 | float_type="double", 45 | max_depth=None, 46 | verbose=0) 47 | 48 | if profile == True: 49 | import yep 50 | assert param['n_jobs'] == 1 51 | yep.start("train.prof") 52 | 53 | # training 54 | fit_start_time = time.time() 55 | model.fit(Xtrain, ytrain) 56 | fit_end_time = time.time() 57 | if profile == True: 58 | yep.stop() 59 | ypreds_train = model.predict(Xtrain) 60 | 61 | # testing 62 | test_start_time = time.time() 63 | ypred_test = model.predict(Xtest) 64 | test_end_time = time.time() 65 | 66 | results = {} 67 | results['dataset'] = dkey 68 | results['param'] = param 69 | results['training_time'] = fit_end_time - fit_start_time 70 | results['testing_time'] = test_end_time - test_start_time 71 | print("Training time: %f" % results['training_time']) 72 | print("Testing time: %f" % results['testing_time']) 73 | 74 | evaluate(ypreds_train, ytrain, results, "training") 75 | evaluate(ypred_test, ytest, results, "testing") 76 | 77 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), 78 | str(param['max_features']), 79 | str(param['n_jobs']), 80 | str(param['bootstrap']), 81 | str(param['tree_type']), 82 | str(seed), 83 | ) 84 | fname = os.path.join(params.odir, str(dkey), str(train_size), "wood", fname) 85 | ensure_dir_for_file(fname) 86 | with open(fname, 'w') as fp: 87 | json.dump(results, fp) 88 | 89 | ################################################################################### 90 | import argparse 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 93 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 94 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 95 | parser.add_argument('--key', type=str) 96 | args = parser.parse_args() 97 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key 98 | ################################################################################### 99 | 100 | single_run(dkey, train_size, params.parameters[key], seed) 101 | -------------------------------------------------------------------------------- /experiments/influence_lamda/wood.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import params 5 | from util import evaluate 6 | 7 | import os 8 | import time 9 | import json 10 | 11 | from woody import WoodClassifier 12 | from woody.util import ensure_dir_for_file 13 | from woody.data import * 14 | 15 | def single_run(dkey, train_size, lamcrit, param, seed, profile=False): 16 | 17 | print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) 18 | 19 | 20 | if dkey == "covtype": 21 | Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed) 22 | elif dkey == "higgs": 23 | Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed) 24 | elif dkey == "susy": 25 | Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed) 26 | else: 27 | raise Exception("Unknown data set!") 28 | 29 | print("") 30 | print("Number of training patterns:\t%i" % Xtrain.shape[0]) 31 | print("Number of test patterns:\t%i" % Xtest.shape[0]) 32 | print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) 33 | 34 | model = WoodClassifier( 35 | n_estimators=param['n_estimators'], 36 | criterion="even_gini", 37 | max_features=param['max_features'], 38 | min_samples_split=2, 39 | n_jobs=param['n_jobs'], 40 | seed=seed, 41 | bootstrap=param['bootstrap'], 42 | tree_traversal_mode="dfs", 43 | tree_type=param['tree_type'], 44 | min_samples_leaf=1, 45 | float_type="double", 46 | max_depth=None, 47 | lam_criterion=lamcrit, 48 | verbose=0) 49 | 50 | if profile == True: 51 | import yep 52 | assert param['n_jobs'] == 1 53 | yep.start("train.prof") 54 | 55 | # training 56 | fit_start_time = time.time() 57 | model.fit(Xtrain, ytrain) 58 | fit_end_time = time.time() 59 | if profile == True: 60 | yep.stop() 61 | 62 | print("Number of nodes: %i" % model.get_n_nodes(0)) 63 | ypreds_train = model.predict(Xtrain) 64 | 65 | # testing 66 | test_start_time = time.time() 67 | ypred_test = model.predict(Xtest) 68 | test_end_time = time.time() 69 | 70 | results = {} 71 | results['dataset'] = dkey 72 | results['param'] = param 73 | results['training_time'] = fit_end_time - fit_start_time 74 | results['testing_time'] = test_end_time - test_start_time 75 | print("Training time: %f" % results['training_time']) 76 | print("Testing time: %f" % results['testing_time']) 77 | 78 | evaluate(ypreds_train, ytrain, results, "training") 79 | evaluate(ypred_test, ytest, results, "testing") 80 | 81 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), 82 | str(param['max_features']), 83 | str(param['n_jobs']), 84 | str(param['bootstrap']), 85 | str(param['tree_type']), 86 | str(seed), 87 | ) 88 | fname = os.path.join(params.odir, str(dkey), str(train_size), str(lamcrit), "wood", fname) 89 | ensure_dir_for_file(fname) 90 | with open(fname, 'w') as fp: 91 | json.dump(results, fp) 92 | 93 | ################################################################################### 94 | import argparse 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 97 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 98 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 99 | parser.add_argument('--key', type=str) 100 | parser.add_argument('--lamcrit', nargs='?', const=0.0, type=float, default=0.0) 101 | args = parser.parse_args() 102 | dkey, train_size, seed, key, lamcrit = args.dkey, args.train_size, args.seed, args.key, args.lamcrit 103 | ################################################################################### 104 | 105 | single_run(dkey, train_size, lamcrit, params.parameters[key], seed) 106 | 107 | -------------------------------------------------------------------------------- /woody/models/forest/src/qsort.c: -------------------------------------------------------------------------------- 1 | /* 2 | * qsort.c 3 | * 4 | * Created on: 12.11.2014 5 | * Author: fgieseke 6 | */ 7 | 8 | #include "include/qsort.h" 9 | // 10 | // qsort.c 11 | // 12 | // Quick sort 13 | // 14 | // Copyright (C) 2002 Michael Ringgaard. All rights reserved. 15 | // 16 | // Redistribution and use in source and binary forms, with or without 17 | // modification, are permitted provided that the following conditions 18 | // are met: 19 | // 20 | // 1. Redistributions of source code must retain the above copyright 21 | // notice, this list of conditions and the following disclaimer. 22 | // 2. Redistributions in binary form must reproduce the above copyright 23 | // notice, this list of conditions and the following disclaimer in the 24 | // documentation and/or other materials provided with the distribution. 25 | // 3. Neither the name of the project nor the names of its contributors 26 | // may be used to endorse or promote products derived from this software 27 | // without specific prior written permission. 28 | // 29 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 30 | // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 33 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 | // SUCH DAMAGE. 40 | // 41 | 42 | #define INT_CUTOFF 8 43 | 44 | static void shortsort(char *lo, char *hi, unsigned width, 45 | int (*comp)(const void *, const void *, const void*), 46 | const void *comp_param); 47 | 48 | static void woody_swap(char *p, char *q, unsigned int width); 49 | 50 | void woody_qsort(void *base, unsigned num, unsigned width, 51 | int (*comp)(const void *, const void *, const void *), 52 | const void *comp_param) { 53 | 54 | char *lo, *hi; 55 | char *mid; 56 | char *l, *h; 57 | unsigned size; 58 | char *lostk[30], *histk[30]; 59 | int stkptr; 60 | 61 | if (num < 2 || width == 0) 62 | return; 63 | 64 | stkptr = 0; 65 | 66 | lo = base; 67 | hi = (char *) base + width * (num - 1); 68 | 69 | recurse: size = (hi - lo) / width + 1; 70 | 71 | if (size <= INT_CUTOFF) { 72 | shortsort(lo, hi, width, comp, comp_param); 73 | } else { 74 | mid = lo + (size / 2) * width; 75 | woody_swap(mid, lo, width); 76 | 77 | l = lo; 78 | h = hi + width; 79 | 80 | for (;;) { 81 | do { 82 | l += width; 83 | } while (l <= hi && comp(l, lo, comp_param) <= 0); 84 | do { 85 | h -= width; 86 | } while (h > lo && comp(h, lo, comp_param) >= 0); 87 | if (h < l) 88 | break; 89 | woody_swap(l, h, width); 90 | } 91 | 92 | woody_swap(lo, h, width); 93 | 94 | if (h - 1 - lo >= hi - l) { 95 | if (lo + width < h) { 96 | lostk[stkptr] = lo; 97 | histk[stkptr] = h - width; 98 | ++stkptr; 99 | } 100 | 101 | if (l < hi) { 102 | lo = l; 103 | goto recurse; 104 | } 105 | } else { 106 | if (l < hi) { 107 | lostk[stkptr] = l; 108 | histk[stkptr] = hi; 109 | ++stkptr; 110 | } 111 | 112 | if (lo + width < h) { 113 | hi = h - width; 114 | goto recurse; 115 | } 116 | } 117 | } 118 | 119 | --stkptr; 120 | if (stkptr >= 0) { 121 | lo = lostk[stkptr]; 122 | hi = histk[stkptr]; 123 | goto recurse; 124 | } 125 | 126 | } 127 | 128 | static void shortsort(char *lo, char *hi, unsigned width, 129 | int (*comp)(const void *, const void *, const void *), 130 | const void* comp_param) { 131 | 132 | char *p, *max; 133 | 134 | while (hi > lo) { 135 | max = lo; 136 | for (p = lo + width; p <= hi; p += width) 137 | if (comp(p, max, comp_param) > 0) 138 | max = p; 139 | woody_swap(max, hi, width); 140 | hi -= width; 141 | } 142 | 143 | } 144 | 145 | static void woody_swap(char *a, char *b, unsigned width) { 146 | 147 | char tmp; 148 | 149 | if (a != b) { 150 | while (width--) { 151 | tmp = *a; 152 | *a++ = *b; 153 | *b++ = tmp; 154 | } 155 | } 156 | 157 | } 158 | 159 | -------------------------------------------------------------------------------- /woody/data/util.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import gzip 8 | import urllib 9 | import shutil 10 | import h5py 11 | import pandas 12 | 13 | def check_and_download(fname, remoteurl="REMOTE_URL"): 14 | 15 | if os.path.isfile(fname) == False: 16 | 17 | if os.path.exists(os.path.join(os.path.dirname(fname), remoteurl)): 18 | urlfname = os.path.join(os.path.dirname(fname), remoteurl) 19 | try: 20 | with open(urlfname,"r") as f: 21 | url = f.readlines()[0].strip() 22 | url = os.path.join(url, os.path.basename(fname)) 23 | except Exception as e: 24 | print("Could not retrieve urlf from file %s" % urlfname) 25 | 26 | elif os.path.exists(fname + ".download"): 27 | urlfname = fname + ".download" 28 | try: 29 | with open(urlfname,"r") as f: 30 | url = f.readlines()[0] 31 | except Exception as e: 32 | print("Could not retrieve urlf from file %s" % urlfname) 33 | 34 | else: 35 | raise Exception("File and download url do not exist!") 36 | 37 | url = url.strip() 38 | 39 | try: 40 | if url.endswith(".gz"): 41 | fname_download = fname + ".gz" 42 | else: 43 | fname_download = fname 44 | 45 | print("Downloading data from %s to %s ..." % (url, fname_download)) 46 | urllib.urlretrieve (url, fname_download) 47 | 48 | print("Successfully downloaded the data!") 49 | if url.endswith(".gz"): 50 | print("Extracting zipped file ...") 51 | inF = gzip.open(fname_download, 'rb') 52 | outF = open(fname, 'wb') 53 | outF.write(inF.read()) 54 | inF.close() 55 | outF.close() 56 | print("Done!") 57 | except Exception as e: 58 | print(str(e)) 59 | try: 60 | # remove incomplete data 61 | shutil.rmtree(fname) 62 | except: 63 | pass 64 | return False 65 | 66 | return True 67 | 68 | def save_to_h5(X, y, fname, compression="lzf"): 69 | 70 | d = os.path.dirname(fname) 71 | if not os.path.exists(d): 72 | os.makedirs(d) 73 | 74 | y = y.reshape((len(y), 1)) 75 | 76 | # create store and data sets 77 | store = h5py.File(fname, 'w') 78 | dsetX = store.create_dataset("X", X.shape, compression=compression) 79 | dsety = store.create_dataset("y", y.shape, compression=compression) 80 | 81 | dsetX[:,:] = X 82 | dsety[:,:] = y 83 | 84 | store.close() 85 | 86 | def save_to_h5pd(X, y, fname, compression="bzip2", complevel=3, delete_before=True): 87 | 88 | d = os.path.dirname(fname) 89 | if not os.path.exists(d): 90 | os.makedirs(d) 91 | 92 | y = y.reshape((len(y), 1)) 93 | 94 | if delete_before == True: 95 | if os.path.exists(fname): 96 | os.remove(fname) 97 | 98 | df_X = pandas.DataFrame(X, index=range(len(X))) 99 | df_y = pandas.DataFrame(y, index=range(len(y))) 100 | 101 | df_X.to_hdf(fname, 'X', append=True, complib=compression, complevel=complevel) 102 | df_y.to_hdf(fname, 'y', append=True, complib=compression, complevel=complevel) 103 | 104 | def convert_to_h5pd(reader, fname, transform, compression="bzip2", complevel=3, delete_before=True): 105 | 106 | d = os.path.dirname(fname) 107 | if not os.path.exists(d): 108 | os.makedirs(d) 109 | 110 | if delete_before == True: 111 | if os.path.exists(fname): 112 | os.remove(fname) 113 | 114 | for chunk in reader: 115 | 116 | X, y = transform(chunk) 117 | y = y.reshape((len(y), 1)) 118 | 119 | df_X = pandas.DataFrame(X, index=range(len(X))) 120 | df_y = pandas.DataFrame(y, index=range(len(y))) 121 | 122 | df_X.to_hdf(fname, 'X', append=True, complib=compression, complevel=complevel) 123 | df_y.to_hdf(fname, 'y', append=True, complib=compression, complevel=complevel) 124 | 125 | -------------------------------------------------------------------------------- /woody/data/susy.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import os 7 | import time 8 | import numpy 9 | import pandas 10 | 11 | from woody.io import DataGenerator 12 | 13 | from .util import check_and_download, save_to_h5pd 14 | 15 | ALLOWED_TRAIN_SIZES = [500000, 1000000, 16 | 1500000, 2000000, 17 | 2500000, 3000000, 18 | 3500000, 4000000] 19 | 20 | def get_susy_files(data_path, train_size=1000000): 21 | 22 | assert train_size in ALLOWED_TRAIN_SIZES 23 | 24 | fname = os.path.join(data_path, "susy/SUSY.csv") 25 | check_and_download(fname) 26 | time.sleep(1) 27 | 28 | fname_train = os.path.join(data_path, "susy/SUSY.train_%s.csv" % str(train_size)) 29 | fname_test = os.path.join(data_path, "susy/SUSY.test.csv") 30 | 31 | if not os.path.exists(fname_train): 32 | os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname, fname_train)) 33 | if not os.path.exists(fname_test): 34 | os.system("sed -n '%i,%ip;%iq' < %s > %s" % (4500001, 5000000, 5000000, fname, fname_test)) 35 | 36 | return fname_train, fname_test 37 | 38 | def get_susy_data(data_path, train_size=1000000, shuffle_train=False, shuffle_test=False, seed=0): 39 | 40 | assert train_size in ALLOWED_TRAIN_SIZES 41 | 42 | numpy.random.seed(seed) 43 | fname_train, fname_test = get_susy_files(data_path, train_size) 44 | 45 | # training data 46 | label_col = 0 47 | features_cols = range(1,19) 48 | 49 | data = pandas.read_csv(fname_train, dtype="float", sep=",", header=None) 50 | ytrain = numpy.ascontiguousarray(data.ix[:,label_col].values) 51 | Xtrain = numpy.ascontiguousarray(data.ix[:,features_cols].values) 52 | 53 | data = pandas.read_csv(fname_test, dtype="float", sep=",", header=None) 54 | ytest = numpy.ascontiguousarray(data.ix[:,label_col].values) 55 | Xtest = numpy.ascontiguousarray(data.ix[:,features_cols].values) 56 | 57 | if shuffle_train == True: 58 | train_partition = numpy.random.permutation(Xtrain.shape[0]) 59 | Xtrain = Xtrain[train_partition] 60 | ytrain = ytrain[train_partition] 61 | 62 | if shuffle_test == True: 63 | test_partition = numpy.random.permutation(Xtest.shape[0]) 64 | Xtest = Xtest[test_partition] 65 | ytest = ytest[test_partition] 66 | 67 | return Xtrain, ytrain, Xtest, ytest 68 | 69 | def _convert_susy_data(data_path, train_size): 70 | 71 | X_train, y_train, X_test, y_test = get_susy_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False) 72 | 73 | fname_store_train = os.path.join(data_path, "susy/SUSY.train_%s.h5pd" % str(train_size)) 74 | fname_store_test = os.path.join(data_path, "susy/SUSY.test.h5pd") 75 | 76 | save_to_h5pd(X_train, y_train, fname_store_train) 77 | save_to_h5pd(X_test, y_test, fname_store_test) 78 | 79 | def get_susy_generator(data_path, train_size=1000000, store="h5", seed=0, part="train", patterns=True, target=True): 80 | 81 | if store == "h5": 82 | 83 | if part=="train": 84 | fname = os.path.join(data_path, "susy/SUSY.train_%s.h5pd" % str(train_size)) 85 | elif part=="test": 86 | fname = os.path.join(data_path, "susy/SUSY.test.h5pd") 87 | 88 | if not os.path.exists(fname): 89 | print("Store for susy data does not exist. Generating all stores ...") 90 | _convert_susy_data(data_path, train_size) 91 | 92 | if part == "test": 93 | chunksize = 250000 94 | else: 95 | if train_size <= 2000000: 96 | chunksize = 500000 97 | else: 98 | chunksize = 2000000 99 | 100 | return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize) 101 | 102 | elif store == "mem": 103 | 104 | X_train, y_train, X_test, y_test = get_susy_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False) 105 | 106 | data = {} 107 | if part == "train": 108 | data['X'] = X_train 109 | data['y'] = y_train 110 | else: 111 | data['X'] = X_test 112 | data['y'] = y_test 113 | 114 | return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=1000000) 115 | -------------------------------------------------------------------------------- /experiments/small_data/subsetwood.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import os 5 | import json 6 | from util import evaluate 7 | import params 8 | 9 | import time 10 | 11 | from woody import SubsetWoodClassifier 12 | 13 | from woody.io import MemoryStore, DiskStore 14 | from woody.util import ensure_dir_for_file 15 | from woody.data import * 16 | 17 | def single_run(dkey, train_size, param, seed, profile=False): 18 | 19 | print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) 20 | 21 | if dkey == "covtype": 22 | traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) 23 | n_subset = 50000 24 | elif dkey == "higgs": 25 | traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) 26 | n_subset = 500000 27 | elif dkey == "susy": 28 | traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) 29 | n_subset = 500000 30 | else: 31 | raise Exception("Unknown data set!") 32 | 33 | print("") 34 | print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) 35 | print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) 36 | print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) 37 | 38 | model = SubsetWoodClassifier( 39 | n_estimators=param['n_estimators'], 40 | criterion="gini", 41 | max_features=param['max_features'], 42 | min_samples_split=2, 43 | n_jobs=param['n_jobs'], 44 | seed=seed, 45 | bootstrap=param['bootstrap'], 46 | tree_traversal_mode="dfs", 47 | tree_type=param['tree_type'], 48 | min_samples_leaf=1, 49 | float_type="double", 50 | max_depth=None, 51 | verbose=1, 52 | store=MemoryStore()) 53 | 54 | # training 55 | if profile == True: 56 | import yep 57 | assert param['n_jobs'] == 1 58 | yep.start("train.prof") 59 | 60 | fit_start_time = time.time() 61 | model.fit(traingen, n_subset=n_subset) 62 | fit_end_time = time.time() 63 | if profile == True: 64 | yep.stop() 65 | ypreds_train = model.predict(generator=traingen) 66 | 67 | # testing 68 | test_start_time = time.time() 69 | ypred_test = model.predict(generator=testgen) 70 | test_end_time = time.time() 71 | 72 | results = {} 73 | results['dataset'] = dkey 74 | results['param'] = param 75 | results['training_time'] = fit_end_time - fit_start_time 76 | results['testing_time'] = test_end_time - test_start_time 77 | print("Training time:\t\t%f" % results['training_time']) 78 | print("Testing time:\t\t%f" % results['testing_time']) 79 | 80 | evaluate(ypreds_train, traingen.get_all_target(), results, "training") 81 | evaluate(ypred_test, testgen.get_all_target(), results, "testing") 82 | 83 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), 84 | str(param['max_features']), 85 | str(param['n_jobs']), 86 | str(param['bootstrap']), 87 | str(param['tree_type']), 88 | str(seed), 89 | ) 90 | fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood", fname) 91 | ensure_dir_for_file(fname) 92 | with open(fname, 'w') as fp: 93 | json.dump(results, fp) 94 | 95 | del(testgen) 96 | del(traingen) 97 | model.cleanup() 98 | 99 | time.sleep(1) 100 | 101 | ################################################################################### 102 | import argparse 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 105 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 106 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 107 | parser.add_argument('--key', type=str) 108 | args = parser.parse_args() 109 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key 110 | ################################################################################### 111 | 112 | single_run(dkey, train_size, params.parameters[key], seed) 113 | -------------------------------------------------------------------------------- /woody/io/csv.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | import numpy 7 | import pandas 8 | 9 | from .reader import Reader 10 | 11 | class CSVReader(Reader): 12 | """ 13 | """ 14 | 15 | def __init__(self, 16 | fname, 17 | patterns=True, 18 | target=True, 19 | chunksize=32000, 20 | target_column=None, 21 | patterns_columns=None, 22 | seed=0, 23 | parsing_args={} 24 | ): 25 | 26 | super(CSVReader, self).__init__(fname=fname, 27 | patterns=patterns, 28 | target=target, 29 | chunksize=chunksize, 30 | seed=seed) 31 | 32 | self.target_column = target_column 33 | self.patterns_columns = patterns_columns 34 | self.parsing_args = parsing_args 35 | 36 | def reset(self): 37 | 38 | self.close() 39 | 40 | self._reader = pandas.read_csv(self.fname, iterator=True, chunksize=self.chunksize, **self.parsing_args) 41 | 42 | def get_random_subset(self, size, chunk_percent=0.5, shuffle=True): 43 | """ 44 | NOTE: Seems to interfer with yep (multiprocessing, deadlock?) 45 | 46 | """ 47 | 48 | data = None 49 | 50 | rand_per_chunk = int(self.chunksize * chunk_percent) 51 | 52 | while data is None or len(data) < size: 53 | 54 | self.reset(self.chunksize) 55 | 56 | for chunk in self._reader: 57 | 58 | data_chunk = self._transform_csv(chunk) 59 | choice = sorted(self._randomgen.sample(xrange(len(data_chunk)), rand_per_chunk)) 60 | data_chunk = data_chunk[choice] 61 | 62 | if data is None: 63 | data = data_chunk 64 | else: 65 | data = numpy.concatenate((data, data_chunk), axis=0) 66 | 67 | if len(data) >= size: 68 | break 69 | 70 | self.close() 71 | 72 | if shuffle == True: 73 | partition = range(len(data)) 74 | self._randomgen.shuffle(partition) 75 | data = data[partition] 76 | data = data[:size] 77 | 78 | return self._get_patterns_labels(data) 79 | 80 | def get_chunk(self, extract=True): 81 | 82 | chunk = self._reader.get_chunk() 83 | data_chunk = self._transform_csv(chunk) 84 | 85 | if extract == True: 86 | data_chunk = self._get_patterns_labels(data_chunk) 87 | 88 | return data_chunk 89 | 90 | def transform(self, chunk): 91 | 92 | return chunk.ix[:,:].values 93 | 94 | def _get_patterns_labels(self, data): 95 | 96 | if self.patterns == True and self.target == True: 97 | 98 | X = numpy.ascontiguousarray(data[:, self.patterns_columns]) 99 | y = numpy.ascontiguousarray(data[:, self.target_column]) 100 | return X, y 101 | 102 | elif self.patterns == True: 103 | 104 | X = numpy.ascontiguousarray(data[:, self.patterns_columns]) 105 | return X 106 | 107 | elif self.target == True: 108 | 109 | y = numpy.ascontiguousarray(data[:, self.target_column]) 110 | return y 111 | 112 | raise Exception("Both patterns and target is set to False!") 113 | 114 | 115 | def get_all(self): 116 | 117 | self.reset() 118 | 119 | data = None 120 | 121 | while True: 122 | 123 | try: 124 | 125 | data_chunk = self.get_chunk(extract=False) 126 | 127 | except Exception as e: 128 | break 129 | 130 | if data is None: 131 | data = data_chunk 132 | else: 133 | data = numpy.concatenate((data, data_chunk), axis=0) 134 | 135 | return self._get_patterns_labels(data) 136 | -------------------------------------------------------------------------------- /woody/data/higgs.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2017 Fabian Gieseke 3 | # License: GPL v2 4 | # 5 | 6 | 7 | import os 8 | import numpy 9 | import pandas 10 | 11 | from woody.io import DataGenerator 12 | 13 | from .util import check_and_download, save_to_h5pd 14 | 15 | ALLOWED_TRAIN_SIZES = [500000, 1000000, 16 | 1500000, 2000000, 17 | 2500000, 3000000, 18 | 3500000, 4000000, 19 | 4500000, 5000000, 20 | 5500000, 6000000, 21 | 6500000, 7000000, 22 | 7500000, 8000000, 23 | 8500000, 9000000, 24 | 950000, 10000000] 25 | 26 | def get_higgs_files(data_path, train_size=1000000): 27 | 28 | assert train_size <= 10000000 29 | 30 | fname = os.path.join(data_path, "higgs/HIGGS.csv") 31 | check_and_download(fname) 32 | 33 | fname_train = os.path.join(data_path, "higgs/HIGGS.train_%s.csv" % str(train_size)) 34 | fname_test = os.path.join(data_path, "higgs/HIGGS.test.csv") 35 | 36 | if not os.path.exists(fname_train): 37 | os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname, fname_train)) 38 | if not os.path.exists(fname_test): 39 | os.system("sed -n '%i,%ip;%iq' < %s > %s" % (10000001, 11000000, 11000000, fname, fname_test)) 40 | 41 | return fname_train, fname_test 42 | 43 | def get_higgs_data(data_path, train_size=1000000, shuffle_train=False, shuffle_test=False, seed=0): 44 | 45 | assert train_size in ALLOWED_TRAIN_SIZES 46 | 47 | numpy.random.seed(seed) 48 | fname_train, fname_test = get_higgs_files(data_path, train_size) 49 | 50 | # training data 51 | label_col = 0 52 | features_cols = range(1,29) 53 | 54 | data = pandas.read_csv(fname_train, dtype="float", header=None) 55 | ytrain = numpy.ascontiguousarray(data.ix[:,label_col].values) 56 | Xtrain = numpy.ascontiguousarray(data.ix[:,features_cols].values) 57 | 58 | data = pandas.read_csv(fname_test, dtype="float", header=None) 59 | ytest = numpy.ascontiguousarray(data.ix[:,label_col].values) 60 | Xtest = numpy.ascontiguousarray(data.ix[:,features_cols].values) 61 | 62 | if shuffle_train == True: 63 | train_partition = numpy.random.permutation(Xtrain.shape[0]) 64 | Xtrain = Xtrain[train_partition] 65 | ytrain = ytrain[train_partition] 66 | 67 | if shuffle_test == True: 68 | test_partition = numpy.random.permutation(Xtest.shape[0]) 69 | Xtest = Xtest[test_partition] 70 | ytest = ytest[test_partition] 71 | 72 | return Xtrain, ytrain, Xtest, ytest 73 | 74 | def _convert_higgs_data(data_path, train_size): 75 | 76 | X_train, y_train, X_test, y_test = get_higgs_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False) 77 | 78 | fname_store_train = os.path.join(data_path, "higgs/HIGGS.train_%s.h5pd" % str(train_size)) 79 | fname_store_test = os.path.join(data_path, "higgs/HIGGS.test.h5pd") 80 | 81 | save_to_h5pd(X_train, y_train, fname_store_train) 82 | save_to_h5pd(X_test, y_test, fname_store_test) 83 | 84 | def get_higgs_generator(data_path, train_size=1000000, store="h5", seed=0, part="train", patterns=True, target=True): 85 | 86 | if store == "h5": 87 | 88 | if part=="train": 89 | fname = os.path.join(data_path, "higgs/HIGGS.train_%s.h5pd" % str(train_size)) 90 | elif part=="test": 91 | fname = os.path.join(data_path, "higgs/HIGGS.test.h5pd") 92 | 93 | if not os.path.exists(fname): 94 | print("Store for higgs data does not exist. Generating all stores ...") 95 | _convert_higgs_data(data_path, train_size) 96 | 97 | if part == "test": 98 | chunksize = 250000 99 | else: 100 | if train_size <= 2000000: 101 | chunksize = 500000 102 | else: 103 | chunksize = 2000000 104 | 105 | return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize) 106 | 107 | elif store == "mem": 108 | 109 | X_train, y_train, X_test, y_test = get_higgs_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False) 110 | 111 | data = {} 112 | if part == "train": 113 | data['X'] = X_train 114 | data['y'] = y_train 115 | else: 116 | data['X'] = X_test 117 | data['y'] = y_test 118 | 119 | return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=10000000) 120 | -------------------------------------------------------------------------------- /experiments/landsat/sk.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import params 5 | from util import evaluate 6 | 7 | import os 8 | import time 9 | import json 10 | 11 | from woody.util import ensure_dir_for_file 12 | from woody.data import * 13 | from woody.io import DataGenerator 14 | 15 | def single_run(dkey, train_size, param, seed, profile=False): 16 | 17 | print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) 18 | 19 | if dkey == "landsat": 20 | 21 | # TODO: Download file manually if needed (9,7GB and 524MB): 22 | # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd 23 | # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd 24 | 25 | # TODO: Adapt paths accordingly 26 | fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" 27 | fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" 28 | 29 | traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) 30 | testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) 31 | 32 | else: 33 | raise Exception("Unknown data set!") 34 | 35 | Xtrain, ytrain = traingen.get_all() 36 | Xtest, ytest = testgen.get_all() 37 | 38 | print("") 39 | print("Number of training patterns:\t%i" % Xtrain.shape[0]) 40 | print("Number of test patterns:\t%i" % Xtest.shape[0]) 41 | print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) 42 | 43 | if param['tree_type'] == "randomized": 44 | from sklearn.ensemble import ExtraTreesClassifier as RF 45 | elif param['tree_type'] == "standard": 46 | from sklearn.ensemble import RandomForestClassifier as RF 47 | 48 | model = RF( 49 | n_estimators=param['n_estimators'], 50 | criterion="gini", 51 | max_features=param['max_features'], 52 | min_samples_split=2, 53 | n_jobs=param['n_jobs'], 54 | random_state=seed, 55 | bootstrap=param['bootstrap'], 56 | min_samples_leaf=1, 57 | max_depth=None, 58 | verbose=0) 59 | 60 | if profile == True: 61 | import yep 62 | assert param['n_jobs'] == 1 63 | yep.start("train.prof") 64 | 65 | # training 66 | fit_start_time = time.time() 67 | model.fit(Xtrain, ytrain) 68 | fit_end_time = time.time() 69 | if profile == True: 70 | yep.stop() 71 | ypreds_train = model.predict(Xtrain) 72 | 73 | # testing 74 | test_start_time = time.time() 75 | ypred_test = model.predict(Xtest) 76 | test_end_time = time.time() 77 | 78 | results = {} 79 | results['dataset'] = dkey 80 | results['param'] = param 81 | results['training_time'] = fit_end_time - fit_start_time 82 | results['testing_time'] = test_end_time - test_start_time 83 | print("Training time: %f" % results['training_time']) 84 | print("Testing time: %f" % results['testing_time']) 85 | 86 | evaluate(ypreds_train, ytrain, results, "training") 87 | evaluate(ypred_test, ytest, results, "testing") 88 | 89 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), 90 | str(param['max_features']), 91 | str(param['n_jobs']), 92 | str(param['bootstrap']), 93 | str(param['tree_type']), 94 | str(seed), 95 | ) 96 | fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname) 97 | ensure_dir_for_file(fname) 98 | with open(fname, 'w') as fp: 99 | json.dump(results, fp) 100 | 101 | ################################################################################### 102 | import argparse 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 105 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 106 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 107 | parser.add_argument('--key', type=str) 108 | args = parser.parse_args() 109 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key 110 | ################################################################################### 111 | 112 | single_run(dkey, train_size, params.parameters[key], seed) 113 | -------------------------------------------------------------------------------- /experiments/small_data/h2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | # test 5 | import params 6 | from util import evaluate 7 | 8 | import os 9 | import time 10 | import json 11 | import numpy 12 | import math 13 | 14 | from woody.util import ensure_dir_for_file 15 | from woody.data import * 16 | 17 | def single_run(dkey, train_size, param, seed, profile=False): 18 | 19 | print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) 20 | 21 | import h2o 22 | from skutil.h2o import h2o_col_to_numpy 23 | h2o.init(max_mem_size = "12G", nthreads=param['n_jobs']) 24 | h2o.remove_all() 25 | from h2o.estimators.random_forest import H2ORandomForestEstimator 26 | 27 | # get and convert data 28 | if dkey == "covtype": 29 | fname_train, fname_test = covtype_files(train_size=train_size) 30 | train_df = h2o.import_file(fname_train) 31 | test_df = h2o.import_file(fname_test) 32 | Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1] 33 | elif dkey == "higgs": 34 | fname_train, fname_test = higgs_files(train_size=train_size) 35 | train_df = h2o.import_file(fname_train) 36 | test_df = h2o.import_file(fname_test) 37 | Xcols, ycol = train_df.col_names[1:], train_df.col_names[0] 38 | elif dkey == "susy": 39 | fname_train, fname_test = susy_files(train_size=train_size) 40 | train_df = h2o.import_file(fname_train) 41 | test_df = h2o.import_file(fname_test) 42 | Xcols, ycol = train_df.col_names[1:], train_df.col_names[0] 43 | 44 | print("") 45 | print("Number of training patterns:\t%i" % train_df.shape[0]) 46 | print("Number of test patterns:\t%i" % test_df.shape[0]) 47 | print("Dimensionality of the data:\t%i\n" % train_df.shape[1]) 48 | 49 | if param['max_features'] is None: 50 | mtries = train_df.shape[1] - 2 51 | elif param['max_features'] == "sqrt": 52 | mtries = int(math.sqrt(train_df.shape[1] - 2)) 53 | 54 | if param['bootstrap'] == False: 55 | sample_rate = 1.0 56 | else: 57 | sample_rate = 0.632 58 | 59 | model = H2ORandomForestEstimator( 60 | mtries=mtries, 61 | sample_rate=sample_rate, 62 | #nbins=1000, #crash 63 | min_rows=1, 64 | build_tree_one_node=True, 65 | max_depth=20, 66 | balance_classes=False, 67 | ntrees=param['n_estimators'], 68 | seed=seed) 69 | 70 | # training 71 | fit_start_time = time.time() 72 | model.train(Xcols, ycol, training_frame=train_df) 73 | fit_end_time = time.time() 74 | ypreds_train = model.predict(train_df) 75 | 76 | # testing 77 | test_start_time = time.time() 78 | ypreds_test = model.predict(test_df) 79 | test_end_time = time.time() 80 | 81 | results = {} 82 | results['dataset'] = dkey 83 | results['param'] = param 84 | results['training_time'] = fit_end_time - fit_start_time 85 | results['testing_time'] = test_end_time - test_start_time 86 | print("Training time: %f" % results['training_time']) 87 | print("Testing time: %f" % results['testing_time']) 88 | 89 | evaluate(numpy.rint(ypreds_train.as_data_frame().values), train_df[ycol].as_data_frame().values, results, "training") 90 | evaluate(numpy.rint(ypreds_test.as_data_frame().values), test_df[ycol].as_data_frame().values, results, "testing") 91 | 92 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), 93 | str(param['max_features']), 94 | str(param['n_jobs']), 95 | str(param['bootstrap']), 96 | str(param['tree_type']), 97 | str(seed), 98 | ) 99 | 100 | fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname) 101 | ensure_dir_for_file(fname) 102 | with open(fname, 'w') as fp: 103 | json.dump(results, fp) 104 | 105 | ################################################################################### 106 | import argparse 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 109 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 110 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 111 | parser.add_argument('--key', type=str) 112 | args = parser.parse_args() 113 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key 114 | ################################################################################### 115 | 116 | single_run(dkey, train_size, params.parameters[key], seed) 117 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2015-2018 Fabian Gieseke 3 | # License: GPL v3 4 | # 5 | 6 | import os 7 | import sys 8 | import shutil 9 | from distutils.command.clean import clean 10 | 11 | DISTNAME = 'woody' 12 | DESCRIPTION = 'A Python library for large-scale random forests.' 13 | LONG_DESCRIPTION = open('README.rst').read() 14 | MAINTAINER = 'Fabian Gieseke' 15 | MAINTAINER_EMAIL = 'fabian.gieseke@di.ku.dk' 16 | URL = 'https://github.com/gieseke/woody' 17 | LICENSE = 'GNU GENERAL PUBLIC LICENSE Version 3' 18 | DOWNLOAD_URL = 'https://github.com/gieseke/woody' 19 | 20 | import woody 21 | VERSION = woody.__version__ 22 | 23 | # adapted from scikit-learn 24 | if len(set(('develop', 'release')).intersection(sys.argv)) > 0: 25 | import setuptools 26 | extra_setuptools_args = dict(zip_safe=False) 27 | else: 28 | extra_setuptools_args = dict() 29 | 30 | def configuration(parent_package='', top_path=None): 31 | 32 | from numpy.distutils.misc_util import Configuration 33 | config = Configuration(None, parent_package, top_path) 34 | config.set_options(ignore_setup_xxx_py=True, 35 | assume_default_configuration=True, 36 | delegate_options_to_subpackages=True, 37 | quiet=True) 38 | config.add_subpackage('woody') 39 | 40 | return config 41 | 42 | class CleanCommand(clean): 43 | 44 | description = "Cleaning up code ..." 45 | 46 | def run(self): 47 | 48 | clean.run(self) 49 | 50 | # remove hidden '~' files 51 | for dirpath, dirnames, filenames in os.walk('.'): 52 | for filename in filenames: 53 | if filename.endswith('~') or filename.endswith('.pyc'): 54 | os.unlink(os.path.join(dirpath, filename)) 55 | 56 | # build related files and directories 57 | if os.path.exists('build'): 58 | shutil.rmtree('build') 59 | if os.path.exists('woody.egg-info'): 60 | shutil.rmtree('woody.egg-info') 61 | if os.path.exists('docs/_build'): 62 | shutil.rmtree('docs/_build') 63 | 64 | # remaining files and directories in woody dir (recursively) 65 | for dirpath, dirnames, filenames in os.walk('woody'): 66 | 67 | for filename in filenames: 68 | if (filename.endswith('.so') or 69 | filename.endswith('.pyd') or 70 | filename.endswith('.dll') or 71 | filename.endswith('.pyc') or 72 | filename.endswith('_wrap.c') or 73 | filename.startswith('wrapper_') or 74 | filename.endswith('~')): 75 | os.unlink(os.path.join(dirpath, filename)) 76 | 77 | for dirname in dirnames: 78 | if dirname == '__pycache__' or dirname == 'build' or dirname == '_build': 79 | shutil.rmtree(os.path.join(dirpath, dirname)) 80 | 81 | try: 82 | shutil.rmtree("dist") 83 | except: 84 | pass 85 | 86 | def setup_package(): 87 | 88 | metadata = dict(name=DISTNAME, 89 | maintainer=MAINTAINER, 90 | maintainer_email=MAINTAINER_EMAIL, 91 | description=DESCRIPTION, 92 | license=LICENSE, 93 | url=URL, 94 | version=VERSION, 95 | download_url=DOWNLOAD_URL, 96 | long_description=LONG_DESCRIPTION, 97 | classifiers=[ 98 | 'Intended Audience :: Science/Research', 99 | 'Intended Audience :: Developers', 100 | 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 101 | 'Programming Language :: C', 102 | 'Programming Language :: Python', 103 | 'Programming Language :: Python :: 2', 104 | 'Programming Language :: Python :: 2.6', 105 | 'Programming Language :: Python :: 2.7', 106 | ], 107 | cmdclass={'clean': CleanCommand}, 108 | install_requires=["numpy>=1.6.1"], 109 | include_package_data=True, 110 | package_data={'woody': []}, 111 | **extra_setuptools_args) 112 | 113 | if (len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or sys.argv[1] in ('--version', 'clean'))): 114 | 115 | try: 116 | from setuptools import setup 117 | except ImportError: 118 | from distutils.core import setup 119 | metadata['version'] = VERSION 120 | 121 | else: 122 | 123 | try: 124 | from numpy.distutils.core import setup 125 | metadata['configuration'] = configuration 126 | except: 127 | print("woody requires numpy>=1.6.1") 128 | sys.exit(0) 129 | 130 | setup(**metadata) 131 | 132 | if __name__ == "__main__": 133 | 134 | setup_package() 135 | 136 | -------------------------------------------------------------------------------- /woody/models/forest/src/pqueue.c: -------------------------------------------------------------------------------- 1 | // Adapted from http://rosettacode.org/wiki/Priority_queue#C 2 | 3 | #include "include/pqueue.h" 4 | 5 | /* -------------------------------------------------------------------------------- 6 | * Tests if the queue is empty (first element in array not used to simplify indices) 7 | * -------------------------------------------------------------------------------- 8 | */ 9 | PQUEUE *pqueue_new(int size) { 10 | 11 | if (size < PQUEUE_MIN_SIZE) { 12 | size = PQUEUE_MIN_SIZE; 13 | } 14 | 15 | // allocate space for priority queue 16 | PQUEUE *q = (PQUEUE*) malloc(sizeof(PQUEUE)); 17 | 18 | // allocate space for size queue items 19 | q->buf = (PQUEUE_ITEM*) malloc(size * sizeof(PQUEUE_ITEM)); 20 | 21 | // set size and number of elements (first element is not used) 22 | q->alloc = size; 23 | q->n = 1; 24 | 25 | return q; 26 | } 27 | 28 | /* -------------------------------------------------------------------------------- 29 | * Tests if the queue is empty 30 | * -------------------------------------------------------------------------------- 31 | */ 32 | inline int pqueue_is_empty(PQUEUE *q) { 33 | 34 | if (q->n == 1) { 35 | return 1; 36 | } else { 37 | return 0; 38 | } 39 | 40 | } 41 | 42 | /* -------------------------------------------------------------------------------- 43 | * Pushes "data" with priority "pri" 44 | * -------------------------------------------------------------------------------- 45 | */ 46 | void pqueue_push(PQUEUE *q, void *data, int pri) { 47 | 48 | // pointer for queue item 49 | PQUEUE_ITEM *b; 50 | int n, m; 51 | 52 | // allocate more memory if needed 53 | if (q->n >= q->alloc) { 54 | q->alloc *= 2; 55 | b = q->buf = (PQUEUE_ITEM*) realloc(q->buf, 56 | sizeof(PQUEUE_ITEM) * q->alloc); 57 | } else { 58 | b = q->buf; 59 | } 60 | 61 | // append at end and perform an up-heap operation 62 | // (move up in case parent has a larger priority) 63 | n = q->n++; 64 | while ((m = n / 2) && pri < b[m].pri) { 65 | b[n] = b[m]; 66 | n = m; 67 | } 68 | 69 | b[n].data = data; 70 | b[n].pri = pri; 71 | 72 | } 73 | 74 | /* -------------------------------------------------------------------------------- 75 | * Removes top item (or returns 0 if queue is empty); *pri can be NULL. 76 | * -------------------------------------------------------------------------------- 77 | */ 78 | void *pqueue_pop(PQUEUE *q, int *pri) { 79 | 80 | void *out; 81 | if (q->n == 1) { 82 | return 0; 83 | } 84 | 85 | PQUEUE_ITEM *b = q->buf; 86 | 87 | // get item from the root and store priority in *pri if pri!=NULL 88 | out = b[1].data; 89 | if (pri) { 90 | *pri = b[1].pri; 91 | } 92 | 93 | // reduce size by one 94 | --q->n; 95 | 96 | int n = 1, m; 97 | while ((m = n * 2) < q->n) { 98 | 99 | if (m + 1 < q->n && b[m].pri > b[m + 1].pri) { 100 | m++; 101 | } 102 | 103 | if (b[q->n].pri <= b[m].pri) { 104 | break; 105 | } 106 | 107 | b[n] = b[m]; 108 | n = m; 109 | } 110 | b[n] = b[q->n]; 111 | 112 | // reduce size if needed 113 | if (q->n < q->alloc / 2 && q->n >= PQUEUE_MIN_SIZE) { 114 | q->buf = (PQUEUE_ITEM*) realloc(q->buf, (q->alloc /= 2) * sizeof(b[0])); 115 | } 116 | 117 | // return data 118 | return out; 119 | 120 | } 121 | 122 | /* -------------------------------------------------------------------------------- 123 | * Returns the top of the queue 124 | * -------------------------------------------------------------------------------- 125 | */ 126 | inline void* pqueue_top(PQUEUE *q, int *pri) { 127 | if (q->n == 1) { 128 | return NULL; 129 | } 130 | if (pri) { 131 | *pri = q->buf[1].pri; 132 | } 133 | return q->buf[1].data; 134 | } 135 | 136 | /* -------------------------------------------------------------------------------- 137 | * Combines/merges two queues 138 | * -------------------------------------------------------------------------------- 139 | */ 140 | void pqueue_combine(PQUEUE *q1, PQUEUE *q2) { 141 | int i; 142 | PQUEUE_ITEM *e = q2->buf + 1; 143 | 144 | for (i = q2->n - 1; i >= 1; i--, e++) { 145 | pqueue_push(q1, e->data, e->pri); 146 | } 147 | 148 | pqueue_purge(q2); 149 | 150 | } 151 | 152 | /*int main() { 153 | int i, p; 154 | char *c, *tasks[] = { "Clear drains", "Feed cat", "Make tea", "Solve RC tasks", "Tax return" }; 155 | int pri[] = { 3, 4, 5, 1, 2 }; 156 | 157 | //make two queues 158 | PQUEUE *q = pqueue_new(0); 159 | PQUEUE *q2 = pqueue_new(0); 160 | 161 | //push all 5 tasks into q 162 | for (i = 0; i < 5; i++) 163 | pqueue_push(q, tasks[i], pri[i]); 164 | 165 | //pop them and print one by one 166 | while ((c = pqueue_pop(q, &p))) 167 | printf("%d: %s\n", p, c); 168 | 169 | //put a million random tasks in each queue 170 | for (i = 0; i < 1 << 20; i++) { 171 | p = rand() / ( RAND_MAX / 5); 172 | pqueue_push(q, tasks[p], pri[p]); 173 | 174 | p = rand() / ( RAND_MAX / 5); 175 | pqueue_push(q2, tasks[p], pri[p]); 176 | } 177 | 178 | printf("\nq has %d items, q2 has %d items\n", pqueue_size(q), pqueue_size(q2)); 179 | 180 | // merge q2 into q; q2 is empty 181 | pqueue_combine(q, q2); 182 | printf("After merge, q has %d items, q2 has %d items\n", pqueue_size(q), 183 | pqueue_size(q2)); 184 | 185 | // pop q until it's empty 186 | for (i = 0; (c = pqueue_pop(q, 0)); i++) 187 | ; 188 | printf("Popped %d items out of q\n", i); 189 | 190 | return 0; 191 | }*/ 192 | -------------------------------------------------------------------------------- /experiments/small_data/hugewood_lam.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import os 5 | import json 6 | from util import evaluate 7 | import params 8 | 9 | import time 10 | 11 | from woody import HugeWoodClassifier, WoodClassifier 12 | 13 | from woody.io import MemoryStore, DiskStore 14 | from woody.util import ensure_dir_for_file 15 | from woody.data import * 16 | 17 | def single_run(dkey, train_size, param, seed, profile=False): 18 | 19 | print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) 20 | 21 | if dkey == "covtype": 22 | traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) 23 | elif dkey == "higgs": 24 | traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) 25 | elif dkey == "susy": 26 | traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) 27 | else: 28 | raise Exception("Unknown data set!") 29 | 30 | print("") 31 | print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) 32 | print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) 33 | print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) 34 | 35 | param_wood = param['param_wood'] 36 | 37 | wood = WoodClassifier( 38 | n_estimators=1, 39 | criterion="gini", 40 | max_features=param_wood['max_features'], 41 | min_samples_split=2, 42 | n_jobs=param_wood['n_jobs'], 43 | seed=seed, 44 | bootstrap=param_wood['bootstrap'], 45 | tree_traversal_mode="dfs", 46 | tree_type=param_wood['tree_type'], 47 | min_samples_leaf=1, 48 | float_type="double", 49 | max_depth=None, 50 | verbose=0) 51 | top_tree_lambda = 0.1 52 | model = HugeWoodClassifier( 53 | n_estimators=param['n_estimators'], 54 | n_estimators_bottom=param['n_estimators_bottom'], 55 | n_top="auto", 56 | n_patterns_leaf="auto", 57 | balanced_top_tree=True, 58 | top_tree_lambda=top_tree_lambda, 59 | top_tree_max_depth=None, 60 | top_tree_type="standard", 61 | top_tree_leaf_stopping_mode="ignore_impurity", 62 | n_jobs=param_wood['n_jobs'], 63 | seed=seed, 64 | verbose=1, 65 | plot_intermediate={}, 66 | chunk_max_megabytes=2048, 67 | wrapped_instance=wood, 68 | store=MemoryStore(), 69 | ) 70 | 71 | # training 72 | if profile == True: 73 | import yep 74 | assert param_wood['n_jobs'] == 1 75 | yep.start("train.prof") 76 | 77 | fit_start_time = time.time() 78 | model.fit(traingen) 79 | fit_end_time = time.time() 80 | if profile == True: 81 | yep.stop() 82 | ypreds_train = model.predict(generator=traingen) 83 | 84 | # testing 85 | test_start_time = time.time() 86 | ypred_test = model.predict(generator=testgen) 87 | test_end_time = time.time() 88 | 89 | results = {} 90 | results['dataset'] = dkey 91 | results['param'] = param 92 | results['training_time'] = fit_end_time - fit_start_time 93 | results['testing_time'] = test_end_time - test_start_time 94 | print("Training time:\t\t%f" % results['training_time']) 95 | print("Testing time:\t\t%f" % results['testing_time']) 96 | 97 | evaluate(ypreds_train, traingen.get_all_target(), results, "training") 98 | evaluate(ypred_test, testgen.get_all_target(), results, "testing") 99 | 100 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']), 101 | str(param_wood['max_features']), 102 | str(param_wood['n_jobs']), 103 | str(param_wood['bootstrap']), 104 | str(param_wood['tree_type']), 105 | str(seed), 106 | ) 107 | fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood_" + str(top_tree_lambda), fname) 108 | ensure_dir_for_file(fname) 109 | with open(fname, 'w') as fp: 110 | json.dump(results, fp) 111 | 112 | del(testgen) 113 | del(traingen) 114 | model.cleanup() 115 | 116 | time.sleep(1) 117 | 118 | ################################################################################### 119 | import argparse 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 124 | parser.add_argument('--key', type=str) 125 | args = parser.parse_args() 126 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key 127 | ################################################################################### 128 | 129 | single_run(dkey, train_size, params.parameters_hugewood[key], seed) 130 | -------------------------------------------------------------------------------- /experiments/influence_n_bottom/hugewood_10K.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import os 5 | import json 6 | from util import evaluate 7 | import params 8 | 9 | import time 10 | 11 | from woody import HugeWoodClassifier, WoodClassifier 12 | 13 | from woody.io import MemoryStore, DiskStore 14 | from woody.util import ensure_dir_for_file 15 | from woody.data import * 16 | 17 | def single_run(dkey, train_size, n_bottom, param, seed, profile=False): 18 | 19 | print("Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param))) 20 | 21 | if dkey == "covtype": 22 | traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) 23 | elif dkey == "higgs": 24 | traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) 25 | elif dkey == "susy": 26 | traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) 27 | else: 28 | raise Exception("Unknown data set!") 29 | 30 | print("") 31 | print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) 32 | print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) 33 | print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) 34 | 35 | param_wood = param['param_wood'] 36 | 37 | wood = WoodClassifier( 38 | n_estimators=1, 39 | criterion="gini", 40 | max_features=param_wood['max_features'], 41 | min_samples_split=2, 42 | n_jobs=param_wood['n_jobs'], 43 | seed=seed, 44 | bootstrap=param_wood['bootstrap'], 45 | tree_traversal_mode="dfs", 46 | tree_type=param_wood['tree_type'], 47 | min_samples_leaf=1, 48 | float_type="double", 49 | max_depth=None, 50 | verbose=0) 51 | 52 | model = HugeWoodClassifier( 53 | n_estimators=int(24 / n_bottom), 54 | n_estimators_bottom=int(n_bottom), 55 | n_top="auto", 56 | n_patterns_leaf=10000, 57 | balanced_top_tree=True, 58 | top_tree_lambda=1.0, 59 | top_tree_max_depth=None, 60 | top_tree_type="standard", 61 | top_tree_leaf_stopping_mode="ignore_impurity", 62 | n_jobs=param_wood['n_jobs'], 63 | seed=seed, 64 | verbose=1, 65 | plot_intermediate={}, 66 | chunk_max_megabytes=2048, 67 | wrapped_instance=wood, 68 | store=MemoryStore(), 69 | ) 70 | 71 | # training 72 | if profile == True: 73 | import yep 74 | assert param_wood['n_jobs'] == 1 75 | yep.start("train.prof") 76 | 77 | fit_start_time = time.time() 78 | model.fit(traingen) 79 | fit_end_time = time.time() 80 | if profile == True: 81 | yep.stop() 82 | ypreds_train = model.predict(generator=traingen) 83 | 84 | # testing 85 | test_start_time = time.time() 86 | ypred_test = model.predict(generator=testgen) 87 | test_end_time = time.time() 88 | 89 | results = {} 90 | results['dataset'] = dkey 91 | results['param'] = param 92 | results['training_time'] = fit_end_time - fit_start_time 93 | results['testing_time'] = test_end_time - test_start_time 94 | print("Training time:\t\t%f" % results['training_time']) 95 | print("Testing time:\t\t%f" % results['testing_time']) 96 | 97 | evaluate(ypreds_train, traingen.get_all_target(), results, "training") 98 | evaluate(ypred_test, testgen.get_all_target(), results, "testing") 99 | 100 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']), 101 | str(param_wood['max_features']), 102 | str(param_wood['n_jobs']), 103 | str(param_wood['bootstrap']), 104 | str(param_wood['tree_type']), 105 | str(seed), 106 | ) 107 | fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_10K", fname) 108 | ensure_dir_for_file(fname) 109 | with open(fname, 'w') as fp: 110 | json.dump(results, fp) 111 | 112 | del(testgen) 113 | del(traingen) 114 | model.cleanup() 115 | 116 | time.sleep(1) 117 | 118 | ################################################################################### 119 | import argparse 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 124 | parser.add_argument('--key', type=str) 125 | parser.add_argument('--n_bottom', nargs='?', const=0.0, type=float, default=0.0) 126 | args = parser.parse_args() 127 | dkey, train_size, seed, key, n_bottom = args.dkey, args.train_size, args.seed, args.key, args.n_bottom 128 | ################################################################################### 129 | 130 | single_run(dkey, train_size, n_bottom, params.parameters_hugewood[key], seed) 131 | -------------------------------------------------------------------------------- /experiments/influence_n_bottom/hugewood_1K.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import os 5 | import json 6 | from util import evaluate 7 | import params 8 | 9 | import time 10 | 11 | from woody import HugeWoodClassifier, WoodClassifier 12 | 13 | from woody.io import MemoryStore, DiskStore 14 | from woody.util import ensure_dir_for_file 15 | from woody.data import * 16 | 17 | def single_run(dkey, train_size, n_bottom, param, seed, profile=False): 18 | 19 | print("Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param))) 20 | 21 | if dkey == "covtype": 22 | traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) 23 | elif dkey == "higgs": 24 | traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) 25 | elif dkey == "susy": 26 | traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) 27 | else: 28 | raise Exception("Unknown data set!") 29 | 30 | print("") 31 | print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) 32 | print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) 33 | print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) 34 | 35 | param_wood = param['param_wood'] 36 | 37 | wood = WoodClassifier( 38 | n_estimators=1, 39 | criterion="gini", 40 | max_features=param_wood['max_features'], 41 | min_samples_split=2, 42 | n_jobs=param_wood['n_jobs'], 43 | seed=seed, 44 | bootstrap=param_wood['bootstrap'], 45 | tree_traversal_mode="dfs", 46 | tree_type=param_wood['tree_type'], 47 | min_samples_leaf=1, 48 | float_type="double", 49 | max_depth=None, 50 | verbose=0) 51 | 52 | model = HugeWoodClassifier( 53 | n_estimators=int(24 / n_bottom), 54 | n_estimators_bottom=int(n_bottom), 55 | n_top="auto", 56 | n_patterns_leaf=1000, 57 | balanced_top_tree=True, 58 | top_tree_lambda=1.0, 59 | top_tree_max_depth=None, 60 | top_tree_type="standard", 61 | top_tree_leaf_stopping_mode="ignore_impurity", 62 | n_jobs=param_wood['n_jobs'], 63 | seed=seed, 64 | verbose=1, 65 | plot_intermediate={}, 66 | chunk_max_megabytes=2048, 67 | wrapped_instance=wood, 68 | store=MemoryStore(), 69 | ) 70 | 71 | # training 72 | if profile == True: 73 | import yep 74 | assert param_wood['n_jobs'] == 1 75 | yep.start("train.prof") 76 | 77 | fit_start_time = time.time() 78 | model.fit(traingen) 79 | fit_end_time = time.time() 80 | if profile == True: 81 | yep.stop() 82 | ypreds_train = model.predict(generator=traingen) 83 | 84 | # testing 85 | test_start_time = time.time() 86 | ypred_test = model.predict(generator=testgen) 87 | test_end_time = time.time() 88 | 89 | results = {} 90 | results['dataset'] = dkey 91 | results['param'] = param 92 | results['training_time'] = fit_end_time - fit_start_time 93 | results['testing_time'] = test_end_time - test_start_time 94 | print("Training time:\t\t%f" % results['training_time']) 95 | print("Testing time:\t\t%f" % results['testing_time']) 96 | 97 | evaluate(ypreds_train, traingen.get_all_target(), results, "training") 98 | evaluate(ypred_test, testgen.get_all_target(), results, "testing") 99 | 100 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']), 101 | str(param_wood['max_features']), 102 | str(param_wood['n_jobs']), 103 | str(param_wood['bootstrap']), 104 | str(param_wood['tree_type']), 105 | str(seed), 106 | ) 107 | fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_1K", fname) 108 | ensure_dir_for_file(fname) 109 | with open(fname, 'w') as fp: 110 | json.dump(results, fp) 111 | 112 | del(testgen) 113 | del(traingen) 114 | model.cleanup() 115 | 116 | time.sleep(1) 117 | 118 | ################################################################################### 119 | import argparse 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 124 | parser.add_argument('--key', type=str) 125 | parser.add_argument('--n_bottom', nargs='?', const=0.0, type=float, default=0.0) 126 | args = parser.parse_args() 127 | dkey, train_size, seed, key, n_bottom = args.dkey, args.train_size, args.seed, args.key, args.n_bottom 128 | ################################################################################### 129 | 130 | single_run(dkey, train_size, n_bottom, params.parameters_hugewood[key], seed) 131 | -------------------------------------------------------------------------------- /experiments/influence_n_bottom/hugewood_75K.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | import os 5 | import json 6 | from util import evaluate 7 | import params 8 | 9 | import time 10 | 11 | from woody import HugeWoodClassifier, WoodClassifier 12 | 13 | from woody.io import MemoryStore, DiskStore 14 | from woody.util import ensure_dir_for_file 15 | from woody.data import * 16 | 17 | def single_run(dkey, train_size, n_bottom, param, seed, profile=False): 18 | 19 | print("Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param))) 20 | 21 | if dkey == "covtype": 22 | traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) 23 | elif dkey == "higgs": 24 | traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) 25 | elif dkey == "susy": 26 | traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) 27 | else: 28 | raise Exception("Unknown data set!") 29 | 30 | print("") 31 | print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) 32 | print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) 33 | print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) 34 | 35 | param_wood = param['param_wood'] 36 | 37 | wood = WoodClassifier( 38 | n_estimators=1, 39 | criterion="gini", 40 | max_features=param_wood['max_features'], 41 | min_samples_split=2, 42 | n_jobs=param_wood['n_jobs'], 43 | seed=seed, 44 | bootstrap=param_wood['bootstrap'], 45 | tree_traversal_mode="dfs", 46 | tree_type=param_wood['tree_type'], 47 | min_samples_leaf=1, 48 | float_type="double", 49 | max_depth=None, 50 | verbose=0) 51 | 52 | model = HugeWoodClassifier( 53 | n_estimators=int(24 / n_bottom), 54 | n_estimators_bottom=int(n_bottom), 55 | n_top="auto", 56 | n_patterns_leaf=75000, 57 | balanced_top_tree=True, 58 | top_tree_lambda=1.0, 59 | top_tree_max_depth=None, 60 | top_tree_type="standard", 61 | top_tree_leaf_stopping_mode="ignore_impurity", 62 | n_jobs=param_wood['n_jobs'], 63 | seed=seed, 64 | verbose=1, 65 | plot_intermediate={}, 66 | chunk_max_megabytes=2048, 67 | wrapped_instance=wood, 68 | store=MemoryStore(), 69 | ) 70 | 71 | # training 72 | if profile == True: 73 | import yep 74 | assert param_wood['n_jobs'] == 1 75 | yep.start("train.prof") 76 | 77 | fit_start_time = time.time() 78 | model.fit(traingen) 79 | fit_end_time = time.time() 80 | if profile == True: 81 | yep.stop() 82 | ypreds_train = model.predict(generator=traingen) 83 | 84 | # testing 85 | test_start_time = time.time() 86 | ypred_test = model.predict(generator=testgen) 87 | test_end_time = time.time() 88 | 89 | results = {} 90 | results['dataset'] = dkey 91 | results['param'] = param 92 | results['training_time'] = fit_end_time - fit_start_time 93 | results['testing_time'] = test_end_time - test_start_time 94 | print("Training time:\t\t%f" % results['training_time']) 95 | print("Testing time:\t\t%f" % results['testing_time']) 96 | 97 | evaluate(ypreds_train, traingen.get_all_target(), results, "training") 98 | evaluate(ypred_test, testgen.get_all_target(), results, "testing") 99 | 100 | fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']), 101 | str(param_wood['max_features']), 102 | str(param_wood['n_jobs']), 103 | str(param_wood['bootstrap']), 104 | str(param_wood['tree_type']), 105 | str(seed), 106 | ) 107 | fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_75K", fname) 108 | ensure_dir_for_file(fname) 109 | with open(fname, 'w') as fp: 110 | json.dump(results, fp) 111 | 112 | del(testgen) 113 | del(traingen) 114 | model.cleanup() 115 | 116 | time.sleep(1) 117 | 118 | ################################################################################### 119 | import argparse 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype") 122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0) 123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0) 124 | parser.add_argument('--key', type=str) 125 | parser.add_argument('--n_bottom', nargs='?', const=0.0, type=float, default=0.0) 126 | args = parser.parse_args() 127 | dkey, train_size, seed, key, n_bottom = args.dkey, args.train_size, args.seed, args.key, args.n_bottom 128 | ################################################################################### 129 | 130 | single_run(dkey, train_size, n_bottom, params.parameters_hugewood[key], seed) 131 | --------------------------------------------------------------------------------