├── CONTRIBUTING.md
├── data
    ├── .gitignore
    ├── susy
    │   └── SUSY.csv.download
    ├── covtype
    │   ├── covtype-test-1.csv.download
    │   └── covtype-train-1.csv.download
    └── higgs
    │   └── HIGGS.csv.download
├── woody
    ├── tests
    │   └── __init__.py
    ├── models
    │   ├── subset
    │   │   ├── __init__.py
    │   │   ├── regression.py
    │   │   └── classification.py
    │   ├── huge
    │   │   ├── __init__.py
    │   │   ├── util.py
    │   │   ├── regression.py
    │   │   ├── classification.py
    │   │   └── predict.py
    │   ├── forest
    │   │   ├── __init__.py
    │   │   ├── src
    │   │   │   ├── include
    │   │   │   │   ├── qsort.h
    │   │   │   │   ├── float.h
    │   │   │   │   ├── util.h
    │   │   │   │   ├── pqueue.h
    │   │   │   │   └── timing.h
    │   │   │   ├── tree
    │   │   │   │   ├── cpu
    │   │   │   │   │   ├── include
    │   │   │   │   │   │   ├── fastsort.h
    │   │   │   │   │   │   ├── standard.h
    │   │   │   │   │   │   └── criteria.h
    │   │   │   │   │   └── fastsort.c
    │   │   │   │   └── include
    │   │   │   │   │   ├── global.h
    │   │   │   │   │   ├── types.h
    │   │   │   │   │   ├── cpu.h
    │   │   │   │   │   └── tree.h
    │   │   │   ├── timing.c
    │   │   │   ├── util.c
    │   │   │   ├── .cproject
    │   │   │   ├── qsort.c
    │   │   │   └── pqueue.c
    │   │   ├── swig
    │   │   │   ├── cpu_float.i
    │   │   │   ├── gpu_float.i
    │   │   │   ├── gpu_double.i
    │   │   │   └── cpu_double.i
    │   │   ├── classification.py
    │   │   ├── regression.py
    │   │   ├── util.py
    │   │   ├── setup.py
    │   │   └── .cproject
    │   ├── __init__.py
    │   ├── util.py
    │   ├── base.py
    │   └── sampler.py
    ├── util
    │   ├── array
    │   │   ├── __init__.py
    │   │   ├── src
    │   │   │   ├── include
    │   │   │   │   ├── util.h
    │   │   │   │   ├── global.h
    │   │   │   │   └── array.h
    │   │   │   ├── util.c
    │   │   │   └── array.c
    │   │   ├── swig
    │   │   │   ├── cpu_float.i
    │   │   │   └── cpu_double.i
    │   │   ├── setup.py
    │   │   └── base.py
    │   ├── __init__.py
    │   ├── timer.py
    │   ├── url.py
    │   ├── base.py
    │   ├── draw.py
    │   └── parallel.py
    ├── io
    │   ├── __init__.py
    │   ├── split.py
    │   ├── reader.py
    │   ├── store.py
    │   └── csv.py
    ├── data
    │   ├── __init__.py
    │   ├── landsat.py
    │   ├── artificial.py
    │   ├── covtype.py
    │   ├── util.py
    │   ├── susy.py
    │   └── higgs.py
    ├── __init__.py
    └── setup.py
├── setup.cfg
├── MANIFEST.in
├── experiments
    ├── landsat
    │   ├── util.py
    │   ├── launch.py
    │   ├── params.py
    │   └── sk.py
    ├── large_data
    │   ├── util.py
    │   ├── launch.py
    │   └── params.py
    ├── small_data
    │   ├── util.py
    │   ├── launch.py
    │   ├── params.py
    │   ├── sk.py
    │   ├── wood.py
    │   ├── subsetwood.py
    │   ├── h2.py
    │   └── hugewood_lam.py
    ├── influence_lamda
    │   ├── util.py
    │   ├── launch.py
    │   ├── params.py
    │   └── wood.py
    └── influence_n_bottom
    │   ├── util.py
    │   ├── launch.py
    │   ├── params.py
    │   ├── hugewood_10K.py
    │   ├── hugewood_1K.py
    │   └── hugewood_75K.py
├── requirements.txt
├── .gitignore
├── README.rst
└── setup.py


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | covtype/*.csv
2 | higgs/*.csv
3 | susy/*.csv
4 | 


--------------------------------------------------------------------------------
/data/susy/SUSY.csv.download:
--------------------------------------------------------------------------------
1 | http://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz
2 | 


--------------------------------------------------------------------------------
/data/covtype/covtype-test-1.csv.download:
--------------------------------------------------------------------------------
1 | https://sid.erda.dk/share_redirect/bx3kbiD08L/covtype-test-1.csv
2 | 


--------------------------------------------------------------------------------
/data/covtype/covtype-train-1.csv.download:
--------------------------------------------------------------------------------
1 | https://sid.erda.dk/share_redirect/bx3kbiD08L/covtype-train-1.csv
2 | 


--------------------------------------------------------------------------------
/data/higgs/HIGGS.csv.download:
--------------------------------------------------------------------------------
1 | https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
2 | 


--------------------------------------------------------------------------------
/woody/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
3 | # License: GPL v2
4 | #


--------------------------------------------------------------------------------
/woody/models/subset/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification import SubsetWoodClassifier
2 | from .regression import SubsetWoodRegressor


--------------------------------------------------------------------------------
/woody/util/array/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
3 | # License: GPL v2
4 | #
5 | 
6 | from .base import split_array, transpose_array


--------------------------------------------------------------------------------
/woody/io/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
3 | # License: GPL v2
4 | #
5 | 
6 | from .base import DataGenerator
7 | from .store import DiskStore, MemoryStore


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [aliases]
 2 | test = nosetests
 3 | 
 4 | [nosetests]
 5 | no-path-adjustment=1
 6 | exe = 1
 7 | detailed-errors = 1
 8 | cover-html = 1
 9 | cover-html-dir = coverage
10 | cover-package = woody
11 | 


--------------------------------------------------------------------------------
/woody/models/huge/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
3 | # License: GPL v2
4 | #
5 | 
6 | from .classification import HugeWoodClassifier
7 | from .regression import HugeWoodRegressor
8 | 


--------------------------------------------------------------------------------
/woody/models/forest/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
3 | # License: GPL v2
4 | #
5 | 
6 | from .classification import WoodClassifier
7 | from .regression import WoodRegressor
8 | from .base import Wood
9 | 


--------------------------------------------------------------------------------
/woody/models/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
3 | # License: GPL v2
4 | #
5 | 
6 | from .forest import WoodClassifier, WoodRegressor, Wood
7 | from .huge import HugeWoodClassifier, HugeWoodRegressor
8 | from .subset import SubsetWoodClassifier, SubsetWoodRegressor


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.rst
 2 | include CONTRIBUTING.md
 3 | include LICENSE
 4 | include requirements.txt
 5 | 
 6 | recursive-include examples *.py
 7 | recursive-include woody *.c *.h *.i
 8 | 
 9 | #include docs/conf.py
10 | #include docs/Makefile
11 | #recursive-include docs *.rst
12 | #include docs/_static/bibtex/*
13 | #include docs/_static/images/*
14 | 
15 | exclude MANIFEST.in
16 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/include/qsort.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * qsort.h
 3 |  *
 4 |  *  Created on: 12.11.2014
 5 |  *      Author: fgieseke
 6 |  */
 7 | 
 8 | #ifndef INCLUDE_QSORT_H_
 9 | #define INCLUDE_QSORT_H_
10 | 
11 | void woody_qsort(void *base, unsigned num, unsigned width,
12 | 		int (*comp)(const void *, const void *, const void *),
13 | 		const void* comp_param);
14 | 
15 | #endif /* INCLUDE_QSORT_H_ */
16 | 


--------------------------------------------------------------------------------
/experiments/landsat/util.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import accuracy_score
 2 | 
 3 | metrics = {"accuracy": accuracy_score}
 4 | 
 5 | def evaluate(preds, y, results, prefix, verbose=1):
 6 |     
 7 |     for key in metrics.keys():
 8 |         res = metrics[key](y, preds)
 9 |         results[prefix + "_" + key] = res
10 |         if verbose > 0:
11 |             print(prefix + " " + key + ":\t" + str(res))
12 |             


--------------------------------------------------------------------------------
/woody/util/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from .base import makedirs, ensure_dir_for_file, convert_to_libsvm
 7 | from .timer import Timer
 8 | from .array import split_array
 9 | from .url import download_from_url
10 | from .draw import draw_single_tree
11 | from .parallel import perform_task_in_parallel, start_via_single_process


--------------------------------------------------------------------------------
/experiments/large_data/util.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import accuracy_score
 2 | 
 3 | metrics = {"accuracy": accuracy_score}
 4 | 
 5 | def evaluate(preds, y, results, prefix, verbose=1):
 6 |     
 7 |     for key in metrics.keys():
 8 |         res = metrics[key](y, preds)
 9 |         results[prefix + "_" + key] = res
10 |         if verbose > 0:
11 |             print(prefix + " " + key + ":\t" + str(res))
12 |             


--------------------------------------------------------------------------------
/experiments/small_data/util.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import accuracy_score
 2 | 
 3 | metrics = {"accuracy": accuracy_score}
 4 | 
 5 | def evaluate(preds, y, results, prefix, verbose=1):
 6 |     
 7 |     for key in metrics.keys():
 8 |         res = metrics[key](y, preds)
 9 |         results[prefix + "_" + key] = res
10 |         if verbose > 0:
11 |             print(prefix + " " + key + ":\t" + str(res))
12 |             


--------------------------------------------------------------------------------
/experiments/influence_lamda/util.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import accuracy_score
 2 | 
 3 | metrics = {"accuracy": accuracy_score}
 4 | 
 5 | def evaluate(preds, y, results, prefix, verbose=1):
 6 |     
 7 |     for key in metrics.keys():
 8 |         res = metrics[key](y, preds)
 9 |         results[prefix + "_" + key] = res
10 |         if verbose > 0:
11 |             print(prefix + " " + key + ":\t" + str(res))
12 |             


--------------------------------------------------------------------------------
/experiments/influence_n_bottom/util.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import accuracy_score
 2 | 
 3 | metrics = {"accuracy": accuracy_score}
 4 | 
 5 | def evaluate(preds, y, results, prefix, verbose=1):
 6 |     
 7 |     for key in metrics.keys():
 8 |         res = metrics[key](y, preds)
 9 |         results[prefix + "_" + key] = res
10 |         if verbose > 0:
11 |             print(prefix + " " + key + ":\t" + str(res))
12 |             


--------------------------------------------------------------------------------
/woody/io/split.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import pandas
 7 | 
 8 | def train_test_split_csv(fname, fname_train, fname_test, train_size=None, test_size=None, chunksize=500000):
 9 |     
10 |     pandas.read_csv(fname, iterator=True, chunksize=chunksize)
11 | 
12 | def train_test_split_h5pd(fname, fname_train, fname_test, train_size=None, test_size=None):
13 |     
14 |     pass


--------------------------------------------------------------------------------
/woody/data/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from woody.data.generate import covtype_files, covtype, covtype_generators
 7 | from woody.data.generate import higgs_files, higgs, higgs_generators
 8 | from woody.data.generate import susy_files, susy, susy_generators
 9 | from woody.data.generate import landsat_files, landsat_generators
10 | from woody.data.generate import artificial, artificial_generators


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | h2o==3.10.4.8
 2 | h5py==2.6.0
 3 | matplotlib==1.5.3
 4 | networkx==1.11
 5 | nose==1.3.7
 6 | numpy==1.11.2
 7 | pandas==0.19.1
 8 | pygraphviz==1.3.1
 9 | pyparsing==2.1.10
10 | requests==2.18.4
11 | scikit-image==0.12.3
12 | scikit-learn==0.18.1
13 | scipy==0.18.1
14 | seaborn==0.8.1
15 | sklearn-evaluation==0.3
16 | tables==3.3.0
17 | tabulate==0.7.7
18 | urllib3==1.22
19 | yep==0.4
20 | Cython==0.26.1
21 | #skutil==0.1.6 # install manually from https://github.com/tgsmith61591/skutil
22 | 
23 | 


--------------------------------------------------------------------------------
/woody/util/array/src/include/util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * util.h
 3 |  */
 4 | #ifndef INCLUDE_UTIL_H_
 5 | #define INCLUDE_UTIL_H_
 6 | 
 7 | #include "global.h"
 8 | 
 9 | #include <stdlib.h>
10 | #include <stdio.h>
11 | 
12 | /* --------------------------------------------------------------------------------
13 |  * Copies a single pattern
14 |  * --------------------------------------------------------------------------------
15 |  */
16 | inline void copy_pattern(FLOAT_TYPE *src, FLOAT_TYPE *dst, int dim);
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/woody/util/array/src/util.c:
--------------------------------------------------------------------------------
 1 | #include "include/array.h"
 2 | 
 3 | /* --------------------------------------------------------------------------------
 4 |  * Copies a single pattern
 5 |  * --------------------------------------------------------------------------------
 6 |  */
 7 | inline void copy_pattern(FLOAT_TYPE *src, FLOAT_TYPE *dst, int dim){
 8 | 
 9 | 	int j;
10 | 
11 | 	// memcpy seems to be slower (function call)
12 |     for (j=0; j<dim; j++){
13 |         dst[j] = src[j];
14 |     }
15 | 
16 | }
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/include/float.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * float.h
 3 |  */
 4 | 
 5 | #ifndef COMMON_INCLUDE_FLOAT_H_
 6 | #define COMMON_INCLUDE_FLOAT_H_
 7 | 
 8 | #ifndef USE_DOUBLE
 9 | #define USE_DOUBLE 0
10 | #endif
11 | 
12 | #if USE_DOUBLE > 0
13 | #define FLOAT_TYPE double
14 | #define PARSE_FLOAT strtod
15 | #define MAX_FLOAT_TYPE     1.7976931348623158e+308
16 | #define MIN_FLOAT_TYPE     -1.7976931348623158e+308
17 | #else
18 | #define FLOAT_TYPE float
19 | #define PARSE_FLOAT strtof
20 | #define MAX_FLOAT_TYPE     3.402823466e+38
21 | #define MIN_FLOAT_TYPE     -3.402823466e+38
22 | #endif
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/cpu/include/fastsort.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * fastsort.h
 3 |  *
 4 |  *  Created on: 23.01.2017
 5 |  *      Author: fgieseke
 6 |  */
 7 | 
 8 | #ifndef ENSEMBLE_CPU_INCLUDE_FASTSORT_H_
 9 | #define ENSEMBLE_CPU_INCLUDE_FASTSORT_H_
10 | 
11 | #include <inttypes.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | 
15 | #include "criteria.h"
16 | 
17 | #include "../../include/global.h"
18 | #include "../../include/util.h"
19 | 
20 | 
21 | #define fast_size_threshold 64
22 | 
23 | void combined_sort(FLOAT_TYPE *XF, int *samples, int n);
24 | 
25 | #endif /* ENSEMBLE_CPU_INCLUDE_FASTSORT_H_ */
26 | 


--------------------------------------------------------------------------------
/woody/util/array/src/include/global.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * global.h
 3 |  */
 4 | 
 5 | #ifndef INCLUDE_GLOBAL_H_
 6 | #define INCLUDE_GLOBAL_H_
 7 | 
 8 | #include <float.h>
 9 | 
10 | #ifndef USE_DOUBLE
11 | #define USE_DOUBLE 0
12 | #endif
13 | 
14 | #if USE_DOUBLE > 0
15 | #define FLOAT_TYPE double
16 | #define PARSE_FLOAT strtod
17 | #define MAX_FLOAT_TYPE     1.7976931348623158e+308
18 | #define MIN_FLOAT_TYPE     -1.7976931348623158e+308
19 | #else
20 | #define FLOAT_TYPE float
21 | #define PARSE_FLOAT strtof
22 | #define MAX_FLOAT_TYPE     3.402823466e+38
23 | #define MIN_FLOAT_TYPE     -3.402823466e+38
24 | #endif
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/experiments/large_data/launch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import params
 3 | 
 4 | seed = params.seed
 5 | odir = params.odir
 6 | methods = params.methods
 7 | 
 8 | for method in methods:
 9 |     for dkey in params.datasets.keys():
10 |         for train_size in params.datasets[dkey]['train_sizes']:
11 |             for key in params.parameters:
12 |                 print("Processing method %s with data set %s, train_size %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(key)))
13 |                 cmd = "python " + method + ".py --dkey %s --train_size %i --key %s" % (dkey, train_size, key)
14 |                 print(cmd)
15 |                 os.system(cmd)


--------------------------------------------------------------------------------
/woody/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | """ 
 7 | The woody package aims at large-scale implementations
 8 | for random forests. It is based on an efficient C 
 9 | implementation  and resorts to distributed computing 
10 | strategies.
11 | """
12 | 
13 | import warnings
14 | 
15 | try:
16 |     from woody.models import WoodClassifier, WoodRegressor, HugeWoodClassifier, HugeWoodRegressor, SubsetWoodClassifier, SubsetWoodRegressor
17 | except Exception as e:
18 |     warnings.warn("Swig models not compiled yet? Error message: %s" % str(e))
19 | 
20 | __version__ = "0.3.1"
21 | 


--------------------------------------------------------------------------------
/experiments/landsat/launch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import params
 3 | 
 4 | seeds = [0,1,2,3]
 5 | odir = params.odir
 6 | methods = params.methods
 7 | 
 8 | for method in methods:
 9 |     for dkey in params.datasets.keys():
10 |         for train_size in params.datasets[dkey]['train_sizes']:
11 |             for seed in seeds:
12 |                 for key in params.parameters:
13 |                     print("Processing method %s with data set %s, train_size %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(seed), str(key)))
14 |                     cmd = "python " + method + ".py --dkey %s --train_size %i --seed %i --key %s" % (dkey, train_size, seed, key)
15 |                     print(cmd)
16 |                     os.system(cmd)
17 | 


--------------------------------------------------------------------------------
/experiments/small_data/launch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import params
 3 | 
 4 | seeds = [0,1,2,3]
 5 | odir = params.odir
 6 | methods = params.methods
 7 | 
 8 | for method in methods:
 9 |     for dkey in params.datasets.keys():
10 |         for train_size in params.datasets[dkey]['train_sizes']:
11 |             for seed in seeds:
12 |                 for key in params.parameters:
13 |                     print("Processing method %s with data set %s, train_size %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(seed), str(key)))
14 |                     cmd = "python " + method + ".py --dkey %s --train_size %i --seed %i --key %s" % (dkey, train_size, seed, key)
15 |                     print(cmd)
16 |                     os.system(cmd)
17 | 


--------------------------------------------------------------------------------
/woody/util/timer.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import time
 7 | 
 8 | class Timer(object):
 9 |     
10 |     def __init__(self):
11 |         
12 |         self._start_time = 0.0
13 |         self._elapsed_time = 0.0
14 |     
15 |     def start(self):
16 |         
17 |         self._start_time = time.time()
18 |         
19 |     def stop(self):
20 |         
21 |         self._elapsed_time += time.time() - self._start_time
22 |         self._start_time = 0.0
23 |     
24 |     def reset(self):
25 |         
26 |         self._start_time = 0.0
27 |         self._elapsed_time = 0.0
28 |         
29 |     def get_elapsed_time(self):
30 |         
31 |         return self._elapsed_time
32 |         


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/cpu/include/standard.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * standard.h
 3 |  *
 4 |  *  Created on: 23.01.2017
 5 |  *      Author: fgieseke
 6 |  */
 7 | 
 8 | #ifndef ENSEMBLE_CPU_INCLUDE_STANDARD_H_
 9 | #define ENSEMBLE_CPU_INCLUDE_STANDARD_H_
10 | 
11 | #include <inttypes.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | 
15 | #include "criteria.h"
16 | #include "fastsort.h"
17 | 
18 | #include "../../include/global.h"
19 | #include "../../include/util.h"
20 | 
21 | 
22 | #define size_threshold 16
23 | 
24 | void intro_sort(PATTERN_LABEL_WEIGHT *a, int n);
25 | 
26 | FLOAT_TYPE compute_optimal_threshold(PATTERN_LABEL_WEIGHT *XF_Y_W, int n_XF_Y_W, PARAMETERS *params, TRAINING_DATA *train_data, SPLIT_RECORD *best_split);
27 | 
28 | #endif /* ENSEMBLE_CPU_INCLUDE_STANDARD_H_ */
29 | 


--------------------------------------------------------------------------------
/woody/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | def configuration(parent_package='', top_path=None):
 7 | 
 8 |     from numpy.distutils.misc_util import Configuration
 9 | 
10 |     config = Configuration('woody', parent_package, top_path)
11 |     config.add_subpackage('models', subpackage_path='models')
12 |     config.add_subpackage('models/forest', subpackage_path='models/forest')
13 |     config.add_subpackage('tests')
14 |     config.add_subpackage('util')
15 |     config.add_subpackage('util/array', subpackage_path='util/array')
16 | 
17 |     return config
18 | 
19 | if __name__ == '__main__':
20 |                 
21 |     from numpy.distutils.core import setup
22 |     setup(**configuration(top_path='').todict())
23 | 


--------------------------------------------------------------------------------
/experiments/influence_lamda/launch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import params
 3 | 
 4 | seeds = [0,1,2,3]
 5 | odir = params.odir
 6 | methods = params.methods
 7 | 
 8 | for method in methods:
 9 |     for dkey in params.datasets.keys():
10 |         for train_size in params.datasets[dkey]['train_sizes']:
11 |             for lamcrit in params.lamcrits:
12 |                 for seed in seeds:
13 |                     for key in params.parameters:
14 |                         print("Processing method %s with data set %s, train_size %s, lamcrit %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(lamcrit), str(seed), str(key)))
15 |                         cmd = "python " + method + ".py --dkey %s --train_size %i --lamcrit %f --seed %i --key %s" % (dkey, train_size, lamcrit, seed, key)
16 |                         print(cmd)
17 |                         os.system(cmd)
18 | 


--------------------------------------------------------------------------------
/experiments/influence_n_bottom/launch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import params
 3 | 
 4 | seeds = [0,1,2,3]
 5 | odir = params.odir
 6 | methods = params.methods
 7 | 
 8 | for method in methods:
 9 |     for dkey in params.datasets.keys():
10 |         for train_size in params.datasets[dkey]['train_sizes']:
11 |             for n_bottom in params.n_estimators_bottoms:
12 |                 for seed in seeds:
13 |                     for key in params.parameters:
14 |                         print("Processing method %s with data set %s, train_size %s, n_bottom %s, seed %s, and key %s ..." % (str(method), str(dkey), str(train_size), str(n_bottom), str(seed), str(key)))
15 |                         cmd = "python " + method + ".py --dkey %s --train_size %i --n_bottom %f --seed %i --key %s" % (dkey, train_size, n_bottom, seed, key)
16 |                         print(cmd)
17 |                         os.system(cmd)
18 | 


--------------------------------------------------------------------------------
/experiments/large_data/params.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | seed = 0
 4 | odir = "results"
 5 | methods = ["hugewood"]
 6 | 
 7 | datasets = collections.OrderedDict()
 8 | datasets['landsat'] = {'train_sizes':[250000000, 500000000, 750000000, 1000000000]}
 9 | 
10 | parameters = collections.OrderedDict()
11 | #parameters['ert'] = {'n_estimators':4,
12 | #                     'max_features':None, 
13 | #                     'bootstrap':False, 
14 | #                     'tree_type':'randomized', 
15 | #                     'n_jobs':4}
16 | parameters['rf'] = {'n_estimators':4,
17 |                     'max_features':"sqrt", 
18 |                     'bootstrap':True, 
19 |                     'tree_type':'standard', 
20 |                     'n_jobs':4}
21 | 
22 | parameters_hugewood = collections.OrderedDict()
23 | 
24 | for key in parameters:
25 |     
26 |     param_hugewood = {}
27 |     param_hugewood['param_wood'] = parameters[key]
28 |     param_hugewood['n_estimators'] = 1
29 |     param_hugewood['n_estimators_bottom'] = 4
30 |     
31 |     parameters_hugewood[key] = param_hugewood
32 | 


--------------------------------------------------------------------------------
/woody/util/array/swig/cpu_float.i:
--------------------------------------------------------------------------------
 1 | %module wrapper_utils_cpu_float
 2 | 
 3 | %{
 4 |     #define SWIG_FILE_WITH_INIT
 5 |     #include "array.h"
 6 | %}
 7 | 
 8 | %include "numpy.i"
 9 | 
10 | %init %{
11 |     import_array();
12 | %}
13 | 
14 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *X, int nX, int dX)}
15 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *XT, int nXT, int dXT)}
16 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *Xnew, int nXnew, int dXnew)}
17 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *y, int ny)}
18 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *ynew, int nynew)}
19 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *offsets, int noffsets)}
20 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *indicator, int nindicator)}
21 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *chunks, int nchunks)}
22 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *counts, int ncounts)}
23 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *cumsums_minus_counts, int ncumsums_minus_counts)}
24 | 
25 | %include "array.h"      
26 | 


--------------------------------------------------------------------------------
/experiments/landsat/params.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | odir = "results"
 4 | 
 5 | methods = ["hugewood", "subsetwood", "sk", "h2"]
 6 | 
 7 | datasets = collections.OrderedDict()
 8 | datasets['landsat'] = {'train_sizes':[i*1000000 for i in [10,20,30,40,50]]}
 9 | 
10 | parameters = collections.OrderedDict()
11 | #parameters['ert'] = {'n_estimators':4,
12 | #                     'max_features':None, 
13 | #                     'bootstrap':False, 
14 | #                     'tree_type':'randomized', 
15 | #                     'n_jobs':4}
16 | parameters['rf'] = {'n_estimators':12,
17 |                     'max_features':"sqrt", 
18 |                     'bootstrap':True, 
19 |                     'tree_type':'standard', 
20 |                     'n_jobs':4}
21 | 
22 | parameters_hugewood = collections.OrderedDict()
23 | 
24 | for key in parameters:
25 |     
26 |     param_hugewood = {}
27 |     param_hugewood['param_wood'] = parameters[key]
28 |     param_hugewood['n_estimators'] = 3
29 |     param_hugewood['n_estimators_bottom'] = 4
30 |     
31 |     parameters_hugewood[key] = param_hugewood
32 | 


--------------------------------------------------------------------------------
/woody/util/array/swig/cpu_double.i:
--------------------------------------------------------------------------------
 1 | %module wrapper_utils_cpu_double
 2 | 
 3 | %{
 4 |     #define SWIG_FILE_WITH_INIT
 5 |     #include "array.h"
 6 | %}
 7 | 
 8 | %include "numpy.i"
 9 | 
10 | %init %{
11 |     import_array();
12 | %}
13 | 
14 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *X, int nX, int dX)}
15 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *XT, int nXT, int dXT)}
16 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE *Xnew, int nXnew, int dXnew)}
17 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *y, int ny)}
18 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *ynew, int nynew)}
19 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *offsets, int noffsets)}
20 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *indicator, int nindicator)}
21 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *chunks, int nchunks)}
22 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *counts, int ncounts)}
23 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *cumsums_minus_counts, int ncumsums_minus_counts)}
24 | 
25 | %include "array.h"      
26 | 


--------------------------------------------------------------------------------
/experiments/influence_lamda/params.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | odir = "results"
 4 | methods = ["hugewood", "wood"]
 5 | 
 6 | lamcrits = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
 7 | 
 8 | datasets = collections.OrderedDict()
 9 | datasets['covtype'] = {'train_sizes':[100000, 150000, 200000, 250000, 300000, 350000, 400000]}
10 | 
11 | parameters = collections.OrderedDict()
12 | #parameters['ert'] = {'n_estimators':4,
13 | #                     'max_features':None, 
14 | #                     'bootstrap':False, 
15 | #                     'tree_type':'randomized', 
16 | #                     'n_jobs':4}
17 | parameters['rf'] = {'n_estimators':24,
18 |                     'max_features':"sqrt", 
19 |                     'bootstrap':True, 
20 |                     'tree_type':'standard', 
21 |                     'n_jobs':4}
22 | 
23 | parameters_hugewood = collections.OrderedDict()
24 | 
25 | for key in parameters:
26 |     
27 |     param_hugewood = {}
28 |     param_hugewood['param_wood'] = parameters[key]
29 |     param_hugewood['n_estimators'] = 6
30 |     param_hugewood['n_estimators_bottom'] = 4
31 |     
32 |     parameters_hugewood[key] = param_hugewood
33 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/timing.c:
--------------------------------------------------------------------------------
 1 | #include "include/timing.h"
 2 | 
 3 | /* --------------------------------------------------------------------------------
 4 |  * Helper method for computing the current time (w.r.t to an offset).
 5 |  * --------------------------------------------------------------------------------
 6 |  */
 7 | long get_system_time_in_microseconds(void) {
 8 | 
 9 | 	struct timeval tempo;
10 | 	gettimeofday(&tempo, NULL);
11 | 
12 | 	return tempo.tv_sec * 1000000 + tempo.tv_usec;
13 | 
14 | }
15 | 
16 | void init_my_timer(TIMER *timer) {
17 | 
18 | 	timer->start_time = 0;
19 | 	timer->elapsed_time = 0.0f;
20 | 	timer->elapsed_time_total = 0.0f;
21 | 
22 | }
23 | 
24 | void start_my_timer(TIMER *timer) {
25 | 
26 | 	timer->start_time = get_system_time_in_microseconds();
27 | 
28 | }
29 | 
30 | void stop_my_timer(TIMER *timer) {
31 | 
32 | 	double current = (double) get_system_time_in_microseconds();
33 | 	timer->elapsed_time = current - timer->start_time;
34 | 	timer->elapsed_time_total += timer->elapsed_time;
35 | 
36 | }
37 | 
38 | double get_my_timer(TIMER *timer) {
39 | 
40 | 	return (double) (1.0 * timer->elapsed_time_total / 1000000.0);
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/experiments/influence_n_bottom/params.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | odir = "results"
 4 | methods = ["hugewood_1K", "hugewood_10K", "hugewood_75K"]
 5 | 
 6 | n_estimators_bottoms = [1,4,12,24]
 7 | 
 8 | datasets = collections.OrderedDict()
 9 | datasets['covtype'] = {'train_sizes':[100000, 150000, 200000, 250000, 300000, 350000, 400000]}
10 | 
11 | parameters = collections.OrderedDict()
12 | #parameters['ert'] = {'n_estimators':4,
13 | #                     'max_features':None, 
14 | #                     'bootstrap':False, 
15 | #                     'tree_type':'randomized', 
16 | #                     'n_jobs':4}
17 | parameters['rf'] = {'n_estimators':24,
18 |                     'max_features':"sqrt", 
19 |                     'bootstrap':True, 
20 |                     'tree_type':'standard', 
21 |                     'n_jobs':4}
22 | 
23 | parameters_hugewood = collections.OrderedDict()
24 | 
25 | for key in parameters:
26 |     
27 |     param_hugewood = {}
28 |     param_hugewood['param_wood'] = parameters[key]
29 |     # set in hugewood*.py
30 |     #param_hugewood['n_estimators'] = 6
31 |     #param_hugewood['n_estimators_bottom'] = 4
32 |     
33 |     parameters_hugewood[key] = param_hugewood
34 | 


--------------------------------------------------------------------------------
/woody/models/forest/swig/cpu_float.i:
--------------------------------------------------------------------------------
 1 | %module wrapper_cpu_float
 2 | 
 3 | %{
 4 |     #define SWIG_FILE_WITH_INIT
 5 |     #include "base.h"
 6 |     #include "types.h"
 7 | %}
 8 | 
 9 | %include "numpy.i"
10 | 
11 | %init %{
12 |     import_array();
13 | %}
14 | 
15 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)}
16 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)}
17 | 
18 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)}
19 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)}
20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)}
21 | 
22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)}
23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)}
24 | 
25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)}
26 | 
27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)}
28 | 
29 | %include "base.h"      
30 | %include "types.h"  
31 | 


--------------------------------------------------------------------------------
/woody/models/forest/swig/gpu_float.i:
--------------------------------------------------------------------------------
 1 | %module wrapper_gpu_float
 2 | 
 3 | %{
 4 |     #define SWIG_FILE_WITH_INIT
 5 |     #include "base.h"
 6 |     #include "types.h"
 7 | %}
 8 | 
 9 | %include "numpy.i"
10 | 
11 | %init %{
12 |     import_array();
13 | %}
14 | 
15 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)}
16 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)}
17 | 
18 | %apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)}
19 | %apply (float* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)}
20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)}
21 | 
22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)}
23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)}
24 | 
25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)}
26 | 
27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)}
28 | 
29 | %include "base.h"      
30 | %include "types.h"  
31 | 


--------------------------------------------------------------------------------
/woody/models/forest/swig/gpu_double.i:
--------------------------------------------------------------------------------
 1 | %module wrapper_gpu_double
 2 | 
 3 | %{
 4 |     #define SWIG_FILE_WITH_INIT
 5 |     #include "base.h"
 6 |     #include "types.h"
 7 | %}
 8 | 
 9 | %include "numpy.i"
10 | 
11 | %init %{
12 |     import_array();
13 | %}
14 | 
15 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)}
16 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)}
17 | 
18 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)}
19 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)}
20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)}
21 | 
22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)}
23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)}
24 | 
25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)}
26 | 
27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)}
28 | 
29 | %include "base.h"      
30 | %include "types.h"  
31 | 


--------------------------------------------------------------------------------
/woody/models/forest/swig/cpu_double.i:
--------------------------------------------------------------------------------
 1 | %module wrapper_cpu_double
 2 | 
 3 | %{
 4 |     #define SWIG_FILE_WITH_INIT
 5 |     #include "base.h"
 6 |     #include "types.h"
 7 | %}
 8 | 
 9 | %include "numpy.i"
10 | 
11 | %init %{
12 |     import_array();
13 | %}
14 | 
15 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtrain, int nXtrain, int dXtrain)}
16 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *Ytrain, int nYtrain)}
17 | 
18 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* Xtest, int nXtest, int dXtest)}
19 | %apply (double* INPLACE_ARRAY1, int DIM1) {(FLOAT_TYPE *predictions, int npredictions)}
20 | %apply (double* INPLACE_ARRAY2, int DIM1, int DIM2) {(FLOAT_TYPE* preds, int npreds, int dpreds)}
21 | 
22 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices, int nbootstrap_indices, int dbootstrap_indices)}
23 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *bootstrap_indices_weights, int nbootstrap_indices_weights, int dbootstrap_indices_weights)}
24 | 
25 | %apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) {(int *indices, int nindices, int dindices)}
26 | 
27 | %apply (int* INPLACE_ARRAY1, int DIM1) {(int *aforest, int naforest)}
28 | 
29 | 
30 | 
31 | %include "base.h"      
32 | %include "types.h"   
33 | 
34 | 


--------------------------------------------------------------------------------
/experiments/small_data/params.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | odir = "results"
 4 | methods = ["hugewood_lam", "subsetwood", "sk", "h2"]
 5 | 
 6 | datasets = collections.OrderedDict()
 7 | datasets['covtype'] = {'train_sizes':[100000, 150000, 200000, 250000, 300000, 350000, 400000]}
 8 | datasets["susy"] = {'train_sizes':[1000000, 1500000, 2000000, 2500000, 3000000, 3500000, 4000000]}
 9 | datasets["higgs"] = {'train_sizes':[1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000]}
10 | 
11 | parameters = collections.OrderedDict()
12 | #parameters['ert'] = {'n_estimators':4,
13 | #                     'max_features':None, 
14 | #                     'bootstrap':False, 
15 | #                     'tree_type':'randomized', 
16 | #                     'n_jobs':4}
17 | parameters['rf'] = {'n_estimators':24,
18 |                     'max_features':"sqrt", 
19 |                     'bootstrap':True, 
20 |                     'tree_type':'standard', 
21 |                     'n_jobs':4}
22 | 
23 | parameters_hugewood = collections.OrderedDict()
24 | 
25 | for key in parameters:
26 |     
27 |     param_hugewood = {}
28 |     param_hugewood['param_wood'] = parameters[key]
29 |     param_hugewood['n_estimators'] = 6
30 |     param_hugewood['n_estimators_bottom'] = 4
31 |     
32 |     parameters_hugewood[key] = param_hugewood
33 | 


--------------------------------------------------------------------------------
/woody/io/reader.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import random
 7 | 
 8 | class Reader(object):
 9 |     """
10 |     """
11 |         
12 |     def __init__(self,
13 |                  fname=None,
14 |                  data=None, 
15 |                  patterns=True,
16 |                  target=True,
17 |                  chunksize=32000,        
18 |                  n_lines_max=None,         
19 |                  seed=0,
20 |                  ):
21 |         
22 |         self.fname = fname
23 |         self.data = data
24 |         self.patterns = patterns
25 |         self.target = target
26 |         self.chunksize = chunksize
27 |         self.n_lines_max = n_lines_max
28 |         self.seed = seed    
29 |         
30 |         self._randomgen = random.Random(self.seed)
31 |         self._reader = None
32 |     
33 |     def __del__(self):
34 |         
35 |         self.close()
36 |             
37 |     def close(self):
38 |         
39 |         try:
40 |             self._reader.close()
41 |         except:
42 |             pass
43 |                 
44 |     def set_seed(self, s):
45 |         
46 |         self._randomgen.seed(s)
47 |         
48 |     def set_mode(self, patterns=True, target=True):
49 |         
50 |         self.patterns = patterns
51 |         self.target = target
52 |         


--------------------------------------------------------------------------------
/woody/models/util.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | import logging
 8 | from datetime import datetime
 9 | 
10 | from logging.handlers import RotatingFileHandler
11 | 
12 | def init_logger(fname, log_name="Logger", log_level="INFO"):
13 |     
14 |     # create logging directory if needed
15 |     d = os.path.dirname(fname)
16 |     if not os.path.exists(d):
17 |         os.makedirs(d)
18 |             
19 |     logger = logging.getLogger(log_name + "_" + str(datetime.now()))
20 |     if log_level == 'INFO': 
21 |         logger.setLevel(logging.INFO)
22 |     else: 
23 |         logger.setLevel(logging.DEBUG)
24 |         
25 |     # logging formatter
26 |     #formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
27 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
28 |     
29 |     # store output if specified
30 |     if fname is not None:
31 |         log_handler = RotatingFileHandler(fname, 'a')
32 |         log_handler.setFormatter(formatter)
33 |         logger.addHandler(log_handler)
34 |     
35 |     # standard streaming handler 
36 |     ch = logging.StreamHandler()
37 |     ch.setFormatter(formatter)
38 |     logger.addHandler(ch)
39 |     
40 |     # avoid double outputs
41 |     logger.propagate = 0
42 | 
43 |     return logger 


--------------------------------------------------------------------------------
/woody/util/array/src/include/array.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * util.h
 3 |  */
 4 | #ifndef INCLUDE_ARRAY_H_
 5 | #define INCLUDE_ARRAY_H_
 6 | 
 7 | #include "global.h"
 8 | #include "util.h"
 9 | 
10 | #include <stdlib.h>
11 | #include <omp.h>
12 | 
13 | /* --------------------------------------------------------------------------------
14 |  * Splits the array X according to the indices
15 |  * --------------------------------------------------------------------------------
16 |  */
17 | void split_array(FLOAT_TYPE *X, int nX, int dX, FLOAT_TYPE *Xnew, int nXnew, int dXnew, int *indicator, int nindicator, int *chunks, int nchunks, int *cumsums_minus_counts, int ncumsums_minus_counts);
18 | 
19 | /* --------------------------------------------------------------------------------
20 |  * Computes split offsets
21 |  * --------------------------------------------------------------------------------
22 |  */
23 | void compute_split_offsets(int *offsets, int noffsets,
24 | 							int *indicator, int nindicator,
25 | 							int *chunks, int nchunks,
26 | 							int *cumsums_minus_counts, int ncumsums_minus_counts);
27 | 
28 | /* --------------------------------------------------------------------------------
29 |  * Transposes an array
30 |  * --------------------------------------------------------------------------------
31 |  */
32 | void transpose_array(FLOAT_TYPE* X, int nX, int dX, FLOAT_TYPE* XT, int nXT, int dXT);
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 
64 | .venv
65 | data/*/*.h5
66 | data/*/*.h5pd
67 | data/*/*.csv
68 | data/*/*.html
69 | data/*/*_files*
70 | data/landsat
71 | 
72 | .project
73 | .pydevproject
74 | 
75 | # swig related
76 | *_double.py
77 | *_float.py
78 | *_wrap.c
79 | 
80 | experiments/landsat/data
81 | experiments/landsat/tmp
82 | experiments/large_data/data
83 | experiments/large_data/tmp
84 | 


--------------------------------------------------------------------------------
/woody/data/landsat.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | from woody.io import DataGenerator
 8 | 
 9 | from .util import check_and_download    
10 | 
11 | def get_landsat_files(data_path, data_set="LC81950212016133LGN00", version="1_1", train_size=0):
12 |     
13 |     fname_train = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".train.csv")
14 |     fname_test = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".test.csv")
15 |     check_and_download(fname_train)
16 |     check_and_download(fname_test)
17 |         
18 |     if train_size > 0:
19 |         fname_train_size = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".train_%i.csv" % train_size)
20 |         if not os.path.exists(fname_train_size):
21 |             os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname_train, fname_train_size))
22 |         fname_train = fname_train_size
23 | 
24 |     return fname_train, fname_test
25 | 
26 | def get_landsat_generator(data_path, train_size=10000000, data_set="LC81950212016133LGN00", version="1_1", seed=0, part="train", store=None, patterns=True, target=True, chunksize=5000000):
27 | 
28 |     assert version in ["1_1", "3_3", "pan_1_1", "pan_3_3"]
29 | 
30 |     if part=="train":
31 |         fname = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".train.h5pd")
32 |     elif part=="test":
33 |         fname = os.path.join(data_path, "landsat", str(data_set) + "_" + version + ".test.h5pd")
34 |     check_and_download(fname)
35 |     
36 |     return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize)


--------------------------------------------------------------------------------
/woody/models/base.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from .util import init_logger
 7 | 
 8 | class NoLogger():
 9 |     
10 |     def __init__(self):
11 |         pass
12 |     
13 |     def info(self, msg):
14 |         pass
15 |     
16 |     def debug(self, msg):
17 |         pass
18 |     
19 | class BaseEstimator(object):
20 |     
21 |     def __init__(self,
22 |                  verbose=0,
23 |                  logging_name="BaseEstimator",
24 |                  logging_file=None,
25 |                  seed=0,
26 |                  ):
27 |         
28 |         self.verbose = verbose
29 |         self.logging_name = logging_name
30 |         self.seed = seed
31 |         
32 |     def fit(self, logging_file="estimator.log"):
33 |         
34 |         # instantiate logger
35 |         if self.verbose > 0:
36 |             self._logger = init_logger(fname=logging_file,
37 |                                        log_name=self.logging_name,
38 |                                        log_level="DEBUG")
39 |         else:
40 |             self._logger = NoLogger()
41 |            
42 |     def get_params(self):
43 |         """ Returns the models's parameters
44 |         """
45 |         
46 |         return {"verbose": self.verbose,
47 |                 "logging_name" : self.logging_name,
48 |                 "seed": self.seed,
49 |                 }
50 |         
51 |     def set_params(self, **parameters):
52 |         """ Sets local parameters (does not need
53 |         to be overwritten).
54 |         """
55 |         
56 |         for parameter, value in parameters.items():
57 |             self.setattr(parameter, value)               
58 |         
59 | 
60 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/include/util.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * util.h
 3 |  */
 4 | #ifndef COMMON_INCLUDE_UTIL_H_
 5 | #define COMMON_INCLUDE_UTIL_H_
 6 | 
 7 | #include <stdlib.h>
 8 | #include <assert.h>
 9 | #include <float.h>
10 | #include <math.h>
11 | #include <stdio.h>
12 | #include <time.h>
13 | #include <sys/resource.h>
14 | #include <sys/time.h>
15 | #include <pthread.h>
16 | #include <sched.h>
17 | #include <errno.h>
18 | #include <string.h>
19 | #include <ctype.h>
20 | 
21 | #include "float.h"
22 | 
23 | #define max(a,b) \
24 |    ({ __typeof__ (a) _a = (a); \
25 |        __typeof__ (b) _b = (b); \
26 |      _a > _b ? _a : _b; })
27 | 
28 | #define min(a,b) \
29 |    ({ __typeof__ (a) _a = (a); \
30 |        __typeof__ (b) _b = (b); \
31 |      _a < _b ? _a : _b; })
32 | 
33 | #define ELEM_SWAP(a,b) { register FLOAT_TYPE t=(a);(a)=(b);(b)=t; }
34 | #define median(a,n) kth_smallest(a,n,((n)/2))
35 | 
36 | /* --------------------------------------------------------------------------------
37 |  * Transposes an array (float)
38 |  * --------------------------------------------------------------------------------
39 |  */
40 | void transpose_array_float(float* array, int n, int d, float* array_transposed);
41 | 
42 | /* --------------------------------------------------------------------------------
43 |  * Transposes an array (double)
44 |  * --------------------------------------------------------------------------------
45 |  */
46 | void transpose_array_double(double* array, int n, int d,
47 | 		double* array_transposed);
48 | 
49 | int compare_floats(const void *p1, const void *p2);
50 | 
51 | int compare_ints(const void *p1, const void *p2);
52 | 
53 | FLOAT_TYPE kth_smallest(FLOAT_TYPE a[], int n, int k);
54 | int kth_smallest_idx(FLOAT_TYPE a[], int n, int k);
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/woody/util/url.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2013-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from __future__ import print_function
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | try:
12 |     import urllib.request as urllib2
13 | except ImportError:
14 |     import urllib2
15 | 
16 | def download_from_url(url, fname):
17 |     """ Downloads data from a given url.
18 |     
19 |     Parameters
20 |     ----------
21 |     url : str
22 |         The target url from which the data
23 |         shall be downloaded
24 |     fname : str
25 |         The local filename; if the corresponding 
26 |         directory does not exists, it will be created
27 |     """
28 |     
29 |     # create directory if needed
30 |     d = os.path.dirname(fname)
31 |     if not os.path.exists(d):
32 |         os.makedirs(d)
33 | 
34 |     # open local file
35 |     f = open(fname, 'wb')
36 | 
37 |     # get data from url; based on 
38 |     # http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python
39 |     u = urllib2.urlopen(url)
40 |     meta = u.info()
41 |     fsize = int(meta.getheaders("Content-Length")[0])
42 |     print("Downloading from %s (%i bytes) ... \n" % (url, fsize))
43 | 
44 |     fsize_current = 0
45 |     block_size = 8192
46 | 
47 |     print("Progress")
48 |     while True:
49 | 
50 |         buff = u.read(block_size)
51 |         if not buff:
52 |             break
53 | 
54 |         fsize_current += len(buff)
55 |         f.write(buff)
56 |         
57 |         percent = fsize_current * 100. / fsize
58 |         
59 |         sys.stdout.flush()        
60 |         sys.stdout.write("\r%2d%%" % percent)
61 |                 
62 |         sys.stdout.flush()        
63 | 
64 |     print("\n")
65 |     f.close()
66 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/cpu/include/criteria.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * criteria.h
 3 |  *
 4 |  *  Created on: 08.01.2015
 5 |  *      Author: fgieseke
 6 |  */
 7 | 
 8 | #ifndef ENSEMBLE_HUGE_FOREST_INCLUDE_CRITERIA_H_
 9 | #define ENSEMBLE_HUGE_FOREST_INCLUDE_CRITERIA_H_
10 | 
11 | #include "../../include/global.h"
12 | #include "../../include/util.h"
13 | 
14 | void criterion_improvement_via_threshold(FLOAT_TYPE threshold, PATTERN_LABEL_WEIGHT *XF_Y_W, TRAINING_DATA *train_data,
15 | 		TRAVERSAL_RECORD *trecord, PARAMETERS *params, SPLIT_RECORD *current_split);
16 | 
17 | /* --------------------------------------------------------------------------------
18 |  * Computes the impurity for samples[start:end]
19 |  * Similar to RegressionCriterion(Criterion) of sklearn
20 |  * --------------------------------------------------------------------------------
21 |  */
22 | FLOAT_TYPE cpu_criterion_leaf(int start, int end,
23 | 		TRAINING_DATA *train_data, PARAMETERS *params);
24 | 
25 | /* --------------------------------------------------------------------------------
26 |  * Initializes a splitting criterion (which can be updated).
27 |  * --------------------------------------------------------------------------------
28 |  */
29 | void init_criterion_cpu(CRITERION_RECORD *crit_record, PATTERN_LABEL_WEIGHT *XF_Y_W,
30 | 		int n_XF_Y_W, PARAMETERS *params, TRAINING_DATA *train_data);
31 | 
32 | void free_criterion_cpu(CRITERION_RECORD *crit_record, PARAMETERS *params, TRAINING_DATA *train_data);
33 | 
34 | /* --------------------------------------------------------------------------------
35 |  * Updates a criterion.
36 |  * --------------------------------------------------------------------------------
37 |  */
38 | void inline update_criterion_cpu(CRITERION_RECORD *crit_record,
39 | 		PATTERN_LABEL_WEIGHT *XF_Y_W, int n_XF_Y_W, int new_pos, PARAMETERS *params, TRAINING_DATA *train_data);
40 | 
41 | #endif /* ENSEMBLE_HUGE_FOREST_INCLUDE_CRITERIA_H_ */
42 | 


--------------------------------------------------------------------------------
/woody/models/sampler.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import copy
 7 | import numpy
 8 | 
 9 | class Sampler(object):
10 |     """
11 |     """
12 | 
13 |     def __init__(self, model, seed=0, n_estimators=10, percentage=0.5):
14 |         
15 |         self.model = model
16 |         self.seed = seed
17 |         self.n_estimators = n_estimators
18 |         self.percentage = percentage
19 |         
20 |         self.models = []
21 |         for i in xrange(self.n_estimators):
22 |             self.models.append(copy.deepcopy(self.model))
23 |              
24 |     def fit(self, X, y):
25 |         
26 |         for i in xrange(self.n_estimators):
27 |             print("Fitting model %i ..." % i) 
28 |             partition = numpy.random.permutation(X.shape[0])
29 |             partition = partition[:int(self.percentage * len(partition))]
30 |             Xsub = X[partition]
31 |             ysub = y[partition]
32 |             self.models[i].fit(Xsub, ysub)
33 |                 
34 |     def predict(self, X, operator="max"):
35 |         
36 |         all_predictions = self._predict_all(X)
37 |         
38 |         preds = []
39 |         for j in xrange(all_predictions.shape[0]):
40 |             p = all_predictions[j,:]
41 |             values, counts = numpy.unique(p,return_counts=True)
42 |             ind = numpy.argmax(counts)
43 |             preds.append(values[ind])
44 |         preds = numpy.array(preds)
45 |         
46 |         return preds
47 |     
48 |     def _predict_all(self, X):
49 |         
50 |         predictions = []
51 |         for i in xrange(self.n_estimators):
52 |             print("Computing predictions for model %i ..." % i)
53 |             preds = self.models[i].predict(X)
54 |             predictions.append(preds)
55 |         predictions = numpy.array(predictions).T
56 |         
57 |         return predictions    
58 |         


--------------------------------------------------------------------------------
/woody/models/forest/classification.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from .base import Wood
 7 | 
 8 | class WoodClassifier(Wood):
 9 |     """ Random forest classifier.
10 |     """
11 |     
12 |     def __init__(self,
13 |                  seed=0,
14 |                  n_estimators=10,
15 |                  min_samples_split=2,
16 |                  max_features=None,
17 |                  bootstrap=False,
18 |                  max_depth=None,
19 |                  min_samples_leaf=1,
20 |                  criterion="gini",
21 |                  tree_traversal_mode="dfs",
22 |                  leaf_stopping_mode="all",
23 |                  tree_type="randomized",
24 |                  float_type="double",
25 |                  patts_trans=True,
26 |                  do_patts_trans=True,
27 |                  lam_criterion=0.0,
28 |                  n_jobs=1,                                  
29 |                  verbose=1,
30 |                  ):
31 |         
32 |         super(WoodClassifier, self).__init__(
33 |                  seed=seed,
34 |                  n_estimators=n_estimators,
35 |                  min_samples_split=min_samples_split,
36 |                  max_features=max_features,
37 |                  bootstrap=bootstrap,
38 |                  max_depth=max_depth,
39 |                  min_samples_leaf=min_samples_leaf,
40 |                  learning_type="classification",
41 |                  criterion=criterion,
42 |                  tree_traversal_mode=tree_traversal_mode,
43 |                  leaf_stopping_mode=leaf_stopping_mode,
44 |                  tree_type=tree_type,
45 |                  float_type=float_type,
46 |                  patts_trans=patts_trans,
47 |                  do_patts_trans=do_patts_trans,
48 |                  lam_criterion=lam_criterion,
49 |                  n_jobs=n_jobs,                                  
50 |                  verbose=verbose)


--------------------------------------------------------------------------------
/woody/models/forest/regression.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from .base import Wood
 7 | 
 8 | class WoodRegressor(Wood):
 9 |     """ Random forest regressor.
10 |     """
11 |     
12 |     def __init__(self,
13 |                  seed=0,
14 |                  n_estimators=10,
15 |                  min_samples_split=2,
16 |                  max_features=None,
17 |                  bootstrap=False,
18 |                  max_depth=None,
19 |                  min_samples_leaf=1,
20 |                  criterion="mse",
21 |                  tree_traversal_mode="dfs",
22 |                  leaf_stopping_mode="all",
23 |                  tree_type="randomized",
24 |                  float_type="double",
25 |                  patts_trans=True,
26 |                  do_patts_trans=True,
27 |                  lam_criterion=0.0,
28 |                  n_jobs=1,                                  
29 |                  verbose=1,
30 |                  ):
31 |         
32 |         super(WoodRegressor, self).__init__(
33 |                  seed=seed,
34 |                  n_estimators=n_estimators,
35 |                  min_samples_split=min_samples_split,
36 |                  max_features=max_features,
37 |                  bootstrap=bootstrap,
38 |                  max_depth=max_depth,
39 |                  min_samples_leaf=min_samples_leaf,
40 |                  learning_type="regression",
41 |                  criterion=criterion,
42 |                  tree_traversal_mode=tree_traversal_mode,
43 |                  leaf_stopping_mode=leaf_stopping_mode,
44 |                  tree_type=tree_type,
45 |                  float_type=float_type,
46 |                  patts_trans=patts_trans,
47 |                  do_patts_trans=do_patts_trans,
48 |                  lam_criterion=lam_criterion,
49 |                  n_jobs=n_jobs,                                  
50 |                  verbose=verbose)
51 |         


--------------------------------------------------------------------------------
/woody/data/artificial.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | import shutil
 8 | 
 9 | from woody.io import DataGenerator
10 | 
11 | from .util import save_to_h5pd
12 | 
13 | def get_artificial_data(size=1000, seed=0):
14 | 
15 |     from sklearn.datasets import make_classification
16 |     
17 |     X, y = make_classification(n_samples=size, n_features=2, n_redundant=0, 
18 |                                n_informative=2, random_state=seed, 
19 |                                n_clusters_per_class=1)
20 |     n_train = len(X) / 2
21 |     X_train, y_train, X_test, y_test = X[:n_train], y[:n_train], X[n_train:], y[n_train:]
22 | 
23 |     return X_train, y_train, X_test, y_test
24 | 
25 | def _convert_datasets(data_path, size=1000, seed=0):
26 | 
27 |     X_train, y_train, X_test, y_test = get_artificial_data(size=size, seed=seed)
28 |     
29 |     fname_store_train = os.path.join(data_path, "artificial/train_" + str(size) + ".h5pd")
30 |     fname_store_test = os.path.join(data_path, "artificial/test_" + str(size) + ".h5pd")
31 | 
32 |     save_to_h5pd(X_train, y_train, fname_store_train)
33 |     save_to_h5pd(X_test, y_test, fname_store_test)
34 |     
35 | def get_artificial_generator(data_path, size=1000, seed=0, part="train", store="h5", patterns=True, target=True):
36 | 
37 |     if part=="train":
38 |         fname = os.path.join(data_path, "artificial/train_" + str(size) + ".h5pd")
39 |     elif part=="test":
40 |         fname = os.path.join(data_path, "artificial/test_" + str(size) + ".h5pd")
41 | 
42 | 
43 |     try:
44 |         shutil.rmtree(fname)
45 |     except:
46 |         pass
47 |     
48 |     if not os.path.exists(fname):
49 |         print("Store for artificial data does not exist. Generating all stores ...")            
50 |         _convert_datasets(data_path, size=size, seed=seed)
51 | 
52 |     return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=200000)
53 | 


--------------------------------------------------------------------------------
/woody/util/base.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | 
 8 | def makedirs(d):
 9 |     """
10 |     """
11 |     
12 |     if not os.path.exists(d):
13 |         os.makedirs(d)
14 |         
15 | def ensure_dir_for_file(f):
16 |     """
17 |     """
18 |     
19 |     d = os.path.dirname(f)
20 |     makedirs(d)  
21 |     
22 | def convert_to_libsvm(ifile_name, ofile_name, counter_print=1000000, label_offset=None):
23 | 
24 |     orig_labels = []
25 |     new_labels = []
26 |     
27 |     ifile = open(ifile_name, 'r')
28 |     ofile = open(ofile_name, 'w')
29 |     
30 |     # process file line-by-line
31 |     counter = 0
32 |     
33 |     for line in ifile:
34 |     
35 |         new_line = []
36 |     
37 |         if counter % counter_print == 0:
38 |             print("Processing line %i ..." % counter)
39 |             print("orig_labels=" + str(orig_labels))
40 |             print("new_labels=" + str(new_labels))
41 |     
42 |         line = line.split(',')
43 |         
44 |         # append label
45 |         label = line[0]
46 |         orig_labels = list(orig_labels)
47 |         orig_labels.append(label)
48 |         orig_labels = set(orig_labels)
49 |         
50 |         if label_offset is not None:
51 |             label = int(label) + label_offset
52 |         new_labels = list(new_labels)
53 |         new_labels.append(label)
54 |         new_labels = set(new_labels)
55 |             
56 |         new_line.append(str(label))
57 |     
58 |         # append features
59 |         for i, item in enumerate(line[1:]):
60 |             new_item = "%s:%s" % (i+1, item.strip())
61 |             new_line.append(new_item)
62 |     
63 |         new_line = " ".join(new_line)
64 |         new_line += "\n"
65 |     
66 |         ofile.write(new_line)
67 |     
68 |         counter += 1
69 |     
70 |     ifile.close()
71 |     ofile.close()
72 |     
73 |     print("orig_labels=" + str(orig_labels))
74 |     print("new_labels=" + str(new_labels))


--------------------------------------------------------------------------------
/woody/models/subset/regression.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from woody.io import DiskStore
 7 | 
 8 | from .base import SubsetWood
 9 | 
10 | class SubsetWoodRegressor(SubsetWood):
11 |     """ Random forest regressor.
12 |     """
13 |     
14 |     def __init__(self,
15 |                  seed=0,
16 |                  n_estimators=10,
17 |                  min_samples_split=2,
18 |                  max_features=None,
19 |                  bootstrap=False,
20 |                  max_depth=None,
21 |                  min_samples_leaf=1,
22 |                  criterion="mse",
23 |                  tree_traversal_mode="dfs",
24 |                  leaf_stopping_mode="all",
25 |                  tree_type="randomized",
26 |                  float_type="double",
27 |                  patts_trans=True,
28 |                  do_patts_trans=True,
29 |                  lam_criterion = 1.0,
30 |                  n_jobs=1,                                  
31 |                  verbose=1,
32 |                  odir=".subsetwood",
33 |                  store=DiskStore(),
34 |                  ):
35 |         
36 |         super(SubsetWoodRegressor, self).__init__(
37 |                  seed=seed,
38 |                  n_estimators=n_estimators,
39 |                  min_samples_split=min_samples_split,
40 |                  max_features=max_features,
41 |                  bootstrap=bootstrap,
42 |                  max_depth=max_depth,
43 |                  min_samples_leaf=min_samples_leaf,
44 |                  learning_type="regression",
45 |                  criterion=criterion,
46 |                  tree_traversal_mode=tree_traversal_mode,
47 |                  leaf_stopping_mode=leaf_stopping_mode,
48 |                  tree_type=tree_type,
49 |                  float_type=float_type,
50 |                  patts_trans=patts_trans,
51 |                  do_patts_trans=do_patts_trans,
52 |                  lam_criterion=lam_criterion,
53 |                  n_jobs=n_jobs,                                  
54 |                  verbose=verbose,
55 |                  odir=odir,
56 |                  store=store)
57 |         


--------------------------------------------------------------------------------
/woody/models/subset/classification.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from woody.io import DiskStore
 7 | 
 8 | from .base import SubsetWood
 9 | 
10 | class SubsetWoodClassifier(SubsetWood):
11 |     """ Random forest classifier.
12 |     """
13 |     
14 |     def __init__(self,
15 |                  seed=0,
16 |                  n_estimators=10,
17 |                  min_samples_split=2,
18 |                  max_features=None,
19 |                  bootstrap=False,
20 |                  max_depth=None,
21 |                  min_samples_leaf=1,
22 |                  criterion="gini",
23 |                  tree_traversal_mode="dfs",
24 |                  leaf_stopping_mode="all",
25 |                  tree_type="randomized",
26 |                  float_type="double",
27 |                  patts_trans=True,
28 |                  do_patts_trans=True,
29 |                  lam_criterion = 1.0,
30 |                  n_jobs=1,                                  
31 |                  verbose=1,
32 |                  odir=".subsetwood",
33 |                  store=DiskStore(),                 
34 |                  ):
35 |         
36 |         super(SubsetWoodClassifier, self).__init__(
37 |                  seed=seed,
38 |                  n_estimators=n_estimators,
39 |                  min_samples_split=min_samples_split,
40 |                  max_features=max_features,
41 |                  bootstrap=bootstrap,
42 |                  max_depth=max_depth,
43 |                  min_samples_leaf=min_samples_leaf,
44 |                  learning_type="classification",
45 |                  criterion=criterion,
46 |                  tree_traversal_mode=tree_traversal_mode,
47 |                  leaf_stopping_mode=leaf_stopping_mode,
48 |                  tree_type=tree_type,
49 |                  float_type=float_type,
50 |                  patts_trans=patts_trans,
51 |                  do_patts_trans=do_patts_trans,
52 |                  lam_criterion=lam_criterion,
53 |                  n_jobs=n_jobs,                                  
54 |                  verbose=verbose,
55 |                  odir=odir,
56 |                  store=store)


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | woody
 2 | =====
 3 | 
 4 | A Python library for constructing very large random forests. The basic idea is to use "top trees" built for a small random subset of the data and to use these top trees to distribute all the training instances to the top trees' leaves. For each leaf, one or more bottom trees are built. For the bottom trees, woody resorts to pure C code that follows the random forest construction scheme provided by the `Scikit-Learn <http://scikit-learn.org/stable/>`_.
 5 | 
 6 | Dependencies
 7 | ------------
 8 | 
 9 | The woody package is tested under Python 2.7. See the requirements.txt for the packages that need to be installed.
10 | 
11 | Further, `Swig <http://www.swig.org>`_, `setuptools <https://pypi.python.org/pypi/setuptools>`_, and a working C/C++ compiler need to be available. 
12 | 
13 | Quickstart
14 | ----------
15 | 
16 | To install the package from the sources, first get the current development release via::
17 | 
18 |   git clone https://github.com/gieseke/woody.git
19 | 
20 | Afterwards, install a virtual environment via virtualenv. Go to the root of the woody package and type::
21 | 
22 |     mkdir .venv
23 |     cd .venv
24 |     virtualenv woody
25 |     source woody/bin/activate
26 |     cd ..
27 |     pip install -r requirements
28 | 
29 | Next, you can install the package locally (development) via::
30 | 
31 |   python setup.py clean
32 |   python setup.py develop
33 | 
34 | To run all the experiments, you also need to manually install::
35 | 
36 |   git clone https://github.com/tgsmith61591/skutil
37 |   cd skutil
38 |   python setup.py install
39 | 
40 | Experiments
41 | -----------
42 | 
43 | To run the experiments, simply run the launch.py file in the corresponding subdirectory. The associated run files will automatically download the datasets needed (in case this phase is interrupted, please delete the incomplete data files in the corresponding directory under woody/data). For instance::
44 | 
45 |   cd experiments/small_data
46 |   python launch.py 
47 | 
48 | Disclaimer
49 | ----------
50 | 
51 | The source code is published under the GNU General Public License (GPLv3). The authors are not responsible for any implications that stem from the use of this software.
52 | 
53 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/include/pqueue.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * heap.h
 3 |  *
 4 |  *  Created on: 21.10.2014
 5 |  *      Author: fgieseke
 6 |  */
 7 | 
 8 | #ifndef COMMON_INCLUDE_PQUEUE_H_
 9 | #define COMMON_INCLUDE_PQUEUE_H_
10 | 
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | 
14 | #define PQUEUE_MIN_SIZE 64
15 | 
16 | typedef struct {
17 | 	void * data;
18 | 	int pri;
19 | } PQUEUE_ITEM;
20 | 
21 | typedef struct {
22 | 	PQUEUE_ITEM *buf;
23 | 	int n;
24 | 	int alloc;
25 | } PQUEUE;
26 | 
27 | // macros
28 | #define pqueue_purge(q) (q)->n = 1
29 | #define pqueue_size(q) ((q)->n - 1)
30 | 
31 | /* --------------------------------------------------------------------------------
32 |  * Instantiates a new queue
33 |  * --------------------------------------------------------------------------------
34 |  */
35 | PQUEUE *pqueue_new(int size);
36 | 
37 | /* --------------------------------------------------------------------------------
38 |  * Tests if the queue is empty
39 |  * --------------------------------------------------------------------------------
40 |  */
41 | inline int pqueue_is_empty(PQUEUE *q);
42 | 
43 | /* --------------------------------------------------------------------------------
44 |  * Pushes "data" with priority "pri"
45 |  * --------------------------------------------------------------------------------
46 |  */
47 | void pqueue_push(PQUEUE *q, void *data, int pri);
48 | 
49 | /* --------------------------------------------------------------------------------
50 |  * Removes top item (or returns 0 if queue is empty); *pri can be NULL.
51 |  * --------------------------------------------------------------------------------
52 |  */
53 | void *pqueue_pop(PQUEUE *q, int *pri);
54 | 
55 | /* --------------------------------------------------------------------------------
56 |  * Returns the top of the queue
57 |  * --------------------------------------------------------------------------------
58 |  */
59 | inline void *pqueue_top(PQUEUE *q, int *pri);
60 | 
61 | /* --------------------------------------------------------------------------------
62 |  * Combines/merges two queues
63 |  * --------------------------------------------------------------------------------
64 |  */
65 | void pqueue_combine(PQUEUE *q1, PQUEUE *q2);
66 | 
67 | #endif /* COMMON_INCLUDE_PQUEUE_H_ */
68 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/include/timing.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * timing.h
 3 |  */
 4 | #ifndef INCLUDE_TIMING_H_
 5 | #define INCLUDE_TIMING_H_
 6 | 
 7 | #include <stdlib.h>
 8 | #include <assert.h>
 9 | #include <float.h>
10 | #include <math.h>
11 | #include <stdio.h>
12 | #include <time.h>
13 | #include <sys/resource.h>
14 | #include <sys/time.h>
15 | #include <pthread.h>
16 | #include <sched.h>
17 | #include <errno.h>
18 | #include <string.h>
19 | #include <ctype.h>
20 | 
21 | // don't use time if not specified
22 | #ifndef TIMING
23 | #define TIMING 0
24 | #endif
25 | 
26 | // struct for input parameters
27 | typedef struct timer_struct {
28 | 
29 | 	long start_time;
30 | 	double elapsed_time;
31 | 	double elapsed_time_total;
32 | 
33 | } TIMER;
34 | 
35 | #define INIT_MY_TIMER init_my_timer
36 | #define START_MY_TIMER start_my_timer
37 | #define RESUME_MY_TIMER start_my_timer
38 | #define STOP_MY_TIMER stop_my_timer
39 | #define GET_MY_TIMER get_my_timer
40 | 
41 | void start_my_timer(TIMER *timer);
42 | void resume_my_timer(TIMER *timer);
43 | void stop_my_timer(TIMER *timer);
44 | double get_my_timer(TIMER *timer);
45 | void init_my_timer(TIMER *timer);
46 | 
47 | // timing macros
48 | #if TIMING > 0
49 | #define DEFINE_TIMER(num) long start_time##num = 0; double elapsed_time##num = 0.0f; double elapsed_time_total##num = 0.0f;
50 | #define DECLARE_TIMER(num) extern long start_time##num; extern double elapsed_time##num; extern double elapsed_time_total##num;
51 | #define START_TIMER(num) start_time##num = get_system_time_in_microseconds();
52 | #define STOP_TIMER(num) elapsed_time##num = (((double)get_system_time_in_microseconds())-((double)start_time##num)); elapsed_time_total##num+=elapsed_time##num;
53 | #define GET_TIME(num) (double)(1.0*elapsed_time_total##num / 1000000.0)
54 | #define RESET_TIMER(num) start_time##num = 0; elapsed_time##num = 0.0f; elapsed_time_total##num = 0.0f;
55 | #else
56 | #define DEFINE_TIMER(num)
57 | #define DECLARE_TIMER(num)
58 | #define START_TIMER(num)
59 | #define STOP_TIMER(num)
60 | #define GET_TIME(num)
61 | #define RESET_TIMER(num)
62 | #endif
63 | 
64 | /* --------------------------------------------------------------------------------
65 |  * Helper method for computing the current time (w.r.t to an offset).
66 |  * --------------------------------------------------------------------------------
67 |  */
68 | long get_system_time_in_microseconds(void);
69 | 
70 | #endif /* INCLUDE_TIMING_H_ */
71 | 


--------------------------------------------------------------------------------
/woody/util/array/src/array.c:
--------------------------------------------------------------------------------
 1 | #include "include/array.h"
 2 | 
 3 | /* --------------------------------------------------------------------------------
 4 |  * Splits the array X according to the indices
 5 |  * --------------------------------------------------------------------------------
 6 |  */
 7 | void split_array(FLOAT_TYPE *X, int nX, int dX,
 8 | 		FLOAT_TYPE *Xnew, int nXnew, int dXnew,
 9 | 		int *indicator, int nindicator,
10 | 		int *chunks, int nchunks,
11 | 		int *cumsums_minus_counts, int ncumsums_minus_counts){
12 | 
13 |     int i;
14 | 
15 |     int *offsets = (int*) malloc(nX * sizeof(int));
16 | 
17 |     compute_split_offsets(offsets, nX, indicator, nindicator, chunks, nchunks, cumsums_minus_counts, ncumsums_minus_counts);
18 | 
19 |     for(i=0; i<nX; i++){
20 | 
21 |     	if (offsets[i] < 0 || offsets[i] >= nX){
22 |     		printf("Bad offset: %i [%i, %i]\n!", offsets[i], 0, nX);
23 |     		exit(-1);
24 |     	}
25 |         copy_pattern(X + i * dX, Xnew + offsets[i] * dX, dX);
26 |     }
27 | 
28 |     free(offsets);
29 | 
30 | }
31 | 
32 | 
33 | void compute_split_offsets(int *offsets, int noffsets,
34 | 							int *indicator, int nindicator,
35 | 							int *chunks, int nchunks,
36 | 							int *cumsums_minus_counts, int ncumsums_minus_counts){
37 | 
38 |     int i;
39 |     int *chunks_counters = (int*) calloc(nchunks, sizeof(int));
40 | 
41 |     for(i=0; i<noffsets; i++){
42 | 
43 | 		int chunk = chunks[indicator[i]];
44 | 		//printf("chunk=%i\n", chunk);
45 | 		//offsets[i] = 0;
46 | 		//printf("cumsums[chunk-1]=%i\n", cumsums[chunk-1]);
47 | 		//printf("chunks_counters[chunk]=%i\n", chunks_counters[chunk]);
48 | 
49 | 		// FIXME: Wrong access here? invalid chunk? chunk_counters wrong?
50 | 		// afterwards: offsets[i]==1892929992 (WRONG)
51 | 		// chunk -1 invalid? why -1? cumsums[chunk] should work
52 | 		// ++ wrong?
53 |         offsets[i] = cumsums_minus_counts[chunk] + chunks_counters[chunk];
54 |         chunks_counters[chunk]++;
55 | 
56 |     }
57 | 
58 |     free(chunks_counters);
59 | 
60 | }
61 | 
62 | 
63 | /* --------------------------------------------------------------------------------
64 |  * Transposes an array
65 |  * --------------------------------------------------------------------------------
66 |  */
67 | void transpose_array(FLOAT_TYPE* X, int nX, int dX, FLOAT_TYPE* XT, int nXT, int dXT){
68 | 
69 | 	int i, j;
70 | 
71 | 	for (j = 0; j < dX; j++) {
72 | 		for (i = 0; i < nX; i++) {
73 | 
74 | 			XT[j * nX + i] = X[i * dX + j];
75 | 
76 | 		}
77 | 	}
78 | 
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/include/global.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * global.h
 3 |  */
 4 | 
 5 | #ifndef FOREST_STANDARD_GLOBAL_INCLUDE_H_
 6 | #define FOREST_STANDARD_GLOBAL_INCLUDE_H_
 7 | 
 8 | #include "../../include/timing.h"
 9 | #include "../../include/float.h"
10 | 
11 | #include "types.h"
12 | 
13 | #define NOT_TRANSPOSED 0
14 | #define TRANSPOSED 1
15 | 
16 | // learning types
17 | #define LEARNING_PROBLEM_TYPE_REGRESSION 		0
18 | #define LEARNING_PROBLEM_TYPE_CLASSIFICATION 	1
19 | 
20 | // prediction types (e.g., for returning only the leaf ids)
21 | #define PREDICTION_TYPE_NORMAL 0
22 | #define PREDICTION_TYPE_LEAVES_IDS 1
23 | 
24 | // tree types
25 | #define TREE_TYPE_STANDARD 				0
26 | #define TREE_TYPE_RANDOMIZED 			1
27 | 
28 | #define USE_BOOTSTRAP_INDICES 	1
29 | 
30 | // tree traversal modes
31 | #define TREE_TRAVERSAL_MODE_DFS 		0
32 | #define TREE_TRAVERSAL_MODE_NODE_SIZE 	1
33 | #define TREE_TRAVERSAL_MODE_PROB 	2
34 | 
35 | // criteria
36 | #define CRITERION_MSE 		0
37 | #define CRITERION_GINI 		1
38 | #define CRITERION_ENTROPY 	2
39 | #define CRITERION_EVEN_SPLIT_MSE 3
40 | #define CRITERION_EVEN_SPLIT_GINI 4
41 | #define CRITERION_EVEN_SPLIT_ENTROPY 5
42 | 
43 | // leaf stopping criteria
44 | #define LEAF_CRIT_NO_LEAF 0
45 | #define LEAF_CRIT_DETECTED 1
46 | #define LEAF_CRIT_MAX_DEPTH 2
47 | #define LEAF_CRIT_MIN_SAMPLES_SPLIT 3
48 | #define LEAF_CRIT_MIN_SAMPLES_LEAF 4
49 | #define LEAF_CRIT_MIN_IMPURITY 5
50 | #define LEAF_CRIT_POS_END 6
51 | 
52 | // leaf stopping modes
53 | #define LEAF_MODE_STOP_ALL 0
54 | #define LEAF_MODE_STOP_IGNORE_IMPURITY 1
55 | 
56 | #define MIN_IMPURITY_SPLIT 	1e-10
57 | #define FEATURE_THRESHOLD 	1e-10
58 | 
59 | #define NO_LEAF 0
60 | #define LEAF 	1
61 | 
62 | #define TREE_ROOT_PARENT_ID -1
63 | #define TREE_ROOT_ID 0
64 | #define TREE_CHILD_ID_NOT_SET 0
65 | 
66 | #define NO_LEFT_CHILD 	0
67 | #define LEFT_CHILD 		1
68 | 
69 | // float as default
70 | #ifndef USE_DOUBLE
71 | #define USE_DOUBLE 0
72 | #endif
73 | 
74 | #if USE_DOUBLE > 0
75 | #define FLOAT_TYPE double
76 | #else
77 | #define FLOAT_TYPE float
78 | #endif
79 | 
80 | #define FREE_RESOURCES cpu_free_resources
81 | #define COMPUTE_SPLITS cpu_compute_splits
82 | #define INIT_BINDICES cpu_init_bindices
83 | #define FREE_BINDICES cpu_free_bindices
84 | #define INIT_TRAINING_DATA cpu_init_training_data
85 | #define FREE_TRAINING_DATA cpu_free_training_data
86 | #define INIT cpu_init
87 | #define INIT_AFTER_FITTING cpu_init_after_fitting
88 | #define PREDICT cpu_predict
89 | 
90 | #define PRINT(params) if ((params->verbosity_level) > 0) printf
91 | 
92 | #endif /* FOREST_STANDARD_GLOBAL_INCLUDE_H_ */
93 | 


--------------------------------------------------------------------------------
/woody/util/array/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | import numpy
 8 | 
 9 | FILES_TO_BE_COMPILED_CPU = ["array.c", "util.c"]
10 | DIRS_TO_BE_INCLUDED_CPU = ["include"]
11 | 
12 | SOURCES_RELATIVE_PATH = "src/"
13 | current_path = os.path.dirname(os.path.abspath(__file__))
14 | sources_abs_path = os.path.abspath(os.path.join(current_path, SOURCES_RELATIVE_PATH))
15 | 
16 | # source files
17 | source_files_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in FILES_TO_BE_COMPILED_CPU]
18 | include_paths_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in DIRS_TO_BE_INCLUDED_CPU]
19 | 
20 | numpy_include = numpy.get_include()
21 | 
22 | def configuration(parent_package='', top_path=None):
23 | 
24 |     from numpy.distutils.misc_util import Configuration
25 |     config = Configuration('util/c', parent_package, top_path)
26 | 
27 |     # CPU + FLOAT
28 |     config.add_extension("_wrapper_utils_cpu_float", \
29 |                                     sources = ["swig/cpu_float.i"] + source_files_cpu,
30 |                                     swig_opts=['-modern'],
31 |                                     include_dirs = [numpy_include] +[include_paths_cpu],
32 |                                     define_macros = [
33 |                                         ('USE_DOUBLE', 0),
34 |                                     ],
35 |                                     libraries=['gomp'],
36 |                                     extra_compile_args=["-std=gnu89", "-fopenmp", '-pthread', '-O3', '-Wall', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu])
37 | 
38 |     # CPU + DOUBLE
39 |     config.add_extension("_wrapper_utils_cpu_double", \
40 |                                     sources = ["swig/cpu_double.i"] + source_files_cpu,
41 |                                     swig_opts=['-modern'],
42 |                                     include_dirs = [numpy_include] +[include_paths_cpu],
43 |                                     define_macros = [
44 |                                         ('USE_DOUBLE', 1),
45 |                                     ],
46 |                                     libraries=['gomp'],
47 |                                     extra_compile_args=["-std=gnu89", "-fopenmp", '-pthread', '-O3', '-Wall', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu])
48 | 
49 |     return config
50 | 
51 | if __name__ == '__main__':
52 |         
53 |     from numpy.distutils.core import setup
54 |     setup(**configuration(top_path='').todict())
55 | 
56 | 


--------------------------------------------------------------------------------
/woody/models/forest/util.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import numpy
 7 | 
 8 | def ensure_data_types(X, y, numpy_dtype_float):
 9 |     
10 |     # ensure floats everywhere (e.g., for split array computations)
11 |     if X.dtype != numpy_dtype_float:
12 |         X = X.astype(numpy_dtype_float)
13 |     if y.dtype != numpy_dtype_float:
14 |         y = y.astype(numpy_dtype_float)    
15 |     
16 |     return X, y  
17 |     
18 | class PickableWoodyRFWrapper(object):
19 |     """
20 |     """
21 |     
22 |     def __init__(self, *args):
23 |         
24 |         self.args = args
25 |         
26 |         self.float_type = args[0]
27 |         
28 |         self._params_swig = self.module.PARAMETERS()
29 |         self._forest_swig = self.module.FOREST()
30 | 
31 |     @property
32 |     def params(self):
33 |         
34 |         return self._params_swig
35 | 
36 |     @property
37 |     def forest(self):
38 |         
39 |         return self._forest_swig
40 |                     
41 |     @property
42 |     def module(self):
43 |         
44 |         return self._get_wrapper_module()
45 |     
46 |     def _get_wrapper_module(self):
47 |         
48 |         if self.float_type == "float":
49 |             import wrapper_cpu_float
50 |             return wrapper_cpu_float
51 |         elif self.float_type == "double":
52 |             import wrapper_cpu_double
53 |             return wrapper_cpu_double
54 |         
55 |     def __setstate__(self, state):
56 |         """ Is called when object is unpickled
57 |         """        
58 |         
59 |         self.__dict__.update(state)
60 |         
61 |         self._params_swig = self.module.PARAMETERS()
62 |         self._forest_swig = self.module.FOREST()
63 | 
64 |         self._get_wrapper_module().restore_forest_from_array_extern(self.params, self.forest, self._aforest)
65 |          
66 |     def __getstate__(self):
67 |         """ Is called when object is pickled
68 |         
69 |         https://docs.python.org/3/library/pickle.html#pickle-state
70 |         
71 |         """
72 | 
73 |         n_bytes_forest = self._get_wrapper_module().get_num_bytes_forest_extern(self.params, self.forest);
74 |         n_bytes_forest = int((float(n_bytes_forest) / 4.0) + 4)
75 | 
76 |         aforest = numpy.empty(n_bytes_forest, dtype=numpy.int32)
77 |         self._get_wrapper_module().get_forest_as_array_extern(self.params, self.forest, aforest)
78 |         
79 |         state = self.__dict__.copy()
80 |         state['_aforest'] = aforest
81 |         
82 |         del state['_params_swig']
83 |         del state['_forest_swig']
84 |         
85 |         return state        
86 |         


--------------------------------------------------------------------------------
/woody/models/huge/util.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import numpy
 7 | import multiprocessing
 8 | 
 9 | from woody.util import split_array
10 | 
11 | from .. import Wood                
12 |                 
13 | def distribute_patterns(toptree, X, y, verbose=0, logger=None):
14 | 
15 |     if logger is not None:
16 |         logger.debug("\tUsing top tree to distribute patterns to leaves ...")
17 |             
18 |     leaves_ids = toptree.get_leaves_ids(X)
19 |     unique_leaves_ids, counts = numpy.unique(leaves_ids, return_counts=True)
20 |     
21 |     if logger is not None:
22 |         logger.debug("\tPatterns are distributed to %i leaves of the top tree ..." % len(unique_leaves_ids))
23 |         
24 |     chunks = -1 * numpy.ones(int(unique_leaves_ids[-1]) + 1, dtype=numpy.int32)
25 |     for i in xrange(len(unique_leaves_ids)):
26 |         leaf_id = int(unique_leaves_ids[i])
27 |         chunks[leaf_id] = i
28 |     
29 |     Xsubs, ysubs = {}, {}
30 | 
31 |     Xnew = split_array(X, leaves_ids, chunks, counts)
32 |     ynew = split_array(y, leaves_ids, chunks, counts)
33 |     
34 |     current_count = 0
35 |     for i in xrange(len(unique_leaves_ids)):
36 |         leaf_id = unique_leaves_ids[i]
37 |         cts = counts[i]
38 |         Xsubs[leaf_id] = Xnew[current_count:current_count + cts, :]
39 |         ysubs[leaf_id] = ynew[current_count:current_count + cts]
40 |         current_count += cts
41 |     
42 |     return Xsubs, ysubs, unique_leaves_ids      
43 | 
44 | def get_XY_subsets_from_store(dset, heavy_leaf_domsize):
45 |     
46 |     pure = False
47 |     
48 |     ychunk = numpy.array(dset[:, -1])
49 |     counts = numpy.bincount(ychunk.astype(numpy.int32))
50 |         
51 |     dominant = numpy.argmax(counts)
52 |     if len(ychunk) > heavy_leaf_domsize:
53 |         rsubset = numpy.random.choice(len(ychunk), heavy_leaf_domsize)
54 |     else:
55 |         rsubset = numpy.arange(len(ychunk))
56 |     subindices = ychunk != dominant
57 |     subindices = numpy.union1d(rsubset, subindices)
58 |     subindices.sort()
59 | 
60 |     # random access slow in h5py, process in chunks
61 |     Xsub, ysub = numpy.array(dset[:, :-1]), numpy.array(dset[:, -1])
62 |     Xsub, ysub = Xsub[subindices,:], ysub[subindices]
63 |     
64 |     print "REMOVE UPWARDS, no XSUB needed"
65 |     if (counts != 0).sum() == 1:
66 |         pure = True
67 |             
68 |     return Xsub, ysub, pure
69 |     
70 | def _load_single_tree(store, fname, wrapped_instance, typ=None):
71 |     
72 |     assert typ in ["top", "bottom"]
73 |     
74 |     if typ == "top":
75 |         return store.load(fname, Wood)
76 |     
77 |     elif typ == "bottom":
78 |         return store.load(fname, wrapped_instance)
79 |     
80 |     return None   
81 | 


--------------------------------------------------------------------------------
/woody/models/huge/regression.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from woody.io import DiskStore
 7 | 
 8 | from .base import HugeWood
 9 | 
10 | from .. import WoodRegressor
11 | 
12 | class HugeWoodRegressor(HugeWood):
13 |     """ Large-scale contruction of a random forest on
14 |     a single workstation (with limited memory resources).
15 |     Each tree belonging to the ensemble is constructed 
16 |     in a multi-stage fashion and the intermediate data
17 |     are stored on disk (e.g., via h5py).
18 |     """
19 |         
20 |     TKEY_ALL_FIT = 0
21 |     TKEY_TOP_TREE = 1
22 |     TKEY_DISTR_PATTS = 2
23 |     TKEY_BOTTOM_TREES = 3
24 |     
25 |     MAX_RAND_INT = 10000000
26 |     
27 |     def __init__(self,
28 |                  n_top="auto",
29 |                  n_patterns_leaf="auto",
30 |                  balanced_top_tree=True,
31 |                  top_tree_lambda=0.0,
32 |                  top_tree_max_depth=None,
33 |                  top_tree_type="randomized",
34 |                  top_tree_leaf_stopping_mode="ignore_impurity",
35 |                  n_estimators=1,
36 |                  n_estimators_bottom=1,
37 |                  n_jobs=1,
38 |                  seed=0,
39 |                  odir=".hugewood",
40 |                  verbose=1,           
41 |                  plot_intermediate={},
42 |                  chunk_max_megabytes=256,                 
43 |                  wrapped_instance=WoodRegressor(),
44 |                  store=DiskStore(),   
45 |                  ):
46 |                         
47 |         super(HugeWoodRegressor, self).__init__(n_top=n_top,
48 |                                                  n_patterns_leaf=n_patterns_leaf,
49 |                                                  balanced_top_tree=balanced_top_tree,
50 |                                                  top_tree_lambda=top_tree_lambda,
51 |                                                  top_tree_max_depth=top_tree_max_depth,
52 |                                                  top_tree_type=top_tree_type,
53 |                                                  top_tree_leaf_stopping_mode=top_tree_leaf_stopping_mode,
54 |                                                  n_estimators=n_estimators,
55 |                                                  n_estimators_bottom=n_estimators_bottom,
56 |                                                  n_jobs=n_jobs,
57 |                                                  seed=seed,
58 |                                                  odir=odir,
59 |                                                  verbose=verbose,
60 |                                                  plot_intermediate=plot_intermediate,
61 |                                                  chunk_max_megabytes=chunk_max_megabytes,                                                      
62 |                                                  wrapped_instance=wrapped_instance,
63 |                                                  store=store)
64 |         


--------------------------------------------------------------------------------
/woody/models/huge/classification.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | from woody.io import DiskStore
 7 | 
 8 | from .base import HugeWood
 9 | 
10 | from .. import WoodClassifier
11 | 
12 | class HugeWoodClassifier(HugeWood):
13 |     """ Large-scale contruction of a random forest on
14 |     a single workstation (with limited memory resources).
15 |     Each tree belonging to the ensemble is constructed 
16 |     in a multi-stage fashion and the intermediate data
17 |     are stored on disk (e.g., via h5py).
18 |     """
19 |         
20 |     TKEY_ALL_FIT = 0
21 |     TKEY_TOP_TREE = 1
22 |     TKEY_DISTR_PATTS = 2
23 |     TKEY_BOTTOM_TREES = 3
24 |     
25 |     MAX_RAND_INT = 10000000
26 |     
27 |     def __init__(self,
28 |                  n_top="auso",
29 |                  n_patterns_leaf="auto",
30 |                  balanced_top_tree=True,
31 |                  top_tree_lambda=0.0,
32 |                  top_tree_max_depth=None,
33 |                  top_tree_type="randomized",
34 |                  top_tree_leaf_stopping_mode="ignore_impurity",
35 |                  n_estimators=1,
36 |                  n_estimators_bottom=1,
37 |                  n_jobs=1,
38 |                  seed=0,
39 |                  odir=".hugewood",
40 |                  verbose=1,           
41 |                  plot_intermediate={},
42 |                  chunk_max_megabytes=256,
43 |                  wrapped_instance=WoodClassifier(),     
44 |                  store=DiskStore(),                 
45 |                  ):
46 |                         
47 |         super(HugeWoodClassifier, self).__init__(n_top=n_top,
48 |                                                  n_patterns_leaf=n_patterns_leaf,
49 |                                                  balanced_top_tree=balanced_top_tree,
50 |                                                  top_tree_lambda=top_tree_lambda,
51 |                                                  top_tree_max_depth=top_tree_max_depth,
52 |                                                  top_tree_type=top_tree_type,
53 |                                                  top_tree_leaf_stopping_mode=top_tree_leaf_stopping_mode,
54 |                                                  n_estimators=n_estimators,
55 |                                                  n_estimators_bottom=n_estimators_bottom,
56 |                                                  n_jobs=n_jobs,
57 |                                                  seed=seed,
58 |                                                  odir=odir,
59 |                                                  verbose=verbose,
60 |                                                  plot_intermediate=plot_intermediate,
61 |                                                  chunk_max_megabytes=chunk_max_megabytes,                                                 
62 |                                                  wrapped_instance=wrapped_instance,
63 |                                                  store=store)
64 |         


--------------------------------------------------------------------------------
/woody/models/forest/src/util.c:
--------------------------------------------------------------------------------
  1 | #include "include/util.h"
  2 | #include "include/float.h"
  3 | 
  4 | /* --------------------------------------------------------------------------------
  5 |  * Transposes an array (float)
  6 |  * --------------------------------------------------------------------------------
  7 |  */
  8 | void transpose_array_float(float* array, int n, int d, float* array_transposed) {
  9 | 
 10 | 	int i, j;
 11 | 
 12 | 	for (j = 0; j < d; j++) {
 13 | 		for (i = 0; i < n; i++) {
 14 | 			array_transposed[j * n + i] = array[i * d + j];
 15 | 		}
 16 | 	}
 17 | 
 18 | }
 19 | 
 20 | /* --------------------------------------------------------------------------------
 21 |  * Transposes an array (double)
 22 |  * --------------------------------------------------------------------------------
 23 |  */
 24 | void transpose_array_double(double* array, int n, int d,
 25 | 		double* array_transposed) {
 26 | 
 27 | 	int i, j;
 28 | 
 29 | 	for (j = 0; j < d; j++) {
 30 | 		for (i = 0; i < n; i++) {
 31 | 			array_transposed[j * n + i] = array[i * d + j];
 32 | 		}
 33 | 	}
 34 | 
 35 | }
 36 | 
 37 | int compare_floats(const void *p1, const void *p2) {
 38 | 
 39 | 	// the index is stored at the end of each element...
 40 | 	FLOAT_TYPE *p1_point, *p2_point;
 41 | 	p1_point = (FLOAT_TYPE *) p1;
 42 | 	p2_point = (FLOAT_TYPE *) p2;
 43 | 
 44 | 	if (*p1_point < *p2_point) {
 45 | 		return -1;
 46 | 	}
 47 | 	if (*p1_point > *p2_point) {
 48 | 		return +1;
 49 | 	}
 50 | 
 51 | 	return 0;
 52 | 
 53 | }
 54 | 
 55 | int compare_ints(const void *p1, const void *p2) {
 56 | 
 57 | 	// the index is stored at the end of each element...
 58 | 	int *p1_point, *p2_point;
 59 | 	p1_point = (int *) p1;
 60 | 	p2_point = (int *) p2;
 61 | 
 62 | 	if (*p1_point < *p2_point) {
 63 | 		return -1;
 64 | 	}
 65 | 	if (*p1_point > *p2_point) {
 66 | 		return +1;
 67 | 	}
 68 | 
 69 | 	return 0;
 70 | 
 71 | }
 72 | 
 73 | /*---------------------------------------------------------------------------
 74 |  Function :   kth_smallest()
 75 |  In       :   array of elements, # of elements in the array, rank k
 76 |  Out      :   one element
 77 |  Job      :   find the kth smallest element in the array
 78 |  Notice   :   use the median() macro defined below to get the median.
 79 | 
 80 |  Reference:
 81 | 
 82 |  Author: Wirth, Niklaus
 83 |  Title: Algorithms + data structures = programs
 84 |  Publisher: Englewood Cliffs: Prentice-Hall, 1976
 85 |  Physical description: 366 p.
 86 |  Series: Prentice-Hall Series in Automatic Computation
 87 | 
 88 |  ---------------------------------------------------------------------------*/
 89 | FLOAT_TYPE kth_smallest(FLOAT_TYPE a[], int n, int k) {
 90 | 
 91 | 	return a[kth_smallest_idx(a, n, k)];
 92 | 
 93 | }
 94 | 
 95 | int kth_smallest_idx(FLOAT_TYPE a[], int n, int k) {
 96 | 
 97 | 	register unsigned int i, j, l, m;
 98 | 	register FLOAT_TYPE x;
 99 | 
100 | 	l = 0;
101 | 	m = n - 1;
102 | 
103 | 	while (l < m) {
104 | 		x = a[k];
105 | 		i = l;
106 | 		j = m;
107 | 		do {
108 | 			while (a[i] < x)
109 | 				i++;
110 | 			while (x < a[j])
111 | 				j--;
112 | 			if (i <= j) {
113 | 				ELEM_SWAP(a[i], a[j]);
114 | 				i++;
115 | 				j--;
116 | 			}
117 | 		} while (i <= j);
118 | 		if (j < k)
119 | 			l = i;
120 | 		if (k < i)
121 | 			m = j;
122 | 	}
123 | 
124 | 	return k;
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/woody/models/huge/predict.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import gc
 4 | import numpy
 5 | from scipy.stats import mode
 6 | 
 7 | from woody.io import DiskStore
 8 | from woody.util import perform_task_in_parallel
 9 | from .util import distribute_patterns
10 | from .util import _load_single_tree
11 |         
12 | def predict_array(X, n_estimators, n_estimators_bottom, numpy_dtype_float, odir, store, wrapped_instance, n_jobs):
13 |     """ Returns predictions for a given set of patterns.
14 |     """
15 |     
16 |     params_parallel = []
17 |     
18 |     for b in xrange(n_estimators):
19 | 
20 |         odir_local = os.path.join(odir, str(int(b)))
21 |         fname = os.path.join(odir_local, "toptree.tree")
22 |         toptree = _load_single_tree(store, fname, wrapped_instance, typ="top")
23 |         args = [n_estimators_bottom, toptree, X, odir_local, store, wrapped_instance, numpy_dtype_float]
24 |         params_parallel.append(args)
25 |     
26 |     if type(store) == DiskStore:
27 |         results = perform_task_in_parallel(predict_bottom, params_parallel, n_jobs=n_jobs, backend="multiprocessing")
28 |     else:
29 |         results = []
30 |         for param in params_parallel:
31 |             res = predict_bottom(param)
32 |             results.append(res)    
33 |     allpreds = numpy.zeros((len(X), n_estimators*n_estimators_bottom), dtype=numpy_dtype_float)
34 |     for i in xrange(len(results)):
35 |         allpreds[:,i*n_estimators_bottom:(i+1)*n_estimators_bottom] = results[i] 
36 |     allpreds = numpy.array(allpreds)
37 |     
38 |     preds = _combine_preds(allpreds, wrapped_instance.learning_type, numpy_dtype_float)
39 |     
40 |     return preds  
41 | 
42 | def predict_bottom(args):
43 |     """ FIXME: This is by far the slowest part during prediction.
44 |     """
45 |     
46 |     n_estimators_bottom, toptree, X, odir, store, wrapped_instance, numpy_dtype_float = args
47 |     
48 |     preds = numpy.zeros((len(X), n_estimators_bottom), dtype=numpy_dtype_float)
49 |     
50 |     oindices = numpy.array(xrange(len(X)), dtype=numpy.float64)
51 | 
52 |     Xsubs, isubs, unique_leaves_ids = distribute_patterns(toptree, X, oindices)
53 | 
54 |     for leaf_id in unique_leaves_ids:
55 |         isubs[leaf_id] = isubs[leaf_id].astype(numpy.int64)
56 |     unique_leaves_ids = unique_leaves_ids.astype(numpy.int64)
57 |     
58 |     for leaf_id in unique_leaves_ids:
59 |         fname = os.path.join(odir, str(int(leaf_id)) + ".tree")            
60 |         btree = _load_single_tree(store, fname, wrapped_instance, typ="bottom")
61 |         pleaf = btree.predict_all(Xsubs[leaf_id])
62 |         preds[isubs[leaf_id], :] = pleaf
63 |         
64 |         del btree
65 |         gc.collect()
66 |     
67 |     return preds
68 | 
69 | def _combine_preds(allpreds, learning_type, numpy_dtype_float):
70 |     
71 |     if learning_type == "regression":
72 |         
73 |         preds = allpreds.mean(axis=1)
74 |         
75 |     elif learning_type == "classification":
76 |         
77 |         preds, _ = mode(allpreds, axis=1)
78 |         preds = preds[:, 0]
79 |         
80 |     else:
81 |         raise Exception("Unknown learning type for wrapped instance: %s" % learning_type)
82 |     
83 |     if preds.dtype != numpy_dtype_float:
84 |         preds = preds.astype(numpy_dtype_float)
85 |         
86 |     return preds  
87 |     


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/include/types.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * types.h
  3 |  *
  4 |  *  Created on: 15.02.2016
  5 |  *      Author: fgieseke
  6 |  */
  7 | 
  8 | #ifndef ENSEMBLE_INCLUDE_TYPES_H_
  9 | #define ENSEMBLE_INCLUDE_TYPES_H_
 10 | 
 11 | #include "../../include/float.h"
 12 | #include "../../include/timing.h"
 13 | 
 14 | typedef struct parameters {
 15 | 
 16 | 	int seed;
 17 | 	int n_estimators;
 18 | 	int min_samples_split;
 19 | 	int max_features;
 20 | 	int bootstrap;
 21 | 	int max_depth;
 22 | 	int min_samples_leaf;
 23 | 	int num_threads;
 24 | 	int verbosity_level;
 25 | 	int tree_traversal_mode;
 26 | 	int leaf_stopping_mode;
 27 | 	int criterion;
 28 | 	int learning_type;
 29 | 	int tree_type;
 30 | 	int prediction_type;
 31 | 	int patterns_transposed;
 32 | 	double lam_crit;
 33 | 	int n_subset_check;
 34 | 
 35 | 	// training
 36 | 	FLOAT_TYPE *Xtrain;
 37 | 	int nXtrain;
 38 | 	int dXtrain;
 39 | 	FLOAT_TYPE max_ytrain_value;
 40 | 
 41 | 	TIMER timers[10];
 42 | 
 43 | } PARAMETERS;
 44 | 
 45 | typedef struct bootstrap_indices {
 46 | 	int n_indices;
 47 | 	int *indices;
 48 | 	int *indices_wmappings;
 49 | } BINDICES;
 50 | 
 51 | typedef struct training_data {
 52 | 
 53 | 	FLOAT_TYPE *Xtrain;
 54 | 	FLOAT_TYPE *Ytrain;
 55 | 	FLOAT_TYPE *Ytrain_mapped;
 56 | 	int nXtrain;
 57 | 	int dXtrain;
 58 | 	BINDICES *bindices;
 59 | 	int n_classes;
 60 | 	FLOAT_TYPE *classes;
 61 | 
 62 | } TRAINING_DATA;
 63 | 
 64 | typedef struct split_record {
 65 | 
 66 | 	unsigned int feature;
 67 | 	int pos;
 68 | 	FLOAT_TYPE threshold;
 69 | 	FLOAT_TYPE improvement;
 70 | 	FLOAT_TYPE impurity;
 71 | 	FLOAT_TYPE impurity_left;
 72 | 	FLOAT_TYPE impurity_right;
 73 | 	FLOAT_TYPE prob_left;
 74 | 	FLOAT_TYPE prob_right;
 75 | 	int leaf_detected;
 76 | 
 77 | } SPLIT_RECORD;
 78 | 
 79 | typedef struct traversal_record {
 80 | 
 81 | 	int start;
 82 | 	int end;
 83 | 	int depth;
 84 | 	int parent_id;
 85 | 	int is_left_child;
 86 | 	int is_leaf;
 87 | 
 88 | 	int n_constant_features;
 89 | 	int *const_features;
 90 | 
 91 | 	SPLIT_RECORD *split_record;
 92 | 
 93 | } TRAVERSAL_RECORD;
 94 | 
 95 | typedef struct tree_node {
 96 | 
 97 | 	unsigned int left_id;
 98 | 	unsigned int right_id;
 99 | 	unsigned int feature;
100 | 	FLOAT_TYPE thres_or_leaf;
101 | 	unsigned int leaf_criterion;
102 | 
103 | } TREE_NODE;
104 | 
105 | typedef struct tree {
106 | 
107 | 	TREE_NODE *root;
108 | 	int n_allocated;
109 | 	int node_counter;
110 | 
111 | } TREE;
112 | 
113 | typedef struct forest {
114 | 
115 | 	TREE *trees;
116 | 	int n_trees;
117 | 
118 | } FOREST;
119 | 
120 | typedef struct pattern_label_weight PATTERN_LABEL_WEIGHT;
121 | 
122 | struct pattern_label_weight {
123 | 	FLOAT_TYPE pattern;
124 | 	FLOAT_TYPE label;
125 | 	int weight;
126 | };
127 | 
128 | typedef struct criterion_record CRITERION_RECORD;
129 | 
130 | struct criterion_record {
131 | 
132 | 	int current_pos;
133 | 	FLOAT_TYPE impurity;
134 | 	FLOAT_TYPE impurity_left;
135 | 	FLOAT_TYPE impurity_right;
136 | 	FLOAT_TYPE improvement;
137 | 
138 | 	int weight_left;
139 | 	int weight_right;
140 | 
141 | 	// needed for regression (MSE)
142 | 	FLOAT_TYPE sum_left;
143 | 	FLOAT_TYPE sum_right;
144 | 	FLOAT_TYPE sq_sum_left;
145 | 	FLOAT_TYPE sq_sum_right;
146 | 
147 | 	// needed for classification (GINI and ENTROPY)
148 | 	int *class_counts_left;
149 | 	int *class_counts_right;
150 | 
151 | };
152 | 
153 | #endif /* ENSEMBLE_INCLUDE_TYPES_H_ */
154 | 


--------------------------------------------------------------------------------
/woody/models/forest/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | import numpy
 8 | 
 9 | TIMING = 1
10 | 
11 | FILES_TO_BE_COMPILED_CPU = ["tree/base.c", 
12 |                             "tree/cpu.c", 
13 |                             "tree/tree.c", 
14 |                             "tree/util.c", 
15 |                             "tree/cpu/base.c", 
16 |                             "tree/cpu/criteria.c", 
17 |                             "tree/cpu/standard.c", 
18 |                             "tree/cpu/fastsort.c", 
19 |                             "timing.c", 
20 |                             "util.c", 
21 |                             "pqueue.c",
22 |                        ]
23 | 
24 | DIRS_TO_BE_INCLUDED_CPU = ["tree/include", "tree/cpu/include"]
25 | 
26 | SOURCES_RELATIVE_PATH = "src/"
27 | current_path = os.path.dirname(os.path.abspath(__file__))
28 | sources_abs_path = os.path.abspath(os.path.join(current_path, SOURCES_RELATIVE_PATH))
29 | 
30 | # source files
31 | source_files_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in FILES_TO_BE_COMPILED_CPU]
32 | include_paths_cpu = [os.path.abspath(os.path.join(sources_abs_path, x)) for x in DIRS_TO_BE_INCLUDED_CPU]
33 | 
34 | numpy_include = numpy.get_include()
35 | 
36 | def configuration(parent_package='', top_path=None):
37 | 
38 |     from numpy.distutils.misc_util import Configuration
39 |     config = Configuration('models/forest', parent_package, top_path)
40 | 
41 |     # CPU + FLOAT
42 |     config.add_extension("_wrapper_cpu_float", \
43 |                                     sources = ["swig/cpu_float.i"] + source_files_cpu,
44 |                                     swig_opts=['-modern', '-threads'],
45 |                                     include_dirs = [numpy_include] +[include_paths_cpu],
46 |                                     define_macros = [
47 |                                         ('ABSOLUTE_PATH', os.path.join(sources_abs_path, "ensemble")),
48 |                                         ('USE_DOUBLE', 0),
49 |                                         ('TIMING', TIMING)
50 |                                     ],
51 |                                     libraries=['gomp'],
52 |                                     extra_compile_args=["-std=gnu89", "-fopenmp", '-O3', '-Wall', '-pthread', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu])
53 | 
54 |     # CPU + DOUBLE
55 |     config.add_extension("_wrapper_cpu_double", \
56 |                                     sources = ["swig/cpu_double.i"] + source_files_cpu,
57 |                                     swig_opts=['-modern', '-threads'],
58 |                                     include_dirs = [numpy_include] +[include_paths_cpu],
59 |                                     define_macros = [
60 |                                         ('ABSOLUTE_PATH', os.path.join(sources_abs_path, "ensemble")),
61 |                                         ('USE_DOUBLE', 1),
62 |                                         ('TIMING', TIMING)
63 |                                     ],
64 |                                     libraries=['gomp'],
65 |                                     extra_compile_args=["-std=gnu89", "-fopenmp", '-O3', '-Wall', '-pthread', '-Wno-unused-label'] + ['-I'+ipath for ipath in include_paths_cpu])
66 |         
67 |     return config
68 | 
69 | if __name__ == '__main__':
70 |         
71 |     from numpy.distutils.core import setup
72 |     setup(**configuration(top_path='').todict())
73 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/include/cpu.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * cpu.h
 3 |  *
 4 |  *  Created on: 17.04.2015
 5 |  *      Author: fgieseke
 6 |  */
 7 | 
 8 | #ifndef FORESTS_STANDARD_INCLUDE_CPU_H_
 9 | #define FORESTS_STANDARD_INCLUDE_CPU_H_
10 | 
11 | #include <stdlib.h>
12 | #include <omp.h>
13 | #include <stdio.h>
14 | #include <time.h>
15 | 
16 | #include "global.h"
17 | #include "util.h"
18 | #include "../cpu/include/base.h"
19 | 
20 | 
21 | 
22 | 
23 | /* --------------------------------------------------------------------------------
24 |  * Fits a model given the training data (and parameters)
25 |  * --------------------------------------------------------------------------------
26 |  */
27 | void fit_forest(FLOAT_TYPE *Xtrain, int nXtrain, int dXtrain,
28 | 		FLOAT_TYPE *Ytrain, int *bootstrap_indices, int *bootstrap_indices_weights,
29 | 		int n_bootstrap_indices, int d_bootstrap_indices, int use_bindices, PARAMETERS *params, FOREST *forest);
30 | 
31 | /* --------------------------------------------------------------------------------
32 |  * Builds a single tree.
33 |  * --------------------------------------------------------------------------------
34 |  */
35 | void build_single_tree(TREE *tree, TRAINING_DATA *train_data,
36 | 		PARAMETERS *params, unsigned int *rstate);
37 | 
38 | 
39 | 
40 | /* --------------------------------------------------------------------------------
41 |  * Process huge nodes
42 |  * --------------------------------------------------------------------------------
43 |  */
44 | void process_all_nodes(TREE *tree, TRAINING_DATA *train_data, PQUEUE *huge_traversal_queue,
45 | 		PARAMETERS *params, unsigned int *rstate);
46 | 
47 | 
48 | /* --------------------------------------------------------------------------------
49 |  * Returns a chunk of traversal records.
50 |  * --------------------------------------------------------------------------------
51 |  */
52 | TRAVERSAL_RECORD **get_chunk_trecords(PQUEUE *traversal_queue, int *n_trecords, int n_to_be_removed);
53 | 
54 | /* --------------------------------------------------------------------------------
55 |  * Generate traversal records for the children of a given traversal record
56 |  * --------------------------------------------------------------------------------
57 |  */
58 | void generate_traversal_records_children(TREE *tree, TRAINING_DATA *train_data, PQUEUE *huge_traversal_queue,
59 | 		TRAVERSAL_RECORD *trecord,
60 | 		PARAMETERS *params, int node_id);
61 | 
62 | void generate_next_leaf_node(int node_id, int start, int end, unsigned int leaf_criterion, int depth, int prio, int child_flag, TREE *tree,
63 | 		PQUEUE *huge_traversal_queue, TRAVERSAL_RECORD *trecord, TRAINING_DATA *train_data, PARAMETERS *params);
64 | 
65 | /* --------------------------------------------------------------------------------
66 |  * Generates leaves and nodes
67 |  * --------------------------------------------------------------------------------
68 |  */
69 | void generate_leaves_nodes(TREE *tree, TRAVERSAL_RECORD **trecords, int n_trecords,
70 | 		TRAINING_DATA *train_data, PARAMETERS *params, PQUEUE *huge_traversal_queue);
71 | 
72 | /* --------------------------------------------------------------------------------
73 |  * Generates a single leaf node
74 |  * --------------------------------------------------------------------------------
75 |  */
76 | void generate_leaf(TREE *tree, int start, int end, int parent_id, int is_left_child,
77 | 		unsigned int leaf_criterion, TRAINING_DATA *train_data, PARAMETERS *params);
78 | 
79 | 
80 | #endif /* FORESTS_STANDARD_INCLUDE_CPU_H_ */
81 | 


--------------------------------------------------------------------------------
/woody/util/draw.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | import numpy
 8 | 
 9 | def draw_single_tree(tree, 
10 |                      node_stats=None, 
11 |                      ax=None, 
12 |                      figsize=(200,20),
13 |                      fname="tree.pdf", 
14 |                      with_labels=True, 
15 |                      arrows=False, 
16 |                      edge_width=1.0,
17 |                      font_size=7,
18 |                      alpha=0.5,
19 |                      edges_alpha=1.0,
20 |                      node_size=1000):
21 |     
22 |     try:
23 |         import networkx as nx
24 |         from networkx.drawing.nx_agraph import graphviz_layout
25 |         import matplotlib.pyplot as plt
26 |     except Exception as e:
27 |         raise Exception("Module 'networkx' is required to export the tree structure: %s" % str(e))
28 |     
29 |     d = os.path.dirname(fname)
30 |     if len(d) > 0:
31 |         if not os.path.exists(d):
32 |             os.makedirs(d)     
33 |                         
34 |     if ax is None:
35 |         fig = plt.figure(figsize=figsize)
36 |         ax = fig.add_subplot(111)
37 |                         
38 |     pos = graphviz_layout(tree, prog='dot')
39 |     
40 |     if node_stats is not None:
41 |         lmin = numpy.array([node_stats[i] for i in node_stats.keys()]).min()
42 |         lmax = numpy.array([node_stats[i] for i in node_stats.keys()]).max()
43 |     
44 |     internal_nodes = {'labels':{}, 'sizes':[], 'node_list':[]}
45 |     leaves = {'labels':{}, 'sizes':[], 'node_list':[]}
46 |         
47 |     for i in xrange(len(tree.nodes())):
48 |         if tree.node[i]['is_leaf'] == True:
49 |             leaves['node_list'].append(i)
50 |             if node_stats is not None:
51 |                 leaves['labels'][i] = "#" + str(node_stats[i]) + "(" + str(tree.node[i]['leaf_criterion']) + ")"
52 |                 leaves['sizes'].append((0.00001 + ((float(node_stats[i] - lmin) / (lmax-lmin)))) *  node_size)
53 |             else:
54 |                 leaves['sizes'].append(node_size)
55 |         else:
56 |             internal_nodes['node_list'].append(i)
57 |             internal_nodes['labels'][i] = "" #str(i)
58 |             internal_nodes['sizes'].append(node_size)
59 |     
60 |     # internal nodes
61 |     nx.draw_networkx_nodes(tree, 
62 |             pos,
63 |             nodelist=internal_nodes['node_list'],
64 |             ax=ax, 
65 |             node_color='#0000FF', 
66 |             with_labels=False,
67 |             linewidths=edge_width,
68 |             alpha=alpha,   
69 |             node_size=internal_nodes['sizes'])
70 |     if with_labels == True:
71 |         nx.draw_networkx_labels(tree, pos, internal_nodes['labels'], font_size=7)
72 | 
73 |     # leaves
74 |     nx.draw_networkx_nodes(tree, 
75 |             pos,
76 |             nodelist=leaves['node_list'],
77 |             ax=ax, 
78 |             node_color='#FF0000',
79 |             with_labels=False,
80 |             linewidths=edge_width,
81 |             alpha=alpha,              
82 |             node_size=leaves['sizes'])
83 |     if with_labels == True:
84 |         nx.draw_networkx_labels(tree, pos, leaves['labels'], font_size=font_size)
85 |     
86 |     # draw edges
87 |     nx.draw_networkx_edges(tree, pos, edge_color='#000000', width=edge_width, alpha=edges_alpha)
88 |     
89 |     plt.axis('off')
90 |     plt.savefig(fname, bbox_inches='tight')
91 |     plt.close()
92 |     plt.close()
93 |     


--------------------------------------------------------------------------------
/woody/models/forest/.cproject:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 3 | 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 4 | 		<cconfiguration id="cdt.managedbuild.toolchain.gnu.base.1637308604">
 5 | 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.base.1637308604" moduleId="org.eclipse.cdt.core.settings" name="Default">
 6 | 				<externalSettings/>
 7 | 				<extensions>
 8 | 					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 9 | 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
10 | 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
11 | 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
12 | 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
13 | 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
14 | 				</extensions>
15 | 			</storageModule>
16 | 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
17 | 				<configuration buildProperties="" id="cdt.managedbuild.toolchain.gnu.base.1637308604" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
18 | 					<folderInfo id="cdt.managedbuild.toolchain.gnu.base.1637308604.1783381339" name="/" resourcePath="">
19 | 						<toolChain id="cdt.managedbuild.toolchain.gnu.base.638866043" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.base">
20 | 							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.base.849492083" name="Debug Platform" osList="linux,hpux,aix,qnx" superClass="cdt.managedbuild.target.gnu.platform.base"/>
21 | 							<builder id="cdt.managedbuild.target.gnu.builder.base.1650247752" managedBuildOn="false" name="Gnu Make Builder.Default" superClass="cdt.managedbuild.target.gnu.builder.base"/>
22 | 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1886988537" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
23 | 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.base.91371697" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.base"/>
24 | 							<tool id="cdt.managedbuild.tool.gnu.c.compiler.base.285567212" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.base"/>
25 | 							<tool id="cdt.managedbuild.tool.gnu.c.linker.base.1058688729" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.base"/>
26 | 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.base.1202535800" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.base"/>
27 | 							<tool id="cdt.managedbuild.tool.gnu.assembler.base.330198266" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.base"/>
28 | 						</toolChain>
29 | 					</folderInfo>
30 | 				</configuration>
31 | 			</storageModule>
32 | 			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
33 | 		</cconfiguration>
34 | 	</storageModule>
35 | 	<storageModule moduleId="scannerConfiguration">
36 | 		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
37 | 	</storageModule>
38 | 	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
39 | 		<project id="woody(C).null.1931818091" name="woody(C)"/>
40 | 	</storageModule>
41 | 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
42 | </cproject>
43 | 


--------------------------------------------------------------------------------
/woody/util/parallel.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v2
  4 | #
  5 | 
  6 | import multiprocessing
  7 | from multiprocessing.pool import ThreadPool
  8 | 
  9 | def pool_init():  
 10 |     import gc
 11 |     gc.collect()
 12 |     
 13 | def wrapped_task(queue, task, args, kwargs):
 14 |     
 15 |     queue.put(task(*args, **kwargs))        
 16 | 
 17 | from multiprocessing import Queue
 18 | 
 19 | # https://github.com/joblib/joblib/issues/138    
 20 | def start_via_single_process(task, args, kwargs):
 21 |             
 22 |     queue = Queue()
 23 | 
 24 |     proc = multiprocessing.Process(target=wrapped_task, args=(queue, task, args, kwargs))                
 25 |     proc.start()
 26 |     
 27 |     result = queue.get()
 28 |     
 29 |     # joining might yield errors ...
 30 |     # https://gist.github.com/schlamar/2311116
 31 |     # see https://docs.python.org/2/library/multiprocessing.html#all-platforms
 32 |     #proc.join()
 33 |     return result
 34 |     
 35 |     
 36 | # def perform_task_in_parallel_in_place(task, params_parallel, n_jobs=1):
 37 | #     """ Performas a task in parallel (in place, not return results are generated
 38 | #      
 39 | #     Parameters
 40 | #     ----------
 41 | #     task : callable
 42 | #         The function/procedure that shall be executed
 43 | #     params_parallel : list
 44 | #         The parallel parameters
 45 | #     n_jobs : int, default 1
 46 | #         The number of jobs that shall be used
 47 | #     """
 48 | #     
 49 | #     
 50 | #     # https://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.pool
 51 | #     pool = multiprocessing.Pool(n_jobs, maxtasksperchild=1)
 52 | #     results = pool.apply_async(task, params_parallel)
 53 | # 
 54 | #     pool.close()
 55 | #     pool.join()    
 56 | #         
 57 | #     return results
 58 |     
 59 |             
 60 | def perform_task_in_parallel(task, params_parallel, n_jobs=1, backend="multiprocessing"):
 61 |     """ Performas a task in parallel
 62 |      
 63 |     Parameters
 64 |     ----------
 65 |     task : callable
 66 |         The function/procedure that shall be executed
 67 |     params_parallel : list
 68 |         The parallel parameters
 69 |     n_jobs : int, default 1
 70 |         The number of jobs that shall be used
 71 |     backend : str, default 'multiprocessing'
 72 |     """
 73 |     
 74 |     if backend == 'multiprocessing':
 75 |     
 76 |         # https://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.pool
 77 |         pool = multiprocessing.Pool(n_jobs, maxtasksperchild=1, initializer=pool_init)
 78 |         results = pool.map(task, params_parallel)
 79 |     
 80 |         pool.close()
 81 |         pool.join()    
 82 |             
 83 |         return results
 84 |     
 85 |     elif backend == 'threading':
 86 |         
 87 |         pool = ThreadPool(n_jobs)
 88 |         results = pool.map(task, params_parallel)
 89 |         pool.close()
 90 |         pool.join()
 91 | 
 92 |         return results            
 93 | 
 94 |     else:
 95 |         raise Exception("Unknown backend: %s" % str(backend))
 96 |     
 97 |     
 98 | if __name__ == "__main__":
 99 |     
100 |     def foo(x):
101 |         print x
102 |         return x*x
103 |     
104 |     params_parallel = range(10000)
105 |     #perform_task_in_parallel(foo, params_parallel, backend="multiprocessing", n_jobs=4)
106 |     results = perform_task_in_parallel(foo, params_parallel, backend="multiprocessing", n_jobs=4)
107 |     print "results=", results
108 |     #results = perform_task_in_parallel(foo, params_parallel, backend="threading", n_jobs=4)
109 |     #print "results=", results    
110 |     


--------------------------------------------------------------------------------
/woody/io/store.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v2
  4 | #
  5 | 
  6 | import h5py
  7 | import numpy
  8 | from woody.util import ensure_dir_for_file
  9 | 
 10 | class Store(object):
 11 |     
 12 |     def __init__(self):
 13 |         pass
 14 | 
 15 | class MemoryStore(Store):
 16 |     
 17 |     def __init__(self):
 18 |         
 19 |         self._containers = {}
 20 |         self._objects = {}
 21 |             
 22 |     def create_dataset(self, container_key, dkey, data):
 23 |         
 24 |         if container_key not in self._containers.keys():
 25 |             self._containers[container_key] = {}
 26 |         
 27 |         self._containers[container_key][dkey] = data    
 28 |             
 29 |     def append_to_dataset(self, container_key, dkey, data):
 30 | 
 31 |         if container_key not in self._containers.keys():
 32 |             self._containers[container_key] = {}
 33 |                     
 34 |         if not dkey in self._containers[container_key].keys():
 35 |             
 36 |             self._containers[container_key][dkey] = data
 37 |             
 38 |         else:
 39 |             
 40 |             newdata = numpy.concatenate([self._containers[container_key][dkey], data], axis=0)
 41 |             self._containers[container_key][dkey] = newdata 
 42 |         
 43 |     def get_dataset(self, container_key, dkey):
 44 |         
 45 |         return numpy.ascontiguousarray(self._containers[container_key][dkey])
 46 |     
 47 |     def get_keys(self, container_key):
 48 |         
 49 |         return self._containers[container_key].keys()        
 50 |     
 51 |     def save(self, key, obj):
 52 |         
 53 |         self._objects[key] = obj
 54 |     
 55 |     def load(self, key, loader):
 56 |         
 57 |         return self._objects[key]
 58 |     
 59 | class DiskStore(Store):
 60 |     
 61 |     def __init__(self):
 62 |         pass 
 63 |             
 64 |     def create_dataset(self, container_key, dkey, data):
 65 |         
 66 |         ensure_dir_for_file(container_key)  
 67 |         s = h5py.File(container_key, 'a', driver="sec2", libver='latest')
 68 |         
 69 |         dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf")
 70 |         dset[:,:] = data
 71 |         
 72 |         s.close()
 73 |     
 74 |     def append_to_dataset(self, container_key, dkey, data):
 75 | 
 76 |         ensure_dir_for_file(container_key)
 77 |         s = h5py.File(container_key, 'a', driver="sec2", libver='latest')
 78 |         
 79 |         offset = 0
 80 |         
 81 |         if not dkey in s.keys():
 82 |             
 83 |             dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf")
 84 |             
 85 |         else:
 86 |             
 87 |             dset = s.get(dkey)
 88 |             offset += dset.shape[0]
 89 |             dset.resize(dset.shape[0] + data.shape[0], axis=0)
 90 |     
 91 |         dset[offset:, :] = data
 92 |         
 93 |         s.close()
 94 |         
 95 |     def get_dataset(self, container_key, dkey):
 96 |         
 97 |         with h5py.File(container_key, 'r') as container:
 98 |             dset = numpy.array(container.get(dkey))
 99 |             
100 |         return dset[:,:]
101 |     
102 |     def get_keys(self, container_key):
103 |                 
104 |         s = h5py.File(container_key, 'r')
105 |         keys = s.keys()
106 |         s.close()
107 |         
108 |         return keys  
109 |     
110 |     def save(self, key, obj):
111 |         
112 |         obj.save(key)
113 |     
114 |     def load(self, key, loader):
115 |         
116 |         return loader.load(key)
117 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/include/tree.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * tree.h
 3 |  *
 4 |  *  Created on: 25.10.2014
 5 |  *      Author: fgieseke
 6 |  */
 7 | 
 8 | #ifndef FOREST_STANDARD_INCLUDE_TREE_H_
 9 | #define FOREST_STANDARD_INCLUDE_TREE_H_
10 | 
11 | #include "global.h"
12 | 
13 | /* --------------------------------------------------------------------------------
14 |  * Initializes a forest
15 |  * --------------------------------------------------------------------------------
16 |  */
17 | inline void init_forest(FOREST *forest, int n_trees);
18 | 
19 | /* --------------------------------------------------------------------------------
20 |  * Frees memory allocated for a forest
21 |  * --------------------------------------------------------------------------------
22 |  */
23 | inline void free_forest(FOREST *forest, int free_trees, int free_forest);
24 | 
25 | /* --------------------------------------------------------------------------------
26 |  * Initializes a single tree
27 |  * --------------------------------------------------------------------------------
28 |  */
29 | inline void init_tree(TREE *tree, int n_allocated);
30 | 
31 | /* --------------------------------------------------------------------------------
32 |  * Frees memory allocated for a single tree
33 |  * --------------------------------------------------------------------------------
34 |  */
35 | inline void free_tree(TREE *tree);
36 | 
37 | /* --------------------------------------------------------------------------------
38 |  * Returns a node based on given node id
39 |  * --------------------------------------------------------------------------------
40 |  */
41 | inline TREE_NODE* get_node(TREE *tree, int node_id);
42 | 
43 | /* --------------------------------------------------------------------------------
44 |  * Adds a node to a given tree
45 |  * --------------------------------------------------------------------------------
46 |  */
47 | int add_node_to_tree(TREE *tree, int parent_id, int is_left_child);
48 | 
49 | /* --------------------------------------------------------------------------------
50 |  * Attaches tree to a leaf of another tree
51 |  * --------------------------------------------------------------------------------
52 |  */
53 | int attach_tree(TREE *tree, TREE *subtree, int leaf_id);
54 | 
55 | /* --------------------------------------------------------------------------------
56 |  * Generates an internal node
57 |  * --------------------------------------------------------------------------------
58 |  */
59 | inline int generate_internal_tree_node(TREE *tree, int parent_id, int is_left_child,
60 | 		int is_leaf, int feature, FLOAT_TYPE threshold, int node_samples);
61 | 
62 | /* --------------------------------------------------------------------------------
63 |  * Generates a single leaf
64 |  * --------------------------------------------------------------------------------
65 |  */
66 | inline int generate_tree_leaf(TREE *tree, int parent_id, int is_left_child,
67 | 		FLOAT_TYPE leaf_value, unsigned int leaf_criterion);
68 | 
69 | /* --------------------------------------------------------------------------------
70 |  * Initializes node entries for internal node
71 |  * --------------------------------------------------------------------------------
72 |  */
73 | void init_internal_tree_node(TREE_NODE *node, int parent_id, int feature,
74 | 		FLOAT_TYPE threshold, int node_samples);
75 | 
76 | /* --------------------------------------------------------------------------------
77 |  * Initializes node entries for leaf node
78 |  * --------------------------------------------------------------------------------
79 |  */
80 | void init_tree_leaf(TREE_NODE *node, int parent_id, FLOAT_TYPE leaf_value, unsigned int leaf_criterion);
81 | 
82 | #endif /* FOREST_STANDARD_INCLUDE_TREE_H_ */
83 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/.cproject:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 3 | 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 4 | 		<cconfiguration id="cdt.managedbuild.toolchain.gnu.base.1409179767">
 5 | 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.base.1409179767" moduleId="org.eclipse.cdt.core.settings" name="Default">
 6 | 				<externalSettings/>
 7 | 				<extensions>
 8 | 					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
 9 | 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
10 | 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
11 | 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
12 | 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
13 | 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
14 | 				</extensions>
15 | 			</storageModule>
16 | 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
17 | 				<configuration buildProperties="" id="cdt.managedbuild.toolchain.gnu.base.1409179767" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
18 | 					<folderInfo id="cdt.managedbuild.toolchain.gnu.base.1409179767.2019355545" name="/" resourcePath="">
19 | 						<toolChain id="cdt.managedbuild.toolchain.gnu.base.1858948916" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.base">
20 | 							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.base.1328757609" name="Debug Platform" osList="linux,hpux,aix,qnx" superClass="cdt.managedbuild.target.gnu.platform.base"/>
21 | 							<builder id="cdt.managedbuild.target.gnu.builder.base.1642703499" managedBuildOn="false" name="Gnu Make Builder.Default" superClass="cdt.managedbuild.target.gnu.builder.base"/>
22 | 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.274453863" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
23 | 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.base.252258130" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.base"/>
24 | 							<tool id="cdt.managedbuild.tool.gnu.c.compiler.base.1770382656" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.base"/>
25 | 							<tool id="cdt.managedbuild.tool.gnu.c.linker.base.1831163166" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.base"/>
26 | 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.base.1495530742" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.base"/>
27 | 							<tool id="cdt.managedbuild.tool.gnu.assembler.base.577833185" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.base"/>
28 | 						</toolChain>
29 | 					</folderInfo>
30 | 				</configuration>
31 | 			</storageModule>
32 | 			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
33 | 		</cconfiguration>
34 | 	</storageModule>
35 | 	<storageModule moduleId="scannerConfiguration">
36 | 		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
37 | 		<scannerConfigBuildInfo instanceId="0.1296053352">
38 | 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
39 | 		</scannerConfigBuildInfo>
40 | 	</storageModule>
41 | 	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
42 | 		<project id="woody(C).null.15282417" name="woody(C)"/>
43 | 	</storageModule>
44 | 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
45 | </cproject>
46 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/tree/cpu/fastsort.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * fastsort.c
  3 |  *
  4 |  *  Created on: 23.01.2017
  5 |  *      Author: fgieseke
  6 |  */
  7 | #include "include/fastsort.h"
  8 | 
  9 | 
 10 | #define swap_fast(a1, a2, s1, s2) { \
 11 | register FLOAT_TYPE tmp = *(a1); \
 12 | *(a1) = *(a2); \
 13 | *(a2) = tmp; \
 14 | register int tmpint = *(s1); \
 15 | *(s1) = *(s2); \
 16 | *(s2) = tmpint; \
 17 | }
 18 | 
 19 | inline static int fast_partition(FLOAT_TYPE *a, int *samples, int lo, int hi, FLOAT_TYPE x);
 20 | inline static int fast_floor_lg(int a);
 21 | static FLOAT_TYPE fast_medianof3(FLOAT_TYPE *a, int lo, int mid, int hi);
 22 | static void fast_downheap(FLOAT_TYPE *a, int i, int n, int lo);
 23 | static void fast_heapsort(FLOAT_TYPE *a, int *samples, int lo, int hi);
 24 | static void fast_introsort_loop(FLOAT_TYPE *a, int *samples, int lo, int hi, int depth_limit);
 25 | static void fast_insertionsort(FLOAT_TYPE *a, int *samples, int lo, int hi);
 26 | 
 27 | void combined_sort(FLOAT_TYPE *XF, int *samples, int n) {
 28 | 
 29 | 	fast_introsort_loop(XF, samples, 0, n, 2 * fast_floor_lg(n));
 30 | 	fast_insertionsort(XF, samples, 0, n);
 31 | }
 32 | 
 33 | static void fast_introsort_loop(FLOAT_TYPE *a, int *samples, int lo, int hi, int depth_limit) {
 34 | 	int p = -1;
 35 | 
 36 | 	while (hi - lo > fast_size_threshold) {
 37 | 
 38 | 		if (depth_limit == 0) {
 39 | 			fast_heapsort(a, samples, lo, hi);
 40 | 			return;
 41 | 		}
 42 | 		depth_limit--;
 43 | 
 44 | 		p = fast_partition(a, samples, lo, hi, fast_medianof3(a, lo, lo + ((hi - lo) / 2) + 1, hi - 1));
 45 | 
 46 | 		fast_introsort_loop(a, samples, p, hi, depth_limit);
 47 | 		hi = p;
 48 | 	}
 49 | }
 50 | 
 51 | inline static int fast_partition(FLOAT_TYPE *a, int *samples, int lo, int hi, FLOAT_TYPE x) {
 52 | 	int i = lo, j = hi;
 53 | 	while (1) {
 54 | 		while (a[i] < x)
 55 | 			i++;
 56 | 		j--;
 57 | 		while (x < a[j])
 58 | 			j--;
 59 | 		if (i >= j)
 60 | 			return i;
 61 | 		swap_fast(&a[i], &a[j], &samples[i], &samples[j]);
 62 | 		i++;
 63 | 	}
 64 | }
 65 | 
 66 | inline static FLOAT_TYPE fast_medianof3(FLOAT_TYPE *a, int lo, int mid, int hi) {
 67 | 
 68 | 	if (a[mid] < a[lo]) {
 69 | 
 70 | 		if (a[hi] < a[mid])
 71 | 			return a[mid];
 72 | 		else {
 73 | 			if (a[hi] < a[lo])
 74 | 				return a[hi];
 75 | 			else
 76 | 				return a[lo];
 77 | 		}
 78 | 	} else {
 79 | 		if (a[hi] < a[mid]) {
 80 | 			if (a[hi] < a[lo])
 81 | 				return a[lo];
 82 | 			else
 83 | 				return a[hi];
 84 | 		} else
 85 | 			return a[mid];
 86 | 	}
 87 | }
 88 | 
 89 | static void fast_heapsort(FLOAT_TYPE *a, int *samples, int lo, int hi) {
 90 | 	int n = hi - lo;
 91 | 	int i;
 92 | 	for (i = n / 2; i >= 1; i--) {
 93 | 		fast_downheap(a, i, n, lo);
 94 | 	}
 95 | 	for (i = n; i > 1; i--) {
 96 | 		swap_fast(&a[lo], &a[lo + i - 1], &samples[lo], &samples[lo + i -1]);
 97 | 		fast_downheap(a, 1, i - 1, lo);
 98 | 	}
 99 | }
100 | 
101 | inline static void fast_downheap(FLOAT_TYPE *a, int i, int n, int lo) {
102 | 	FLOAT_TYPE d = a[lo + i - 1];
103 | 	int child;
104 | 	int n2 = n / 2;
105 | 	while (i <= n2) {
106 | 		child = 2 * i;
107 | 		if (child < n && a[lo + child - 1] < a[lo + child])
108 | 			child++;
109 | 		if (d >= a[lo + child - 1])
110 | 			break;
111 | 		a[lo + i - 1] = a[lo + child - 1];
112 | 		i = child;
113 | 	}
114 | 	a[lo + i - 1] = d;
115 | }
116 | 
117 | 
118 | static void fast_insertionsort(FLOAT_TYPE *a, int *samples, int lo, int hi) {
119 | 	int i, j;
120 | 	FLOAT_TYPE tfloat;
121 | 	int tint;
122 | 	for (i = lo; i < hi; i++) {
123 | 		j = i;
124 | 		tfloat = a[i];
125 | 		tint = samples[i];
126 | 		while (j != lo && tfloat < a[j - 1]) {
127 | 			a[j] = a[j - 1];
128 | 			samples[j] = samples[j - 1];
129 | 			j--;
130 | 		}
131 | 		a[j] = tfloat;
132 | 		samples[j] = tint;
133 | 	}
134 | }
135 | 
136 | inline static int fast_floor_lg(int a) {
137 | 	return (int) floor(log(a) / log(2));
138 | }
139 | 


--------------------------------------------------------------------------------
/woody/util/array/base.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v2
  4 | #
  5 | 
  6 | import numpy
  7 | 
  8 | import wrapper_utils_cpu_float, wrapper_utils_cpu_double
  9 | 
 10 | def split_array_chunk(a, indicator, chunks, counts):
 11 |     
 12 |     if type(a[0,0]) == numpy.float64:
 13 |         wrapper = wrapper_utils_cpu_double
 14 |     elif type(a[0,0]) == numpy.float32:
 15 |         wrapper = wrapper_utils_cpu_float
 16 |     else:
 17 |         raise Exception("Invalid dtype for array: %s" % str(type(a[0,0])))
 18 |             
 19 |     anew = numpy.empty(a.shape, dtype=a.dtype)
 20 |                 
 21 |     cumsums = numpy.cumsum(counts).astype(numpy.int32)
 22 |         
 23 |     wrapper.split_array(a, anew, indicator, chunks, cumsums)
 24 |         
 25 |     return anew
 26 |            
 27 | def split_array(a, indicator, chunks, counts, n_jobs=1):
 28 |     """ Splits an array according to an indicator array.
 29 |     
 30 |     Parameters
 31 |     ----------
 32 |     a : array, numpy-like
 33 |         The input array that is supposed to
 34 |         be split according to the indicator array.
 35 |     indicator: array, numpy-like
 36 |         The array that contains the indices 
 37 |         according to which the array should be 
 38 |         split up. Each index is also contained in
 39 |         the chunks array (see below).
 40 |     chunks: array, numpy-like
 41 |         This array contains all possible chunk indices
 42 |         that occur in the indices array. E.g., 
 43 |         chunks = [-1,-1,0,-1,-1,1] means that we
 44 |         have two chunks in total and an indicator 
 45 |         index 2 is mapped to chunk 0 and an 
 46 |         indicator index 5 to chunk 1.  
 47 |     counts: array, numpy-like
 48 |     """
 49 |     
 50 |     reshaped = False
 51 |         
 52 |     if len(a.shape) == 1:
 53 |         reshaped = True
 54 |         a = a.reshape((len(a), 1))
 55 | 
 56 |     if type(a[0,0]) == numpy.float64:
 57 |         wrapper = wrapper_utils_cpu_double
 58 |     elif type(a[0,0]) == numpy.float32:
 59 |         wrapper = wrapper_utils_cpu_float
 60 |     else:
 61 |         raise Exception("Invalid dtype for array: %s" % str(type(a[0,0])))
 62 |     
 63 |     indicator = indicator.astype(numpy.int32)
 64 |     chunks = chunks.astype(numpy.int32)
 65 |     counts = counts.astype(numpy.int32)
 66 |     
 67 | #     sanity_check = True
 68 | #     # sanity checks (to be removed)    
 69 | #     if sanity_check:
 70 | #         for indi in indicator:
 71 | #             assert chunks[indi] != -1        
 72 | #          
 73 | #         anew_check = numpy.empty(a.shape, dtype=a.dtype)
 74 | #         # compute splits
 75 | #         counter = 0
 76 | #         unique, unique_counts = numpy.unique(indicator, return_counts=True)
 77 | #         for i in xrange(len(unique)):
 78 | #             u = unique[i]
 79 | #             selector = indicator == u
 80 | #              
 81 | #             sub = a[selector,:]
 82 | #             anew_check[counter:counter+len(sub),:] = sub
 83 | #             counter += len(sub)
 84 |         
 85 |     # compute new array        
 86 |     anew = numpy.empty(a.shape, dtype=a.dtype)
 87 |     cumsums = numpy.cumsum(counts).astype(numpy.int32)
 88 |     cumsums_minus_counuts = cumsums - counts
 89 |     wrapper.split_array(a, anew, indicator, chunks, cumsums_minus_counuts)
 90 |     #anew = anew_check
 91 | 
 92 | #     if sanity_check == True:
 93 | #         assert numpy.allclose(anew_check, anew)
 94 |     
 95 |     if reshaped == True:
 96 |         a = a.reshape(a.shape[0])
 97 |         anew = anew.reshape(anew.shape[0])
 98 |         
 99 |     return anew
100 | 
101 | def transpose_array(a, a_trans):
102 | 
103 |     if type(a[0,0]) == numpy.float64:
104 |         wrapper = wrapper_utils_cpu_double
105 |     else:
106 |         wrapper = wrapper_utils_cpu_float
107 | 
108 |     wrapper.transpose_array(a, a_trans)            
109 |     
110 | 


--------------------------------------------------------------------------------
/woody/data/covtype.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
 3 | # License: GPL v2
 4 | #
 5 | 
 6 | import os
 7 | import numpy
 8 | import pandas
 9 | 
10 | from woody.io import DataGenerator
11 | 
12 | from .util import check_and_download, save_to_h5pd
13 | 
14 | def get_covtype_files(data_path, train_size=100000):
15 |     
16 |     fname_train = os.path.join(data_path, "covtype/covtype-train-1.csv")
17 |     fname_test = os.path.join(data_path, "covtype/covtype-test-1.csv")
18 |     check_and_download(fname_train)
19 |     check_and_download(fname_test)
20 |     
21 |     fname_train_size = os.path.join(data_path, "covtype/covtype-train-1_%s.csv" % str(train_size))
22 |     
23 |     if not os.path.exists(fname_train_size):
24 |         os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname_train, fname_train_size))    
25 |     
26 |     return fname_train_size, fname_test
27 |     
28 | def get_covtype_data(data_path, train_size=100000, shuffle_train=False, shuffle_test=False, seed=0):
29 | 
30 |     numpy.random.seed(seed)
31 | 
32 |     fname_train, fname_test = get_covtype_files(data_path, train_size)
33 | 
34 |     # training data
35 |     outcome_col = 55
36 |     features = 54
37 |     data = pandas.read_csv(fname_train, dtype="int", header=None)
38 |     ytrain = numpy.ascontiguousarray(data[(outcome_col-1)].values)
39 |     xcols = set(range(features+1)).difference(set([outcome_col-1]))
40 |     Xtrain = numpy.ascontiguousarray(data.ix[:,xcols].values)
41 |     
42 |     if shuffle_train == True:
43 |         train_partition = numpy.random.permutation(Xtrain.shape[0])    
44 |         Xtrain = Xtrain[train_partition]
45 |         ytrain = ytrain[train_partition]
46 | 
47 |     # testing data
48 |     data = pandas.read_csv(fname_test, dtype=int, header=None)
49 |     ytest = numpy.ascontiguousarray(data[(outcome_col-1)].values)
50 |     xcols = set(range(features+1)).difference(set([outcome_col-1]))
51 |     Xtest = numpy.ascontiguousarray(data.ix[:,xcols].values)
52 |     
53 |     if shuffle_test == True:
54 |         test_partition = numpy.random.permutation(Xtest.shape[0])    
55 |         Xtest = Xtest[test_partition]
56 |         ytest = ytest[test_partition]
57 | 
58 |     return Xtrain, ytrain, Xtest, ytest
59 | 
60 | def _convert_datasets(data_path, train_size):
61 | 
62 |     X_train, y_train, X_test, y_test = get_covtype_data(data_path, train_size, shuffle_train=False, shuffle_test=False)
63 | 
64 |     fname_store_train = os.path.join(data_path, "covtype/covtype-train-1_%s.csv.h5pd" % str(train_size))
65 |     fname_store_test = os.path.join(data_path, "covtype/covtype-test-1.csv.h5pd")
66 | 
67 |     save_to_h5pd(X_train, y_train, fname_store_train)
68 |     save_to_h5pd(X_test, y_test, fname_store_test)
69 | 
70 | def get_covtype_generator(data_path, train_size=100000, store="h5", seed=0, part="train", patterns=True, target=True):
71 |     
72 | 
73 |     if store == "h5":
74 |         
75 |         if part=="train":
76 |             fname = os.path.join(data_path, "covtype/covtype-train-1_%s.csv.h5pd" % str(train_size))
77 |         elif part=="test":
78 |             fname = os.path.join(data_path, "covtype/covtype-test-1.csv.h5pd")
79 |             
80 |         if not os.path.exists(fname):
81 |             print("Store for covtype data does not exist. Generating all stores ...")
82 |             _convert_datasets(data_path, train_size)
83 |     
84 |         return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=200000)
85 |     
86 |     elif store == "mem":
87 |     
88 |         X_train, y_train, X_test, y_test = get_covtype_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False)
89 |         
90 |         data = {}
91 |         if part == "train":
92 |             data['X'] = X_train
93 |             data['y'] = y_train
94 |         else:
95 |             data['X'] = X_test
96 |             data['y'] = y_test            
97 |                     
98 |         return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=200000)
99 |     


--------------------------------------------------------------------------------
/experiments/small_data/sk.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import params
  5 | from util import evaluate
  6 | 
  7 | import os
  8 | import time
  9 | import json
 10 | 
 11 | from woody.util import ensure_dir_for_file
 12 | from woody.data import *          
 13 |             
 14 | def single_run(dkey, train_size, param, seed, profile=False):     
 15 |            
 16 |     print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param)))
 17 | 
 18 |     if dkey == "covtype":
 19 |         Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed)
 20 |     elif dkey == "higgs":
 21 |         Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed)
 22 |     elif dkey == "susy":
 23 |         Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed)
 24 |     else:
 25 |         raise Exception("Unknown data set!")
 26 |     
 27 |     print("")
 28 |     print("Number of training patterns:\t%i" % Xtrain.shape[0])
 29 |     print("Number of test patterns:\t%i" % Xtest.shape[0])
 30 |     print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])
 31 |     
 32 |     if param['tree_type'] == "randomized":
 33 |         from sklearn.ensemble import ExtraTreesClassifier as RF
 34 |     elif param['tree_type'] == "standard":
 35 |         from sklearn.ensemble import RandomForestClassifier as RF
 36 |     
 37 |     model = RF(
 38 |             n_estimators=param['n_estimators'],
 39 |             criterion="gini",
 40 |             max_features=param['max_features'],
 41 |             min_samples_split=2,
 42 |             n_jobs=param['n_jobs'],
 43 |             random_state=seed,
 44 |             bootstrap=param['bootstrap'],
 45 |             min_samples_leaf=1,
 46 |             max_depth=None,
 47 |             verbose=0)
 48 |     
 49 |     if profile == True:
 50 |         import yep
 51 |         assert param['n_jobs'] == 1
 52 |         yep.start("train.prof")
 53 |                     
 54 |     # training
 55 |     fit_start_time = time.time()
 56 |     model.fit(Xtrain, ytrain)
 57 |     fit_end_time = time.time()
 58 |     if profile == True:
 59 |         yep.stop()             
 60 |     ypreds_train = model.predict(Xtrain) 
 61 |      
 62 |     # testing
 63 |     test_start_time = time.time()
 64 |     ypred_test = model.predict(Xtest)
 65 |     test_end_time = time.time()
 66 |     
 67 |     results = {}
 68 |     results['dataset'] = dkey
 69 |     results['param'] = param
 70 |     results['training_time'] = fit_end_time - fit_start_time
 71 |     results['testing_time'] = test_end_time - test_start_time
 72 |     print("Training time:     %f" % results['training_time'])
 73 |     print("Testing time:      %f" % results['testing_time'])
 74 |                 
 75 |     evaluate(ypreds_train, ytrain, results, "training")
 76 |     evaluate(ypred_test, ytest, results, "testing")
 77 |                     
 78 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
 79 |                                   str(param['max_features']),
 80 |                                   str(param['n_jobs']),
 81 |                                   str(param['bootstrap']),
 82 |                                   str(param['tree_type']),
 83 |                                   str(seed),
 84 |                                 )
 85 |     fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname)
 86 |     ensure_dir_for_file(fname)
 87 |     with open(fname, 'w') as fp:
 88 |         json.dump(results, fp)
 89 |  
 90 | ###################################################################################
 91 | import argparse
 92 | parser = argparse.ArgumentParser()
 93 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
 94 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
 95 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
 96 | parser.add_argument('--key', type=str)
 97 | args = parser.parse_args()
 98 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key
 99 | ###################################################################################
100 | 
101 | single_run(dkey, train_size, params.parameters[key], seed)


--------------------------------------------------------------------------------
/experiments/small_data/wood.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import params
  5 | from util import evaluate
  6 | 
  7 | import os
  8 | import time
  9 | import json
 10 | 
 11 | from woody import WoodClassifier
 12 | from woody.util import ensure_dir_for_file
 13 | from woody.data import *
 14 |             
 15 | def single_run(dkey, train_size, param, seed, profile=False):     
 16 |            
 17 |     print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param)))
 18 | 
 19 |     if dkey == "covtype":
 20 |         Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed)
 21 |     elif dkey == "higgs":
 22 |         Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed)
 23 |     elif dkey == "susy":
 24 |         Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed)
 25 |     else:
 26 |         raise Exception("Unknown data set!")
 27 |     
 28 |     print("")
 29 |     print("Number of training patterns:\t%i" % Xtrain.shape[0])
 30 |     print("Number of test patterns:\t%i" % Xtest.shape[0])
 31 |     print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])
 32 |         
 33 |     model = WoodClassifier(
 34 |                 n_estimators=param['n_estimators'],
 35 |                 criterion="gini",
 36 |                 max_features=param['max_features'],
 37 |                 min_samples_split=2,
 38 |                 n_jobs=param['n_jobs'],
 39 |                 seed=seed,
 40 |                 bootstrap=param['bootstrap'],
 41 |                 tree_traversal_mode="dfs",
 42 |                 tree_type=param['tree_type'],
 43 |                 min_samples_leaf=1,
 44 |                 float_type="double",
 45 |                 max_depth=None,
 46 |                 verbose=0)
 47 |     
 48 |     if profile == True:
 49 |         import yep
 50 |         assert param['n_jobs'] == 1
 51 |         yep.start("train.prof")
 52 |                     
 53 |     # training
 54 |     fit_start_time = time.time()
 55 |     model.fit(Xtrain, ytrain)
 56 |     fit_end_time = time.time()
 57 |     if profile == True:
 58 |         yep.stop()             
 59 |     ypreds_train = model.predict(Xtrain) 
 60 |      
 61 |     # testing
 62 |     test_start_time = time.time()
 63 |     ypred_test = model.predict(Xtest)
 64 |     test_end_time = time.time()
 65 |     
 66 |     results = {}
 67 |     results['dataset'] = dkey
 68 |     results['param'] = param
 69 |     results['training_time'] = fit_end_time - fit_start_time
 70 |     results['testing_time'] = test_end_time - test_start_time
 71 |     print("Training time:     %f" % results['training_time'])
 72 |     print("Testing time:      %f" % results['testing_time'])
 73 |                 
 74 |     evaluate(ypreds_train, ytrain, results, "training")
 75 |     evaluate(ypred_test, ytest, results, "testing")
 76 |                     
 77 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
 78 |                                   str(param['max_features']),
 79 |                                   str(param['n_jobs']),
 80 |                                   str(param['bootstrap']),
 81 |                                   str(param['tree_type']),
 82 |                                   str(seed),
 83 |                                 )
 84 |     fname = os.path.join(params.odir, str(dkey), str(train_size), "wood", fname)
 85 |     ensure_dir_for_file(fname)
 86 |     with open(fname, 'w') as fp:
 87 |         json.dump(results, fp)
 88 |             
 89 | ###################################################################################
 90 | import argparse
 91 | parser = argparse.ArgumentParser()
 92 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
 93 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
 94 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
 95 | parser.add_argument('--key', type=str)
 96 | args = parser.parse_args()
 97 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key
 98 | ###################################################################################
 99 | 
100 | single_run(dkey, train_size, params.parameters[key], seed)            
101 |             


--------------------------------------------------------------------------------
/experiments/influence_lamda/wood.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import params
  5 | from util import evaluate
  6 | 
  7 | import os
  8 | import time
  9 | import json
 10 | 
 11 | from woody import WoodClassifier
 12 | from woody.util import ensure_dir_for_file
 13 | from woody.data import *
 14 |             
 15 | def single_run(dkey, train_size, lamcrit, param, seed, profile=False):     
 16 |            
 17 |     print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param)))
 18 | 
 19 |      
 20 |     if dkey == "covtype":
 21 |         Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed)
 22 |     elif dkey == "higgs":
 23 |         Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed)
 24 |     elif dkey == "susy":
 25 |         Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed)
 26 |     else:
 27 |         raise Exception("Unknown data set!")
 28 |     
 29 |     print("")
 30 |     print("Number of training patterns:\t%i" % Xtrain.shape[0])
 31 |     print("Number of test patterns:\t%i" % Xtest.shape[0])
 32 |     print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])
 33 |        
 34 |     model = WoodClassifier(
 35 |                 n_estimators=param['n_estimators'],
 36 |                 criterion="even_gini",
 37 |                 max_features=param['max_features'],
 38 |                 min_samples_split=2,
 39 |                 n_jobs=param['n_jobs'],
 40 |                 seed=seed,
 41 |                 bootstrap=param['bootstrap'],
 42 |                 tree_traversal_mode="dfs",
 43 |                 tree_type=param['tree_type'],
 44 |                 min_samples_leaf=1,
 45 |                 float_type="double",
 46 |                 max_depth=None,
 47 |                 lam_criterion=lamcrit,
 48 |                 verbose=0)
 49 |     
 50 |     if profile == True:
 51 |         import yep
 52 |         assert param['n_jobs'] == 1
 53 |         yep.start("train.prof")
 54 |                     
 55 |     # training
 56 |     fit_start_time = time.time()
 57 |     model.fit(Xtrain, ytrain)
 58 |     fit_end_time = time.time()
 59 |     if profile == True:
 60 |         yep.stop()            
 61 | 
 62 |     print("Number of nodes: %i" % model.get_n_nodes(0)) 
 63 |     ypreds_train = model.predict(Xtrain) 
 64 |      
 65 |     # testing
 66 |     test_start_time = time.time()
 67 |     ypred_test = model.predict(Xtest)
 68 |     test_end_time = time.time()
 69 |     
 70 |     results = {}
 71 |     results['dataset'] = dkey
 72 |     results['param'] = param
 73 |     results['training_time'] = fit_end_time - fit_start_time
 74 |     results['testing_time'] = test_end_time - test_start_time
 75 |     print("Training time:     %f" % results['training_time'])
 76 |     print("Testing time:      %f" % results['testing_time'])
 77 |                 
 78 |     evaluate(ypreds_train, ytrain, results, "training")
 79 |     evaluate(ypred_test, ytest, results, "testing")
 80 |                     
 81 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
 82 |                                   str(param['max_features']),
 83 |                                   str(param['n_jobs']),
 84 |                                   str(param['bootstrap']),
 85 |                                   str(param['tree_type']),
 86 |                                   str(seed),
 87 |                                 )
 88 |     fname = os.path.join(params.odir, str(dkey), str(train_size), str(lamcrit), "wood", fname)
 89 |     ensure_dir_for_file(fname)
 90 |     with open(fname, 'w') as fp:
 91 |         json.dump(results, fp)
 92 |             
 93 | ###################################################################################
 94 | import argparse
 95 | parser = argparse.ArgumentParser()
 96 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
 97 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
 98 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
 99 | parser.add_argument('--key', type=str)
100 | parser.add_argument('--lamcrit', nargs='?', const=0.0, type=float, default=0.0)
101 | args = parser.parse_args()
102 | dkey, train_size, seed, key, lamcrit = args.dkey, args.train_size, args.seed, args.key, args.lamcrit
103 | ###################################################################################
104 | 
105 | single_run(dkey, train_size, lamcrit, params.parameters[key], seed)            
106 |             
107 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/qsort.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * qsort.c
  3 |  *
  4 |  *  Created on: 12.11.2014
  5 |  *      Author: fgieseke
  6 |  */
  7 | 
  8 | #include "include/qsort.h"
  9 | //
 10 | // qsort.c
 11 | //
 12 | // Quick sort
 13 | //
 14 | // Copyright (C) 2002 Michael Ringgaard. All rights reserved.
 15 | //
 16 | // Redistribution and use in source and binary forms, with or without
 17 | // modification, are permitted provided that the following conditions
 18 | // are met:
 19 | //
 20 | // 1. Redistributions of source code must retain the above copyright
 21 | //    notice, this list of conditions and the following disclaimer.
 22 | // 2. Redistributions in binary form must reproduce the above copyright
 23 | //    notice, this list of conditions and the following disclaimer in the
 24 | //    documentation and/or other materials provided with the distribution.
 25 | // 3. Neither the name of the project nor the names of its contributors
 26 | //    may be used to endorse or promote products derived from this software
 27 | //    without specific prior written permission.
 28 | //
 29 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 30 | // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 31 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 32 | // ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 33 | // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 34 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 35 | // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 36 | // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 37 | // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 38 | // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 39 | // SUCH DAMAGE.
 40 | //
 41 | 
 42 | #define INT_CUTOFF 8
 43 | 
 44 | static void shortsort(char *lo, char *hi, unsigned width,
 45 | 		int (*comp)(const void *, const void *, const void*),
 46 | 		const void *comp_param);
 47 | 
 48 | static void woody_swap(char *p, char *q, unsigned int width);
 49 | 
 50 | void woody_qsort(void *base, unsigned num, unsigned width,
 51 | 		int (*comp)(const void *, const void *, const void *),
 52 | 		const void *comp_param) {
 53 | 
 54 | 	char *lo, *hi;
 55 | 	char *mid;
 56 | 	char *l, *h;
 57 | 	unsigned size;
 58 | 	char *lostk[30], *histk[30];
 59 | 	int stkptr;
 60 | 
 61 | 	if (num < 2 || width == 0)
 62 | 		return;
 63 | 
 64 | 	stkptr = 0;
 65 | 
 66 | 	lo = base;
 67 | 	hi = (char *) base + width * (num - 1);
 68 | 
 69 | 	recurse: size = (hi - lo) / width + 1;
 70 | 
 71 | 	if (size <= INT_CUTOFF) {
 72 | 		shortsort(lo, hi, width, comp, comp_param);
 73 | 	} else {
 74 | 		mid = lo + (size / 2) * width;
 75 | 		woody_swap(mid, lo, width);
 76 | 
 77 | 		l = lo;
 78 | 		h = hi + width;
 79 | 
 80 | 		for (;;) {
 81 | 			do {
 82 | 				l += width;
 83 | 			} while (l <= hi && comp(l, lo, comp_param) <= 0);
 84 | 			do {
 85 | 				h -= width;
 86 | 			} while (h > lo && comp(h, lo, comp_param) >= 0);
 87 | 			if (h < l)
 88 | 				break;
 89 | 			woody_swap(l, h, width);
 90 | 		}
 91 | 
 92 | 		woody_swap(lo, h, width);
 93 | 
 94 | 		if (h - 1 - lo >= hi - l) {
 95 | 			if (lo + width < h) {
 96 | 				lostk[stkptr] = lo;
 97 | 				histk[stkptr] = h - width;
 98 | 				++stkptr;
 99 | 			}
100 | 
101 | 			if (l < hi) {
102 | 				lo = l;
103 | 				goto recurse;
104 | 			}
105 | 		} else {
106 | 			if (l < hi) {
107 | 				lostk[stkptr] = l;
108 | 				histk[stkptr] = hi;
109 | 				++stkptr;
110 | 			}
111 | 
112 | 			if (lo + width < h) {
113 | 				hi = h - width;
114 | 				goto recurse;
115 | 			}
116 | 		}
117 | 	}
118 | 
119 | 	--stkptr;
120 | 	if (stkptr >= 0) {
121 | 		lo = lostk[stkptr];
122 | 		hi = histk[stkptr];
123 | 		goto recurse;
124 | 	}
125 | 
126 | }
127 | 
128 | static void shortsort(char *lo, char *hi, unsigned width,
129 | 		int (*comp)(const void *, const void *, const void *),
130 | 		const void* comp_param) {
131 | 
132 | 	char *p, *max;
133 | 
134 | 	while (hi > lo) {
135 | 		max = lo;
136 | 		for (p = lo + width; p <= hi; p += width)
137 | 			if (comp(p, max, comp_param) > 0)
138 | 				max = p;
139 | 		woody_swap(max, hi, width);
140 | 		hi -= width;
141 | 	}
142 | 
143 | }
144 | 
145 | static void woody_swap(char *a, char *b, unsigned width) {
146 | 
147 | 	char tmp;
148 | 
149 | 	if (a != b) {
150 | 		while (width--) {
151 | 			tmp = *a;
152 | 			*a++ = *b;
153 | 			*b++ = tmp;
154 | 		}
155 | 	}
156 | 
157 | }
158 | 
159 | 


--------------------------------------------------------------------------------
/woody/data/util.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v2
  4 | #
  5 | 
  6 | import os
  7 | import gzip
  8 | import urllib
  9 | import shutil
 10 | import h5py
 11 | import pandas
 12 | 
 13 | def check_and_download(fname, remoteurl="REMOTE_URL"):
 14 | 
 15 |     if os.path.isfile(fname) == False:
 16 | 
 17 |         if os.path.exists(os.path.join(os.path.dirname(fname), remoteurl)):                
 18 |             urlfname = os.path.join(os.path.dirname(fname), remoteurl)
 19 |             try:
 20 |                 with open(urlfname,"r") as f:
 21 |                     url = f.readlines()[0].strip()
 22 |                     url = os.path.join(url, os.path.basename(fname))
 23 |             except Exception as e:
 24 |                 print("Could not retrieve urlf from file %s" % urlfname)
 25 | 
 26 |         elif os.path.exists(fname + ".download"):                
 27 |             urlfname = fname + ".download"
 28 |             try:
 29 |                 with open(urlfname,"r") as f:
 30 |                     url = f.readlines()[0]
 31 |             except Exception as e:
 32 |                 print("Could not retrieve urlf from file %s" % urlfname)
 33 | 
 34 |         else: 
 35 |             raise Exception("File and download url do not exist!")
 36 | 
 37 |         url = url.strip()
 38 |         
 39 |         try:
 40 |                 if url.endswith(".gz"):
 41 |                     fname_download = fname + ".gz"
 42 |                 else:
 43 |                     fname_download = fname
 44 |                     
 45 |                 print("Downloading data from %s to %s ..." % (url, fname_download))
 46 |                 urllib.urlretrieve (url, fname_download)
 47 |                 
 48 |                 print("Successfully downloaded the data!")
 49 |                 if url.endswith(".gz"):
 50 |                     print("Extracting zipped file ...")
 51 |                     inF = gzip.open(fname_download, 'rb')
 52 |                     outF = open(fname, 'wb')
 53 |                     outF.write(inF.read())
 54 |                     inF.close()
 55 |                     outF.close()                    
 56 |                     print("Done!")
 57 |         except Exception as e:
 58 |             print(str(e))
 59 |             try:
 60 |                 # remove incomplete data
 61 |                 shutil.rmtree(fname)
 62 |             except:
 63 |                 pass
 64 |             return False
 65 | 
 66 |     return True
 67 | 
 68 | def save_to_h5(X, y, fname, compression="lzf"):
 69 | 
 70 |     d = os.path.dirname(fname)
 71 |     if not os.path.exists(d):
 72 |         os.makedirs(d)
 73 | 
 74 |     y = y.reshape((len(y), 1))
 75 |     
 76 |     # create store and data sets
 77 |     store = h5py.File(fname, 'w')
 78 |     dsetX = store.create_dataset("X", X.shape, compression=compression)
 79 |     dsety = store.create_dataset("y", y.shape, compression=compression)
 80 |     
 81 |     dsetX[:,:] = X
 82 |     dsety[:,:] = y
 83 |     
 84 |     store.close()
 85 |     
 86 | def save_to_h5pd(X, y, fname, compression="bzip2", complevel=3, delete_before=True):
 87 | 
 88 |     d = os.path.dirname(fname)
 89 |     if not os.path.exists(d):
 90 |         os.makedirs(d)
 91 |             
 92 |     y = y.reshape((len(y), 1))
 93 | 
 94 |     if delete_before == True:
 95 |         if os.path.exists(fname):
 96 |             os.remove(fname)
 97 |         
 98 |     df_X = pandas.DataFrame(X, index=range(len(X)))
 99 |     df_y = pandas.DataFrame(y, index=range(len(y)))    
100 |     
101 |     df_X.to_hdf(fname, 'X', append=True, complib=compression, complevel=complevel)
102 |     df_y.to_hdf(fname, 'y', append=True, complib=compression, complevel=complevel)
103 |                         
104 | def convert_to_h5pd(reader, fname, transform, compression="bzip2", complevel=3, delete_before=True):
105 | 
106 |     d = os.path.dirname(fname)
107 |     if not os.path.exists(d):
108 |         os.makedirs(d)
109 | 
110 |     if delete_before == True:
111 |         if os.path.exists(fname):
112 |             os.remove(fname)
113 |             
114 |     for chunk in reader:
115 |                 
116 |         X, y = transform(chunk)            
117 |         y = y.reshape((len(y), 1))
118 |                 
119 |         df_X = pandas.DataFrame(X, index=range(len(X)))
120 |         df_y = pandas.DataFrame(y, index=range(len(y)))    
121 |     
122 |         df_X.to_hdf(fname, 'X', append=True, complib=compression, complevel=complevel)
123 |         df_y.to_hdf(fname, 'y', append=True, complib=compression, complevel=complevel)
124 |                         
125 | 


--------------------------------------------------------------------------------
/woody/data/susy.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v2
  4 | #
  5 | 
  6 | import os
  7 | import time
  8 | import numpy
  9 | import pandas
 10 | 
 11 | from woody.io import DataGenerator
 12 | 
 13 | from .util import check_and_download, save_to_h5pd
 14 | 
 15 | ALLOWED_TRAIN_SIZES = [500000, 1000000, 
 16 |                        1500000, 2000000,
 17 |                        2500000, 3000000,
 18 |                        3500000, 4000000]
 19 | 
 20 | def get_susy_files(data_path, train_size=1000000):
 21 | 
 22 |     assert train_size in ALLOWED_TRAIN_SIZES
 23 |     
 24 |     fname = os.path.join(data_path, "susy/SUSY.csv")
 25 |     check_and_download(fname)
 26 |     time.sleep(1)
 27 |         
 28 |     fname_train = os.path.join(data_path, "susy/SUSY.train_%s.csv" % str(train_size))
 29 |     fname_test = os.path.join(data_path, "susy/SUSY.test.csv")
 30 |     
 31 |     if not os.path.exists(fname_train):
 32 |         os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname, fname_train))
 33 |     if not os.path.exists(fname_test):
 34 |         os.system("sed -n '%i,%ip;%iq' < %s > %s" % (4500001, 5000000, 5000000, fname, fname_test))
 35 | 
 36 |     return fname_train, fname_test
 37 |     
 38 | def get_susy_data(data_path, train_size=1000000, shuffle_train=False, shuffle_test=False, seed=0):
 39 | 
 40 |     assert train_size in ALLOWED_TRAIN_SIZES
 41 |     
 42 |     numpy.random.seed(seed)
 43 |     fname_train, fname_test = get_susy_files(data_path, train_size)
 44 | 
 45 |     # training data
 46 |     label_col = 0
 47 |     features_cols = range(1,19)
 48 |     
 49 |     data = pandas.read_csv(fname_train, dtype="float", sep=",", header=None)    
 50 |     ytrain = numpy.ascontiguousarray(data.ix[:,label_col].values)
 51 |     Xtrain = numpy.ascontiguousarray(data.ix[:,features_cols].values)
 52 | 
 53 |     data = pandas.read_csv(fname_test, dtype="float", sep=",", header=None)    
 54 |     ytest = numpy.ascontiguousarray(data.ix[:,label_col].values)
 55 |     Xtest = numpy.ascontiguousarray(data.ix[:,features_cols].values)
 56 |             
 57 |     if shuffle_train == True:
 58 |         train_partition = numpy.random.permutation(Xtrain.shape[0])    
 59 |         Xtrain = Xtrain[train_partition]
 60 |         ytrain = ytrain[train_partition]
 61 | 
 62 |     if shuffle_test == True:
 63 |         test_partition = numpy.random.permutation(Xtest.shape[0])    
 64 |         Xtest = Xtest[test_partition]
 65 |         ytest = ytest[test_partition]
 66 | 
 67 |     return Xtrain, ytrain, Xtest, ytest
 68 | 
 69 | def _convert_susy_data(data_path, train_size):
 70 | 
 71 |     X_train, y_train, X_test, y_test = get_susy_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False)
 72 | 
 73 |     fname_store_train = os.path.join(data_path, "susy/SUSY.train_%s.h5pd" % str(train_size))
 74 |     fname_store_test = os.path.join(data_path, "susy/SUSY.test.h5pd")
 75 | 
 76 |     save_to_h5pd(X_train, y_train, fname_store_train)
 77 |     save_to_h5pd(X_test, y_test, fname_store_test)
 78 | 
 79 | def get_susy_generator(data_path, train_size=1000000, store="h5", seed=0, part="train", patterns=True, target=True):
 80 |     
 81 |     if store == "h5":
 82 |         
 83 |         if part=="train":
 84 |             fname = os.path.join(data_path, "susy/SUSY.train_%s.h5pd" % str(train_size))
 85 |         elif part=="test":
 86 |             fname = os.path.join(data_path, "susy/SUSY.test.h5pd")
 87 |             
 88 |         if not os.path.exists(fname):
 89 |             print("Store for susy data does not exist. Generating all stores ...")
 90 |             _convert_susy_data(data_path, train_size)
 91 |     
 92 |         if part == "test":
 93 |             chunksize = 250000
 94 |         else:
 95 |             if train_size <= 2000000:
 96 |                 chunksize = 500000
 97 |             else:
 98 |                 chunksize = 2000000
 99 |             
100 |         return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize)
101 |     
102 |     elif store == "mem":
103 |     
104 |         X_train, y_train, X_test, y_test = get_susy_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False)
105 |         
106 |         data = {}
107 |         if part == "train":
108 |             data['X'] = X_train
109 |             data['y'] = y_train
110 |         else:
111 |             data['X'] = X_test
112 |             data['y'] = y_test            
113 |                     
114 |         return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=1000000)
115 | 


--------------------------------------------------------------------------------
/experiments/small_data/subsetwood.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import os
  5 | import json
  6 | from util import evaluate
  7 | import params
  8 | 
  9 | import time
 10 | 
 11 | from woody import SubsetWoodClassifier
 12 | 
 13 | from woody.io import  MemoryStore, DiskStore
 14 | from woody.util import ensure_dir_for_file
 15 | from woody.data import *
 16 | 
 17 | def single_run(dkey, train_size, param, seed, profile=False):
 18 |                 
 19 |     print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param)))
 20 |     
 21 |     if dkey == "covtype":
 22 |         traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed)
 23 |         n_subset = 50000
 24 |     elif dkey == "higgs":
 25 |         traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed)
 26 |         n_subset = 500000
 27 |     elif dkey == "susy":
 28 |         traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed)
 29 |         n_subset = 500000
 30 |     else:
 31 |         raise Exception("Unknown data set!")
 32 |     
 33 |     print("")
 34 |     print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
 35 |     print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
 36 |     print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
 37 | 
 38 |     model = SubsetWoodClassifier(
 39 |                 n_estimators=param['n_estimators'],
 40 |                 criterion="gini",
 41 |                 max_features=param['max_features'],
 42 |                 min_samples_split=2,
 43 |                 n_jobs=param['n_jobs'],
 44 |                 seed=seed,
 45 |                 bootstrap=param['bootstrap'],
 46 |                 tree_traversal_mode="dfs",
 47 |                 tree_type=param['tree_type'],
 48 |                 min_samples_leaf=1,
 49 |                 float_type="double",
 50 |                 max_depth=None,
 51 |                 verbose=1,
 52 |                 store=MemoryStore())
 53 |          
 54 |     # training
 55 |     if profile == True:
 56 |         import yep
 57 |         assert param['n_jobs'] == 1
 58 |         yep.start("train.prof")
 59 |                 
 60 |     fit_start_time = time.time()        
 61 |     model.fit(traingen, n_subset=n_subset)
 62 |     fit_end_time = time.time()
 63 |     if profile == True:
 64 |         yep.stop()
 65 |     ypreds_train = model.predict(generator=traingen)
 66 |     
 67 |     # testing
 68 |     test_start_time = time.time()
 69 |     ypred_test = model.predict(generator=testgen)
 70 |     test_end_time = time.time()
 71 |     
 72 |     results = {}
 73 |     results['dataset'] = dkey
 74 |     results['param'] = param
 75 |     results['training_time'] = fit_end_time - fit_start_time
 76 |     results['testing_time'] = test_end_time - test_start_time
 77 |     print("Training time:\t\t%f" % results['training_time'])
 78 |     print("Testing time:\t\t%f" % results['testing_time'])
 79 |                 
 80 |     evaluate(ypreds_train, traingen.get_all_target(), results, "training")
 81 |     evaluate(ypred_test, testgen.get_all_target(), results, "testing")
 82 |         
 83 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
 84 |                                   str(param['max_features']),
 85 |                                   str(param['n_jobs']),
 86 |                                   str(param['bootstrap']),
 87 |                                   str(param['tree_type']),
 88 |                                   str(seed),
 89 |                                 )
 90 |     fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood", fname)
 91 |     ensure_dir_for_file(fname)
 92 |     with open(fname, 'w') as fp:
 93 |         json.dump(results, fp)
 94 |     
 95 |     del(testgen)
 96 |     del(traingen)
 97 |     model.cleanup()
 98 |     
 99 |     time.sleep(1)
100 | 
101 | ###################################################################################
102 | import argparse
103 | parser = argparse.ArgumentParser()
104 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
105 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
106 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
107 | parser.add_argument('--key', type=str)
108 | args = parser.parse_args()
109 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key
110 | ###################################################################################
111 | 
112 | single_run(dkey, train_size, params.parameters[key], seed)
113 | 


--------------------------------------------------------------------------------
/woody/io/csv.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v2
  4 | #
  5 | 
  6 | import numpy
  7 | import pandas
  8 | 
  9 | from .reader import Reader
 10 | 
 11 | class CSVReader(Reader):
 12 |     """
 13 |     """
 14 |         
 15 |     def __init__(self,
 16 |                  fname, 
 17 |                  patterns=True,
 18 |                  target=True,
 19 |                  chunksize=32000,                 
 20 |                  target_column=None,
 21 |                  patterns_columns=None,
 22 |                  seed=0,
 23 |                  parsing_args={}
 24 |                  ):
 25 |         
 26 |         super(CSVReader, self).__init__(fname=fname, 
 27 |                                        patterns=patterns,
 28 |                                        target=target,
 29 |                                        chunksize=chunksize,                                       
 30 |                                        seed=seed)
 31 | 
 32 |         self.target_column = target_column
 33 |         self.patterns_columns = patterns_columns
 34 |         self.parsing_args = parsing_args
 35 |                         
 36 |     def reset(self):
 37 |         
 38 |         self.close()
 39 |                 
 40 |         self._reader = pandas.read_csv(self.fname, iterator=True, chunksize=self.chunksize, **self.parsing_args)
 41 |             
 42 |     def get_random_subset(self, size, chunk_percent=0.5, shuffle=True):
 43 |         """
 44 |         NOTE: Seems to interfer with yep (multiprocessing, deadlock?)
 45 |         
 46 |         """
 47 | 
 48 |         data = None
 49 |         
 50 |         rand_per_chunk = int(self.chunksize * chunk_percent)
 51 |     
 52 |         while data is None or len(data) < size:
 53 | 
 54 |             self.reset(self.chunksize)
 55 |             
 56 |             for chunk in self._reader:
 57 |                 
 58 |                 data_chunk = self._transform_csv(chunk)
 59 |                 choice = sorted(self._randomgen.sample(xrange(len(data_chunk)), rand_per_chunk))
 60 |                 data_chunk = data_chunk[choice]
 61 |                 
 62 |                 if data is None:
 63 |                     data = data_chunk  
 64 |                 else:
 65 |                     data = numpy.concatenate((data, data_chunk), axis=0)
 66 |                 
 67 |                 if len(data) >= size:
 68 |                     break                
 69 |                 
 70 |             self.close()
 71 |         
 72 |         if shuffle == True:
 73 |             partition = range(len(data))
 74 |             self._randomgen.shuffle(partition)
 75 |             data = data[partition]
 76 |         data = data[:size]
 77 | 
 78 |         return self._get_patterns_labels(data)
 79 |     
 80 |     def get_chunk(self, extract=True):
 81 |         
 82 |         chunk = self._reader.get_chunk()
 83 |         data_chunk = self._transform_csv(chunk)
 84 | 
 85 |         if extract == True:
 86 |             data_chunk = self._get_patterns_labels(data_chunk)
 87 |                     
 88 |         return data_chunk
 89 |     
 90 |     def transform(self, chunk):
 91 |         
 92 |         return chunk.ix[:,:].values
 93 |             
 94 |     def _get_patterns_labels(self, data):
 95 |         
 96 |         if self.patterns == True and self.target == True:
 97 |                                                                               
 98 |             X = numpy.ascontiguousarray(data[:, self.patterns_columns])
 99 |             y = numpy.ascontiguousarray(data[:, self.target_column])
100 |             return X, y
101 |         
102 |         elif self.patterns == True:
103 |             
104 |             X = numpy.ascontiguousarray(data[:, self.patterns_columns])
105 |             return X
106 |         
107 |         elif self.target == True:
108 |             
109 |             y = numpy.ascontiguousarray(data[:, self.target_column])
110 |             return y   
111 |         
112 |         raise Exception("Both patterns and target is set to False!")
113 |     
114 |     
115 |     def get_all(self):
116 |         
117 |         self.reset()
118 |           
119 |         data = None
120 |         
121 |         while True:
122 |             
123 |             try:
124 |                 
125 |                 data_chunk = self.get_chunk(extract=False)
126 |                 
127 |             except Exception as e:
128 |                 break
129 | 
130 |             if data is None:
131 |                 data = data_chunk  
132 |             else:
133 |                 data = numpy.concatenate((data, data_chunk), axis=0)               
134 |         
135 |         return self._get_patterns_labels(data)  
136 |             


--------------------------------------------------------------------------------
/woody/data/higgs.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2017 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v2
  4 | #
  5 | 
  6 | 
  7 | import os
  8 | import numpy
  9 | import pandas
 10 | 
 11 | from woody.io import DataGenerator
 12 | 
 13 | from .util import check_and_download, save_to_h5pd
 14 | 
 15 | ALLOWED_TRAIN_SIZES = [500000, 1000000, 
 16 |                        1500000, 2000000,
 17 |                        2500000, 3000000,
 18 |                        3500000, 4000000,
 19 |                        4500000, 5000000,
 20 |                        5500000, 6000000,
 21 |                        6500000, 7000000,
 22 |                        7500000, 8000000,
 23 |                        8500000, 9000000,
 24 |                        950000, 10000000]
 25 | 
 26 | def get_higgs_files(data_path, train_size=1000000):
 27 | 
 28 |     assert train_size <= 10000000
 29 |     
 30 |     fname = os.path.join(data_path, "higgs/HIGGS.csv")
 31 |     check_and_download(fname)
 32 |     
 33 |     fname_train = os.path.join(data_path, "higgs/HIGGS.train_%s.csv" % str(train_size))
 34 |     fname_test = os.path.join(data_path, "higgs/HIGGS.test.csv")
 35 |     
 36 |     if not os.path.exists(fname_train):
 37 |         os.system("sed -n '%i,%ip;%iq' < %s > %s" % (1, train_size, train_size, fname, fname_train))
 38 |     if not os.path.exists(fname_test):
 39 |         os.system("sed -n '%i,%ip;%iq' < %s > %s" % (10000001, 11000000, 11000000, fname, fname_test))
 40 | 
 41 |     return fname_train, fname_test
 42 |     
 43 | def get_higgs_data(data_path, train_size=1000000, shuffle_train=False, shuffle_test=False, seed=0):
 44 | 
 45 |     assert train_size in ALLOWED_TRAIN_SIZES
 46 |     
 47 |     numpy.random.seed(seed)
 48 |     fname_train, fname_test = get_higgs_files(data_path, train_size)
 49 | 
 50 |     # training data
 51 |     label_col = 0
 52 |     features_cols = range(1,29)
 53 |     
 54 |     data = pandas.read_csv(fname_train, dtype="float", header=None)    
 55 |     ytrain = numpy.ascontiguousarray(data.ix[:,label_col].values)
 56 |     Xtrain = numpy.ascontiguousarray(data.ix[:,features_cols].values)
 57 | 
 58 |     data = pandas.read_csv(fname_test, dtype="float", header=None)    
 59 |     ytest = numpy.ascontiguousarray(data.ix[:,label_col].values)
 60 |     Xtest = numpy.ascontiguousarray(data.ix[:,features_cols].values)
 61 |             
 62 |     if shuffle_train == True:
 63 |         train_partition = numpy.random.permutation(Xtrain.shape[0])    
 64 |         Xtrain = Xtrain[train_partition]
 65 |         ytrain = ytrain[train_partition]
 66 | 
 67 |     if shuffle_test == True:
 68 |         test_partition = numpy.random.permutation(Xtest.shape[0])    
 69 |         Xtest = Xtest[test_partition]
 70 |         ytest = ytest[test_partition]
 71 | 
 72 |     return Xtrain, ytrain, Xtest, ytest
 73 | 
 74 | def _convert_higgs_data(data_path, train_size):
 75 | 
 76 |     X_train, y_train, X_test, y_test = get_higgs_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False)
 77 | 
 78 |     fname_store_train = os.path.join(data_path, "higgs/HIGGS.train_%s.h5pd" % str(train_size))
 79 |     fname_store_test = os.path.join(data_path, "higgs/HIGGS.test.h5pd")
 80 | 
 81 |     save_to_h5pd(X_train, y_train, fname_store_train)
 82 |     save_to_h5pd(X_test, y_test, fname_store_test)
 83 | 
 84 | def get_higgs_generator(data_path, train_size=1000000, store="h5", seed=0, part="train", patterns=True, target=True):
 85 |     
 86 |     if store == "h5":
 87 |         
 88 |         if part=="train":
 89 |             fname = os.path.join(data_path, "higgs/HIGGS.train_%s.h5pd" % str(train_size))
 90 |         elif part=="test":
 91 |             fname = os.path.join(data_path, "higgs/HIGGS.test.h5pd")
 92 |             
 93 |         if not os.path.exists(fname):
 94 |             print("Store for higgs data does not exist. Generating all stores ...")
 95 |             _convert_higgs_data(data_path, train_size)
 96 |     
 97 |         if part == "test":
 98 |             chunksize = 250000
 99 |         else:
100 |             if train_size <= 2000000:
101 |                 chunksize = 500000
102 |             else:
103 |                 chunksize = 2000000
104 |             
105 |         return DataGenerator(fname=fname, seed=seed, patterns=patterns, target=target, chunksize=chunksize)
106 |     
107 |     elif store == "mem":
108 |     
109 |         X_train, y_train, X_test, y_test = get_higgs_data(data_path, train_size=train_size, shuffle_train=False, shuffle_test=False)
110 |         
111 |         data = {}
112 |         if part == "train":
113 |             data['X'] = X_train
114 |             data['y'] = y_train
115 |         else:
116 |             data['X'] = X_test
117 |             data['y'] = y_test            
118 |                     
119 |         return DataGenerator(data=data, seed=seed, patterns=patterns, target=target, chunksize=10000000)
120 | 


--------------------------------------------------------------------------------
/experiments/landsat/sk.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import params
  5 | from util import evaluate
  6 | 
  7 | import os
  8 | import time
  9 | import json
 10 | 
 11 | from woody.util import ensure_dir_for_file
 12 | from woody.data import *          
 13 | from woody.io import DataGenerator
 14 |             
 15 | def single_run(dkey, train_size, param, seed, profile=False):     
 16 |            
 17 |     print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param)))
 18 |     
 19 |     if dkey == "landsat":
 20 |     
 21 |         # TODO: Download file manually if needed (9,7GB and 524MB):
 22 |         # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
 23 |         # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
 24 | 
 25 |         # TODO: Adapt paths accordingly
 26 |         fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
 27 |         fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
 28 | 
 29 |         traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size)
 30 |         testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000)
 31 | 
 32 |     else:
 33 |         raise Exception("Unknown data set!")
 34 |                         
 35 |     Xtrain, ytrain = traingen.get_all()
 36 |     Xtest, ytest = testgen.get_all() 
 37 |     
 38 |     print("")
 39 |     print("Number of training patterns:\t%i" % Xtrain.shape[0])
 40 |     print("Number of test patterns:\t%i" % Xtest.shape[0])
 41 |     print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])
 42 |     
 43 |     if param['tree_type'] == "randomized":
 44 |         from sklearn.ensemble import ExtraTreesClassifier as RF
 45 |     elif param['tree_type'] == "standard":
 46 |         from sklearn.ensemble import RandomForestClassifier as RF
 47 |     
 48 |     model = RF(
 49 |             n_estimators=param['n_estimators'],
 50 |             criterion="gini",
 51 |             max_features=param['max_features'],
 52 |             min_samples_split=2,
 53 |             n_jobs=param['n_jobs'],
 54 |             random_state=seed,
 55 |             bootstrap=param['bootstrap'],
 56 |             min_samples_leaf=1,
 57 |             max_depth=None,
 58 |             verbose=0)
 59 |     
 60 |     if profile == True:
 61 |         import yep
 62 |         assert param['n_jobs'] == 1
 63 |         yep.start("train.prof")
 64 |                     
 65 |     # training
 66 |     fit_start_time = time.time()
 67 |     model.fit(Xtrain, ytrain)
 68 |     fit_end_time = time.time()
 69 |     if profile == True:
 70 |         yep.stop()             
 71 |     ypreds_train = model.predict(Xtrain) 
 72 |      
 73 |     # testing
 74 |     test_start_time = time.time()
 75 |     ypred_test = model.predict(Xtest)
 76 |     test_end_time = time.time()
 77 |     
 78 |     results = {}
 79 |     results['dataset'] = dkey
 80 |     results['param'] = param
 81 |     results['training_time'] = fit_end_time - fit_start_time
 82 |     results['testing_time'] = test_end_time - test_start_time
 83 |     print("Training time:     %f" % results['training_time'])
 84 |     print("Testing time:      %f" % results['testing_time'])
 85 |                 
 86 |     evaluate(ypreds_train, ytrain, results, "training")
 87 |     evaluate(ypred_test, ytest, results, "testing")
 88 |                     
 89 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
 90 |                                   str(param['max_features']),
 91 |                                   str(param['n_jobs']),
 92 |                                   str(param['bootstrap']),
 93 |                                   str(param['tree_type']),
 94 |                                   str(seed),
 95 |                                 )
 96 |     fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname)
 97 |     ensure_dir_for_file(fname)
 98 |     with open(fname, 'w') as fp:
 99 |         json.dump(results, fp)
100 |  
101 | ###################################################################################
102 | import argparse
103 | parser = argparse.ArgumentParser()
104 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
105 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
106 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
107 | parser.add_argument('--key', type=str)
108 | args = parser.parse_args()
109 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key
110 | ###################################################################################
111 | 
112 | single_run(dkey, train_size, params.parameters[key], seed)
113 | 


--------------------------------------------------------------------------------
/experiments/small_data/h2.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | # test
  5 | import params
  6 | from util import evaluate
  7 | 
  8 | import os
  9 | import time
 10 | import json
 11 | import numpy
 12 | import math
 13 | 
 14 | from woody.util import ensure_dir_for_file
 15 | from woody.data import *
 16 | 
 17 | def single_run(dkey, train_size, param, seed, profile=False):
 18 | 
 19 |     print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param)))
 20 |     
 21 |     import h2o
 22 |     from skutil.h2o import h2o_col_to_numpy
 23 |     h2o.init(max_mem_size = "12G", nthreads=param['n_jobs']) 
 24 |     h2o.remove_all() 
 25 |     from h2o.estimators.random_forest import H2ORandomForestEstimator
 26 |     
 27 |     # get and convert data
 28 |     if dkey == "covtype":
 29 |         fname_train, fname_test = covtype_files(train_size=train_size)
 30 |         train_df = h2o.import_file(fname_train)
 31 |         test_df = h2o.import_file(fname_test)
 32 |         Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1]
 33 |     elif dkey == "higgs":  
 34 |         fname_train, fname_test = higgs_files(train_size=train_size)
 35 |         train_df = h2o.import_file(fname_train)
 36 |         test_df = h2o.import_file(fname_test)
 37 |         Xcols, ycol = train_df.col_names[1:], train_df.col_names[0]
 38 |     elif dkey == "susy":  
 39 |         fname_train, fname_test = susy_files(train_size=train_size)
 40 |         train_df = h2o.import_file(fname_train)
 41 |         test_df = h2o.import_file(fname_test)
 42 |         Xcols, ycol = train_df.col_names[1:], train_df.col_names[0]
 43 |                                                 
 44 |     print("")
 45 |     print("Number of training patterns:\t%i" % train_df.shape[0])
 46 |     print("Number of test patterns:\t%i" % test_df.shape[0])
 47 |     print("Dimensionality of the data:\t%i\n" % train_df.shape[1])
 48 | 
 49 |     if param['max_features'] is None:
 50 |         mtries = train_df.shape[1] - 2
 51 |     elif param['max_features'] == "sqrt":
 52 |         mtries = int(math.sqrt(train_df.shape[1] - 2))
 53 |     
 54 |     if param['bootstrap'] == False:
 55 |         sample_rate = 1.0
 56 |     else:
 57 |         sample_rate = 0.632
 58 |         
 59 |     model = H2ORandomForestEstimator(
 60 |                 mtries=mtries,
 61 |                 sample_rate=sample_rate,
 62 |                 #nbins=1000, #crash
 63 |                 min_rows=1,
 64 |                 build_tree_one_node=True,
 65 |                 max_depth=20,
 66 |                 balance_classes=False,
 67 |                 ntrees=param['n_estimators'],
 68 |                 seed=seed)
 69 |     
 70 |     # training
 71 |     fit_start_time = time.time()
 72 |     model.train(Xcols, ycol, training_frame=train_df)
 73 |     fit_end_time = time.time()
 74 |     ypreds_train = model.predict(train_df)
 75 |     
 76 |     # testing
 77 |     test_start_time = time.time()
 78 |     ypreds_test = model.predict(test_df)
 79 |     test_end_time = time.time()
 80 |     
 81 |     results = {}
 82 |     results['dataset'] = dkey
 83 |     results['param'] = param
 84 |     results['training_time'] = fit_end_time - fit_start_time
 85 |     results['testing_time'] = test_end_time - test_start_time
 86 |     print("Training time:     %f" % results['training_time'])
 87 |     print("Testing time:      %f" % results['testing_time'])
 88 | 
 89 |     evaluate(numpy.rint(ypreds_train.as_data_frame().values), train_df[ycol].as_data_frame().values, results, "training")
 90 |     evaluate(numpy.rint(ypreds_test.as_data_frame().values), test_df[ycol].as_data_frame().values, results, "testing")            
 91 |                             
 92 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
 93 |                                   str(param['max_features']),
 94 |                                   str(param['n_jobs']),
 95 |                                   str(param['bootstrap']),
 96 |                                   str(param['tree_type']),
 97 |                                   str(seed),                                  
 98 |                                 )
 99 |         
100 |     fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname)
101 |     ensure_dir_for_file(fname)
102 |     with open(fname, 'w') as fp:
103 |         json.dump(results, fp)
104 |         
105 | ###################################################################################
106 | import argparse
107 | parser = argparse.ArgumentParser()
108 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
109 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
110 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
111 | parser.add_argument('--key', type=str)
112 | args = parser.parse_args()
113 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key
114 | ###################################################################################
115 | 
116 | single_run(dkey, train_size, params.parameters[key], seed)        
117 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2015-2018 Fabian Gieseke <fabian.gieseke@di.ku.dk>
  3 | # License: GPL v3
  4 | #
  5 | 
  6 | import os
  7 | import sys
  8 | import shutil
  9 | from distutils.command.clean import clean
 10 |  
 11 | DISTNAME = 'woody'
 12 | DESCRIPTION = 'A Python library for large-scale random forests.'
 13 | LONG_DESCRIPTION = open('README.rst').read()
 14 | MAINTAINER = 'Fabian Gieseke'
 15 | MAINTAINER_EMAIL = 'fabian.gieseke@di.ku.dk'
 16 | URL = 'https://github.com/gieseke/woody'
 17 | LICENSE = 'GNU GENERAL PUBLIC LICENSE Version 3'
 18 | DOWNLOAD_URL = 'https://github.com/gieseke/woody'
 19 | 
 20 | import woody
 21 | VERSION = woody.__version__
 22 | 
 23 | # adapted from scikit-learn
 24 | if len(set(('develop', 'release')).intersection(sys.argv)) > 0:
 25 |     import setuptools
 26 |     extra_setuptools_args = dict(zip_safe=False)
 27 | else:
 28 |     extra_setuptools_args = dict()
 29 |     
 30 | def configuration(parent_package='', top_path=None):
 31 | 
 32 |     from numpy.distutils.misc_util import Configuration
 33 |     config = Configuration(None, parent_package, top_path)
 34 |     config.set_options(ignore_setup_xxx_py=True,
 35 |                        assume_default_configuration=True,
 36 |                        delegate_options_to_subpackages=True,
 37 |                        quiet=True)
 38 |     config.add_subpackage('woody')
 39 | 
 40 |     return config
 41 | 
 42 | class CleanCommand(clean):
 43 |     
 44 |     description = "Cleaning up code ..."
 45 | 
 46 |     def run(self):
 47 | 
 48 |         clean.run(self)
 49 | 
 50 |         # remove hidden '~' files
 51 |         for dirpath, dirnames, filenames in os.walk('.'):
 52 |             for filename in filenames:
 53 |                 if filename.endswith('~') or filename.endswith('.pyc'):
 54 |                     os.unlink(os.path.join(dirpath, filename))
 55 | 
 56 |         # build related files and directories
 57 |         if os.path.exists('build'):
 58 |             shutil.rmtree('build')
 59 |         if os.path.exists('woody.egg-info'):
 60 |             shutil.rmtree('woody.egg-info')
 61 |         if os.path.exists('docs/_build'):
 62 |             shutil.rmtree('docs/_build')
 63 | 
 64 |         # remaining files and directories in woody dir (recursively)
 65 |         for dirpath, dirnames, filenames in os.walk('woody'):
 66 |             
 67 |             for filename in filenames:
 68 |                 if (filename.endswith('.so') or 
 69 |                     filename.endswith('.pyd') or 
 70 |                     filename.endswith('.dll') or 
 71 |                     filename.endswith('.pyc') or 
 72 |                     filename.endswith('_wrap.c') or 
 73 |                     filename.startswith('wrapper_') or 
 74 |                     filename.endswith('~')):
 75 |                         os.unlink(os.path.join(dirpath, filename))
 76 | 
 77 |             for dirname in dirnames:
 78 |                 if dirname == '__pycache__' or dirname == 'build' or dirname == '_build':
 79 |                     shutil.rmtree(os.path.join(dirpath, dirname))
 80 | 
 81 |         try:
 82 |             shutil.rmtree("dist")
 83 |         except:
 84 |             pass
 85 | 
 86 | def setup_package():
 87 |     
 88 |     metadata = dict(name=DISTNAME,
 89 |                     maintainer=MAINTAINER,
 90 |                     maintainer_email=MAINTAINER_EMAIL,
 91 |                     description=DESCRIPTION,
 92 |                     license=LICENSE,
 93 |                     url=URL,
 94 |                     version=VERSION,
 95 |                     download_url=DOWNLOAD_URL,
 96 |                     long_description=LONG_DESCRIPTION,
 97 |                     classifiers=[
 98 |                                  'Intended Audience :: Science/Research',
 99 |                                  'Intended Audience :: Developers',
100 |                                  'License :: OSI Approved :: GNU General Public License v2 (GPLv2)',
101 |                                  'Programming Language :: C',
102 |                                  'Programming Language :: Python',
103 |                                  'Programming Language :: Python :: 2',
104 |                                  'Programming Language :: Python :: 2.6',
105 |                                  'Programming Language :: Python :: 2.7',
106 |                                  ],
107 |                     cmdclass={'clean': CleanCommand},
108 |                     install_requires=["numpy>=1.6.1"],
109 |                     include_package_data=True,
110 |                     package_data={'woody': []},
111 |                     **extra_setuptools_args)
112 | 
113 |     if (len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or sys.argv[1] in ('--version', 'clean'))):
114 | 
115 |         try:
116 |             from setuptools import setup
117 |         except ImportError:
118 |             from distutils.core import setup
119 |         metadata['version'] = VERSION
120 | 
121 |     else:
122 | 
123 |         try:
124 |             from numpy.distutils.core import setup
125 |             metadata['configuration'] = configuration
126 |         except:
127 |             print("woody requires numpy>=1.6.1")
128 |             sys.exit(0)
129 | 
130 |     setup(**metadata)
131 | 
132 | if __name__ == "__main__":
133 |     
134 |     setup_package()
135 |     
136 | 


--------------------------------------------------------------------------------
/woody/models/forest/src/pqueue.c:
--------------------------------------------------------------------------------
  1 | // Adapted from http://rosettacode.org/wiki/Priority_queue#C
  2 | 
  3 | #include "include/pqueue.h"
  4 | 
  5 | /* --------------------------------------------------------------------------------
  6 |  * Tests if the queue is empty (first element in array not used to simplify indices)
  7 |  * --------------------------------------------------------------------------------
  8 |  */
  9 | PQUEUE *pqueue_new(int size) {
 10 | 
 11 | 	if (size < PQUEUE_MIN_SIZE) {
 12 | 		size = PQUEUE_MIN_SIZE;
 13 | 	}
 14 | 
 15 | 	// allocate space for priority queue
 16 | 	PQUEUE *q = (PQUEUE*) malloc(sizeof(PQUEUE));
 17 | 
 18 | 	// allocate space for size queue items
 19 | 	q->buf = (PQUEUE_ITEM*) malloc(size * sizeof(PQUEUE_ITEM));
 20 | 
 21 | 	// set size and number of elements (first element is not used)
 22 | 	q->alloc = size;
 23 | 	q->n = 1;
 24 | 
 25 | 	return q;
 26 | }
 27 | 
 28 | /* --------------------------------------------------------------------------------
 29 |  * Tests if the queue is empty
 30 |  * --------------------------------------------------------------------------------
 31 |  */
 32 | inline int pqueue_is_empty(PQUEUE *q) {
 33 | 
 34 | 	if (q->n == 1) {
 35 | 		return 1;
 36 | 	} else {
 37 | 		return 0;
 38 | 	}
 39 | 
 40 | }
 41 | 
 42 | /* --------------------------------------------------------------------------------
 43 |  * Pushes "data" with priority "pri"
 44 |  * --------------------------------------------------------------------------------
 45 |  */
 46 | void pqueue_push(PQUEUE *q, void *data, int pri) {
 47 | 
 48 | 	// pointer for queue item
 49 | 	PQUEUE_ITEM *b;
 50 | 	int n, m;
 51 | 
 52 | 	// allocate more memory if needed
 53 | 	if (q->n >= q->alloc) {
 54 | 		q->alloc *= 2;
 55 | 		b = q->buf = (PQUEUE_ITEM*) realloc(q->buf,
 56 | 				sizeof(PQUEUE_ITEM) * q->alloc);
 57 | 	} else {
 58 | 		b = q->buf;
 59 | 	}
 60 | 
 61 | 	// append at end and perform an up-heap operation
 62 | 	// (move up in case parent has a larger priority)
 63 | 	n = q->n++;
 64 | 	while ((m = n / 2) && pri < b[m].pri) {
 65 | 		b[n] = b[m];
 66 | 		n = m;
 67 | 	}
 68 | 
 69 | 	b[n].data = data;
 70 | 	b[n].pri = pri;
 71 | 
 72 | }
 73 | 
 74 | /* --------------------------------------------------------------------------------
 75 |  * Removes top item (or returns 0 if queue is empty); *pri can be NULL.
 76 |  * --------------------------------------------------------------------------------
 77 |  */
 78 | void *pqueue_pop(PQUEUE *q, int *pri) {
 79 | 
 80 | 	void *out;
 81 | 	if (q->n == 1) {
 82 | 		return 0;
 83 | 	}
 84 | 
 85 | 	PQUEUE_ITEM *b = q->buf;
 86 | 
 87 | 	// get item from the root and store priority in *pri if pri!=NULL
 88 | 	out = b[1].data;
 89 | 	if (pri) {
 90 | 		*pri = b[1].pri;
 91 | 	}
 92 | 
 93 | 	// reduce size by one
 94 | 	--q->n;
 95 | 
 96 | 	int n = 1, m;
 97 | 	while ((m = n * 2) < q->n) {
 98 | 
 99 | 		if (m + 1 < q->n && b[m].pri > b[m + 1].pri) {
100 | 			m++;
101 | 		}
102 | 
103 | 		if (b[q->n].pri <= b[m].pri) {
104 | 			break;
105 | 		}
106 | 
107 | 		b[n] = b[m];
108 | 		n = m;
109 | 	}
110 | 	b[n] = b[q->n];
111 | 
112 | 	// reduce size if needed
113 | 	if (q->n < q->alloc / 2 && q->n >= PQUEUE_MIN_SIZE) {
114 | 		q->buf = (PQUEUE_ITEM*) realloc(q->buf, (q->alloc /= 2) * sizeof(b[0]));
115 | 	}
116 | 
117 | 	// return data
118 | 	return out;
119 | 
120 | }
121 | 
122 | /* --------------------------------------------------------------------------------
123 |  * Returns the top of the queue
124 |  * --------------------------------------------------------------------------------
125 |  */
126 | inline void* pqueue_top(PQUEUE *q, int *pri) {
127 | 	if (q->n == 1) {
128 | 		return NULL;
129 | 	}
130 | 	if (pri) {
131 | 		*pri = q->buf[1].pri;
132 | 	}
133 | 	return q->buf[1].data;
134 | }
135 | 
136 | /* --------------------------------------------------------------------------------
137 |  * Combines/merges two queues
138 |  * --------------------------------------------------------------------------------
139 |  */
140 | void pqueue_combine(PQUEUE *q1, PQUEUE *q2) {
141 | 	int i;
142 | 	PQUEUE_ITEM *e = q2->buf + 1;
143 | 
144 | 	for (i = q2->n - 1; i >= 1; i--, e++) {
145 | 		pqueue_push(q1, e->data, e->pri);
146 | 	}
147 | 
148 | 	pqueue_purge(q2);
149 | 
150 | }
151 | 
152 | /*int main() {
153 |  int i, p;
154 |  char *c, *tasks[] = { "Clear drains", "Feed cat", "Make tea", "Solve RC tasks", "Tax return" };
155 |  int pri[] = { 3, 4, 5, 1, 2 };
156 | 
157 |  //make two queues
158 |  PQUEUE *q = pqueue_new(0);
159 |  PQUEUE *q2 = pqueue_new(0);
160 | 
161 |  //push all 5 tasks into q
162 |  for (i = 0; i < 5; i++)
163 |  pqueue_push(q, tasks[i], pri[i]);
164 | 
165 |  //pop them and print one by one
166 |  while ((c = pqueue_pop(q, &p)))
167 |  printf("%d: %s\n", p, c);
168 | 
169 |  //put a million random tasks in each queue
170 |  for (i = 0; i < 1 << 20; i++) {
171 |  p = rand() / ( RAND_MAX / 5);
172 |  pqueue_push(q, tasks[p], pri[p]);
173 | 
174 |  p = rand() / ( RAND_MAX / 5);
175 |  pqueue_push(q2, tasks[p], pri[p]);
176 |  }
177 | 
178 |  printf("\nq has %d items, q2 has %d items\n", pqueue_size(q), pqueue_size(q2));
179 | 
180 |  // merge q2 into q; q2 is empty
181 |  pqueue_combine(q, q2);
182 |  printf("After merge, q has %d items, q2 has %d items\n", pqueue_size(q),
183 |  pqueue_size(q2));
184 | 
185 |  // pop q until it's empty
186 |  for (i = 0; (c = pqueue_pop(q, 0)); i++)
187 |  ;
188 |  printf("Popped %d items out of q\n", i);
189 | 
190 |  return 0;
191 |  }*/
192 | 


--------------------------------------------------------------------------------
/experiments/small_data/hugewood_lam.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import os
  5 | import json
  6 | from util import evaluate
  7 | import params
  8 | 
  9 | import time
 10 | 
 11 | from woody import HugeWoodClassifier, WoodClassifier
 12 | 
 13 | from woody.io import  MemoryStore, DiskStore
 14 | from woody.util import ensure_dir_for_file
 15 | from woody.data import *
 16 | 
 17 | def single_run(dkey, train_size, param, seed, profile=False):
 18 |                 
 19 |     print("Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param)))
 20 |     
 21 |     if dkey == "covtype":
 22 |         traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed)
 23 |     elif dkey == "higgs":
 24 |         traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed)
 25 |     elif dkey == "susy":
 26 |         traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed)
 27 |     else:
 28 |         raise Exception("Unknown data set!")
 29 |     
 30 |     print("")
 31 |     print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
 32 |     print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
 33 |     print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
 34 |     
 35 |     param_wood = param['param_wood']
 36 |     
 37 |     wood = WoodClassifier(
 38 |                 n_estimators=1,
 39 |                 criterion="gini",
 40 |                 max_features=param_wood['max_features'],
 41 |                 min_samples_split=2,
 42 |                 n_jobs=param_wood['n_jobs'],
 43 |                 seed=seed,
 44 |                 bootstrap=param_wood['bootstrap'],
 45 |                 tree_traversal_mode="dfs",
 46 |                 tree_type=param_wood['tree_type'],
 47 |                 min_samples_leaf=1,
 48 |                 float_type="double",
 49 |                 max_depth=None,
 50 |                 verbose=0)
 51 |     top_tree_lambda = 0.1
 52 |     model = HugeWoodClassifier(
 53 |                n_estimators=param['n_estimators'],
 54 |                n_estimators_bottom=param['n_estimators_bottom'],
 55 |                n_top="auto",
 56 |                n_patterns_leaf="auto",
 57 |                balanced_top_tree=True,
 58 |                top_tree_lambda=top_tree_lambda,
 59 |                top_tree_max_depth=None,
 60 |                top_tree_type="standard",
 61 |                top_tree_leaf_stopping_mode="ignore_impurity",
 62 |                n_jobs=param_wood['n_jobs'],
 63 |                seed=seed,
 64 |                verbose=1,
 65 |                plot_intermediate={},
 66 |                chunk_max_megabytes=2048, 
 67 |                wrapped_instance=wood,
 68 |                store=MemoryStore(),                       
 69 |                )
 70 |     
 71 |     # training
 72 |     if profile == True:
 73 |         import yep
 74 |         assert param_wood['n_jobs'] == 1
 75 |         yep.start("train.prof")
 76 |                 
 77 |     fit_start_time = time.time()        
 78 |     model.fit(traingen)
 79 |     fit_end_time = time.time()
 80 |     if profile == True:
 81 |         yep.stop()
 82 |     ypreds_train = model.predict(generator=traingen)
 83 |     
 84 |     # testing
 85 |     test_start_time = time.time()
 86 |     ypred_test = model.predict(generator=testgen)
 87 |     test_end_time = time.time()
 88 |     
 89 |     results = {}
 90 |     results['dataset'] = dkey
 91 |     results['param'] = param
 92 |     results['training_time'] = fit_end_time - fit_start_time
 93 |     results['testing_time'] = test_end_time - test_start_time
 94 |     print("Training time:\t\t%f" % results['training_time'])
 95 |     print("Testing time:\t\t%f" % results['testing_time'])
 96 |                 
 97 |     evaluate(ypreds_train, traingen.get_all_target(), results, "training")
 98 |     evaluate(ypred_test, testgen.get_all_target(), results, "testing")
 99 |         
100 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']),
101 |                                   str(param_wood['max_features']),
102 |                                   str(param_wood['n_jobs']),
103 |                                   str(param_wood['bootstrap']),
104 |                                   str(param_wood['tree_type']),
105 |                                   str(seed),
106 |                                 )
107 |     fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood_" + str(top_tree_lambda), fname)
108 |     ensure_dir_for_file(fname)
109 |     with open(fname, 'w') as fp:
110 |         json.dump(results, fp)
111 |     
112 |     del(testgen)
113 |     del(traingen)
114 |     model.cleanup()
115 |     
116 |     time.sleep(1)
117 | 
118 | ###################################################################################
119 | import argparse
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
124 | parser.add_argument('--key', type=str)
125 | args = parser.parse_args()
126 | dkey, train_size, seed, key = args.dkey, args.train_size, args.seed, args.key
127 | ###################################################################################
128 | 
129 | single_run(dkey, train_size, params.parameters_hugewood[key], seed)
130 | 


--------------------------------------------------------------------------------
/experiments/influence_n_bottom/hugewood_10K.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import os
  5 | import json
  6 | from util import evaluate
  7 | import params
  8 | 
  9 | import time
 10 | 
 11 | from woody import HugeWoodClassifier, WoodClassifier
 12 | 
 13 | from woody.io import  MemoryStore, DiskStore
 14 | from woody.util import ensure_dir_for_file
 15 | from woody.data import *
 16 | 
 17 | def single_run(dkey, train_size, n_bottom, param, seed, profile=False):
 18 |                 
 19 |     print("Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param)))
 20 |     
 21 |     if dkey == "covtype":
 22 |         traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed)
 23 |     elif dkey == "higgs":
 24 |         traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed)
 25 |     elif dkey == "susy":
 26 |         traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed)
 27 |     else:
 28 |         raise Exception("Unknown data set!")
 29 |     
 30 |     print("")
 31 |     print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
 32 |     print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
 33 |     print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
 34 |     
 35 |     param_wood = param['param_wood']
 36 |     
 37 |     wood = WoodClassifier(
 38 |                 n_estimators=1,
 39 |                 criterion="gini",
 40 |                 max_features=param_wood['max_features'],
 41 |                 min_samples_split=2,
 42 |                 n_jobs=param_wood['n_jobs'],
 43 |                 seed=seed,
 44 |                 bootstrap=param_wood['bootstrap'],
 45 |                 tree_traversal_mode="dfs",
 46 |                 tree_type=param_wood['tree_type'],
 47 |                 min_samples_leaf=1,
 48 |                 float_type="double",
 49 |                 max_depth=None,
 50 |                 verbose=0)
 51 |     
 52 |     model = HugeWoodClassifier(
 53 |                n_estimators=int(24 / n_bottom),
 54 |                n_estimators_bottom=int(n_bottom),
 55 |                n_top="auto",
 56 |                n_patterns_leaf=10000,
 57 |                balanced_top_tree=True,
 58 |                top_tree_lambda=1.0,
 59 |                top_tree_max_depth=None,
 60 |                top_tree_type="standard",
 61 |                top_tree_leaf_stopping_mode="ignore_impurity",
 62 |                n_jobs=param_wood['n_jobs'],
 63 |                seed=seed,
 64 |                verbose=1,
 65 |                plot_intermediate={},
 66 |                chunk_max_megabytes=2048, 
 67 |                wrapped_instance=wood,
 68 |                store=MemoryStore(),                       
 69 |                )
 70 |     
 71 |     # training
 72 |     if profile == True:
 73 |         import yep
 74 |         assert param_wood['n_jobs'] == 1
 75 |         yep.start("train.prof")
 76 |                 
 77 |     fit_start_time = time.time()        
 78 |     model.fit(traingen)
 79 |     fit_end_time = time.time()
 80 |     if profile == True:
 81 |         yep.stop()
 82 |     ypreds_train = model.predict(generator=traingen)
 83 |     
 84 |     # testing
 85 |     test_start_time = time.time()
 86 |     ypred_test = model.predict(generator=testgen)
 87 |     test_end_time = time.time()
 88 |     
 89 |     results = {}
 90 |     results['dataset'] = dkey
 91 |     results['param'] = param
 92 |     results['training_time'] = fit_end_time - fit_start_time
 93 |     results['testing_time'] = test_end_time - test_start_time
 94 |     print("Training time:\t\t%f" % results['training_time'])
 95 |     print("Testing time:\t\t%f" % results['testing_time'])
 96 |                 
 97 |     evaluate(ypreds_train, traingen.get_all_target(), results, "training")
 98 |     evaluate(ypred_test, testgen.get_all_target(), results, "testing")
 99 |         
100 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']),
101 |                                   str(param_wood['max_features']),
102 |                                   str(param_wood['n_jobs']),
103 |                                   str(param_wood['bootstrap']),
104 |                                   str(param_wood['tree_type']),
105 |                                   str(seed),
106 |                                 )
107 |     fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_10K", fname)
108 |     ensure_dir_for_file(fname)
109 |     with open(fname, 'w') as fp:
110 |         json.dump(results, fp)
111 |     
112 |     del(testgen)
113 |     del(traingen)
114 |     model.cleanup()
115 |     
116 |     time.sleep(1)
117 | 
118 | ###################################################################################
119 | import argparse
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
124 | parser.add_argument('--key', type=str)
125 | parser.add_argument('--n_bottom', nargs='?', const=0.0, type=float, default=0.0)
126 | args = parser.parse_args()
127 | dkey, train_size, seed, key, n_bottom = args.dkey, args.train_size, args.seed, args.key, args.n_bottom
128 | ###################################################################################
129 | 
130 | single_run(dkey, train_size, n_bottom, params.parameters_hugewood[key], seed)
131 | 


--------------------------------------------------------------------------------
/experiments/influence_n_bottom/hugewood_1K.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import os
  5 | import json
  6 | from util import evaluate
  7 | import params
  8 | 
  9 | import time
 10 | 
 11 | from woody import HugeWoodClassifier, WoodClassifier
 12 | 
 13 | from woody.io import  MemoryStore, DiskStore
 14 | from woody.util import ensure_dir_for_file
 15 | from woody.data import *
 16 | 
 17 | def single_run(dkey, train_size, n_bottom, param, seed, profile=False):
 18 |                 
 19 |     print("Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param)))
 20 |     
 21 |     if dkey == "covtype":
 22 |         traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed)
 23 |     elif dkey == "higgs":
 24 |         traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed)
 25 |     elif dkey == "susy":
 26 |         traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed)
 27 |     else:
 28 |         raise Exception("Unknown data set!")
 29 |     
 30 |     print("")
 31 |     print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
 32 |     print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
 33 |     print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
 34 |     
 35 |     param_wood = param['param_wood']
 36 |     
 37 |     wood = WoodClassifier(
 38 |                 n_estimators=1,
 39 |                 criterion="gini",
 40 |                 max_features=param_wood['max_features'],
 41 |                 min_samples_split=2,
 42 |                 n_jobs=param_wood['n_jobs'],
 43 |                 seed=seed,
 44 |                 bootstrap=param_wood['bootstrap'],
 45 |                 tree_traversal_mode="dfs",
 46 |                 tree_type=param_wood['tree_type'],
 47 |                 min_samples_leaf=1,
 48 |                 float_type="double",
 49 |                 max_depth=None,
 50 |                 verbose=0)
 51 |     
 52 |     model = HugeWoodClassifier(
 53 |                n_estimators=int(24 / n_bottom),
 54 |                n_estimators_bottom=int(n_bottom),
 55 |                n_top="auto",
 56 |                n_patterns_leaf=1000,
 57 |                balanced_top_tree=True,
 58 |                top_tree_lambda=1.0,
 59 |                top_tree_max_depth=None,
 60 |                top_tree_type="standard",
 61 |                top_tree_leaf_stopping_mode="ignore_impurity",
 62 |                n_jobs=param_wood['n_jobs'],
 63 |                seed=seed,
 64 |                verbose=1,
 65 |                plot_intermediate={},
 66 |                chunk_max_megabytes=2048, 
 67 |                wrapped_instance=wood,
 68 |                store=MemoryStore(),                       
 69 |                )
 70 |     
 71 |     # training
 72 |     if profile == True:
 73 |         import yep
 74 |         assert param_wood['n_jobs'] == 1
 75 |         yep.start("train.prof")
 76 |                 
 77 |     fit_start_time = time.time()        
 78 |     model.fit(traingen)
 79 |     fit_end_time = time.time()
 80 |     if profile == True:
 81 |         yep.stop()
 82 |     ypreds_train = model.predict(generator=traingen)
 83 |     
 84 |     # testing
 85 |     test_start_time = time.time()
 86 |     ypred_test = model.predict(generator=testgen)
 87 |     test_end_time = time.time()
 88 |     
 89 |     results = {}
 90 |     results['dataset'] = dkey
 91 |     results['param'] = param
 92 |     results['training_time'] = fit_end_time - fit_start_time
 93 |     results['testing_time'] = test_end_time - test_start_time
 94 |     print("Training time:\t\t%f" % results['training_time'])
 95 |     print("Testing time:\t\t%f" % results['testing_time'])
 96 |                 
 97 |     evaluate(ypreds_train, traingen.get_all_target(), results, "training")
 98 |     evaluate(ypred_test, testgen.get_all_target(), results, "testing")
 99 |         
100 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']),
101 |                                   str(param_wood['max_features']),
102 |                                   str(param_wood['n_jobs']),
103 |                                   str(param_wood['bootstrap']),
104 |                                   str(param_wood['tree_type']),
105 |                                   str(seed),
106 |                                 )
107 |     fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_1K", fname)
108 |     ensure_dir_for_file(fname)
109 |     with open(fname, 'w') as fp:
110 |         json.dump(results, fp)
111 |     
112 |     del(testgen)
113 |     del(traingen)
114 |     model.cleanup()
115 |     
116 |     time.sleep(1)
117 | 
118 | ###################################################################################
119 | import argparse
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
124 | parser.add_argument('--key', type=str)
125 | parser.add_argument('--n_bottom', nargs='?', const=0.0, type=float, default=0.0)
126 | args = parser.parse_args()
127 | dkey, train_size, seed, key, n_bottom = args.dkey, args.train_size, args.seed, args.key, args.n_bottom
128 | ###################################################################################
129 | 
130 | single_run(dkey, train_size, n_bottom, params.parameters_hugewood[key], seed)
131 | 


--------------------------------------------------------------------------------
/experiments/influence_n_bottom/hugewood_75K.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(".")
  3 | 
  4 | import os
  5 | import json
  6 | from util import evaluate
  7 | import params
  8 | 
  9 | import time
 10 | 
 11 | from woody import HugeWoodClassifier, WoodClassifier
 12 | 
 13 | from woody.io import  MemoryStore, DiskStore
 14 | from woody.util import ensure_dir_for_file
 15 | from woody.data import *
 16 | 
 17 | def single_run(dkey, train_size, n_bottom, param, seed, profile=False):
 18 |                 
 19 |     print("Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param)))
 20 |     
 21 |     if dkey == "covtype":
 22 |         traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed)
 23 |     elif dkey == "higgs":
 24 |         traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed)
 25 |     elif dkey == "susy":
 26 |         traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed)
 27 |     else:
 28 |         raise Exception("Unknown data set!")
 29 |     
 30 |     print("")
 31 |     print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
 32 |     print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
 33 |     print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
 34 |     
 35 |     param_wood = param['param_wood']
 36 |     
 37 |     wood = WoodClassifier(
 38 |                 n_estimators=1,
 39 |                 criterion="gini",
 40 |                 max_features=param_wood['max_features'],
 41 |                 min_samples_split=2,
 42 |                 n_jobs=param_wood['n_jobs'],
 43 |                 seed=seed,
 44 |                 bootstrap=param_wood['bootstrap'],
 45 |                 tree_traversal_mode="dfs",
 46 |                 tree_type=param_wood['tree_type'],
 47 |                 min_samples_leaf=1,
 48 |                 float_type="double",
 49 |                 max_depth=None,
 50 |                 verbose=0)
 51 |     
 52 |     model = HugeWoodClassifier(
 53 |                n_estimators=int(24 / n_bottom),
 54 |                n_estimators_bottom=int(n_bottom),
 55 |                n_top="auto",
 56 |                n_patterns_leaf=75000,
 57 |                balanced_top_tree=True,
 58 |                top_tree_lambda=1.0,
 59 |                top_tree_max_depth=None,
 60 |                top_tree_type="standard",
 61 |                top_tree_leaf_stopping_mode="ignore_impurity",
 62 |                n_jobs=param_wood['n_jobs'],
 63 |                seed=seed,
 64 |                verbose=1,
 65 |                plot_intermediate={},
 66 |                chunk_max_megabytes=2048, 
 67 |                wrapped_instance=wood,
 68 |                store=MemoryStore(),                       
 69 |                )
 70 |     
 71 |     # training
 72 |     if profile == True:
 73 |         import yep
 74 |         assert param_wood['n_jobs'] == 1
 75 |         yep.start("train.prof")
 76 |                 
 77 |     fit_start_time = time.time()        
 78 |     model.fit(traingen)
 79 |     fit_end_time = time.time()
 80 |     if profile == True:
 81 |         yep.stop()
 82 |     ypreds_train = model.predict(generator=traingen)
 83 |     
 84 |     # testing
 85 |     test_start_time = time.time()
 86 |     ypred_test = model.predict(generator=testgen)
 87 |     test_end_time = time.time()
 88 |     
 89 |     results = {}
 90 |     results['dataset'] = dkey
 91 |     results['param'] = param
 92 |     results['training_time'] = fit_end_time - fit_start_time
 93 |     results['testing_time'] = test_end_time - test_start_time
 94 |     print("Training time:\t\t%f" % results['training_time'])
 95 |     print("Testing time:\t\t%f" % results['testing_time'])
 96 |                 
 97 |     evaluate(ypreds_train, traingen.get_all_target(), results, "training")
 98 |     evaluate(ypred_test, testgen.get_all_target(), results, "testing")
 99 |         
100 |     fname = '%s_%s_%s_%s_%s_%s.json' % (str(param_wood['n_estimators']),
101 |                                   str(param_wood['max_features']),
102 |                                   str(param_wood['n_jobs']),
103 |                                   str(param_wood['bootstrap']),
104 |                                   str(param_wood['tree_type']),
105 |                                   str(seed),
106 |                                 )
107 |     fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_75K", fname)
108 |     ensure_dir_for_file(fname)
109 |     with open(fname, 'w') as fp:
110 |         json.dump(results, fp)
111 |     
112 |     del(testgen)
113 |     del(traingen)
114 |     model.cleanup()
115 |     
116 |     time.sleep(1)
117 | 
118 | ###################################################################################
119 | import argparse
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument('--dkey', nargs='?', const="covtype", type=str, default="covtype")
122 | parser.add_argument('--train_size', nargs='?', const=0, type=int, default=0)
123 | parser.add_argument('--seed', nargs='?', const=0, type=int, default=0)
124 | parser.add_argument('--key', type=str)
125 | parser.add_argument('--n_bottom', nargs='?', const=0.0, type=float, default=0.0)
126 | args = parser.parse_args()
127 | dkey, train_size, seed, key, n_bottom = args.dkey, args.train_size, args.seed, args.key, args.n_bottom
128 | ###################################################################################
129 | 
130 | single_run(dkey, train_size, n_bottom, params.parameters_hugewood[key], seed)
131 | 


--------------------------------------------------------------------------------