├── .gitignore ├── COPYING ├── Makefile ├── README.md ├── addheaders.py ├── cythonsrc └── kmeans.pyx ├── examples ├── dense_200_5_header.txt ├── dense_200_5_noheader.txt ├── examples.py ├── linking_example │ ├── README │ └── main.cpp ├── sparse_randdim10_test_header.txt ├── sparse_randdim10_test_noheader.txt ├── sparse_randdim10_train_header.txt ├── sparse_randdim10_train_noheader.txt └── testing_BF.py ├── setup.py └── src ├── BaseGrowBatch.h ├── BaseGrowBatchMse.h ├── BaseGrowBatchPartitional.h ├── BaseSparseGrowBatch.h ├── BaseSparseGrowBatchMse.h ├── GBMse3v1.h ├── GBMseSimple.h ├── GBPSimple.h ├── SparseGBMse3v1.h ├── SparseGBMseSimple.h ├── YY17v2.h ├── YY17v3.h ├── YY17v5.h ├── YY17v6.h ├── YY21v3.h ├── YY21v4.h ├── YY21v5.h ├── alg_X_selkSN.h ├── arrutilv2copy.h ├── arrutilv2discrete.h ├── arrutilv2l0.h ├── arrutilv2l0blasless.h ├── arrutilv2l0withblas.h ├── arrutilv2l1.h ├── arrutilv2l2.h ├── arrutilv2l3.h ├── arrutilv2minmax.h ├── arrutilv2mse.h ├── barrierutil.cpp ├── barrierutil.h ├── baseYY.h ├── baseYYMNS.h ├── baseYYMSN.h ├── baseYYSMN.h ├── basecluster.h ├── basedensecentroidkmeans.h ├── baseelkan.h ├── baseelkanminibatch.h ├── baseexact.h ├── baseexponion.h ├── basehamerly.h ├── basekmeans.h ├── baseminibatch.h ├── basesimpleexact.h ├── basesimpleminibatch.h ├── basesparseelkan.h ├── basesparseexact.h ├── basesparsekmeans.h ├── basesparseminibatch.h ├── blastemplates.cpp ├── blastemplates.h ├── elkan3v0.h ├── elkan4v2.h ├── elkan5v1.h ├── elkan6v0.h ├── exactsimplebatch.h ├── growbatchapp.h ├── hamerly11v0.h ├── hamerly12v6.h ├── hamerly12v7.h ├── hamerly13v0.h ├── initialise2.h ├── main.cpp ├── mb3v0.h ├── minibatch.h ├── minibatchapp.h ├── optionsutil.cpp ├── optionsutil.h ├── pllcluster.h ├── pllkmeansfuncs.cpp ├── pllkmeansfuncs.hpp ├── pllkmeansfuncs_nonvoid.h ├── pllkmeansfuncs_void.h ├── processingfilename.py ├── randomarray.h ├── randomsparse.h ├── sample.h ├── simple1.h ├── simplest.h ├── sortutil.h ├── sparsedatasets.h ├── sparseelkan3v0.h ├── sparseinitialise.h ├── sparseminibatch.h ├── sparsesimple.h ├── sparsestandardminibatch.h ├── sparseutil.h ├── standardminibatch.h ├── stringutilbase.cpp ├── stringutilbase.h ├── stringutilclustering.cpp ├── stringutilclustering.h ├── stringutilfile.cpp ├── stringutilfile.h ├── templatedbarrierutil.h └── txtdatasets.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.swp 4 | *.pyc 5 | *.pyxbldc 6 | *.bbl 7 | *.blg 8 | *.log 9 | *bin* 10 | build/* 11 | *junk* 12 | *eakmeans/* 13 | python/batch* 14 | python/other* 15 | bin 16 | build 17 | cythonsrc/kmeans.cpp 18 | lib 19 | obj 20 | 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | # Written by James Newling 4 | # All rights reserved. 5 | # 6 | # eakmeans is a library for exact and approximate k-means written in C++ and 7 | # Python. This file is part of eakmeans. See file COPYING for more details. 8 | # 9 | # This file is part of eakmeans. 10 | # 11 | # eakmeans is free software: you can redistribute it and/or modify 12 | # it under the terms of the 3-Clause BSD Licence. See 13 | # https://opensource.org/licenses/BSD-3-Clause for more details. 14 | # 15 | # eakmeans is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | # COPYING for more details. 19 | # 20 | ########################################################## 21 | #compile with blas? 22 | USEBLAS = YES 23 | export LIBBLASDIR=/idiap/user/jnewling/openblas/lib #/home/james/openblas/lib 24 | export INCBLASDIR=/idiap/user/jnewling/openblas/include #/home/james/openblas/include 25 | ########################################################## 26 | 27 | 28 | CXX := g++ 29 | CXXFLAGS := -std=c++11 -O3 -Wall -pedantic -fPIC 30 | LINKER := g++ #-Wl,--no-as-needed 31 | LFLAGS := -lpthread 32 | ifeq ($(USEBLAS), YES) 33 | LFLAGS := ${LFLAGS} -lopenblas -L${LIBBLASDIR} 34 | CXXFLAGS := ${CXXFLAGS} -D WITHBLAS 35 | INCLUDEPATHS = -I${INCBLASDIR} 36 | TARGET := withblaskmeans 37 | LIBNAME := libwithblaskmeans 38 | export WITHBLAS 39 | else 40 | TARGET := blaslesskmeans 41 | LIBNAME := libblaslesskmeans 42 | endif 43 | 44 | 45 | SOURCES := $(wildcard src/*.cpp) 46 | HEADERS := $(wildcard src/*.h) 47 | OBJECTS := $(SOURCES:src/%.cpp=obj/%.o) 48 | OBJECTS_FORLIB := $(filter-out obj/main.o,$(OBJECTS)) 49 | 50 | all : main lib pythonkmeans 51 | 52 | 53 | main : $(OBJECTS) 54 | @mkdir -p bin 55 | $(LINKER) -o bin/${TARGET} $(OBJECTS) $(LFLAGS) 56 | @echo "Linking for main of ${NAME} done!" 57 | 58 | lib : $(OBJECTS_FORLIB) 59 | @mkdir -p lib 60 | $(CXX) -shared $^ -o lib/$(LIBNAME).so 61 | @echo "Shared library ${LIBNAME} made!" 62 | 63 | pythonkmeans: 64 | python setup.py build_ext -b lib 65 | 66 | obj/%.o : src/%.cpp $(HEADERS) 67 | @mkdir -p obj 68 | $(CXX) -c $(CXXFLAGS) $(INCLUDEPATHS) $< -o $@ 69 | @echo "compiled "$<" successfully!" 70 | 71 | 72 | .PHONEY: clean 73 | 74 | 75 | clean: 76 | rm -f $(OBJECTS) 77 | @echo "cleanup done!" 78 | 79 | .PHONEY: remove 80 | 81 | remove: clean 82 | -rm cythonsrc/kmeans.cpp 83 | -rm -rf build 84 | -rm -rf bin 85 | -rm -rf lib 86 | -rm -rf obj 87 | @echo "should be 100% clean!" 88 | 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | WHAT 2 | ==== 3 | Implementations of fast exact k-means algorithms as described in http://arxiv.org/abs/1602.02514 and implementations of turbo-charged mini-batch k-means as described in http://arxiv.org/pdf/1602.02934 4 | 5 | for interfaces 6 | - (LIB) Shared library with accompanying C++ header file 7 | - (EX) Command-line exectuble 8 | - (PY) Python library 9 | 10 | 11 | REQUIREMENTS 12 | ============ 13 | Minimal installation requirements: 14 | - C++ compiler supporting C++11 15 | - Linux operating system 16 | 17 | Optional but recommended: 18 | - BLAS implementation, we recommend this one : http://www.openblas.net/ 19 | 20 | Specific to Python library: 21 | - Python and Cython 22 | 23 | 24 | CONFIGURATION 25 | ============= 26 | In `Makefile`, set `USEBLAS` to either `NO` or `YES` 27 | if `USEBLAS = YES`, then set `LIBBLASDIR`, `INCBLASDIR` (unless blas paths will be found automatically) 28 | 29 | 30 | BUILDING 31 | ======== 32 | - For (LIB) and (EX) and (PY) : `make all` 33 | - For (EX) : `make main` 34 | - For (LIB) : `make lib` 35 | 36 | USING 37 | ===== 38 | (EX) If succesfully installed, you should find an executable in directory bin 39 | Run the executable with -h flag to see the options 40 | 41 | (LIB) You need to add lib directory to your LD_LIBRARY_PATH : put the following line in your ``~/.bashrc` file: 42 | ``` 43 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/to/kmeans/lib 44 | ``` 45 | (PY) If successfully installed, you should be able to `import kmeans` when in directory lib. 46 | To use from a different directory, 47 | 48 | (a) as per (LIB), and 49 | 50 | (b) add the path to lib to your python path, either by: 51 | ``` 52 | export PYTHONPATH=${PYTHONPATH}:/path/to/kmeans/lib 53 | ``` 54 | or directly in your python script : 55 | ``` 56 | import sys 57 | sys.path.insert(0,'/path/to/kmeans/lib') 58 | ``` 59 | Example use is found in `examples/examples.py` 60 | 61 | 62 | 63 | DOESN'T WORK? 64 | ============= 65 | Please contact James Newling at 66 | -------------------------------------------------------------------------------- /addheaders.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | # Written by James Newling 4 | # All rights reserved. 5 | # 6 | # eakmeans is a library for exact and approximate k-means written in C++ and 7 | # Python. This file is part of eakmeans. See file COPYING for more details. 8 | # 9 | # This file is part of eakmeans. 10 | # 11 | # eakmeans is free software: you can redistribute it and/or modify 12 | # it under the terms of the 3-Clause BSD Licence. See 13 | # https://opensource.org/licenses/BSD-3-Clause for more details. 14 | # 15 | # eakmeans is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | # COPYING for more details. 19 | # 20 | from IPython.core.debugger import Tracer 21 | 22 | 23 | old_rawheader = r"""Copyright (c) 2015 Idiap Research Institute, http://www.idiap.ch/ 24 | Written by James Newling 25 | 26 | eakmeans is a library for exact and approximate k-means written in C++ and Python. This file is part of eakmeans. eakmeans is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License version 3 as published by the Free Software Foundation. eakmeans is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with eakmeans. If not, see . 27 | 28 | """ 29 | 30 | old_hashheader = r"""#Copyright (c) 2015 Idiap Research Institute, http://www.idiap.ch/ 31 | #Written by James Newling 32 | 33 | #eakmeans is a library for exact and approximate k-means written in C++ and Python. This file is part of eakmeans. eakmeans is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License version 3 as published by the Free Software Foundation. eakmeans is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with eakmeans. If not, see . 34 | 35 | """ 36 | 37 | old_cppheader = r"""/* 38 | %s 39 | */ 40 | 41 | """%(old_rawheader, ) 42 | 43 | new_rawheader = r""" 44 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 45 | Written by James Newling 46 | All rights reserved. 47 | 48 | eakmeans is a library for exact and approximate k-means written in C++ and 49 | Python. This file is part of eakmeans. See file COPYING for more details. 50 | 51 | This file is part of eakmeans. 52 | 53 | eakmeans is free software: you can redistribute it and/or modify 54 | it under the terms of the 3-Clause BSD Licence. See 55 | https://opensource.org/licenses/BSD-3-Clause for more details. 56 | 57 | eakmeans is distributed in the hope that it will be useful, 58 | but WITHOUT ANY WARRANTY; without even the implied warranty of 59 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 60 | COPYING for more details. 61 | """ 62 | 63 | new_hashheader = '\n'.join(['# ' + l for l in new_rawheader.split('\n')]) + '\n' 64 | new_cppheader = "/*%s*/\n\n" % new_rawheader 65 | 66 | import os 67 | import sys 68 | 69 | import commands 70 | 71 | 72 | hfiles = commands.getstatusoutput("find . -name \"*.h\"")[1].split("\n") 73 | cppfiles = commands.getstatusoutput("find . -name \"*.cpp\"")[1].split("\n") 74 | hppfiles = commands.getstatusoutput("find . -name \"*.hpp\"")[1].split("\n") 75 | cppheaderable = hfiles + cppfiles + hppfiles 76 | 77 | makefiles = commands.getstatusoutput("find . -name \"Makefile\"")[1].split("\n") 78 | pyfiles = commands.getstatusoutput("find . -name \"*.py\"")[1].split("\n") 79 | pyxfiles = commands.getstatusoutput("find . -name \"*.pyx\"")[1].split("\n") 80 | pyxbldfiles = commands.getstatusoutput("find . -name \"*.pyxbld\"")[1].split("\n") 81 | hashheaderable = makefiles + pyfiles + pyxfiles + pyxbldfiles 82 | 83 | 84 | for files, old_header, new_header in zip( 85 | [cppheaderable, hashheaderable], 86 | [old_cppheader, old_hashheader], 87 | [new_cppheader, new_hashheader] 88 | ): 89 | for fn in files: 90 | 91 | if fn: 92 | sys.stdout.write("headering " + fn +"...") 93 | 94 | filly = open(fn, "r") 95 | lines = filly.read() 96 | filly.close() 97 | 98 | if lines.startswith(old_header): 99 | lines = lines[len(old_header):] 100 | 101 | if lines.startswith(new_header): 102 | # already with header, skip this file 103 | sys.stdout.write(" already done, skip.\n") 104 | continue 105 | 106 | filly = open(fn, "w") 107 | filly.write(new_header) 108 | filly.write(lines) 109 | filly.close() 110 | sys.stdout.write(" done.\n") 111 | -------------------------------------------------------------------------------- /examples/examples.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | # Written by James Newling 4 | # All rights reserved. 5 | # 6 | # eakmeans is a library for exact and approximate k-means written in C++ and 7 | # Python. This file is part of eakmeans. See file COPYING for more details. 8 | # 9 | # This file is part of eakmeans. 10 | # 11 | # eakmeans is free software: you can redistribute it and/or modify 12 | # it under the terms of the 3-Clause BSD Licence. See 13 | # https://opensource.org/licenses/BSD-3-Clause for more details. 14 | # 15 | # eakmeans is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | # COPYING for more details. 19 | # 20 | import numpy as np 21 | import numpy.random as npr 22 | import time 23 | from IPython.core.debugger import Tracer 24 | 25 | import sys 26 | sys.path.insert(0, "../lib") 27 | 28 | import kmeans 29 | reload(kmeans) 30 | 31 | 32 | def example_1(ndata = 1e4, dimension = 100, dtype = np.float64): 33 | """ 34 | basic use : cluster random data 35 | """ 36 | data = npr.randn(ndata, dimension).astype(dtype) 37 | clustering = kmeans.get_clustering(X = data, n_clusters = 60, algorithm = 'auto', verbose = 2) 38 | 39 | 40 | def example_2(): 41 | """ 42 | compare algorithms which may be good in low-d 43 | ham, ann, expSN, expNS, syinSN, syinNS, yin, 44 | on dataset ldfpads.txt (~160000 points in 3 dimensions) 45 | """ 46 | data = np.loadtxt('ldfpads.txt') 47 | print "Data shape : ", data.shape 48 | seed = npr.randint(100000) 49 | algs = ['ham','ann', 'exp-sn', 'exp-ns', 'syin-sn', 'syin-ns', 'yin'] 50 | 51 | times = dict.fromkeys(algs) 52 | for alg in algs: 53 | clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed) 54 | times[alg] = clustering['duration'] 55 | 56 | 57 | print "TIMES:" 58 | for alg in algs: 59 | print alg, " : ", times[alg] 60 | 61 | 62 | def example_3(): 63 | """ 64 | compare selkNS to standard algorithm on random data. selkNS is faster, but not by as much as when the data has structure. 65 | """ 66 | algs = ['sta','selk-ns'] 67 | times = dict.fromkeys(algs) 68 | data = npr.randn(50000, 25).astype(np.float64) 69 | seed = 1011 70 | for alg in algs: 71 | clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed) 72 | times[alg] = clustering['duration'] 73 | 74 | 75 | print "TIMES:" 76 | for alg in algs: 77 | print alg, " : ", times[alg] 78 | 79 | 80 | def example_4(): 81 | """ 82 | compare to scikitlearn implementation of kmeans 83 | """ 84 | 85 | import sklearn.cluster as skc 86 | import time 87 | 88 | ndata = 50000 89 | dimension = 10 90 | ncentroids = 1000 91 | data = npr.randn(ndata, dimension).astype(np.float64) 92 | 93 | centroids0 = data[0:ncentroids, :] 94 | 95 | t0 = time.time() 96 | kmeans.get_clustering(X = data, init = centroids0, n_clusters = ncentroids, algorithm = 'auto', verbose = 1, n_threads = 1) 97 | t1 = time.time() 98 | 99 | sklearner = skc.k_means(X = data, n_clusters = ncentroids, max_iter = 1000, n_init = 1, init = centroids0, precompute_distances = False, verbose = True, n_jobs = 1, return_n_iter = True, tol = 0.0) 100 | t2 = time.time() 101 | 102 | kmeans_time = t1 - t0 103 | sklearner_time = t2 - t1 104 | 105 | print "sklearn : ", sklearner_time, " s" 106 | print "this kmeans: ", kmeans_time, " s" 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /examples/linking_example/README: -------------------------------------------------------------------------------- 1 | at terminal (assuming you've installed with blas and that openblas .so file is in ../../openblas/lib): 2 | g++ -std=c++11 -I../src/ -L../lib/ -lwithblaskmeans -L../../openblas/lib -lopenblas main.cpp -o xme 3 | 4 | at terminal (assuming no blas) 5 | g++ -std=c++11 -I../src/ -L../lib/ -lblaslesskmeans main.cpp -o xme 6 | -------------------------------------------------------------------------------- /examples/linking_example/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "pllkmeansfuncs_nonvoid.h" 28 | 29 | 30 | 31 | int main(){ 32 | 33 | 34 | /* full list of algorithms can be found in pllcluster.h, for exact k-means on dense data, you'll probably want one of: 35 | * exp-ns (exponion - ns), selk-ns (simplified elkan - ns), syin-ns (simplified yinyang - ns) 36 | * See ICML paper Fast K-Means with Accurate Bounds, or --help flag in executable or python function string for more info. 37 | * */ 38 | std::string algorithm = "exp-ns"; 39 | 40 | size_t nthreads = 2; 41 | 42 | /* we make some data */ 43 | size_t ndata = 10000; 44 | size_t dimension = 3; 45 | std::vector v_data (ndata*dimension, 0); 46 | for (size_t i = 0; i < ndata*dimension; ++i){ 47 | v_data[i]= static_cast (rand() % 100); 48 | } 49 | 50 | size_t ncentroids = 100; 51 | int cout_verbosity = 2; 52 | std::string initialisation_method = "from_indices"; 53 | const float * const C_init = nullptr; 54 | std::vector v_data_indices_init_from(ncentroids); 55 | std::iota(v_data_indices_init_from.begin(), v_data_indices_init_from.end(), 0); 56 | bool setseed = true; 57 | size_t seed = 1011; 58 | float maxtime = 1000; 59 | size_t maxrounds = 1000; 60 | size_t minibatchsize = 0; 61 | bool capture_verbose = false; 62 | 63 | 64 | std::cout << "entering solveiolessf" << std::endl; 65 | /* the double version is sloveiolessd : see pllkmeansfuncs_nonvoid */ 66 | auto results = cluster::solveiolessf(algorithm, nthreads, ndata, dimension, v_data.data(), ncentroids, cout_verbosity, initialisation_method, C_init, v_data_indices_init_from.data(), setseed, seed, maxtime, maxrounds, minibatchsize, capture_verbose); 67 | 68 | /* return : C, L, inds0, duration, niterations, mse */ 69 | float * C = std::get<0> (results).get(); 70 | size_t * labels = std::get<1> (results).get(); 71 | size_t * starting_indices_returned = std::get<2> (results).get(); 72 | size_t duration = std::get<3> (results); 73 | size_t niterations = std::get<4> (results); 74 | double mse = std::get<5> (results); 75 | 76 | std::vector counts (ncentroids, 0); 77 | for (size_t i = 0; i < ndata; ++i){ 78 | ++counts[labels[i]]; 79 | } 80 | 81 | std::cout << "- -- - - - --- - - -- - - - --- - -- - -- -" << std::endl; 82 | for (size_t k = 0; k < ncentroids; ++k){ 83 | std::cout << "in cluster " << k << " : " << counts[k] << std::endl; 84 | } 85 | 86 | 87 | 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /examples/sparse_randdim10_test_header.txt: -------------------------------------------------------------------------------- 1 | 111 10 2 | 1011 9:0.755 3 | 1011 1:0.320 2:0.460 3:0.967 5:0.100 6:0.220 9:0.715 4 | 1011 0:0.011 9:0.611 5 | 1011 9:0.255 6 | 1011 2:0.697 9:0.682 7 | 1011 0:0.931 7:0.491 8 | 1011 6:0.010 9 | 1011 1:0.640 6:0.322 10 | 1011 1:0.366 6:0.709 11 | 1011 0:0.936 1:0.414 2:0.654 4:0.929 7:0.702 12 | 1011 1:0.880 3:0.165 5:0.494 6:0.692 13 | 1011 14 | 1011 6:0.008 15 | 1011 4:0.987 6:0.359 8:0.113 9:0.761 16 | 1011 2:0.943 4:0.716 8:0.634 17 | 1011 8:0.731 18 | 1011 0:0.530 1:0.030 2:0.942 3:0.878 5:0.513 7:0.095 19 | 1011 0:0.491 1:0.870 4:0.815 7:0.284 20 | 1011 0:0.666 4:0.733 6:0.609 7:0.489 9:0.349 21 | 1011 2:0.999 6:0.325 8:0.723 9:0.678 22 | 1011 7:0.255 23 | 1011 7:0.856 9:0.930 24 | 1011 1:0.803 6:0.708 25 | 1011 1:0.417 2:0.404 6:0.027 7:0.173 26 | 1011 5:0.511 27 | 1011 3:0.499 8:0.876 28 | 1011 0:0.043 1:0.264 2:0.826 3:0.573 4:0.363 8:0.519 9:0.084 29 | 1011 2:0.028 5:0.154 7:0.536 8:0.608 30 | 1011 0:0.040 1:0.238 4:0.952 5:0.778 31 | 1011 1:0.656 3:0.249 7:0.108 32 | 1011 4:0.213 8:0.725 33 | 1011 0:0.344 1:0.433 7:0.283 8:0.285 34 | 1011 8:0.067 35 | 1011 1:0.973 4:0.139 9:0.723 36 | 1011 0:0.043 1:0.388 5:0.124 6:0.573 7:0.596 9:0.165 37 | 1011 0:0.315 2:0.591 5:0.538 38 | 1011 0:0.330 1:0.809 2:0.074 9:0.903 39 | 1011 1:0.156 4:0.825 9:0.044 40 | 1011 41 | 1011 0:0.270 3:0.628 6:0.894 42 | 1011 0:0.058 1:0.704 2:0.842 6:0.832 8:0.174 43 | 1011 1:0.354 6:0.908 8:0.270 44 | 1011 1:0.347 2:0.626 5:0.563 6:0.400 45 | 1011 7:0.980 8:0.952 46 | 1011 5:0.661 47 | 1011 9:0.724 48 | 1011 0:0.384 5:0.911 7:0.833 8:0.915 49 | 1011 0:0.356 3:0.769 5:0.979 50 | 1011 1:0.586 2:0.190 3:0.247 51 | 1011 2:0.930 4:0.232 8:0.013 52 | 1011 3:0.538 8:0.104 9:0.174 53 | 1011 0:0.848 1:0.635 54 | 1011 0:0.250 1:0.015 55 | 1011 1:0.595 8:0.139 56 | 1011 3:0.533 7:0.093 8:0.528 57 | 1011 7:0.075 8:0.506 58 | 1011 59 | 1011 2:0.466 5:0.185 8:0.512 60 | 1011 9:0.440 61 | 1011 1:0.753 3:0.261 4:0.789 62 | 1011 1:0.385 7:0.465 63 | 1011 0:0.347 2:0.726 5:0.607 6:0.092 8:0.040 64 | 1011 1:0.871 6:0.311 9:0.989 65 | 1011 2:0.418 5:0.284 66 | 1011 0:0.539 1:0.159 6:0.408 67 | 1011 2:0.868 7:0.617 8:0.665 68 | 1011 0:0.967 5:0.060 69 | 1011 1:0.747 5:0.966 70 | 1011 5:0.803 71 | 1011 2:0.377 6:0.680 7:0.792 72 | 1011 6:0.926 9:0.216 73 | 1011 0:0.231 8:0.242 74 | 1011 0:0.657 2:0.512 4:0.570 7:0.819 9:0.829 75 | 1011 2:0.878 5:0.987 6:0.953 7:0.911 76 | 1011 0:0.146 1:0.978 2:0.558 7:0.530 9:0.136 77 | 1011 0:0.761 1:0.050 7:0.457 78 | 1011 5:0.200 6:0.192 79 | 1011 1:0.202 2:0.186 4:0.164 6:0.825 8:0.454 80 | 1011 0:0.899 5:0.719 8:0.176 81 | 1011 2:0.314 6:0.202 82 | 1011 3:0.094 5:0.591 83 | 1011 0:0.619 9:0.779 84 | 1011 1:0.611 3:0.585 5:0.882 8:0.574 9:0.181 85 | 1011 1:0.886 3:0.533 4:0.444 5:0.858 86 | 1011 0:0.132 2:0.439 3:0.364 5:0.838 8:0.088 87 | 1011 2:0.079 5:0.700 6:0.640 9:0.356 88 | 1011 2:0.138 3:0.282 5:0.645 6:0.398 89 | 1011 0:0.574 1:0.966 4:0.902 5:0.042 90 | 1011 2:0.331 6:0.843 91 | 1011 0:0.352 7:0.352 8:0.579 92 | 1011 9:0.097 93 | 1011 3:0.262 7:0.072 8:0.406 94 | 1011 3:0.634 6:0.095 95 | 1011 1:0.152 6:0.888 96 | 1011 0:0.821 1:0.378 3:0.625 4:0.361 97 | 1011 0:0.326 3:0.770 7:0.207 98 | 1011 3:0.085 6:0.393 99 | 1011 1:0.691 2:0.729 9:0.957 100 | 1011 3:0.045 4:0.328 5:0.278 8:0.502 101 | 1011 5:0.452 7:0.430 102 | 1011 1:0.564 2:0.152 3:0.118 5:0.315 9:0.056 103 | 1011 3:0.637 4:0.803 104 | 1011 0:0.182 1:0.892 3:0.879 7:0.470 9:0.557 105 | 1011 0:0.695 7:0.190 106 | 1011 2:0.124 6:0.271 107 | 1011 3:0.226 5:0.708 108 | 1011 1:0.005 2:0.096 5:0.967 109 | 1011 1:0.678 5:0.601 110 | 1011 0:0.020 2:0.391 4:0.480 111 | 1011 4:0.746 9:0.505 112 | 1011 1:0.734 4:0.862 5:0.305 7:0.928 113 | -------------------------------------------------------------------------------- /examples/sparse_randdim10_test_noheader.txt: -------------------------------------------------------------------------------- 1 | 1011 9:0.755 2 | 1011 1:0.320 2:0.460 3:0.967 5:0.100 6:0.220 9:0.715 3 | 1011 0:0.011 9:0.611 4 | 1011 9:0.255 5 | 1011 2:0.697 9:0.682 6 | 1011 0:0.931 7:0.491 7 | 1011 6:0.010 8 | 1011 1:0.640 6:0.322 9 | 1011 1:0.366 6:0.709 10 | 1011 0:0.936 1:0.414 2:0.654 4:0.929 7:0.702 11 | 1011 1:0.880 3:0.165 5:0.494 6:0.692 12 | 1011 13 | 1011 6:0.008 14 | 1011 4:0.987 6:0.359 8:0.113 9:0.761 15 | 1011 2:0.943 4:0.716 8:0.634 16 | 1011 8:0.731 17 | 1011 0:0.530 1:0.030 2:0.942 3:0.878 5:0.513 7:0.095 18 | 1011 0:0.491 1:0.870 4:0.815 7:0.284 19 | 1011 0:0.666 4:0.733 6:0.609 7:0.489 9:0.349 20 | 1011 2:0.999 6:0.325 8:0.723 9:0.678 21 | 1011 7:0.255 22 | 1011 7:0.856 9:0.930 23 | 1011 1:0.803 6:0.708 24 | 1011 1:0.417 2:0.404 6:0.027 7:0.173 25 | 1011 5:0.511 26 | 1011 3:0.499 8:0.876 27 | 1011 0:0.043 1:0.264 2:0.826 3:0.573 4:0.363 8:0.519 9:0.084 28 | 1011 2:0.028 5:0.154 7:0.536 8:0.608 29 | 1011 0:0.040 1:0.238 4:0.952 5:0.778 30 | 1011 1:0.656 3:0.249 7:0.108 31 | 1011 4:0.213 8:0.725 32 | 1011 0:0.344 1:0.433 7:0.283 8:0.285 33 | 1011 8:0.067 34 | 1011 1:0.973 4:0.139 9:0.723 35 | 1011 0:0.043 1:0.388 5:0.124 6:0.573 7:0.596 9:0.165 36 | 1011 0:0.315 2:0.591 5:0.538 37 | 1011 0:0.330 1:0.809 2:0.074 9:0.903 38 | 1011 1:0.156 4:0.825 9:0.044 39 | 1011 40 | 1011 0:0.270 3:0.628 6:0.894 41 | 1011 0:0.058 1:0.704 2:0.842 6:0.832 8:0.174 42 | 1011 1:0.354 6:0.908 8:0.270 43 | 1011 1:0.347 2:0.626 5:0.563 6:0.400 44 | 1011 7:0.980 8:0.952 45 | 1011 5:0.661 46 | 1011 9:0.724 47 | 1011 0:0.384 5:0.911 7:0.833 8:0.915 48 | 1011 0:0.356 3:0.769 5:0.979 49 | 1011 1:0.586 2:0.190 3:0.247 50 | 1011 2:0.930 4:0.232 8:0.013 51 | 1011 3:0.538 8:0.104 9:0.174 52 | 1011 0:0.848 1:0.635 53 | 1011 0:0.250 1:0.015 54 | 1011 1:0.595 8:0.139 55 | 1011 3:0.533 7:0.093 8:0.528 56 | 1011 7:0.075 8:0.506 57 | 1011 58 | 1011 2:0.466 5:0.185 8:0.512 59 | 1011 9:0.440 60 | 1011 1:0.753 3:0.261 4:0.789 61 | 1011 1:0.385 7:0.465 62 | 1011 0:0.347 2:0.726 5:0.607 6:0.092 8:0.040 63 | 1011 1:0.871 6:0.311 9:0.989 64 | 1011 2:0.418 5:0.284 65 | 1011 0:0.539 1:0.159 6:0.408 66 | 1011 2:0.868 7:0.617 8:0.665 67 | 1011 0:0.967 5:0.060 68 | 1011 1:0.747 5:0.966 69 | 1011 5:0.803 70 | 1011 2:0.377 6:0.680 7:0.792 71 | 1011 6:0.926 9:0.216 72 | 1011 0:0.231 8:0.242 73 | 1011 0:0.657 2:0.512 4:0.570 7:0.819 9:0.829 74 | 1011 2:0.878 5:0.987 6:0.953 7:0.911 75 | 1011 0:0.146 1:0.978 2:0.558 7:0.530 9:0.136 76 | 1011 0:0.761 1:0.050 7:0.457 77 | 1011 5:0.200 6:0.192 78 | 1011 1:0.202 2:0.186 4:0.164 6:0.825 8:0.454 79 | 1011 0:0.899 5:0.719 8:0.176 80 | 1011 2:0.314 6:0.202 81 | 1011 3:0.094 5:0.591 82 | 1011 0:0.619 9:0.779 83 | 1011 1:0.611 3:0.585 5:0.882 8:0.574 9:0.181 84 | 1011 1:0.886 3:0.533 4:0.444 5:0.858 85 | 1011 0:0.132 2:0.439 3:0.364 5:0.838 8:0.088 86 | 1011 2:0.079 5:0.700 6:0.640 9:0.356 87 | 1011 2:0.138 3:0.282 5:0.645 6:0.398 88 | 1011 0:0.574 1:0.966 4:0.902 5:0.042 89 | 1011 2:0.331 6:0.843 90 | 1011 0:0.352 7:0.352 8:0.579 91 | 1011 9:0.097 92 | 1011 3:0.262 7:0.072 8:0.406 93 | 1011 3:0.634 6:0.095 94 | 1011 1:0.152 6:0.888 95 | 1011 0:0.821 1:0.378 3:0.625 4:0.361 96 | 1011 0:0.326 3:0.770 7:0.207 97 | 1011 3:0.085 6:0.393 98 | 1011 1:0.691 2:0.729 9:0.957 99 | 1011 3:0.045 4:0.328 5:0.278 8:0.502 100 | 1011 5:0.452 7:0.430 101 | 1011 1:0.564 2:0.152 3:0.118 5:0.315 9:0.056 102 | 1011 3:0.637 4:0.803 103 | 1011 0:0.182 1:0.892 3:0.879 7:0.470 9:0.557 104 | 1011 0:0.695 7:0.190 105 | 1011 2:0.124 6:0.271 106 | 1011 3:0.226 5:0.708 107 | 1011 1:0.005 2:0.096 5:0.967 108 | 1011 1:0.678 5:0.601 109 | 1011 0:0.020 2:0.391 4:0.480 110 | 1011 4:0.746 9:0.505 111 | 1011 1:0.734 4:0.862 5:0.305 7:0.928 112 | -------------------------------------------------------------------------------- /examples/testing_BF.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | # Written by James Newling 4 | # All rights reserved. 5 | # 6 | # eakmeans is a library for exact and approximate k-means written in C++ and 7 | # Python. This file is part of eakmeans. See file COPYING for more details. 8 | # 9 | # This file is part of eakmeans. 10 | # 11 | # eakmeans is free software: you can redistribute it and/or modify 12 | # it under the terms of the 3-Clause BSD Licence. See 13 | # https://opensource.org/licenses/BSD-3-Clause for more details. 14 | # 15 | # eakmeans is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | # COPYING for more details. 19 | # 20 | import kmeans 21 | 22 | import numpy as np 23 | import numpy.random as npr 24 | 25 | old_seed = npr.randint(100000) 26 | 27 | ndata = 10000 28 | dimension = 300 29 | n_clusters = 10 30 | npr.seed(1012) 31 | 32 | X = npr.randn(ndata, dimension) 33 | C0 = 1.001*npr.randn(n_clusters, dimension) 34 | 35 | npr.seed(old_seed) 36 | 37 | bla = kmeans.get_clustering(X = X, n_clusters = n_clusters, init = "BF", verbose = 1, seed = old_seed) 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | # Written by James Newling 4 | # All rights reserved. 5 | # 6 | # eakmeans is a library for exact and approximate k-means written in C++ and 7 | # Python. This file is part of eakmeans. See file COPYING for more details. 8 | # 9 | # This file is part of eakmeans. 10 | # 11 | # eakmeans is free software: you can redistribute it and/or modify 12 | # it under the terms of the 3-Clause BSD Licence. See 13 | # https://opensource.org/licenses/BSD-3-Clause for more details. 14 | # 15 | # eakmeans is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | # COPYING for more details. 19 | # 20 | 21 | 22 | from distutils.core import Extension, setup 23 | from Cython.Build import cythonize 24 | 25 | import os 26 | 27 | #X_library_dir = "./" 28 | openblaslibdir = os.environ["LIBBLASDIR"] #/idiap/user/jnewling/openblas/lib" 29 | 30 | #LIBBLASDIR 31 | 32 | libname = "kmeans" 33 | 34 | if "WITHBLAS" not in os.environ.keys(): 35 | librariestouse = ["blaslesskmeans"] 36 | print "will build the python library pkmeans without blas (building with blas will make it faster)" 37 | 38 | else: 39 | librariestouse = ["withblaskmeans", "openblas"] 40 | print "will build the python library pkmeans with blas (good choice)" 41 | 42 | #TODO : sort out libname vs "kmeans" below 43 | 44 | ########## Using Cython directly ################################### 45 | ext = Extension(libname, sources = [os.path.abspath("cythonsrc/kmeans.pyx")], include_dirs = ["cythonsrc", "src"], library_dirs = [openblaslibdir, "lib"], libraries = librariestouse, language = "c++") 46 | setup(name = libname, ext_modules = cythonize(ext)) 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | ############ Using precompiled cpp file (no need for Cython) ########## 56 | #ext = Extension("kmeans", sources = [os.path.abspath("./precythonised/kmeans.cpp")], include_dirs = [X_include_dir], library_dirs = [X_library_dir, openblaslibdir], libraries = librariestouse, language = "c++") 57 | #setup(name = "kmeans", ext_modules = [ext]) 58 | 59 | 60 | 61 | 62 | 63 | #if hostname == "goudurix12": 64 | #X_include_dir = "/idiap/home/jnewling/libraries/utility/pllkmeans/include" 65 | #X_library_dir = "/idiap/user/jnewling/own/templib" 66 | #openblaslibdir = "/idiap/user/jnewling/openblas/lib" 67 | 68 | #else: 69 | #X_include_dir = "/home/james/libraries/utility/pllkmeans/include" 70 | #X_library_dir = "/home/james/oak/own/templib" 71 | #openblaslibdir = "/home/james/oak/openblas/lib" 72 | -------------------------------------------------------------------------------- /src/BaseGrowBatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef BASEGROWBATCH_H 22 | #define BASEGROWBATCH_H 23 | 24 | #include "basekmeans.h" 25 | 26 | #include "growbatchapp.h" 27 | 28 | 29 | namespace kmeans{ 30 | template 31 | class BaseGrowBatch : public kmeans::BaseKmeans{ 32 | 33 | private: 34 | virtual void endroundupdate() override final{ 35 | this->iscomplete = (this->nchanges == 0 && (this->gba.ndata_active == this->ndata)) || (this->duration > this->maxtime) || (this->round >= this->maxrounds); 36 | this->nchanges = 0; 37 | ++this->round; 38 | } 39 | 40 | protected: 41 | 42 | growbatchapp::GBApp gba; 43 | //For the data used in an X update, some algorithms 44 | //do full on data x centroid multiplications, how much 45 | //data can be used per thread in a full data x centroid 46 | // product? 47 | TInt maxpermultiplyblock; 48 | 49 | virtual void set_X_tasks() = 0; 50 | virtual void set_C_tasks() = 0; 51 | 52 | inline TFloat * const get_delta_C(){ 53 | return this->gba.delta_C.get(); 54 | } 55 | 56 | virtual void set_initialisation_tasks() = 0; 57 | 58 | void BGB_constructor_helper_densebits(){ 59 | 60 | //stuff specific to dense goes here. 61 | 62 | this->setalgname("Dense Base Grow Batch"); 63 | 64 | this->maxpermultiplyblock = 65 | std::max(static_cast (1), 66 | static_cast ((this->getndata() * this->getdimension())/(2 * this->getncentroids() * this->nthreads))); 67 | } 68 | 69 | 70 | //A c&p from minibatch base. Not as code reducing as the baseexact version, but easier to understand 71 | template 72 | void gb_pll_principal_X(const Function & X_updater, TInt ti, Args&&... args){ 73 | 74 | arrutilv2::pll_update_L_etc( 75 | //The compulsory parameters to pll_update_L_etc, 76 | X_updater, 77 | this->ncentroids, this->dimension, this->get_sums(), this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_counts(), this->get_dcounts() + ti*this->ncentroids, this->nchanges, this->ndcalcs_X, this->work_mutex 78 | //The additional parameters to pll_update_L_etc with correct offset 79 | , std::forward(args)...); 80 | } 81 | 82 | 83 | private: 84 | virtual void set_summaries() override final{ 85 | this->set_summaries_growbatch(this->gba); 86 | } 87 | 88 | 89 | //Note that BaseGrowBatchMse overrides this. 90 | virtual void set_mse() override { 91 | //if not all data is active, refuse to compute the mse 92 | if (this->gba.ndata_active != this->ndata){ 93 | this->mse = -1; 94 | } 95 | 96 | else{ 97 | this->mse = arrutilv2::getmeanl22at(this->ncentroids, this->dimension, this->get_C(), this->ndata, this->data, this->get_L(), this->get_C_l22s(), this->get_data_l22s()); 98 | } 99 | } 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | public: 108 | 109 | template 110 | BaseGrowBatch(TInt batchsize0, Args&&... args): kmeans::BaseKmeans (std::forward(args)...) 111 | { 112 | this->BGB_constructor_helper(batchsize0, this->gba); 113 | this->BGB_constructor_helper_densebits(); 114 | } 115 | virtual ~BaseGrowBatch(){}; 116 | }; 117 | } 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /src/BaseGrowBatchMse.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef BASEGROWBATCHMSE_H 22 | #define BASEGROWBATCHMSE_H 23 | 24 | #include "BaseGrowBatch.h" 25 | #include "growbatchapp.h" 26 | 27 | #include 28 | #include 29 | namespace kmeans{ 30 | 31 | template 32 | /* A type of GrowBatch, so batch size doubles when determined to be appropriate. 33 | * Specifically, we monitor 34 | * (1) mse per cluster, 35 | * (2) delta_C per cluster 36 | * and if *median* of mse/delta_C > *1*, double (while can) see basecluster function for details (in basecluster so that sparse can use as well) 37 | * */ 38 | 39 | class BaseGrowBatchMse : public kmeans::BaseGrowBatch{ 40 | 41 | private: 42 | virtual void set_mse() override final { 43 | this->gbmse_set_mse(this->gba, this->gbmseapp); 44 | } 45 | 46 | protected: 47 | 48 | growbatchapp::GBMseApp gbmseapp; 49 | 50 | TFloat * const get_dn(){ 51 | return this->gbmseapp.dn.get(); 52 | } 53 | 54 | virtual bool should_double() override final{ 55 | return this->gbmse_should_double(this->gba, this->gbmseapp.mse_by_cluster.data()); 56 | } 57 | 58 | virtual void set_L_dn(TInt x0, TInt x1) override final { 59 | TInt local_ndcalcs = 0; 60 | arrutilv2::set_rargminmins(x1 - x0, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dn() + x0, local_ndcalcs); 61 | this->ndcalcs_X += local_ndcalcs; 62 | } 63 | 64 | virtual void set_initialisation_tasks() = 0; 65 | 66 | virtual void set_X_tasks() override final{ 67 | /* using function as defined in basedensecentroidkmeans */ 68 | this->X_tasks = this->bgbmse_update_L_dn_etc_S_H_batch_switch_mati(this->gba); 69 | } 70 | 71 | virtual void set_C_tasks() = 0; 72 | 73 | public: 74 | 75 | template 76 | BaseGrowBatchMse(TInt batchsize0, Args&&... args): kmeans::BaseGrowBatch (batchsize0, std::forward(args)...) 77 | { 78 | this->BGBM_constructor_helper(this->gbmseapp); 79 | } 80 | 81 | virtual ~BaseGrowBatchMse(){}; 82 | 83 | }; 84 | } 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /src/BaseSparseGrowBatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | //Based blindly on BaseSparseGrowBatch.h 22 | 23 | #ifndef BASESPARSEGROWBATCH_H 24 | #define BASESPARSEGROWBATCH_H 25 | 26 | #include "basesparsekmeans.h" 27 | #include "growbatchapp.h" 28 | 29 | 30 | namespace kmeans{ 31 | template 32 | class BaseSparseGrowBatch : public kmeans::BaseSparseKmeans{ 33 | 34 | private: 35 | 36 | virtual void endroundupdate() override final{ 37 | this->iscomplete = (this->nchanges == 0 && (this->gba.ndata_active == this->ndata)) || (this->duration > this->maxtime) || (this->round >= this->maxrounds); 38 | this->nchanges = 0; 39 | ++this->round; 40 | } 41 | 42 | virtual void sgb_update_L_etc(TInt x0, TInt x1, TInt ti) = 0; 43 | virtual void sgb_set_L_etc(TInt x0, TInt x1, TInt ti) = 0; 44 | 45 | protected: 46 | 47 | growbatchapp::GBApp gba; 48 | 49 | virtual void set_C_tasks() = 0; 50 | 51 | inline TFloat * const get_delta_C(){ 52 | return this->gba.delta_C.get(); 53 | } 54 | 55 | virtual void set_initialisation_tasks() = 0; 56 | 57 | void BGB_constructor_helper_sparsebits(){ 58 | this->setalgname("Sparse Base Grow Batch"); 59 | } 60 | 61 | 62 | //This may be a bit premature : hoping that my assumption of one task per update round is ~accurate. 63 | virtual void set_X_tasks() override final{ 64 | this->X_tasks = this->sgb_update_L_dn_etc_S_H_mati(); 65 | } 66 | 67 | 68 | std::vector > sgb_update_L_dn_etc_S_H_mati(){ 69 | std::vector > tasks = {}; 70 | tasks.emplace_back ( 71 | //update L and dn of data used in previous round 72 | [this](TInt ti){ 73 | TInt x0 = (ti*this->gba.ndata_active_previous)/this->nthreads; 74 | TInt x1 = ((ti+1)*this->gba.ndata_active_previous)/this->nthreads; 75 | 76 | this->sgb_update_L_etc(x0, x1, ti); 77 | 78 | } 79 | ); 80 | 81 | tasks.emplace_back( 82 | [this](TInt ti){ 83 | if (ti == 0){ 84 | sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts(), this->nchanges); 85 | } 86 | } 87 | ); 88 | 89 | tasks.emplace_back( 90 | [this](TInt ti){ 91 | //set L and dn of unused 92 | if (this->gba.ndata_active != this->gba.ndata_active_previous){ 93 | TInt ndata_tail = this->gba.ndata_active - this->gba.ndata_active_previous; 94 | TInt x0 = this->gba.ndata_active_previous + (ti*ndata_tail)/this->nthreads; 95 | TInt x1 = this->gba.ndata_active_previous + ((ti + 1)*ndata_tail)/this->nthreads; 96 | this->sgb_set_L_etc(x0, x1, ti); 97 | this->nchanges += x1 - x0; 98 | } 99 | } 100 | ); 101 | 102 | tasks.emplace_back( 103 | [this](TInt ti){ 104 | if (ti == 0){ 105 | if (this->gba.ndata_active != this->gba.ndata_active_previous){ 106 | sparse::increment_S_H(this->gba.ndata_active_previous, 107 | this->gba.ndata_active, 108 | *this->ptrdata, 109 | this->get_L(), 110 | this->get_sums(), 111 | this->get_counts()); 112 | } 113 | } 114 | } 115 | ); 116 | return tasks; 117 | } 118 | 119 | 120 | private: 121 | virtual void set_summaries() override final{ 122 | this->set_summaries_growbatch(this->gba); 123 | } 124 | 125 | virtual void set_mse() override { 126 | if (this->gba.ndata_active != this->ndata){ 127 | this->mse = -1; 128 | } 129 | 130 | else{ 131 | this->mse = this->getmeanl22at(); 132 | } 133 | } 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | public: 142 | 143 | template 144 | BaseSparseGrowBatch(TInt batchsize0, Args&&... args): kmeans::BaseSparseKmeans (std::forward(args)...) 145 | { 146 | this->BGB_constructor_helper(batchsize0, this->gba); 147 | this->BGB_constructor_helper_sparsebits(); 148 | 149 | } 150 | virtual ~BaseSparseGrowBatch(){}; 151 | }; 152 | } 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /src/BaseSparseGrowBatchMse.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | //based blindly on BaseSparseGrowBatchMse.h, almost identical. 22 | 23 | #ifndef BASESPARSEGROWBATCHMSE_H 24 | #define BASESPARSEGROWBATCHMSE_H 25 | 26 | #include "BaseSparseGrowBatch.h" 27 | #include "growbatchapp.h" 28 | 29 | #include 30 | #include 31 | namespace kmeans{ 32 | 33 | template 34 | /* A type of GrowBatch, so batch size doubles when determined to be appropriate. 35 | * Specifically, we monitor 36 | * (1) mse per cluster, 37 | * (2) delta_C per cluster 38 | * and if *median* of mse/delta_C > *1*, double (while can) see basecluster function for details (in basecluster so that sparse can use as well) 39 | * */ 40 | 41 | class BaseSparseGrowBatchMse : public kmeans::BaseSparseGrowBatch{ 42 | 43 | private: 44 | virtual void set_mse() override final { 45 | this->gbmse_set_mse(this->gba, this->gbmseapp); 46 | } 47 | 48 | 49 | 50 | 51 | 52 | protected: 53 | 54 | growbatchapp::GBMseApp gbmseapp; 55 | 56 | TFloat * const get_dn(){ 57 | return this->gbmseapp.dn.get(); 58 | } 59 | 60 | virtual bool should_double() override final{ 61 | return this->gbmse_should_double(this->gba, this->gbmseapp.mse_by_cluster.data()); 62 | } 63 | 64 | virtual void set_L_dn(TInt x0, TInt x1) override final { 65 | sparse::set_L_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), 66 | this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dn()); 67 | this->ndcalcs_X += (x1 - x0)*this->ncentroids; 68 | } 69 | 70 | virtual void set_initialisation_tasks() = 0; 71 | 72 | virtual void set_C_tasks() = 0; 73 | 74 | public: 75 | 76 | template 77 | BaseSparseGrowBatchMse(TInt batchsize0, Args&&... args): kmeans::BaseSparseGrowBatch (batchsize0, std::forward(args)...) 78 | { 79 | this->BGBM_constructor_helper(this->gbmseapp); 80 | } 81 | 82 | virtual ~BaseSparseGrowBatchMse(){}; 83 | 84 | }; 85 | } 86 | 87 | #endif 88 | -------------------------------------------------------------------------------- /src/GBMse3v1.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef GBMse3v1_H 22 | #define GBMse3v1_H 23 | 24 | 25 | 26 | #include "BaseGrowBatchMse.h" 27 | #include "alg_X_selkSN.h" 28 | #include "arrutilv2l3.h" 29 | 30 | namespace kmeans{ 31 | 32 | //The updating with 3v1 is similar to that of 3v0, but takes advantage of the fact that upper bounds are always tight. (dn is always the distance to the nearest centroid). 33 | 34 | template 35 | 36 | class GBMse3v1 : public kmeans::BaseGrowBatchMse{ 37 | 38 | private: 39 | 40 | virtual void update_already_used(TInt x0, TInt x1, TInt ti) override final{ 41 | 42 | this->gb_pll_principal_X( 43 | kmeans::update_L_lowers_upper_S_H_3v1, 44 | ti, 45 | x1 - x0, 46 | this->data + x0*this->dimension, 47 | this->get_C(), 48 | this->get_data_l22s() + x0, 49 | this->get_C_l22s(), 50 | this->gba.delta_C.get(), 51 | this->get_L() + x0, 52 | this->get_lowers() + x0*this->ncentroids, 53 | this->get_dn() + x0, 54 | this->round); 55 | 56 | } 57 | 58 | virtual void update_unused(TInt x0, TInt x1, TInt ti) override final{ 59 | 60 | arrutilv2::set_L_lowers_dn_and_increment_S_H(x1 - x0, this->dimension, this->data + this->dimension*x0, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_lowers() + x0*this->ncentroids, this->get_dn() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex); 61 | this->ndcalcs_X += (x1 - x0)*this->ncentroids; 62 | } 63 | 64 | 65 | protected: 66 | 67 | TFloat * const get_lowers(){ 68 | return this->elkan_lowers_base.get(); 69 | } 70 | 71 | virtual void set_initialisation_tasks() override final{ 72 | 73 | auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_lowers_dn_inds0_mati(this->gba); 74 | 75 | auto init_task_B = this->base_set_S_H_ati(static_cast(0), this->gba.ndata_active); 76 | this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end()); 77 | this->initialisation_tasks.push_back(init_task_B); 78 | } 79 | 80 | virtual void set_C_tasks() override final{ 81 | this->C_tasks = {}; 82 | this->C_tasks.push_back( 83 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX) 84 | ); 85 | this->C_tasks.push_back( 86 | this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp) 87 | ); 88 | this->C_tasks.push_back( 89 | this->update_ndata_active_ati(this->gba) 90 | ); 91 | } 92 | 93 | virtual void set_L_lowers_dn(TInt x0, TInt x1) override final{ 94 | 95 | arrutilv2::set_rrl2ss_argminmins(x1 - x0, this->dimension, 96 | this->data + x0*this->dimension, this->ncentroids, 97 | this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), 98 | this->elkan_lowers_base.get() + x0*this->ncentroids, 99 | this->get_L() + x0, 100 | this->gbmseapp.dn.get() + x0 101 | 102 | ); 103 | } 104 | 105 | public: 106 | 107 | template 108 | GBMse3v1(TInt batchsize0, Args&&... args): kmeans::BaseGrowBatchMse (batchsize0, std::forward(args)...) 109 | { 110 | this->assignmemory_elkan_lowers(); 111 | this->algname = "GBMse Elkan 3v1"; 112 | } 113 | 114 | virtual ~GBMse3v1(){}; 115 | 116 | }; 117 | 118 | } 119 | 120 | #endif 121 | -------------------------------------------------------------------------------- /src/GBMseSimple.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef GBMSESIMPLE_H 22 | #define GBMSESIMPLE_H 23 | 24 | #include "BaseGrowBatchMse.h" 25 | 26 | namespace kmeans{ 27 | 28 | template 29 | 30 | class GBMseSimple : public kmeans::BaseGrowBatchMse{ 31 | 32 | private: 33 | 34 | virtual void update_already_used(TInt x0, TInt x1, TInt ti) override final{ 35 | arrutilv2::update_L_dn_S_H_batch(x1 - x0, this->maxpermultiplyblock, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dn() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex); 36 | this->ndcalcs_X += (x1 - x0)*this->ncentroids; 37 | } 38 | 39 | virtual void update_unused(TInt x0, TInt x1, TInt ti) override final{ 40 | arrutilv2::update_L_dn_S_H_batch_increment_only(x1 - x0, this->maxpermultiplyblock, this->dimension, this->data + this->dimension*x0, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dn() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex); 41 | this->ndcalcs_X += (x1 - x0)*this->ncentroids; 42 | } 43 | 44 | 45 | protected: 46 | 47 | virtual void set_initialisation_tasks() override final{ 48 | auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_dn_inds0_mati(this->gba); 49 | auto init_task_B = this->base_set_S_H_ati(static_cast(0), this->gba.ndata_active); 50 | this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end()); 51 | this->initialisation_tasks.push_back(init_task_B); 52 | } 53 | 54 | virtual void set_C_tasks() override final{ 55 | this->C_tasks = {}; 56 | this->C_tasks.push_back( 57 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX) 58 | ); 59 | this->C_tasks.push_back( 60 | this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp) 61 | ); 62 | this->C_tasks.push_back( 63 | this->update_ndata_active_ati(this->gba) 64 | ); 65 | } 66 | 67 | public: 68 | 69 | template 70 | GBMseSimple(TInt batchsize0, Args&&... args): kmeans::BaseGrowBatchMse (batchsize0, std::forward(args)...) 71 | { 72 | this->algname = "GBMse Simple Dense"; 73 | } 74 | 75 | virtual ~GBMseSimple(){}; 76 | 77 | }; 78 | 79 | 80 | } 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /src/SparseGBMse3v1.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef SPARSEGBMSE3V0_H 22 | #define SPARSEGBMSE3V0_H 23 | 24 | #include "BaseSparseGrowBatchMse.h" 25 | 26 | #include "alg_X_selkSN.h" 27 | 28 | namespace kmeans{ 29 | 30 | template 31 | 32 | class SparseGBMse3v1 : public kmeans::BaseSparseGrowBatchMse{ 33 | 34 | private: 35 | 36 | //updates L, dn, this->mba.nchanges_on_batch[ti], ... 37 | virtual void sgb_update_L_etc(TInt x0, TInt x1, TInt ti){ 38 | 39 | this->where_label_changes[ti].clear(); //index, old, new. 40 | 41 | TInt ndcalcs_local = 0; 42 | kmeans::sparse_update_L_lowers_upper_where_changes_3v1(this->ncentroids, this->dimension, x0, x1, *this->ptrdata, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->gba.delta_C.get(), this->where_label_changes[ti], ndcalcs_local, this->get_L() + x0, this->get_lowers() + x0*this->ncentroids, this->get_dn() + x0); 43 | 44 | std::lock_guard gluk(this->work_mutex); 45 | this->ndcalcs_X += ndcalcs_local; 46 | } 47 | 48 | //sets L, dn, ... 49 | virtual void sgb_set_L_etc(TInt x0, TInt x1, TInt ti){ 50 | 51 | sparse::set_L_lowers_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_lowers(), this->get_dn()); 52 | 53 | std::lock_guard gluk(this->work_mutex); 54 | this->ndcalcs_X += this->ncentroids*(x1 - x0); 55 | } 56 | 57 | 58 | 59 | 60 | protected: 61 | 62 | TFloat * const get_lowers(){ 63 | return this->elkan_lowers_base.get(); 64 | } 65 | 66 | 67 | 68 | virtual void set_initialisation_tasks() override final{ 69 | auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_lowers_dn_inds0_mati(this->gba); 70 | auto init_task_B = this->base_set_S_H_ati(static_cast(0), this->gba.ndata_active); 71 | this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end()); 72 | this->initialisation_tasks.push_back(init_task_B); 73 | } 74 | 75 | virtual void set_C_tasks() override final{ 76 | this->C_tasks = {}; 77 | this->C_tasks.push_back( 78 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX) 79 | ); 80 | this->C_tasks.push_back( 81 | this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp) 82 | ); 83 | this->C_tasks.push_back( 84 | this->update_ndata_active_ati(this->gba) 85 | ); 86 | } 87 | 88 | //this->C_tasks.push_back( 89 | //[this](TInt ti){ 90 | //if (ti == 0){ 91 | //std::cout << "\n----------------------------------------------\n"; 92 | ////TInt mincount = 100000; 93 | //for (TInt ci = 0; ci < this->ncentroids; ++ci){ 94 | //std::cout << this->get_counts()[ci] << " "; 95 | ////if (mincount > this->get_counts()[ci]){ 96 | ////mincount = this->get_counts()[ci]; 97 | ////} 98 | //} 99 | //std::cout << "\n----------------------------------------------\n"; 100 | 101 | 102 | //} 103 | //} 104 | //); 105 | 106 | 107 | virtual void set_L_lowers_dn(TInt x0, TInt x1) override final{ 108 | 109 | ////TODO : do I need to increment ndcalcs_X? 110 | sparse::set_L_lowers_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_lowers(), this->get_dn()); 111 | 112 | 113 | 114 | 115 | //arrutilv2::set_rrl2ss_argminmins(x1 - x0, this->dimension, 116 | //this->data + x0*this->dimension, this->ncentroids, 117 | //this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), 118 | //this->elkan_lowers_base.get() + x0*this->ncentroids, 119 | //this->get_L() + x0, 120 | //this->gbmseapp.dn.get() + x0 121 | 122 | //); 123 | } 124 | 125 | public: 126 | 127 | template 128 | SparseGBMse3v1(TInt batchsize0, Args&&... args): kmeans::BaseSparseGrowBatchMse (batchsize0, std::forward(args)...) 129 | { 130 | this->assignmemory_elkan_lowers(); 131 | this->algname = "GBMse 3v0 Sparse (turbocharged-rho)"; 132 | } 133 | 134 | virtual ~SparseGBMse3v1(){}; 135 | 136 | }; 137 | 138 | 139 | } 140 | 141 | #endif 142 | -------------------------------------------------------------------------------- /src/SparseGBMseSimple.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | //blindly based on GBMseSimple, again very similar. 22 | 23 | #ifndef SPARSEGBMSESIMPLE_H 24 | #define SPARSEGBMSESIMPLE_H 25 | 26 | #include "BaseSparseGrowBatchMse.h" 27 | 28 | namespace kmeans{ 29 | 30 | template 31 | 32 | class SparseGBMseSimple : public kmeans::BaseSparseGrowBatchMse{ 33 | 34 | private: 35 | 36 | //updates L, dn, this->mba.nchanges_on_batch[ti] 37 | virtual void sgb_update_L_etc(TInt x0, TInt x1, TInt ti){ 38 | 39 | this->where_label_changes[ti].clear(); //index, old, new. 40 | sparse::update_L_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dn(), this->where_label_changes[ti]); 41 | 42 | std::lock_guard gluk(this->work_mutex); 43 | //this->nchanges += this->where_label_changes[ti].size(); 44 | this->ndcalcs_X += this->ncentroids*(x1 - x0); 45 | } 46 | 47 | //sets L, dn 48 | virtual void sgb_set_L_etc(TInt x0, TInt x1, TInt ti){ 49 | 50 | sparse::set_L_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dn()); 51 | 52 | std::lock_guard gluk(this->work_mutex); 53 | this->ndcalcs_X += this->ncentroids*(x1 - x0); 54 | } 55 | 56 | 57 | 58 | 59 | protected: 60 | 61 | virtual void set_initialisation_tasks() override final{ 62 | auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_dn_inds0_mati(this->gba); 63 | auto init_task_B = this->base_set_S_H_ati(static_cast(0), this->gba.ndata_active); 64 | this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end()); 65 | this->initialisation_tasks.push_back(init_task_B); 66 | } 67 | 68 | virtual void set_C_tasks() override final{ 69 | this->C_tasks = {}; 70 | 71 | 72 | //we use this codefrag to confirm that S and H are correctly set. 73 | if (true == false){ 74 | this->C_tasks.push_back( 75 | [this](TInt ti){ 76 | if (ti == 0){ 77 | sparse::todense::set_S_H(*this->ptrdata, static_cast (0), this->gba.ndata_active, this->ncentroids, this->get_L(), this->get_sums(), this->get_counts()); 78 | } 79 | } 80 | ); 81 | } 82 | 83 | 84 | 85 | this->C_tasks.push_back( 86 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX) 87 | ); 88 | this->C_tasks.push_back( 89 | this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp) 90 | ); 91 | this->C_tasks.push_back( 92 | this->update_ndata_active_ati(this->gba) 93 | ); 94 | 95 | } 96 | 97 | public: 98 | 99 | template 100 | SparseGBMseSimple(TInt batchsize0, Args&&... args): kmeans::BaseSparseGrowBatchMse (batchsize0, std::forward(args)...) 101 | { 102 | this->algname = "GBMse Simple Sparse"; 103 | } 104 | 105 | virtual ~SparseGBMseSimple(){}; 106 | 107 | }; 108 | 109 | 110 | } 111 | 112 | #endif 113 | -------------------------------------------------------------------------------- /src/YY17v2.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_ELKANKMEANS_17V2_H 22 | #define PLL_ELKANKMEANS_17V2_H 23 | 24 | #include "baseYYSMN.h" //for initial clustering of the centroids 25 | 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | namespace kmeans{ 34 | 35 | 36 | template 37 | /* the same as a17v2.h : no delta_C test as per yy, so if group bound fails all distances calculated. 38 | * a few percent slower, strange as most pxx are faster than axx. 39 | * */ 40 | void update_L_glowers_upb_S_H_17v2(TInt ncentroids, TInt dimension, TFloat * const S, TInt * const H, TInt & nchanges, TInt & ndcalcs, TInt ndata, const TFloat * const data, const TFloat * const C, const TFloat * const data_l22s, const TFloat * const C_l22s, const TFloat * const delta_C, const TFloat * const delta_G, TInt * const L, TInt ngroups, const TInt * const groupparts, const TInt * const groupsizes, TInt * const group, TFloat * const glowers, TFloat * const upb, const TInt & round){ 41 | 42 | 43 | std::unique_ptr distances( new TFloat [ncentroids] ); 44 | TInt group_nearest_index; 45 | TFloat group_nearest; 46 | TFloat group_second_nearest; 47 | 48 | for (TInt i = 0; i < ndata; ++i){ 49 | arrutilv2::set_l2(dimension, data + i*dimension, C + L[i]*dimension, data_l22s[i], C_l22s[L[i]], upb[i] , ndcalcs ); 50 | TInt label_before = L[i]; 51 | 52 | for (TInt gi = 0; gi < ngroups; ++gi){ 53 | glowers[i*ngroups + gi] -= delta_G[gi]; 54 | } 55 | 56 | for (TInt gi = 0; gi < ngroups; ++gi){ 57 | if (glowers[i*ngroups + gi] < upb[i]){ 58 | arrutilv2::set_rl2s(dimension, data + i*dimension, groupsizes[gi], C + groupparts[gi]*dimension, data_l22s[i], C_l22s + groupparts[gi], distances.get(), ndcalcs); 59 | 60 | if (gi != group[i]){ 61 | arrutilv2::set_argminmin2(groupsizes[gi], distances.get(), group_nearest_index, group_nearest, group_second_nearest); 62 | group_nearest_index += groupparts[gi]; 63 | if (group_nearest < upb[i]){ 64 | if (gi < group[i]){ 65 | glowers[i*ngroups + group[i]] = std::min(upb[i], glowers[i*ngroups + group[i]]); 66 | } 67 | else{ 68 | glowers[i*ngroups + group[i]] = upb[i]; 69 | } 70 | glowers[i*ngroups + gi] = group_second_nearest; 71 | L[i] = group_nearest_index; 72 | group[i] = gi; 73 | upb[i] = group_nearest; 74 | } 75 | 76 | else{ 77 | glowers[i*ngroups + gi] = group_nearest; 78 | } 79 | } 80 | 81 | else{ 82 | arrutilv2::set_argminmin2(groupsizes[gi], distances.get(), L[i], upb[i], glowers[i*ngroups + gi]); 83 | L[i] += groupparts[gi]; 84 | } 85 | } 86 | } 87 | 88 | if (L[i] != label_before){ 89 | ++nchanges; 90 | ++H[L[i]]; 91 | --H[label_before]; 92 | arrutilv2::addto(dimension, data + i*dimension, S + dimension*L[i]); 93 | arrutilv2::subtractfrom(dimension, data + i*dimension, S + dimension*label_before); 94 | } 95 | } 96 | } 97 | 98 | 99 | 100 | 101 | 102 | template 103 | class P17V2 : public kmeans::BaseYYSMN{ 104 | 105 | 106 | protected: 107 | 108 | std::function update_L_glowers_upb_S_H_17v2_ati(){ 109 | return [this](TInt ti){ 110 | TInt x0 = (ti*this->getndata())/this->getnthreads(); 111 | this->pll_principal_X(update_L_glowers_upb_S_H_17v2, ti, 112 | this->get_delta_C(), this->get_delta_G(), this->get_L() + x0, this->get_ngroups(), this->get_groupparts(), this->get_groupsizes(), this->get_group() + x0, this->get_glowers() + x0*this->get_ngroups(), this->get_upb() + x0, this->round); 113 | }; 114 | } 115 | 116 | public: 117 | typedef kmeans::BaseYYSMN BC; 118 | template 119 | P17V2(Args&&... args): BC(std::forward(args)...) 120 | 121 | { 122 | this->setalgname("p17v2"); 123 | } 124 | 125 | virtual ~P17V2(){} 126 | 127 | virtual void verbose_write_additional(){ 128 | this->get_verbose_file() << "\n\n ..not implemented down to 17v2..\n\n"; 129 | } 130 | 131 | virtual void set_initialisation_tasks(){ 132 | this->yinyang_initialisation_tasks(); 133 | } 134 | 135 | virtual void set_C_tasks(){ 136 | 137 | this->C_tasks = { 138 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX), 139 | 140 | arrutilv2::update_delta_G_from_delta_C_ati(this->getncentroids(), this->get_delta_C(), this->get_ngroups(), this->get_groupparts(), this->get_groupsizes(), this->get_delta_G()) 141 | }; 142 | } 143 | 144 | virtual void set_X_tasks(){ 145 | 146 | this->X_tasks = { 147 | this->update_L_glowers_upb_S_H_17v2_ati() 148 | }; 149 | } 150 | }; 151 | 152 | } 153 | 154 | #endif 155 | 156 | -------------------------------------------------------------------------------- /src/arrutilv2copy.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef ARRUTILV2COPY_H 22 | #define ARRUTILV2COPY_H 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | namespace arrutilv2{ 33 | 34 | template 35 | void copyatindices(TSize ndata_from, TSize ndata_to, TSize dimension, const DType * const data_from, DType * const data_to, const TSize * const indices){ 36 | 37 | for (TSize index_i = 0; index_i < ndata_to; ++index_i){ 38 | std::memcpy(data_to + index_i*dimension, data_from + indices[index_i]*dimension, dimension*sizeof(DType)); 39 | } 40 | } 41 | 42 | 43 | /* calls copy at indices but first does some checks for uniqueness, bounds etc*/ 44 | template 45 | void copyatuniqueindices(TSize ndata_from, TSize ndata_to, TSize dimension, const DType * const data_from, DType * const data_to, const TSize * const indices){ 46 | 47 | std::vector used (ndata_from,false); 48 | TSize index; 49 | for (TSize i = 0; i < ndata_to; ++i){ 50 | index = indices[i]; 51 | if (index >= ndata_from){ 52 | throw std::runtime_error("index of the data to copy (" + std::to_string(index) + ") seems to be out of range in copyatuniqueindices (" + std::to_string(ndata_from) + "). In other words, the following is false, causing the throw " + std::to_string(index) + " >= " + std::to_string(ndata_from)); 53 | } 54 | 55 | else if (used[index] == true){ 56 | throw std::runtime_error("index to copy (" +std::to_string(index) +") seems to have already been copied. Possible cause: indices to be copied are not unique, which contradicts assumptions in this function"); 57 | } 58 | 59 | else{ 60 | used[i] = true; 61 | } 62 | } 63 | copyatindices(ndata_from, ndata_to, dimension, data_from, data_to, indices); 64 | } 65 | 66 | 67 | 68 | template 69 | void copyatindices(TSize ndata_from, TSize dimension, const DType * const data_from, DType * const data_to, const std::vector & indices){ 70 | TSize ndata_to = indices.size(); 71 | copyatindices(ndata_from, ndata_to, dimension, data_from, data_to, indices.data()); 72 | } 73 | 74 | 75 | template 76 | void copyatuniqueindices(TSize ndata_from, TSize dimension, const DType * const data_from, DType * const data_to, const std::vector & indices){ 77 | TSize ndata_to = indices.size(); 78 | copyatuniqueindices(ndata_from, ndata_to, dimension, data_from, data_to, indices.data()); 79 | } 80 | 81 | //untested 82 | template 83 | std::unique_ptr getatuniqueindices(TSize ndata_from, TSize dimension, const DType * const data_from, const std::vector & indices){ 84 | std::unique_ptr data_to (new DType[dimension*indices.size()]); 85 | copyatuniqueindices(ndata_from, dimension, data_from, data_to.get(), indices); 86 | return data_to; 87 | } 88 | 89 | template 90 | std::unique_ptr copy_uptrarr_to_uptrarr(TSize n, const std::unique_ptr & uptr){ 91 | std::unique_ptr thecopy (new T[n]); 92 | std::memcpy(thecopy.get(), uptr.get(), n*sizeof(T)); 93 | return thecopy; 94 | } 95 | 96 | 97 | template 98 | std::unique_ptr copy_ptrarr_to_uptrarr(TSize n, const T * const ptrarr){ 99 | std::unique_ptr thecopy (new T[n]); 100 | std::memcpy(thecopy.get(), ptrarr, n*sizeof(T)); 101 | return thecopy; 102 | } 103 | 104 | 105 | template 106 | std::unique_ptr get_initialised_uptrarr(TSize npts, TNumber defval){ 107 | std::unique_ptr upined (new TNumber [npts]); 108 | std::fill_n(upined.get(), npts, defval); 109 | return upined; 110 | } 111 | 112 | 113 | template 114 | std::unique_ptr < TInt []> get_with_offset(TInt n, const TInt * const vals, TInt offset){ 115 | std::unique_ptr < TInt []> newvals (new TInt [n]); 116 | for (TInt ci = 0; ci < n; ++ ci){ 117 | newvals[ci] = offset + vals[ci]; 118 | } 119 | return newvals; 120 | } 121 | 122 | 123 | } 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /src/arrutilv2discrete.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef ARRUTILV2DISCRETE_H 22 | #define ARRUTILV2DISCRETE_H 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | /* histograms, where non zero, where above threshold etc. 29 | * templates should : 30 | * -- have no typename TFloat (better off in arrutilv2lxxx)*/ 31 | 32 | //TODO : think about when to return std::unique_ptr< []> and when to return std::vector 33 | 34 | namespace arrutilv2{ 35 | 36 | template 37 | std::unique_ptr get_vhisto(TInt nrows, TInt ncols, TInt range, const TLabel * const labels){ 38 | std::unique_ptr vhisto (new TLabel [range*ncols]); 39 | std::fill_n(vhisto.get(), range*ncols, 0); 40 | for (TInt r = 0; r < nrows; ++r){ 41 | for (TInt c = 0; c < ncols; ++ c){ 42 | ++vhisto[ncols*labels[r*ncols + c] + c]; 43 | } 44 | } 45 | return vhisto; 46 | } 47 | 48 | template 49 | std::vector get_where_nonzero(TInt N, const TLabel * const X){ 50 | std::vector where; 51 | for (TInt i = 0; i < N; ++i){ 52 | if (X[i] > 0){ 53 | where.push_back(i); 54 | } 55 | } 56 | return where; 57 | } 58 | 59 | 60 | template 61 | std::vector get_where_above_threshold(TInt N, const TLabel * const X, TInt threshold){ 62 | std::vector where; 63 | for (TInt i = 0; i < N; ++i){ 64 | if (X[i] > threshold){ 65 | where.push_back(i); 66 | } 67 | } 68 | return where; 69 | } 70 | 71 | 72 | 73 | 74 | template 75 | std::unique_ptr gethistogram(TInt N, TInt range, const TInt * const L){ 76 | std::unique_ptr hist(new TInt[range]); 77 | std::fill_n(hist.get(), range, 0); 78 | for (TInt i = 0; i < N; ++i){ 79 | ++hist[L[i]]; 80 | } 81 | return hist; 82 | } 83 | 84 | /* set max in partitions where partitions are ~equal */ 85 | template 86 | void set_maxinpartition(TInt npts, TInt npartitions, const TNumber * const vals, TNumber * const maxinpartition){ 87 | TInt p_end = 0; 88 | TInt p_start; 89 | for (TInt p = 0; p < npartitions; ++p){ 90 | p_start = p_end; 91 | p_end = ((p+1)*npts)/npartitions; 92 | maxinpartition[p] = std::numeric_limits::min(); 93 | for (TInt ci = p_start ; ci < p_end; ++ci){ 94 | if (vals[ci] > maxinpartition[p]){ 95 | maxinpartition[p] = vals[ci]; 96 | } 97 | } 98 | } 99 | } 100 | 101 | /* same as inline void set_minexclusionnocheck in arrutilv2l1.h */ 102 | template 103 | void set_min_excluding(TInt nvals, const TNumber * const vals, TInt excl, TNumber & toset){ 104 | toset = std::numeric_limits::max(); 105 | for (TInt i = 0; i < excl; ++i){ 106 | if (vals[i] < toset){ 107 | toset = vals[i]; 108 | } 109 | } 110 | 111 | for (TInt i = excl+1; i < nvals; ++i){ 112 | if (vals[i] < toset){ 113 | toset = vals[i]; 114 | } 115 | } 116 | } 117 | 118 | template 119 | TInt get_sum_int_array(TSize size, TInt * arr){ 120 | TInt sum = arr[0]; 121 | for (TSize i = 1; i < size; ++i){ 122 | sum += arr[i]; 123 | } 124 | return sum; 125 | } 126 | 127 | template 128 | bool get_sum_int_array_iszero(TSize size, const TInt * const arr){ 129 | TInt sum = arr[0]; 130 | TSize i = 0; 131 | while (sum == 0 && i < size){ 132 | sum += arr[i]; 133 | ++i; 134 | } 135 | if (sum == 0){ 136 | return true; 137 | } 138 | else{ 139 | return false; 140 | } 141 | } 142 | 143 | 144 | template 145 | void integraladdto(TInt N, const TIntArray * const to_add, TIntArray * const to){ 146 | for (TInt i = 0; i < N; ++i){ 147 | to[i] += to_add[i]; 148 | } 149 | } 150 | 151 | 152 | template 153 | std::vector intlinspace(TInt i0, TInt i1, TInt npts){ 154 | std::vector linspaced (npts, i1); 155 | for (TInt k = 0; k < npts - 1; ++k){ 156 | linspaced[k] = i0 + (k*(i1 - i0)/(npts - 1)); 157 | } 158 | 159 | return linspaced; 160 | } 161 | 162 | 163 | template 164 | void make_balanced(TSize minclustersize, TSize ndata, TLabel * const L, TSize nclusters, TSize * const groupsizes){ 165 | 166 | TLabel argminc; 167 | TSize minc; 168 | TLabel argmaxc; 169 | TSize maxc; 170 | 171 | arrutilv2::set_argminmin(nclusters, groupsizes, argminc, minc); 172 | arrutilv2::set_argmaxmax(nclusters, groupsizes, argmaxc, maxc); 173 | 174 | while (minc < minclustersize){ 175 | if (maxc <= minclustersize){ 176 | throw std::runtime_error("In get contig by cluster 3, trying to balance clusters. minc < minclustersize, so balancing required. But maxc <= minclustersize, so balancing will cause another hole to appear, it will be impossible to fill all the holes there are simply not enough plugs"); 177 | } 178 | 179 | *(std::find(L, L + ndata, argmaxc)) = argminc; 180 | ++groupsizes[argminc]; 181 | --groupsizes[argmaxc]; 182 | arrutilv2::set_argminmin(nclusters, groupsizes, argminc, minc); 183 | arrutilv2::set_argmaxmax(nclusters, groupsizes, argmaxc, maxc); 184 | } 185 | } 186 | 187 | 188 | } 189 | 190 | #endif 191 | -------------------------------------------------------------------------------- /src/arrutilv2l0.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef ARRUTILV2L0_H 22 | #define ARRUTILV2L0_H 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | /* Herein all things doable directly by blas 37 | * Herein all distances, distances squared, mins, maxs, combos therof (which if using blas would be different) 38 | * 39 | * rules of functions to make easier to use / remember: 40 | * (1) dimensions of arrays must appear before arrays, but as late as possible 41 | * (2) functions which return must be getxxx 42 | * (3) functions which set_ must be set_xxx (or subtractfrom , addto , update , something obvious) 43 | * (4) thing(s) being set_ should come as late as possible (excluding flag like parameters, background increment parameters etc.) without violating above rules 44 | * (5) if array being set_ is dimension d, there should be d trailing 's' to function name 45 | * (6) if operation is on 1-D and 2-D array, should have r/c somewhere telling whether row or column 46 | * (7) if operation on 2-D and 2-D array should have rr/rc/cr/cc as above (unless a flag like bool asrow) 47 | * (8) nrows before ncols in parameter list 48 | * for [TFloat = double, TInt = unsigned] autogeneration of functions to arrutilv2.cpp is done by python function 49 | * */ 50 | 51 | #ifdef WITHBLAS 52 | #include "arrutilv2l0withblas.h" 53 | #else 54 | #include "arrutilv2l0blasless.h" 55 | #endif 56 | 57 | #endif 58 | 59 | -------------------------------------------------------------------------------- /src/arrutilv2l0blasless.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef ARRUTILV2L0BLASLESS_H 22 | #define ARRUTILV2L0BLASLESS_H 23 | 24 | namespace arrutilv2{ 25 | 26 | inline void proxy_openblas_set_num_threads(int nthreads){ 27 | 28 | } 29 | 30 | 31 | template 32 | void set_rargmaxabs(TInt nrows, TInt ncols, const TFloat * const A, TInt * argmaxabss){ 33 | TFloat absval; 34 | TFloat vmax; 35 | for (TInt r = 0; r < nrows; ++r){ 36 | argmaxabss[r] = 0; 37 | vmax = A[r*ncols]; 38 | for (TInt c = 1; c < ncols; ++c){ 39 | absval = std::abs(A[r*ncols + c]); 40 | if (absval > vmax){ 41 | argmaxabss[r] = c; 42 | vmax = absval; 43 | } 44 | } 45 | } 46 | } 47 | 48 | template 49 | void scale(TInt N, TScaleFloatType factor, TFloat * const toscale){ 50 | TFloat factor_ct = static_cast (factor); 51 | for (TInt i = 0; i < N; ++i){ 52 | toscale[i] *= factor_ct; 53 | } 54 | } 55 | 56 | 57 | 58 | template 59 | void rank1rowupdate(TInt ncols, const TFloat * const row, TFloat scale, TInt nrows, TFloat * const toupdate){ 60 | std::unique_ptr scaledrow (new TFloat [ncols]); 61 | for (TInt c = 0; c < ncols; ++c){ 62 | scaledrow[c] = row[c]*scale; 63 | } 64 | for (TInt r = 0; r < nrows; ++r){ 65 | for (TInt c = 0; c < ncols; ++c){ 66 | toupdate[r*ncols + c] += scaledrow[c]; 67 | } 68 | } 69 | } 70 | 71 | template 72 | inline void set_l22(const TInt & ndata, const TFloat * const a, TFloat & l22){ 73 | l22 = 0; 74 | for (TInt i = 0; i < ndata; ++i){ 75 | l22 += a[i]*a[i]; 76 | } 77 | } 78 | 79 | template 80 | inline void set_l22(const TInt & dimension, const TFloat * const a, const TFloat * const b, const TFloat & a_l22, const TFloat & b_l22, TFloat & l22){ 81 | l22 = 0; 82 | for (TInt i = 0; i < dimension; ++ i){ 83 | l22 += a[i]*b[i]; 84 | } 85 | l22 *= -2; 86 | l22 += a_l22; 87 | l22 += b_l22; 88 | } 89 | 90 | template 91 | inline void set_sum(const TInt & ndata, const TFloat * const a, TFloat & sum){ 92 | sum = 0; 93 | for (TInt i = 0; i < ndata; ++i){ 94 | sum += a[i]; 95 | } 96 | } 97 | 98 | template 99 | void set_l22s(TInt nrows, TInt ncols, const TFloat * const A, TFloat * const l22s, bool byrow){ 100 | if (byrow == true){ 101 | for (TInt r = 0; r < nrows; ++r){ 102 | set_l22(ncols, A + r*ncols, l22s[r]); 103 | } 104 | } 105 | 106 | else{ 107 | for (TInt c = 0; c < ncols; ++c){ 108 | l22s[c] = 0; 109 | } 110 | 111 | for (TInt r = 0; r < nrows; ++r){ 112 | for (TInt c = 0; c < ncols; ++c){ 113 | l22s[c] += A[r*ncols + c]*A[r*ncols + c]; 114 | } 115 | } 116 | } 117 | } 118 | 119 | 120 | 121 | 122 | template 123 | void set_sums(TInt nrows, TInt ncols, const TFloat * const A, TFloat * const sums, bool byrow){ 124 | if (byrow == true){ 125 | for (TInt r = 0; r < nrows; ++r){ 126 | set_sum(ncols, A + r*ncols, sums[r]); 127 | } 128 | } 129 | 130 | else{ 131 | for (TInt c = 0; c < ncols; ++c){ 132 | sums[c] = 0; 133 | } 134 | for (TInt r = 0; r < nrows; ++r){ 135 | for (TInt c = 0; c < ncols; ++c){ 136 | sums[c] += A[r*ncols + c]; 137 | } 138 | } 139 | } 140 | } 141 | 142 | /* v : 1-D of size ncols 143 | * B : nrows x ncols 144 | * l22s[i] : |v - B[i]|_2 145 | * */ 146 | template 147 | inline void set_rl22s(const TInt & ncols, const TFloat * const v, const TInt & nrows, const TFloat * const B, const TFloat & v_l22s, const TFloat * const B_l22s, TFloat * const l22s){ 148 | for (TInt r = 0; r < nrows; ++r){ 149 | l22s[r] = 0; 150 | for (TInt c = 0; c < ncols; ++c){ 151 | l22s[r] += B[r*ncols + c]*v[c]; 152 | } 153 | l22s[r] *= -2; 154 | l22s[r] += v_l22s; 155 | l22s[r] += B_l22s[r]; 156 | } 157 | } 158 | 159 | 160 | /* A : nrowsA x ncols 161 | * B : nrowsB x ncols 162 | * C[i,j] : |A[i] - B[j]|_2 for 0 <= i < nrowsA 0 <= j < nrowsB 163 | * */ 164 | 165 | template 166 | void set_rrl22ss(TInt nrowsA, TInt ncols, const TFloat * const A, TInt nrowsB, const TFloat * const B, const TFloat * const A_l22s, const TFloat * const B_l22s, TFloat * const l22ss){ 167 | for (TInt r = 0; r < nrowsA; ++r){ 168 | set_rl22s(ncols, A + r*ncols, nrowsB, B, A_l22s[r], B_l22s, l22ss + r*nrowsB); 169 | } 170 | } 171 | 172 | 173 | template 174 | void subtractfrom(TInt N, const TFloat * const tosubtract, TFloat * const from){ 175 | for (TInt i = 0; i < N; ++i){ 176 | from[i] -= tosubtract[i]; 177 | } 178 | } 179 | 180 | template 181 | void addto(TInt N, const TFloat * const toadd, TFloat * const to){ 182 | for (TInt i = 0; i < N; ++i){ 183 | to[i] += toadd[i]; 184 | } 185 | } 186 | 187 | 188 | 189 | 190 | 191 | } 192 | 193 | #endif 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /src/arrutilv2mse.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef ARRUTILV2MSE_H 22 | #define ARRUTILV2MSE_H 23 | 24 | #include 25 | #include 26 | 27 | 28 | 29 | namespace arrutilv2{ 30 | 31 | template 32 | TFloat get_mse(TSize ndata, TInt ncentroids, const TFloat * const distances2, const TInt * const labels){ 33 | TSize s_ncentroids = static_cast (ncentroids); 34 | TFloat suml2s = 0; 35 | TFloat d2; 36 | for (TSize i = 0; i < ndata; ++i){ 37 | d2 = distances2[i*s_ncentroids + labels[i]]; 38 | if (d2 > 0){ 39 | suml2s += std::sqrt(d2); 40 | } 41 | else if (d2 <-1e-5){ 42 | throw std::runtime_error("negative value in get_mse of magnitude less than 1e-5. Probable cause: user has provided negative distance squared value"); 43 | } 44 | else{ 45 | //assumed rounding error (provide warning?) 46 | } 47 | } 48 | 49 | return suml2s/static_cast (ndata); 50 | } 51 | 52 | template 53 | TFloat get_mse(TSize ndata, TFloat * distances){ 54 | TFloat mse; 55 | set_row_sum_squares(static_cast(1), ndata, distances, &mse); 56 | mse/=ndata; 57 | return mse; 58 | } 59 | 60 | 61 | //[ 0.5 * ridgeterm * sum (count - mean count)^2 ] / ndata 62 | template 63 | TFloat get_meanridge(TFloat ridgeterm, TSize ncentroids, TSize * const counts){ 64 | 65 | auto ndata = 0; 66 | for (TSize si = 0; si < ncentroids; ++si){ 67 | ndata += counts[si]; 68 | }//std::accumulate(counts, counts + ncentroids); 69 | 70 | TFloat meancount = static_cast (ndata ) / static_cast (ncentroids); 71 | 72 | TFloat ridge_penalty = 0.; 73 | for (TSize ci = 0; ci < ncentroids; ++ ci){ 74 | ridge_penalty += (counts[ci] - meancount)*(counts[ci] - meancount); 75 | } 76 | //TFloat ridge_penalty = static_cast (ridge_penalty_st); 77 | ridge_penalty *= ridgeterm/2.; 78 | ridge_penalty /= static_cast (ndata); 79 | return ridge_penalty; 80 | } 81 | 82 | 83 | 84 | template 85 | TFloat getmeanl22at(TInt ncentroids, TInt dimension, const TFloat * const centroids, TInt ndata, const TFloat * const data, const TInt * const labels, const TFloat * const centroid_l22s, const TFloat * const data_l22s){ 86 | TFloat sum_variances = 0; 87 | for (TInt i = 0; i < ndata; ++ i){ 88 | TFloat variance = 0; 89 | for (TInt d = 0; d < dimension; ++ d){ 90 | variance += data[i*dimension + d]*centroids[labels[i]*dimension + d]; 91 | } 92 | variance *= -2; 93 | variance += data_l22s[i]; 94 | variance += centroid_l22s[labels[i]]; 95 | sum_variances += variance; 96 | } 97 | TFloat variance_estimate = sum_variances/static_cast(ndata); 98 | return variance_estimate; 99 | } 100 | 101 | //mse + meanridgeerror 102 | template 103 | TFloat get_mse_ridge(TSize ndata, TFloat * distances, TFloat ridgeterm, TSize ncentroids, TSize * const counts){ 104 | TFloat mse; 105 | set_row_sum_squares(static_cast(1), ndata, distances, &mse); 106 | mse/=ndata; 107 | TFloat meanridge = get_meanridge(ridgeterm, ncentroids, counts); 108 | return mse + meanridge; 109 | } 110 | 111 | 112 | 113 | 114 | 115 | template 116 | TFloat get_sse_batchwise(TInt ndata, TInt nperbatch, TInt dimension, const TFloat * const data, TInt ncentroids, const TFloat * const centroids, const TFloat * const data_l22s, const TFloat * const centroid_l22s, TInt & ndcalcs){ 117 | 118 | TInt nfullbatches = ndata/nperbatch; 119 | TInt nfinalbatch = ndata - nfullbatches*nperbatch; 120 | std::unique_ptr distances_squared (new TFloat [nperbatch*ncentroids]); 121 | //data from the full batches 122 | TFloat sse = 0; 123 | for (TInt bi = 0; bi < nfullbatches; ++bi){ 124 | set_rrl22ss(nperbatch, dimension, data + bi*dimension*nperbatch, ncentroids, centroids, data_l22s +bi*nperbatch, centroid_l22s, distances_squared.get()); 125 | for (TInt i = nperbatch*bi; i < nperbatch*(bi + 1); ++ i){ 126 | sse += *std::min_element(distances_squared.get() + (i - nperbatch*bi)*ncentroids, distances_squared.get() + (i - nperbatch*bi + 1)*ncentroids); 127 | } 128 | } 129 | //data from the tail 130 | set_rrl22ss(nfinalbatch, dimension, data + nfullbatches*dimension*nperbatch, ncentroids, centroids, data_l22s + nfullbatches*nperbatch, centroid_l22s, distances_squared.get()); 131 | 132 | for (TInt i = nperbatch*nfullbatches; i < ndata; ++ i){ 133 | sse += *std::min_element(distances_squared.get() + (i - nperbatch*nfullbatches)*ncentroids, distances_squared.get() + (i - nperbatch*nfullbatches + 1)*ncentroids); 134 | } 135 | 136 | return sse; 137 | } 138 | 139 | 140 | 141 | 142 | } 143 | 144 | 145 | #endif 146 | -------------------------------------------------------------------------------- /src/barrierutil.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef BARRIERUTIL_H 22 | #define BARRIERUTIL_H 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace stdthreadutil{ 32 | 33 | 34 | /* btask for `barriered task' 35 | * used for situation when 36 | * (1) several threads perform same task 37 | * (2) when all finished, one thread performs finishing task 38 | * (3) all threads are released */ 39 | 40 | void btask( 41 | /* identity of thread, not used in function but left as parameter as potentially useful for debugging */ 42 | const size_t & ti, 43 | /* number of threads performing the task*/ 44 | const size_t & nthreads, 45 | /* reference to number of threads which have completed the task*/ 46 | size_t & completions, 47 | /* when work of this thread is complete, use the following tools (shared by all workers on this task) to notify the others */ 48 | std::mutex & workend_mutex, 49 | std::condition_variable & condvar, 50 | /* task to perform, and task to perform at end if this thread finishes last*/ 51 | const std::function & task, 52 | const std::function & endtask); 53 | 54 | 55 | /* btasks for `barrirered tasks' 56 | * used in situation when several threads of equal status perform 57 | for (# tasks) { do task | do end task if last to complete | } 58 | */ 59 | inline void btasks( 60 | const size_t & ti, 61 | const size_t & nthreads, 62 | std::vector & section_completions, 63 | std::vector & sectionend_mutexes, 64 | std::vector & section_condvars, 65 | const std::vector> & section_tasks, 66 | const std::vector> & sectionend_tasks 67 | ); 68 | 69 | /* btask_rbasks for `barriered task then repeat barriered tasks' 70 | * used in situation when 71 | * several threads of equal status perform 72 | * do initialisation task | do intitialisation end task if last to complete | 73 | * while (condition is true) for (# tasks) do task | do end task if last to complete | 74 | */ 75 | void btask_rbtasks( 76 | size_t ti, 77 | size_t nthreads, 78 | std::vector & section_completions, 79 | std::vector & sectionend_mutexes, 80 | std::vector & section_condvars, 81 | const std::function & initialisation_task, 82 | const std::function & initialisationend_task, 83 | const std::vector> & section_tasks, 84 | const std::vector> & sectionend_tasks, 85 | const std::function & getiscomplete); 86 | 87 | /* barriered tasks (inititialisation) then while condition repeat barriered tasks */ 88 | void btasks_rbtasks( 89 | size_t ti, 90 | size_t nthreads, 91 | std::vector & x_completions, 92 | std::vector & xend_mutexes, 93 | std::vector & x_condvars, 94 | const std::vector< std::function> & initialisation_tasks, 95 | const std::vector< std::function> & initialisationend_tasks, 96 | const std::vector< std::function> & section_tasks, 97 | const std::vector< std::function> & sectionend_tasks, 98 | const std::function & getiscomplete); 99 | 100 | /* launch barriered tasks */ 101 | int launch_btasks( 102 | size_t nthreads, 103 | const std::vector> & section_tasks, 104 | const std::vector> & sectionend_tasks 105 | ); 106 | 107 | 108 | int launch_btasks_rbtasks( 109 | size_t nthreads, 110 | const std::vector> & initialisation_tasks, 111 | const std::vector> & initialisationend_tasks, 112 | const std::vector> & section_tasks, 113 | const std::vector> & sectionend_tasks, 114 | const std::function & getiscomplete, 115 | const std::function & closing_task 116 | ); 117 | 118 | 119 | 120 | int launch_btask_rbtasks( 121 | size_t nthreads, 122 | const std::function & initialisation_task, 123 | const std::function & initialisationend_task, 124 | const std::vector> & section_tasks, 125 | const std::vector> & sectionend_tasks, 126 | const std::function & getiscomplete, 127 | const std::function & closing_task 128 | ); 129 | 130 | 131 | } 132 | 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /src/baseYYMSN.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | // UNDER CONSTRUCTION. 22 | 23 | #ifndef PLL_PLLYINYANGMSNBASEKMEANS_H 24 | #define PLL_PLLYINYANGMSNBASEKMEANS_H 25 | 26 | #include "baseYY.h" 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | namespace kmeans{ 34 | 35 | 36 | template 37 | /* " max sum norm" for lower bounds versions inherit from here */ 38 | class YYMSNBase : public kmeans::BaseYY{ 39 | 40 | private: 41 | std::unique_ptr delta_C; 42 | std::unique_ptr u_delta_G; 43 | 44 | protected: 45 | TFloat * const get_glowers(){ 46 | return this->get_glowers_base(); 47 | } 48 | 49 | TFloat * const get_upb(){ 50 | return this->get_upb_base(); 51 | } 52 | 53 | TFloat * const get_delta_C(){ 54 | return delta_C.get(); 55 | } 56 | 57 | TFloat * const get_u_delta_G(){ 58 | return u_delta_G.get(); 59 | } 60 | 61 | 62 | 63 | public: 64 | typedef kmeans::BaseYY YYB; 65 | template 66 | YYMSNBase(Args&&... args): YYB(std::forward(args)...), 67 | delta_C{ new TFloat [this->getncentroids()] }, 68 | delta_G{ new TFloat [this->get_ngroups()] } 69 | 70 | { 71 | this->setalgname("YYMSNBase"); 72 | } 73 | 74 | virtual ~YYMSNBase(){} 75 | 76 | 77 | virtual TInt get_approximate_memory_requirement(){ 78 | return YYB::get_approximate_memory_requirement() + 79 | sizeof(TFloat)*( 80 | this->getncentroids() + //delta_C 81 | this->get_ngroups()); //delta_G 82 | } 83 | 84 | virtual void verbose_write_additional(){ 85 | this->get_verbose_file() << "\n\n ..not implemented down to YYMSNBase..\n\n"; 86 | } 87 | 88 | 89 | 90 | 91 | virtual void set_initialisation_tasks() = 0; 92 | 93 | 94 | virtual void set_C_tasks() = 0; 95 | virtual void set_X_tasks() = 0; 96 | }; 97 | 98 | } 99 | 100 | #endif 101 | 102 | -------------------------------------------------------------------------------- /src/baseYYSMN.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_PLLYINYANGSMNBASEKMEANS_H 22 | #define PLL_PLLYINYANGSMNBASEKMEANS_H 23 | 24 | #include "baseYY.h" 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace kmeans{ 32 | 33 | 34 | template 35 | /* "sum max norm" for lower bounds versions inherit from here */ 36 | class BaseYYSMN : public kmeans::BaseYY{ 37 | 38 | private: 39 | std::unique_ptr delta_C; 40 | std::unique_ptr delta_G; 41 | 42 | protected: 43 | TFloat * const get_glowers(){ 44 | return this->get_glowers_base(); 45 | } 46 | 47 | TFloat * const get_upb(){ 48 | return this->get_upb_base(); 49 | } 50 | 51 | TFloat * const get_delta_C(){ 52 | return delta_C.get(); 53 | } 54 | 55 | TFloat * const get_delta_G(){ 56 | return delta_G.get(); 57 | } 58 | 59 | 60 | 61 | public: 62 | typedef kmeans::BaseYY YYB; 63 | template 64 | BaseYYSMN(Args&&... args): YYB(std::forward(args)...), 65 | delta_C{ new TFloat [this->getncentroids()] }, 66 | delta_G{ new TFloat [this->get_ngroups()] } 67 | 68 | { 69 | this->setalgname("BaseYYSMN"); 70 | } 71 | 72 | virtual ~BaseYYSMN(){} 73 | 74 | 75 | virtual TInt get_approximate_memory_requirement(){ 76 | return YYB::get_approximate_memory_requirement() + 77 | sizeof(TFloat)*( 78 | this->getncentroids() + //delta_C 79 | this->get_ngroups()); //delta_G 80 | } 81 | 82 | virtual void verbose_write_additional(){ 83 | this->get_verbose_file() << "\n\n ..not implemented down to BaseYYSMN..\n\n"; 84 | } 85 | 86 | 87 | 88 | 89 | virtual void set_initialisation_tasks() = 0; 90 | 91 | 92 | virtual void set_C_tasks() = 0; 93 | virtual void set_X_tasks() = 0; 94 | }; 95 | 96 | } 97 | 98 | #endif 99 | 100 | -------------------------------------------------------------------------------- /src/baseelkan.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASEELKANKMEANS__H 22 | #define PLL_BASEELKANKMEANS__H 23 | 24 | namespace kmeans{ 25 | 26 | template 27 | class BaseElkan : public kmeans::BaseExact{ 28 | 29 | protected: 30 | 31 | 32 | public: 33 | typedef kmeans::BaseExact BC; 34 | template 35 | BaseElkan(Args&&... args): BC(std::forward(args)...) 36 | 37 | { 38 | this->assignmemory_elkan_upper_lowers(); 39 | this->setalgname("elkan base"); 40 | } 41 | 42 | virtual ~BaseElkan(){} 43 | 44 | virtual void verbose_write_additional() override {} 45 | virtual void set_initialisation_tasks() = 0; 46 | virtual void set_C_tasks() = 0; 47 | virtual void set_X_tasks() = 0; 48 | 49 | virtual TInt get_approximate_memory_requirement(){ 50 | return BC::get_approximate_memory_requirement() + this->get_elkan_base_memory(); 51 | } 52 | }; 53 | 54 | } 55 | 56 | #endif 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /src/baseelkanminibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASEELKANMINIBATCHKMEANS_H 22 | #define PLL_BASEELKANMINIBATCHKMEANS_H 23 | 24 | #include "baseminibatch.h" 25 | 26 | namespace kmeans{ 27 | template 28 | class BaseElkanMiniBatch : public kmeans::BaseMiniBatch{ 29 | 30 | 31 | protected: 32 | 33 | 34 | public: 35 | typedef kmeans::BaseMiniBatch BC; 36 | template 37 | BaseElkanMiniBatch(Args&&... args): BC(std::forward(args)...) 38 | 39 | { 40 | this->assignmemory_elkan_upper_lowers(); 41 | this->setalgname("elkan minibatch base"); 42 | } 43 | 44 | virtual ~BaseElkanMiniBatch(){} 45 | 46 | virtual void verbose_write_additional() override {} 47 | virtual void set_initialisation_tasks() = 0; 48 | virtual void set_C_tasks() = 0; 49 | virtual void set_X_tasks() = 0; 50 | 51 | virtual TInt get_approximate_memory_requirement(){ 52 | return BC::get_approximate_memory_requirement() + this->get_elkan_base_memory(); 53 | } 54 | }; 55 | 56 | } 57 | 58 | #endif 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/baseexact.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASEEXACTKMEANSTRUE_H 22 | #define PLL_BASEEXACTKMEANSTRUE_H 23 | 24 | #include "basekmeans.h" 25 | 26 | namespace kmeans{ 27 | 28 | template 29 | class BaseExact : public kmeans::BaseKmeans { 30 | 31 | private: 32 | virtual void set_summaries() { 33 | this->set_summaries_exact(); 34 | } 35 | 36 | virtual void set_mse() override { 37 | this->mse = arrutilv2::getmeanl22at(this->ncentroids, this->dimension, this->get_C(), this->ndata, this->data, this->get_L(), this->get_C_l22s(), this->get_data_l22s()); 38 | } 39 | 40 | protected: 41 | 42 | virtual void set_initialisation_tasks() = 0; 43 | virtual void set_X_tasks() = 0; 44 | virtual void set_C_tasks() = 0; 45 | 46 | template 47 | void pll_principal_X(const Function & X_updater, TInt ti, Args&&... args){ 48 | this->base_pll_principal_X(static_cast (0), this->ndata, X_updater, ti, std::forward(args)...); 49 | } 50 | 51 | std::function set_L_ati(){ 52 | return this->set_L_ati(0, this->ndata); 53 | } 54 | 55 | public: 56 | template 57 | BaseExact(Args&&... args): kmeans::BaseKmeans(std::forward(args)...){} 58 | 59 | virtual ~BaseExact(){} 60 | 61 | }; 62 | 63 | } 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/basehamerly.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_HAMERLYBASEKMEANS_H 22 | #define PLL_HAMERLYBASEKMEANS_H 23 | 24 | 25 | namespace kmeans{ 26 | 27 | template 28 | class BaseHamerly : public kmeans::BaseExact{ 29 | 30 | private: 31 | 32 | std::unique_ptr CC; 33 | std::unique_ptr halfminCC; 34 | std::unique_ptr delta_C; 35 | std::unique_ptr lower_base; 36 | std::unique_ptr upper_base; 37 | 38 | 39 | std::vector > makeset_C_C_l22s_L_inds0_lower_upper_mati(){ 40 | 41 | std::vector > tasks; 42 | 43 | tasks = this->exact_makeset_C_C_l22s_inds0_mati(); 44 | tasks.push_back( 45 | [this](TInt ti){ 46 | TInt local_ndcalcs = 0; 47 | TInt x0 = (ti*this->getndata())/this->getnthreads(); 48 | TInt x1 = ((ti+1)*this->getndata())/this->getnthreads(); 49 | arrutilv2::set_L2_dn(x1 - x0, this->getdimension(), this->getdata() + x0*this->getdimension(), this->getncentroids(), this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_upper_base() + x0, this->get_lower_base() + x0, local_ndcalcs); 50 | this->ndcalcs_notX += local_ndcalcs; 51 | } 52 | ); 53 | tasks.push_back( 54 | //set starting mse. TODO : parallelise. 55 | [this](TInt ti){ 56 | if (ti == 0 && this->get_initialisation_method().compare("kmeans++") != 0){ // && this->getfileptr()->is_open() 57 | this->mse = 0; 58 | for (TInt i = 0; i < this->getndata(); ++i){ 59 | this->mse += (this->get_upper_base()[i])*(this->get_upper_base()[i]); 60 | } 61 | this->mse /= static_cast(this->getndata()); 62 | } 63 | 64 | } 65 | ); 66 | 67 | return tasks; 68 | } 69 | 70 | 71 | protected: 72 | 73 | TFloat * const get_CC(){ 74 | return CC.get(); 75 | } 76 | 77 | TFloat * const get_halfminCC(){ 78 | return halfminCC.get(); 79 | } 80 | 81 | TFloat * const get_lower_base(){ 82 | return lower_base.get(); 83 | } 84 | 85 | TFloat * const get_upper_base(){ 86 | return upper_base.get(); 87 | } 88 | 89 | TFloat * const get_delta_C(){ 90 | return delta_C.get(); 91 | } 92 | 93 | std::vector > makeset_C_C_l22s_L_inds0_lower_upper_S_H_mati(){ 94 | auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_lower_upper_mati(); 95 | 96 | 97 | 98 | auto init_task_B = arrutilv2::set_S_H_ati(this->nthreads, this->ndata, this->dimension, this->data, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dsums(), this->get_dcounts(), this->get_sums(), this->get_counts(), this->work_mutex); 99 | 100 | auto initialisation_tasks = std::move(init_tasks_A); 101 | initialisation_tasks.push_back(std::move(init_task_B)); 102 | return initialisation_tasks; 103 | } 104 | 105 | public: 106 | template 107 | BaseHamerly(Args&&... args): kmeans::BaseExact(std::forward(args)...), 108 | 109 | CC{ new TFloat [this->getncentroids()*this->getncentroids()] }, 110 | halfminCC{ new TFloat [this->getncentroids()] }, 111 | delta_C{ new TFloat [this->getncentroids()] }, 112 | lower_base{ new TFloat [this->getndata()] }, 113 | upper_base{ new TFloat [this->getndata()] } 114 | 115 | { 116 | this->setalgname("BaseHamerly"); 117 | } 118 | virtual ~BaseHamerly(){} 119 | 120 | virtual void verbose_write_additional(){ 121 | kmeans::BaseExact::verbose_write_additional(); 122 | this->get_verbose_file() << "\nlower_base:\n" << lower_base[0] << "\n"; 123 | this->get_verbose_file() << "\n\nupper_base:\n" << upper_base[0] << "\n"; 124 | /* anything else to print ? */ 125 | } 126 | 127 | virtual void set_initialisation_tasks() = 0; 128 | virtual void set_C_tasks() = 0; 129 | virtual void set_X_tasks() = 0; 130 | 131 | }; 132 | 133 | } 134 | 135 | #endif 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /src/baseminibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASEMINIBATCHKMEANS_H 22 | #define PLL_BASEMINIBATCHKMEANS_H 23 | 24 | #include "basekmeans.h" 25 | #include "minibatchapp.h" 26 | 27 | namespace kmeans{ 28 | template 29 | class BaseMiniBatch : public kmeans::BaseKmeans{ 30 | 31 | private: 32 | 33 | virtual void set_mse() override final { 34 | this->minibatch_set_mse(this->mba); 35 | } 36 | 37 | virtual void set_summaries() override final { 38 | this->set_summaries_minibatch(this->mba); 39 | } 40 | 41 | protected: 42 | 43 | minibatchapp::MiniBatchApp mba; 44 | 45 | TInt maxpermultiplyblock; 46 | 47 | 48 | 49 | virtual void set_C_tasks() = 0; 50 | 51 | 52 | //set S, H from first batch 53 | std::function set_S_H_ati(){ 54 | return this->base_set_S_H_ati(static_cast (0), this->mba.initialising_batch_size); 55 | } 56 | 57 | 58 | //Not as code reducing as the baseexact version, but easier to understand 59 | template 60 | void mb_pll_principal_X(const Function & X_updater, TInt ti, Args&&... args){ 61 | 62 | arrutilv2::pll_update_L_etc( 63 | //The compulsory parameters to pll_update_L_etc, 64 | X_updater, 65 | this->ncentroids, this->dimension, this->get_sums(), this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_counts(), this->get_dcounts() + ti*this->ncentroids, this->mba.nchanges_on_batch[(this->mba.subround + 1)%this->mba.nsubrounds], this->ndcalcs_X, this->work_mutex 66 | //The additional parameters to pll_update_L_etc with correct offset 67 | , std::forward(args)...); 68 | } 69 | 70 | 71 | 72 | public: 73 | void constructor_helper(const TInt & batchsize){ 74 | 75 | this->mba = minibatchapp::MiniBatchApp(batchsize, this->ndata); 76 | this->setalgname("Base Mini Batch Kmeans"); 77 | 78 | this->maxpermultiplyblock = //10000000; 79 | std::max(static_cast (1), 80 | static_cast ((this->getndata() * this->getdimension())/(2 * this->getncentroids() * this->nthreads))); 81 | 82 | 83 | 84 | 85 | //TODO : move to summaries: 86 | std::cout << "batchsize : " << batchsize << " nsubrounds : " << this->mba.nsubrounds << " lastbatchsize : " << this->mba.lastbatchsize << " maxpermultiply : " << this->maxpermultiplyblock << " initialising_batch_size : " << this->mba.initialising_batch_size << std::endl; 87 | 88 | 89 | } 90 | 91 | /* overly hungry, consult Meyers to see how I can prevent this. 92 | * if non-standard constructor args, use variadic args ala Eli Bendersky. 93 | * Note that these won't be initialised by extern template class, so changes here will require full remake 94 | * */ 95 | template 96 | BaseMiniBatch(TInt batchsize, Args&&... args): kmeans::BaseKmeans (std::forward(args)...){ 97 | this->constructor_helper(batchsize); 98 | } 99 | 100 | virtual ~BaseMiniBatch(){}; 101 | 102 | }; 103 | 104 | 105 | } 106 | 107 | 108 | #endif 109 | 110 | -------------------------------------------------------------------------------- /src/basesimpleexact.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_SIMPLEKBASEKMEANS_H 22 | #define PLL_SIMPLEKBASEKMEANS_H 23 | 24 | #include "baseexact.h" 25 | 26 | 27 | namespace kmeans{ 28 | 29 | //two simple versions inherit from this class : simplebatch (distances calculated in batches of data) and simple (memory light version) 30 | template 31 | class BaseSimpleExactKmeans : public kmeans::BaseExact{ 32 | 33 | public: 34 | 35 | template 36 | /* variadic args ala Eli Bendersky */ 37 | BaseSimpleExactKmeans(Args&&... args): kmeans::BaseExact (std::forward(args)...) {this->setalgname("simple base");} 38 | virtual ~BaseSimpleExactKmeans(){}; 39 | 40 | protected: 41 | virtual void set_initialisation_tasks(){ 42 | 43 | auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati(); 44 | 45 | auto init_task_B = this->base_set_S_H_ati(static_cast(0), this->ndata); 46 | 47 | this->initialisation_tasks = std::move(init_tasks_A); 48 | this->initialisation_tasks.push_back(std::move(init_task_B)); 49 | } 50 | 51 | virtual void set_X_tasks() = 0; 52 | 53 | virtual void set_C_tasks(){ 54 | this->C_tasks = { 55 | //[](TInt ti){std::cout << "C task start " << std::endl; }, 56 | arrutilv2::update_C_C_l22s_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s()) 57 | //[](TInt ti){std::cout << "C task end " << std::endl; } 58 | 59 | }; 60 | } 61 | }; 62 | 63 | } 64 | 65 | #endif 66 | 67 | -------------------------------------------------------------------------------- /src/basesimpleminibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASESIMPLEMINIBATCHKMEANS_H 22 | #define PLL_BASESIMPLEMINIBATCHKMEANS_H 23 | 24 | #include "baseminibatch.h" 25 | 26 | namespace kmeans{ 27 | template 28 | class BaseSimpleMiniBatch : public kmeans::BaseMiniBatch{ 29 | 30 | 31 | private: 32 | 33 | virtual void update_L_S_H(TInt x0, TInt x1, TInt ti) = 0; 34 | 35 | virtual std::function update_L_S_H_ati() override final{ 36 | 37 | return [this](TInt ti){ 38 | //the batch to use this round (same for all threads) 39 | TInt ind0 = this->mba.batchsize*((this->mba.subround + 1) % this->mba.nsubrounds); //(this->round%this->mba.nsubrounds); //not this->mba.subround! 40 | TInt ind1 = std::min(ind0 + this->mba.batchsize, this->ndata); 41 | TInt thisbatchsize = ind1 - ind0; 42 | 43 | //absolute indices of data to process on this threads 44 | TInt x0 = ind0 + (ti*thisbatchsize)/this->nthreads; 45 | TInt x1 = ind0 + ((ti + 1)*thisbatchsize)/this->nthreads; 46 | 47 | this->update_L_S_H(x0, x1, ti); 48 | 49 | this->ndcalcs_X += (x1 - x0)*this->ncentroids; 50 | }; 51 | } 52 | 53 | 54 | 55 | 56 | 57 | 58 | protected: 59 | 60 | 61 | std::unique_ptr delta_C; 62 | 63 | virtual void set_C_tasks() override final { 64 | this->C_tasks = { 65 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->delta_C.get(), this->ndcalcs_notX) 66 | }; 67 | } 68 | 69 | 70 | //some initialisation, using the first batch if necessary 71 | std::vector > makeset_C_C_l22s_L_inds0_mati(){ 72 | return this->minibatch_makeset_C_C_l22s_L_inds0_mati(this->mba); 73 | } 74 | 75 | 76 | 77 | 78 | virtual void set_initialisation_tasks() override final{ 79 | auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati(); 80 | auto init_task_B = this->set_S_H_ati(); 81 | this->initialisation_tasks = std::move(init_tasks_A); 82 | this->initialisation_tasks.push_back(std::move(init_task_B)); 83 | } 84 | 85 | 86 | 87 | 88 | virtual void set_X_tasks() override final { 89 | this->X_tasks = { 90 | 91 | //[this](TInt ti){ 92 | //std::cout << "\nnchanges_on_batch" << std::endl; 93 | //for (TInt a = 0; a < this->mba.nsubrounds; ++a){ 94 | //std::cout << this->mba.nchanges_on_batch[a] << " "; 95 | //} 96 | //std::cout << std::endl; 97 | //}, 98 | 99 | this->update_L_S_H_ati(), 100 | this->minibatch_subround_update(this->mba) 101 | }; 102 | } 103 | 104 | 105 | void update_L_S_H_batch_increment_only(TInt x0, TInt x1, TInt ti){ 106 | arrutilv2::update_L_S_H_batch_increment_only(x1-x0, this->maxpermultiplyblock, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->mba.nchanges_on_batch[(this->mba.subround + 1)%this->mba.nsubrounds], this->work_mutex); 107 | } 108 | 109 | 110 | void update_L_S_H_batch(TInt x0, TInt x1, TInt ti){ 111 | arrutilv2::update_L_S_H_batch(x1-x0, this->maxpermultiplyblock, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->mba.nchanges_on_batch[(this->mba.subround + 1)%this->mba.nsubrounds], this->work_mutex); 112 | } 113 | 114 | 115 | 116 | 117 | 118 | public: 119 | 120 | template 121 | BaseSimpleMiniBatch(Args&&... args): kmeans::BaseMiniBatch (std::forward(args)...){ 122 | this->delta_C = std::unique_ptr (new TFloat [this->ncentroids]); 123 | } 124 | 125 | virtual ~BaseSimpleMiniBatch(){}; 126 | 127 | }; 128 | } 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /src/basesparseelkan.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASESPARSEELKANKMEANS__H 22 | #define PLL_BASESPARSEELKANKMEANS__H 23 | 24 | #include "basesparseexact.h" 25 | #include "sparseutil.h" 26 | 27 | namespace kmeans{ 28 | 29 | template 30 | class BaseSparseElkan : public kmeans::BaseSparseExact{ 31 | 32 | protected: 33 | 34 | virtual void set_upper_lowers_L(TInt x0, TInt x1) override final{ /* from basedensecentroidkmeans */ 35 | for (TInt i = x0; i < x1; ++i){ 36 | sparse::set_argminmin_rl2s(this->ptrdata->starts[i+1] - this->ptrdata->starts[i], this->ptrdata->indices.data() + this->ptrdata->starts[i], this->ptrdata->values.data() + this->ptrdata->starts[i], this->ptrdata->dimension, this->ncentroids, this->get_C(), this->data_l22s[i], this->get_C_l22s(), this->L[i], this->elkan_upper_base[i], this->elkan_lowers_base.get() + i*this->ncentroids); 37 | } 38 | 39 | this->ndcalcs_X += this->ncentroids*(x1 - x0); 40 | } 41 | 42 | public: 43 | typedef kmeans::BaseSparseExact BC; 44 | template 45 | BaseSparseElkan(Args&&... args): BC(std::forward(args)...) 46 | 47 | { 48 | this->assignmemory_elkan_upper_lowers(); 49 | this->setalgname("sparse elkan base"); 50 | } 51 | 52 | virtual ~BaseSparseElkan(){} 53 | 54 | virtual void verbose_write_additional() override {} 55 | virtual void set_initialisation_tasks() = 0; 56 | virtual void set_C_tasks() = 0; 57 | virtual void set_X_tasks() = 0; 58 | 59 | virtual TInt get_approximate_memory_requirement(){ 60 | return BC::get_approximate_memory_requirement() + this->get_elkan_base_memory(); 61 | } 62 | }; 63 | 64 | } 65 | 66 | #endif 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /src/basesparseexact.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASESPARSEEXACT_H 22 | #define PLL_BASESPARSEEXACT_H 23 | 24 | #include "basesparsekmeans.h" 25 | #include 26 | 27 | namespace kmeans{ 28 | 29 | template 30 | class BaseSparseExact : public kmeans::BaseSparseKmeans { 31 | 32 | private: 33 | virtual void set_mse() override{ 34 | TFloat sse = 0; 35 | for (TInt i = 0; i < this->ndata; ++i){ 36 | sse += 37 | this->data_l22s[i] + this->C_l22s[this->L[i]] 38 | -2.*sparse::get_inner(this->ptrdata->starts[i+1] - this->ptrdata->starts[i], 39 | this->ptrdata->indices.data() + this->ptrdata->starts[i], 40 | this->ptrdata->values.data() + this->ptrdata->starts[i], 41 | this->get_C() + this->dimension*this->L[i]); 42 | } 43 | this->mse = sse / static_cast (this->ndata); 44 | } 45 | 46 | 47 | public: 48 | template 49 | BaseSparseExact(Args&&... args): kmeans::BaseSparseKmeans (std::forward(args)...) { 50 | this->setalgname("base-sparse-exact-kmeans"); 51 | } 52 | 53 | virtual ~BaseSparseExact(){}; 54 | 55 | protected: 56 | virtual void set_initialisation_tasks() = 0; 57 | virtual void set_X_tasks() = 0; 58 | virtual void set_C_tasks() = 0; 59 | 60 | virtual void set_summaries(){ 61 | this->set_summaries_exact(); 62 | } 63 | 64 | 65 | 66 | 67 | //A hack as no pllsation as suggested by ati suffix 68 | std::function set_S_H_ati(){ 69 | return this->base_set_S_H_ati(static_cast(0), this->ndata); 70 | } 71 | 72 | 73 | virtual void verbose_write_additional(){ 74 | throw std::runtime_error("verbose_write_additional needs implementing in basesparseexact"); 75 | } 76 | 77 | 78 | 79 | }; 80 | } 81 | 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /src/basesparseminibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_BASESPARSEMINIBATCHKMEANS_H 22 | #define PLL_BASESPARSEMINIBATCHKMEANS_H 23 | 24 | #include "basesparsekmeans.h" 25 | #include "minibatchapp.h" 26 | 27 | namespace kmeans{ 28 | template 29 | class BaseSparseMiniBatch : public kmeans::BaseSparseKmeans{ 30 | 31 | private: 32 | 33 | 34 | 35 | //different versions for sparsestandardminibatch and sparseminibatch (my version, where not just a naive add) 36 | virtual void post_L_adjust_S_H() = 0; 37 | 38 | //update L, label_changes in pll on batch specified by round. 39 | virtual std::function update_L_label_changes_ati(){ 40 | return [this](TInt ti){ 41 | 42 | 43 | TInt data0 = this->mba.batchsize*(this->round%this->mba.nsubrounds); 44 | TInt data1 = std::min(data0 + this->mba.batchsize, this->ndata); 45 | TInt ndata_batch = data1 - data0; 46 | TInt x0 = data0 + (ti*ndata_batch)/this->nthreads; 47 | TInt x1 = data0 + ((ti+1)*ndata_batch)/this->nthreads; 48 | 49 | //std::cout << "\nupdating in [ " << x0 << ", " << x1 << " ] " << std::endl; 50 | 51 | 52 | this->where_label_changes[ti].clear(); //index, old, new. 53 | sparse::update_L(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->where_label_changes[ti]); 54 | this->ndcalcs_X += this->ncentroids*(x1 - x0); 55 | 56 | std::lock_guard gluk(this->work_mutex); 57 | this->mba.nchanges_on_batch[this->mba.subround] += this->where_label_changes[ti].size(); 58 | }; 59 | } 60 | 61 | 62 | virtual void set_mse() override final { 63 | this->minibatch_set_mse(this->mba); 64 | } 65 | 66 | virtual void set_summaries() override final { 67 | this->set_summaries_minibatch(this->mba); 68 | } 69 | 70 | protected: 71 | 72 | minibatchapp::MiniBatchApp mba; 73 | 74 | virtual void set_C_tasks() override final { 75 | this->C_tasks = { 76 | arrutilv2::update_C_C_l22s_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s()) 77 | }; 78 | } 79 | 80 | //set S, H from first batch 81 | std::function set_S_H_from_initial_batch_ati(){ 82 | return [this](TInt ti){ 83 | if (ti == 0){ 84 | this->set_S_H(static_cast(0), this->mba.initialising_batch_size); 85 | } 86 | }; 87 | } 88 | 89 | 90 | std::vector > makeset_C_C_l22s_L_inds0_mati(){ 91 | return this->minibatch_makeset_C_C_l22s_L_inds0_mati(this->mba); 92 | } 93 | 94 | virtual void set_initialisation_tasks(){ 95 | auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati(); 96 | auto init_task_B = this->set_S_H_from_initial_batch_ati(); 97 | this->initialisation_tasks = std::move(init_tasks_A); 98 | 99 | this->initialisation_tasks.push_back([this](TInt ti) 100 | { 101 | 102 | }); 103 | 104 | this->initialisation_tasks.push_back(std::move(init_task_B)); 105 | } 106 | 107 | 108 | 109 | 110 | virtual void set_X_tasks(){ 111 | this->X_tasks = { 112 | this->update_L_label_changes_ati(), 113 | [this](TInt ti){ 114 | if (ti == 0){ 115 | this->post_L_adjust_S_H(); 116 | } 117 | }, 118 | this->minibatch_subround_update(this->mba) 119 | }; 120 | } 121 | 122 | 123 | public: 124 | void constructor_helper(const TInt & batchsize){ 125 | 126 | 127 | this->mba = minibatchapp::MiniBatchApp(batchsize, this->ndata); 128 | this->setalgname("Base Sparse Mini Batch Kmeans"); 129 | 130 | } 131 | 132 | template 133 | BaseSparseMiniBatch(TInt batchsize, Args&&... args): kmeans::BaseSparseKmeans (std::forward(args)...){ 134 | 135 | this->constructor_helper(batchsize); 136 | } 137 | 138 | virtual ~BaseSparseMiniBatch(){}; 139 | 140 | }; 141 | 142 | 143 | } 144 | 145 | 146 | #endif 147 | 148 | -------------------------------------------------------------------------------- /src/elkan3v0.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_ELKANKMEANS_3V0_H 22 | #define PLL_ELKANKMEANS_3V0_H 23 | 24 | #include "baseelkan.h" 25 | #include "alg_X_selkSN.h" 26 | 27 | namespace kmeans{ 28 | 29 | /* discrepency in ndcalcs as compared to a3v0 due to not computing CC initially (I propose) */ 30 | 31 | template 32 | class P3V0 : public kmeans::BaseElkan{ 33 | 34 | protected: 35 | TFloat * const get_lowers(){ 36 | return this->elkan_lowers_base.get(); 37 | } 38 | 39 | TFloat * const get_upbs(){ 40 | return this->elkan_upper_base.get(); 41 | } 42 | 43 | TFloat * const get_delta_C(){ 44 | return this->elkan_delta_C.get(); 45 | } 46 | 47 | std::function update_L_lowers_upper_S_H_3v0_ati(){ 48 | return [this](TInt ti){ 49 | TInt x0 = (ti*this->getndata())/this->getnthreads(); 50 | this->pll_principal_X(update_L_lowers_upper_S_H_3v0, ti, this->get_delta_C(), this->get_L() + x0, this->get_lowers() + x0*this->getncentroids(), this->get_upbs() + x0, this->round); 51 | }; 52 | } 53 | 54 | 55 | public: 56 | typedef kmeans::BaseElkan EB; 57 | template 58 | P3V0(Args&&... args): EB(std::forward(args)...) 59 | 60 | 61 | { 62 | this->setalgname("p3v0"); 63 | this->elkan_delta_C.reset(new TFloat [this->getncentroids()]); 64 | } 65 | virtual ~P3V0(){} 66 | 67 | virtual TInt get_approximate_memory_requirement(){ 68 | return EB::get_approximate_memory_requirement() + 69 | sizeof(TFloat)*this->getncentroids(); // delta_C 70 | } 71 | 72 | virtual void verbose_write_additional(){ 73 | this->EB_verbose_write_additional(); 74 | /* anything else to add ? */ 75 | } 76 | 77 | virtual void set_initialisation_tasks(){ 78 | /* all Elkan variants have same initialisation tasks */ 79 | this->ElkBase_set_initialisation_tasks(); 80 | } 81 | 82 | virtual void set_C_tasks(){ 83 | this->C_tasks = { 84 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX) 85 | }; 86 | } 87 | 88 | virtual void set_X_tasks(){ 89 | this->X_tasks = { 90 | this->update_L_lowers_upper_S_H_3v0_ati() 91 | }; 92 | } 93 | }; 94 | 95 | } 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /src/elkan5v1.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_ELKANKMEANS_5V1_H 22 | #define PLL_ELKANKMEANS_5V1_H 23 | 24 | #include "elkan3v0.h" 25 | 26 | namespace kmeans{ 27 | 28 | 29 | template 30 | void update_L_lowers_upbs_S_H_5v1(TInt ncentroids, TInt dimension, TFloat * const S, TInt * const H , TInt & nchanges, TInt &ndcalcs, 31 | TInt ndata, const TFloat * const data, const TFloat * const C, const TFloat * const data_l22s, const TFloat * const C_l22s, const TFloat * const CC, const TFloat * const halfminCC, const TFloat * const delta_C, TInt * const L, TFloat * const lowers, TFloat * const upbs, const TInt & round){ 32 | 33 | 34 | nchanges = 0; 35 | ndcalcs = 0; 36 | 37 | /* experiments with this in or out the loop show that it makes little difference (@test1) */ 38 | arrutilv2::rank1rowupdate(ncentroids, delta_C, static_cast(-1.), ndata, lowers); 39 | 40 | 41 | for (TInt i = 0; i < ndata; ++i){ 42 | /* (@test1) */ 43 | 44 | upbs[i] += delta_C[L[i]]; 45 | if (halfminCC[L[i]] < upbs[i]){ 46 | TInt label_before = L[i]; 47 | TInt ci = 0; 48 | while (ci < ncentroids){ 49 | if ((L[i] != ci) && (upbs[i] > lowers[i*ncentroids + ci]) && (upbs[i] > 0.5*CC[ci*ncentroids + L[i]])){ 50 | arrutilv2::set_l2(dimension, data + i*dimension, C + L[i]*dimension, data_l22s[i], C_l22s[L[i]], upbs[i], ndcalcs); 51 | lowers[i*ncentroids + L[i]] = upbs[i]; 52 | if ((upbs[i] > lowers[i*ncentroids + ci]) && (upbs[i] > 0.5*CC[ci*ncentroids + L[i]])){ 53 | arrutilv2::set_l2(dimension, data + i*dimension, C + ci*dimension, data_l22s[i], C_l22s[ci], lowers[i*ncentroids + ci], ndcalcs); 54 | if (upbs[i] > lowers[i*ncentroids + ci]){ 55 | upbs[i] = lowers[i*ncentroids + ci]; 56 | L[i] = ci; 57 | } 58 | } 59 | ++ci; 60 | break; 61 | } 62 | ++ci; 63 | } 64 | while (ci < ncentroids){ 65 | if ((upbs[i] > lowers[i*ncentroids + ci]) && (upbs[i] > 0.5*CC[ci*ncentroids + L[i]])){ // (L[i] != ci) && 66 | arrutilv2::set_l2(dimension, data + i*dimension, C + ci*dimension, data_l22s[i], C_l22s[ci], lowers[i*ncentroids + ci], ndcalcs); 67 | if (upbs[i] > lowers[i*ncentroids + ci]){ 68 | upbs[i] = lowers[i*ncentroids + ci]; 69 | L[i] = ci; 70 | } 71 | } 72 | ++ci; 73 | } 74 | if (L[i] != label_before){ 75 | ++nchanges; 76 | ++H[L[i]]; 77 | --H[label_before]; 78 | arrutilv2::addto(dimension, data + i*dimension, S + dimension*L[i]); 79 | arrutilv2::subtractfrom(dimension, data + i*dimension, S + dimension*label_before); 80 | } 81 | } 82 | } 83 | } 84 | 85 | 86 | 87 | template 88 | class P5V1 : public P3V0{ 89 | 90 | 91 | 92 | private: 93 | std::unique_ptr CC; 94 | std::unique_ptr halfminCC; 95 | 96 | protected: 97 | TFloat * const get_CC(){ 98 | return CC.get(); 99 | } 100 | 101 | TFloat * const get_halfminCC(){ 102 | return halfminCC.get(); 103 | } 104 | 105 | 106 | std::function update_L_lowers_upbs_S_H_5v1_ati(){ 107 | return [this](TInt ti){ 108 | TInt x0 = (ti*this->getndata())/this->getnthreads(); 109 | this->pll_principal_X(update_L_lowers_upbs_S_H_5v1, ti, this->get_CC(), this->get_halfminCC(), this->get_delta_C(), this->get_L() + x0, this->get_lowers() + x0*this->getncentroids(), this->get_upbs() + x0, this->round); 110 | }; 111 | } 112 | 113 | public: 114 | typedef kmeans::P3V0 PC; 115 | template 116 | P5V1(Args&&... args): PC(std::forward(args)...), 117 | 118 | CC{ new TFloat [this->getncentroids()*this->getncentroids()] }, 119 | halfminCC{ new TFloat [this->getncentroids()] } 120 | { 121 | this->setalgname("p5v1"); 122 | } 123 | 124 | virtual ~P5V1(){} 125 | 126 | virtual void verbose_write_additional(){ 127 | PC::verbose_write_additional(); 128 | /* do I want to write CC as well ? */ 129 | } 130 | 131 | virtual void set_initialisation_tasks(){ 132 | 133 | 134 | this->initialisation_tasks = this->exact_makeset_C_C_l22s_L_inds0_lowers_upper_S_H_mati(); 135 | 136 | /* note : CC and halfminCC don't need to be set at this point. C and C_l22s must be set so that S&H can be set above. Maybe with smarter initialisations or initialisations with kmeans++ */ 137 | 138 | } 139 | 140 | virtual void set_C_tasks(){ 141 | 142 | this->C_tasks = { 143 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX), 144 | 145 | arrutilv2::update_CC_halfminCC_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_C(), this->get_C_l22s(), this->get_CC(), this->get_halfminCC(), this->ndcalcs_notX) 146 | }; 147 | } 148 | 149 | virtual void set_X_tasks(){ 150 | 151 | this->X_tasks = { 152 | this->update_L_lowers_upbs_S_H_5v1_ati() 153 | }; 154 | } 155 | }; 156 | 157 | 158 | } 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /src/exactsimplebatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_EXACTSIMPLEBATCHKMEANS_H 22 | #define PLL_EXACTSIMPLEBATCHKMEANS_H 23 | 24 | #include "basesimpleexact.h" 25 | 26 | 27 | 28 | namespace kmeans{ 29 | 30 | template 31 | class SimpleExactBatchKmeans : public kmeans::BaseSimpleExactKmeans{ 32 | 33 | private: 34 | TInt nperbatch; 35 | 36 | public: 37 | TInt get_nperbatch(){ 38 | return this->nperbatch; 39 | } 40 | 41 | template 42 | SimpleExactBatchKmeans(Args&&... args): kmeans::BaseSimpleExactKmeans (std::forward(args)...) { 43 | this->setalgname("Exact Simple Batch K-Means"); 44 | //set so that the batch step does not cause memory in assigment to exceed half memory of data itself 45 | nperbatch = std::max( 46 | static_cast (1), static_cast ((this->getndata() * this->getdimension())/(2 * this->getncentroids() * this->nthreads)) 47 | ); 48 | } 49 | 50 | virtual ~SimpleExactBatchKmeans(){}; 51 | 52 | 53 | protected: 54 | virtual void set_X_tasks(){ 55 | this->X_tasks = { 56 | arrutilv2::update_L_S_H_batch_ati(this->getnthreads(), this->getndata(), this->nperbatch, this->getdimension(), this->getdata(), this->getncentroids(), this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dsums(), this->get_dcounts(), this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex, this->ndcalcs_X) 57 | }; 58 | 59 | 60 | //update_L_S_H_batch_ati(TInt nthreads, TInt ndata, TInt nperbatch, TInt dimension, const TFloat * const data, TInt ncentroids, const TFloat * const C, const TFloat * const data_l22s, const TFloat * const C_l22s, TInt * const L, TFloat * const dsums, TInt * const dcounts, TFloat * const sums, TInt * const counts, TInt & nchanges, std::mutex & work_mutex, std::atomic & ndcalcs){ 61 | 62 | 63 | } 64 | }; 65 | 66 | } 67 | 68 | 69 | 70 | #endif 71 | 72 | 73 | 74 | //extern template class kmeans::SimpleExactBatchKmeans; 75 | //extern template class kmeans::SimpleExactBatchKmeans; 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /src/growbatchapp.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef GBAPP_H 22 | #define GBAPP_H 23 | 24 | #include 25 | 26 | namespace growbatchapp{ 27 | 28 | template 29 | class GBApp { 30 | public: 31 | 32 | 33 | /* amount of data which is active. Initially determined by user, thereafter grows by factor of growthfactor when nec */ 34 | TInt ndata_active; 35 | 36 | /* Hacky variable for printing purposes */ 37 | TFloat d_C__over__d_AB; 38 | 39 | /* will be 2 */ 40 | TFloat growthfactor; 41 | 42 | /* definition depends on class, for Grow Batch Partitional: if \|C_A - C_B\|_2 > threshold * \|C_{t} - C_{t-1}\|, then grow by growthfactor. will be 1.0 */ 43 | TFloat threshold; 44 | 45 | /* amount of data which was active in previous round. Either ndata_active or ndata_active/2. */ 46 | TInt ndata_active_previous; 47 | 48 | /* used to determine if exapansion should take place (and maybe other things) */ 49 | std::unique_ptr delta_C; 50 | 51 | 52 | }; 53 | 54 | template 55 | class GBMseApp { 56 | public: 57 | std::vector sse_by_cluster; 58 | std::vector mse_by_cluster; 59 | std::unique_ptr dn; //distance to nearest. TODO: checl that not this quantity squared. 60 | }; 61 | 62 | } 63 | 64 | 65 | 66 | 67 | 68 | 69 | #endif 70 | 71 | -------------------------------------------------------------------------------- /src/hamerly11v0.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_HAMERLYKMEANS_11V0_H 22 | #define PLL_HAMERLYKMEANS_11V0_H 23 | 24 | #include "basehamerly.h" 25 | 26 | namespace kmeans{ 27 | 28 | 29 | 30 | 31 | 32 | template 33 | void update_L_lower_upper_S_H_11v0(TInt ncentroids, TInt dimension, TFloat * const S, TInt * const H , TInt & nchanges, TInt &ndcalcs, 34 | TInt ndata, const TFloat * const data, const TFloat * const C, const TFloat * const data_l22s, const TFloat * const C_l22s, const TFloat * const CC, const TFloat * const halfminCC, const TFloat * const delta_C, TInt * const L, TFloat * const lower, TFloat * const upper, const TInt & round){ 35 | 36 | nchanges = 0; 37 | ndcalcs = 0; 38 | 39 | 40 | TFloat m; 41 | TInt oldlabel; 42 | std::unique_ptr distances (new TFloat [ncentroids]); 43 | 44 | //TODO: Hamerly checks that the label of data point is not max-mover (if fail, add second biggest budge). 45 | TFloat max_deltaC_previous_round; 46 | TInt index_max_deltaC_previous_round; 47 | arrutilv2::set_argmaxmax(ncentroids, delta_C, index_max_deltaC_previous_round, max_deltaC_previous_round); 48 | for (TInt i = 0; i < ndata; ++i){ 49 | lower[i] -= max_deltaC_previous_round; 50 | upper[i] += max_deltaC_previous_round; 51 | m = std::max(halfminCC[L[i]], lower[i]); 52 | if (upper[i] > m){ 53 | arrutilv2::set_l2(dimension, data + i*dimension, C + L[i]*dimension, data_l22s[i], C_l22s[L[i]], upper[i], ndcalcs); 54 | if (upper[i] > m){ 55 | oldlabel = L[i]; 56 | arrutilv2::set_rl2s(dimension, data + i*dimension, ncentroids, C, data_l22s[i], C_l22s, distances.get(), ndcalcs); 57 | arrutilv2::set_argminmin2nocheck(ncentroids, distances.get(), L[i], upper[i], lower[i]); 58 | if (L[i] != oldlabel){ 59 | ++nchanges; 60 | ++H[L[i]]; 61 | --H[oldlabel]; 62 | arrutilv2::addto(dimension, data + i*dimension, S + dimension*L[i]); 63 | arrutilv2::subtractfrom(dimension, data + i*dimension, S + dimension*oldlabel); 64 | } 65 | } 66 | } 67 | } 68 | } 69 | 70 | 71 | 72 | 73 | /* discrepency in ndcalcs with a11v0 is due to a11v0 performing CC computation before first round */ 74 | 75 | template 76 | class P11V0 : public kmeans::BaseHamerly{ 77 | 78 | private: 79 | 80 | 81 | protected: 82 | 83 | TFloat * const get_lower(){ 84 | return this->get_lower_base(); 85 | } 86 | 87 | TFloat * const get_upper(){ 88 | return this->get_upper_base(); 89 | } 90 | 91 | std::function update_L_lower_upper_S_H_11v0_ati(){ 92 | return [this](TInt ti){ 93 | TInt x0 = (ti*this->getndata())/this->getnthreads(); 94 | 95 | this->pll_principal_X(update_L_lower_upper_S_H_11v0, ti, this->get_CC(), this->get_halfminCC(), this->get_delta_C(), this->get_L() + x0, this->get_lower() + x0, this->get_upper() + x0, this->round); 96 | }; 97 | } 98 | 99 | public: 100 | typedef kmeans::BaseHamerly BH; 101 | template 102 | P11V0(Args&&... args): BH(std::forward(args)...) 103 | 104 | { 105 | this->setalgname("p11v0"); 106 | } 107 | virtual ~P11V0(){} 108 | 109 | virtual void verbose_write_additional(){ 110 | BH::verbose_write_additional(); 111 | /* anything else to print ? */ 112 | } 113 | 114 | 115 | virtual void set_initialisation_tasks(){ 116 | 117 | 118 | this->initialisation_tasks = this->makeset_C_C_l22s_L_inds0_lower_upper_S_H_mati(); 119 | 120 | /* note : CC and halfminCC don't need to be set at this point. C and C_l22s must be set so that S&H can be set above. Maybe with smarter initialisations or initialisations with kmeans++ */ 121 | 122 | } 123 | 124 | virtual void set_C_tasks(){ 125 | this->C_tasks = { 126 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX), 127 | 128 | arrutilv2::update_CC_halfminCC_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_C(), this->get_C_l22s(), this->get_CC(), this->get_halfminCC(), this->ndcalcs_notX) 129 | }; 130 | } 131 | 132 | virtual void set_X_tasks(){ 133 | 134 | this->X_tasks = { 135 | 136 | 137 | this->update_L_lower_upper_S_H_11v0_ati() 138 | }; 139 | } 140 | }; 141 | 142 | } 143 | 144 | #endif 145 | 146 | -------------------------------------------------------------------------------- /src/minibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_MINIBATCHKMEANS_H 22 | #define PLL_MINIBATCHKMEANS_H 23 | 24 | #include "basesimpleminibatch.h" 25 | 26 | namespace kmeans{ 27 | template 28 | //Like D Sculley, but instead of just adding newly labeled data to centroids, if the data has already been used first remove it from the centroid it was assigned to previously. This breaks the 1/t convergence to the local minimum 29 | class MiniBatch : public kmeans::BaseSimpleMiniBatch{ 30 | 31 | private: 32 | 33 | virtual void update_L_S_H(TInt x0, TInt x1, TInt ti) override final{ 34 | 35 | 36 | if (this->round < this->mba.nsubrounds){ 37 | this->update_L_S_H_batch_increment_only(x0, x1, ti); 38 | } 39 | 40 | else{ 41 | this->update_L_S_H_batch(x0, x1, ti); 42 | } 43 | } 44 | 45 | public: 46 | 47 | 48 | template 49 | MiniBatch(Args&&... args): kmeans::BaseSimpleMiniBatch (std::forward(args)...) { 50 | this->setalgname("(Improved) Mini Batch Kmeans"); 51 | } 52 | 53 | 54 | virtual ~MiniBatch(){}; 55 | 56 | }; 57 | 58 | 59 | } 60 | 61 | 62 | #endif 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /src/minibatchapp.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef MINIBATCHAPP_H 22 | #define MINIBATCHAPP_H 23 | 24 | namespace minibatchapp{ 25 | 26 | template 27 | class MiniBatchApp{ 28 | public: 29 | //nuber of data used for each centroid update 30 | TInt batchsize; 31 | //number of centroid updates in one complete round of data 32 | TInt nsubrounds; 33 | //this round mod nsubrounds 34 | TInt subround; 35 | //the amount of data used in the nsubround'th update, the other updates use batchsize datapoints 36 | TInt lastbatchsize; 37 | //The size of the first batch. 38 | TInt initialising_batch_size; 39 | //Number of changes on each batch since previous time that batch processed. 40 | std::vector nchanges_on_batch; 41 | 42 | 43 | MiniBatchApp() = default; 44 | 45 | MiniBatchApp(TInt batchsize, TInt ndata){ 46 | this->batchsize = batchsize; 47 | if (ndata % batchsize == 0){ 48 | this->nsubrounds = ndata/batchsize; 49 | } 50 | 51 | else{ 52 | this->nsubrounds = 1 + ndata/batchsize; 53 | } 54 | this->lastbatchsize = ndata - batchsize*(this->nsubrounds - 1); 55 | this->subround = 0; 56 | 57 | 58 | this->initialising_batch_size = std::min(this->batchsize, ndata); 59 | this->nchanges_on_batch = std::vector (this->nsubrounds, 0); 60 | } 61 | }; 62 | 63 | //template 64 | //void set_summaries_minibatch(cluster::BaseCluster & basecluster, const minibatchapp::MiniBatchApp & mba){ 65 | 66 | 67 | 68 | 69 | 70 | 71 | } 72 | 73 | 74 | 75 | 76 | #endif 77 | 78 | -------------------------------------------------------------------------------- /src/optionsutil.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "optionsutil.h" 28 | namespace optionsutil{ 29 | 30 | Option::Option(std::string fn, std::string sn, std::string desc, std::string tp, std::string dv): fullname(std::move(fn)), shortname(std::move(sn)), description(std::move(desc)), type(std::move(tp)), defval(std::move(dv)), isset(false) { 31 | if (defval.compare("") == 0){ 32 | definition = "--" + fullname + " -" + shortname + " " + type; 33 | } 34 | else{ 35 | definition = "--" + fullname + " -" + shortname + " " + type + " (" + defval + ") "; 36 | } 37 | } 38 | 39 | Option::Option(){ 40 | throw std::logic_error("Default constructor for Option called, this should never happen"); 41 | } 42 | 43 | 44 | void Option::print(unsigned tab1, unsigned tab2){ 45 | unsigned width = tab2 - tab1; 46 | unsigned margin = 2; 47 | 48 | std::cout << definition; 49 | if (definition.size() > tab1-margin){ 50 | std::cout << " \n"; 51 | std::cout << std::setw(tab1) << " "; 52 | } 53 | else{ 54 | std::cout << std::setw(tab1 - definition.size()) << " "; 55 | } 56 | 57 | 58 | unsigned fragi = 0; 59 | //unsigned currenti = 0; 60 | //std::string nextline(""); 61 | //while (currenti < description.size()){ 62 | //nextline = description.substr(currenti, width); 63 | //if (nextline.find("\0") != std::string::npos) { 64 | 65 | //} 66 | //} 67 | 68 | while(fragi < description.size()/width){ 69 | 70 | //\033 71 | 72 | //if (description.substr(fragi*width, width).find("\\0") != std::string::npos){ 73 | //std::cout << "-------------------------------------------------------------" << std::endl; 74 | //} 75 | std::cout << description.substr(fragi*width, width) << " \n"; 76 | std::cout << std::setw(tab1) << " "; 77 | ++fragi; 78 | } 79 | std::cout << description.substr(fragi*width) << " \n" << std::endl; 80 | } 81 | 82 | /* To do : is it possible to have the following class using variadic templates: 83 | * Options options; 84 | * options.add("name1", anint) 85 | * options.add("name2", astring) 86 | * ... 87 | * ? 88 | * */ 89 | 90 | void Options::add(Option && o){ 91 | fullname.emplace(std::make_pair(o.shortname, o.fullname)); 92 | options.emplace(std::make_pair(o.fullname, std::move(o))); 93 | } 94 | 95 | void Options::add(std::string fn, std::string sn, std::string desc, std::string type, std::string defval){ 96 | fullname.emplace(std::make_pair(sn, fn)); 97 | options.emplace(std::make_pair(fn, Option(fn, std::move(sn), std::move(desc), std::move(type), std::move(defval)))); 98 | } 99 | 100 | void Options::print(unsigned tab1, unsigned tab2){ 101 | 102 | //std::cout << "\n-------------------------------------------------\n"; 103 | std::cout << "\nThe options are of the form,\n--full_option_name -abridged_name type (default) \n\n"; 104 | 105 | 106 | std::cout << std::endl; 107 | for (auto & p : options){ 108 | p.second.print(tab1, tab2); 109 | } 110 | 111 | unsigned fragi = 0; 112 | while(fragi < tail.size()/tab2){ 113 | std::cout << tail.substr(fragi*tab2, tab2) << "\n"; 114 | ++fragi; 115 | } 116 | std::cout << tail.substr(fragi*tab2) <<"\n" << std::endl; 117 | 118 | } 119 | 120 | } 121 | -------------------------------------------------------------------------------- /src/optionsutil.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #include 22 | #include 23 | 24 | namespace optionsutil{ 25 | 26 | class Option{ 27 | public: 28 | std::string fullname; 29 | std::string shortname; 30 | std::string description; 31 | std::string type; 32 | std::string defval; 33 | bool isset; 34 | 35 | Option(std::string fn, std::string sn, std::string desc, std::string tp, std::string dv); 36 | Option(); 37 | void print(unsigned tab1, unsigned tab2); 38 | 39 | private: 40 | std::string definition; 41 | }; 42 | 43 | class Options{ 44 | public: 45 | std::map options; 46 | std::map fullname; 47 | std::string tail; 48 | void add(Option && o); 49 | void add(std::string fn, std::string sn, std::string desc, std::string type, std::string defval); 50 | void print(unsigned tab1 = 40, unsigned tab2 = 85); 51 | }; 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/pllkmeansfuncs.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLLKMEANSFUNCS_HPP 22 | #define PLLKMEANSFUNCS_HPP 23 | 24 | #include 25 | 26 | 27 | #include "pllcluster.h" 28 | 29 | 30 | namespace cluster{ 31 | 32 | //boilerplate 33 | template 34 | std::tuple, std::unique_ptr, std::unique_ptr, size_t, size_t, TFloat, std::string> 35 | 36 | solveioless(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const TFloat * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const TFloat * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, TFloat maxtime, size_t maxrounds, size_t minibatchsize, size_t nvaldata, const TFloat * const valdata, size_t valperiod, bool captureverbose){ 37 | 38 | 39 | 40 | std::stringstream buffer; 41 | auto cout_buff = std::cout.rdbuf(); 42 | 43 | if (captureverbose == true){ 44 | auto bizzle = buffer.rdbuf(); 45 | std::cout.rdbuf(bizzle); 46 | } 47 | 48 | std::ofstream nowhere; 49 | 50 | 51 | //I assume cmse not wanted 52 | size_t cmserate = 0; 53 | 54 | 55 | TFloat gbphi = 1e-3; //ooph...... 56 | 57 | 58 | auto pretro = solve6<'d', size_t, TFloat>(algorithm, minibatchsize, nthreads, ndata, dimension, data, ncentroids, cout_verbosity, 0, nowhere, initialisation_method, C_init, data_indices_init_from, setseed, seed, maxtime, maxrounds, "", nvaldata, valdata, valperiod, "", cmserate, gbphi); 59 | 60 | 61 | std::string text; 62 | 63 | if (captureverbose == true){ 64 | text = buffer.str(); 65 | std::cout.rdbuf(cout_buff); 66 | } 67 | 68 | else{ 69 | text = "captureverbose was false, so nothing here"; 70 | } 71 | 72 | 73 | auto retro = std::move(std::tuple_cat(std::move(pretro), std::make_tuple(text)));//, std::make_tuple("bwerlk")); 74 | 75 | return retro; 76 | //} 77 | } 78 | } 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /src/pllkmeansfuncs_nonvoid.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLLKMEANSFUNCS_H 22 | #define PLLKMEANSFUNCS_H 23 | 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | 31 | namespace cluster{ 32 | 33 | 34 | /* useful for direct use in C++ code */ 35 | /* return : C, L, inds0, duration, niterations, mse */ 36 | std::tuple, std::unique_ptr, std::unique_ptr, size_t, size_t, float, std::string> 37 | solveiolessf(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const float * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const float * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, float maxtime, size_t maxrounds, size_t minibatchsize, bool captureverbose); 38 | 39 | std::tuple, std::unique_ptr, std::unique_ptr, size_t, size_t, double, std::string> 40 | solveiolessd(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const double * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const double * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, double maxtime, size_t maxrounds, size_t minibatchsize, bool captureverbose); 41 | 42 | 43 | } 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/pllkmeansfuncs_void.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLLKMEANSVOIDFUNCS_H 22 | #define PLLKMEANSVOIDFUNCS_H 23 | 24 | namespace cluster { 25 | 26 | 27 | 28 | 29 | /* As per nonvoid versions, but C, L, inds0, duration, niterations, mse set inplace. They should be initialised to be the right dimension before entering. (useful function for Cython so that no messing around with smart pointers, although apparently it is straightforward...) */ 30 | void v_solveiolessf(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const float * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const float * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, float maxtime, size_t maxrounds, float * const C, size_t * const L, size_t * const inds0, size_t & duration, size_t & niterations, float & mse, size_t minibatchsize, size_t nvaldata, const float * const valdata, size_t valperiod , bool captureverbose, std::string & verbosestring); 31 | 32 | 33 | void v_solveiolessd( 34 | const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const double * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const double * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, double maxtime, size_t maxrounds, double * const C, size_t * const L, size_t * const inds0,size_t & duration, size_t & niterations, double & mse, size_t minibatchsize, size_t nvaldata, const double * const valdata, size_t valperiod , bool captureverbose, std::string & verbosestring); 35 | 36 | 37 | /* functions used in kmeans executable */ 38 | void solvewrited( 39 | const std::string & algorithm, 40 | bool issparse, 41 | size_t nruns, 42 | size_t nthreads, 43 | int cout_verbosity, 44 | int file_verbosity, 45 | const std::string & datainfn, 46 | const std::string & coutfn, 47 | const std::string & loutfn, 48 | const std::string & ioutfn, 49 | const std::string & soutfn, 50 | const std::string & voutfn, 51 | const std::string & moutfn, 52 | const std::string & moutdir, 53 | const std::string & cinf, 54 | const std::string & ind0fn, 55 | const std::string & init0, 56 | bool setseed, 57 | size_t seed, 58 | size_t ncentroids, 59 | size_t maxiter, 60 | double maxtime, 61 | const std::string & valinfn, 62 | size_t valperiod, 63 | size_t minibatchsize, 64 | std::string & cmsewritefn, 65 | size_t cmserate, //27 66 | double gbphi 67 | ); 68 | 69 | void solvewritef(const std::string & algorithm, bool issparse, size_t nruns, size_t nthreads, int cout_verbosity, int file_verbosity, const std::string & datainfn, const std::string & coutfn, const std::string & loutfn, const std::string & ioutfn, const std::string & soutfn, const std::string & voutfn, const std::string & moutfn, const std::string & moutdir, const std::string & cinf, const std::string & ind0fn, const std::string & init0, bool setseed, size_t seed, size_t ncentroids, size_t maxiter, double maxtime, const std::string & valinfn, size_t valperiod, size_t minibatchsize, std::string & cmsewritefn, size_t cmserate, float gbphi); 70 | 71 | 72 | 73 | 74 | } 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /src/processingfilename.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | # Written by James Newling 4 | # All rights reserved. 5 | # 6 | # eakmeans is a library for exact and approximate k-means written in C++ and 7 | # Python. This file is part of eakmeans. See file COPYING for more details. 8 | # 9 | # This file is part of eakmeans. 10 | # 11 | # eakmeans is free software: you can redistribute it and/or modify 12 | # it under the terms of the 3-Clause BSD Licence. See 13 | # https://opensource.org/licenses/BSD-3-Clause for more details. 14 | # 15 | # eakmeans is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | # COPYING for more details. 19 | # 20 | import sys 21 | import os 22 | import commands 23 | import shutil 24 | 25 | names = commands.getstatusoutput('find .. -name "*.hpp" -type "f"')[1].split("\n") 26 | 27 | bobs = [] 28 | for n in names: 29 | shutil.copy(n, n.split("/")[-1]) 30 | 31 | #if "whileprototying" not in n and "test" not in n and "experiments" not in n and "junk" not in n: 32 | #if "util" in n and "main" in n: 33 | #pass 34 | ##print "--------> ", n 35 | #else: 36 | #bobs.append(n) 37 | 38 | #for b in bobs: 39 | #shutil.copy(b, b.split("/")[-1]) 40 | #print b 41 | 42 | #print n 43 | 44 | #bobs.append(n.split("/")[-1]) 45 | #bobs.sort() 46 | 47 | #for b in bobs: 48 | #print b 49 | ##for n in names: 50 | ##if "arrutilv2l0" not in n and "kmeansstandalone" not in n: 51 | ##shutil.copy(n, n.split("/")[-1]) 52 | -------------------------------------------------------------------------------- /src/randomarray.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef RANDOMARRAY_H 22 | #define RANDOMARRAY_H 23 | 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | namespace randomutil{ 33 | namespace randomarray{ 34 | 35 | template 36 | void filluniform_int(SizeType size_tofill, IntType * tofill, IntType lower, IntType upper){ 37 | IntType range = upper - lower; 38 | for (SizeType i = 0; i < size_tofill; ++ i){ 39 | tofill[i] = lower + rand() % range; 40 | } 41 | } 42 | 43 | template 44 | void filluniform_float(SizeType size_tofill, FloatType * tofill, FloatType lower, FloatType upper){ 45 | FloatType range = upper - lower; 46 | for (SizeType i = 0; i < size_tofill; ++ i){ 47 | tofill[i] = lower + range * (static_cast (rand()) / static_cast (RAND_MAX)); 48 | } 49 | } 50 | 51 | template 52 | void filluniform(SizeType size_tofill, NumberType * tofill, NumberType lower, NumberType upper); 53 | 54 | template 55 | void filluniform(SizeType size_tofill, float * tofill, float lower, float upper){ 56 | filluniform_float(size_tofill, tofill, lower, upper); 57 | } 58 | 59 | template 60 | void filluniform(SizeType size_tofill, double * tofill, double lower, double upper){ 61 | filluniform_float(size_tofill, tofill, lower, upper); 62 | } 63 | 64 | 65 | template 66 | void filluniform(SizeType size_tofill, unsigned * tofill, unsigned lower, unsigned upper){ 67 | filluniform_int(size_tofill, tofill, lower, upper); 68 | } 69 | 70 | template 71 | void filluniform(SizeType size_tofill, int * tofill, int lower, int upper){ 72 | filluniform_int(size_tofill, tofill, lower, upper); 73 | } 74 | 75 | /* fill tofill with values chosen from options uniformly at random */ 76 | template 77 | void filluniform(SizeType size_tofill, NumberType * tofill, Container && options){ 78 | unsigned n_options = options.size(); 79 | std::vector option_numbers (size_tofill); 80 | filluniform(size_tofill, option_numbers.data(), static_cast (0), n_options); 81 | for (SizeType i = 0; i < size_tofill; ++i){ 82 | tofill[i] = options[option_numbers[i]]; 83 | } 84 | } 85 | 86 | 87 | template 88 | std::vector getuniform(SizeType N, Container && options){ 89 | std::vector sampled (N,0); 90 | filluniform(N, sampled.data(), options); 91 | return sampled; 92 | } 93 | 94 | 95 | // untested function: 96 | template 97 | std::vector getuniform(SizeType N, NumberType lower, NumberType upper){ 98 | std::vector sampled (N,0); 99 | filluniform(N, sampled.data(), lower, upper); 100 | return sampled; 101 | } 102 | 103 | // untested function: 104 | template 105 | std::unique_ptr getuniform_uptr(SizeType N, NumberType lower, NumberType upper){ 106 | std::unique_ptr sampled (new NumberType [N]); 107 | filluniform(N, sampled.get(), lower, upper); 108 | return sampled; 109 | } 110 | 111 | 112 | //return vector of length ndraes of sorted vectors, each vector has probability that a value (TInt) lies in the vector being p in range [0, N) and 0 otherwise. 113 | template 114 | std::vector> get_p_sample(TInt ndraws, TFloat p, TInt N){ 115 | 116 | TInt proposal; 117 | bool goodproposal; 118 | std::vector> samples (ndraws); 119 | 120 | std::default_random_engine generator(rand()); 121 | std::binomial_distribution distribution(N,p); 122 | for (TInt draw=0; draw 157 | void signflip(SizeType ndata, NumberType * const data, ProbType switch_probability){ 158 | for (SizeType i = 0; i < ndata; ++i){ 159 | if ((float(rand()) / float(RAND_MAX)) < switch_probability){ 160 | data[i]*=(-1); 161 | } 162 | } 163 | } 164 | 165 | } 166 | } 167 | 168 | #endif 169 | -------------------------------------------------------------------------------- /src/randomsparse.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef RANDOMSPARSE_H 22 | #define RANDOMSPARSE_H 23 | 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include "sparsedatasets.h" 32 | #include "randomarray.h" 33 | 34 | namespace randomutil{ 35 | namespace randomsparse{ 36 | 37 | //TODO : make function accept a random number generator. 38 | template 39 | sparse::SparseData get_sparsedata(TInt ndata, TInt dimension, TFloat sparsity){ 40 | 41 | 42 | std::vector values; 43 | std::vector indices; 44 | std::vector starts (1,0); 45 | std::vector labels; 46 | 47 | 48 | //vector of vectors of indices 49 | auto vvinds = randomutil::randomarray::get_p_sample(ndata, sparsity, dimension); 50 | 51 | 52 | for (TInt i = 0; i < ndata; ++i){ 53 | for (auto & index : vvinds[i]){ 54 | indices.push_back(index); 55 | TFloat value = (static_cast (rand()) / static_cast (RAND_MAX)); 56 | values.push_back(value); 57 | } 58 | //for (TInt j = 0; j < dimension; ++j){ //really slow way to do it! TODO: 59 | //bool nonzero = (sparsity > (rand() / (RAND_MAX + 0.))); 60 | //if (nonzero == true){ 61 | //TFloat value = (static_cast (rand()) / static_cast (RAND_MAX)); 62 | //values.push_back(value); 63 | //indices.push_back(j); 64 | //} 65 | //} 66 | starts.push_back(indices.size()); 67 | labels.push_back("1011"); //give everything label 0 68 | } 69 | 70 | return sparse::SparseData(std::move(values), std::move(indices), std::move(starts), std::move(labels)); 71 | 72 | } 73 | 74 | //TODO : make function accept a random number generator. 75 | void write_sparsedata(unsigned ndata, unsigned dimension, double sparsity, const std::string & filename, bool dimheader = true){ 76 | auto sd = get_sparsedata (ndata, dimension, sparsity); 77 | sd.write(filename, dimheader); 78 | } 79 | 80 | void write_sparse_and_dense_data(unsigned ndata, unsigned dimension, double sparsity, const std::string & sparsefilename, const std::string & densefilename){ 81 | auto sd = get_sparsedata (ndata, dimension, sparsity); 82 | sd.write(sparsefilename, true); 83 | sd.write_dense(densefilename, true); 84 | } 85 | 86 | 87 | } 88 | 89 | } 90 | 91 | 92 | #endif 93 | 94 | -------------------------------------------------------------------------------- /src/sample.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef SAMPLE_H 22 | #define SAMPLE_H 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | 33 | namespace randomutil{ 34 | namespace sample{ 35 | 36 | /*TODO based on code at: //codegolf.stackexchange.com/questions/4772/random-sampling-without-replacement 37 | * but changed to make uninclusive of upperbound! 38 | g by universal ref?? I really need to clarify when universal ref should be used.. 39 | see https://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers */ 40 | 41 | //O(max - min) algorithm for almost uniform sampling. 42 | 43 | template 44 | 45 | 46 | void range_no_replacement(OutputIterator out, IntegerType n, IntegerType min, IntegerType max, URNG && g){ 47 | if (n < 0) 48 | throw std::runtime_error("negative sample size"); 49 | if (max < min) 50 | throw std::runtime_error("invalid range"); 51 | if (n > max-min+1) 52 | throw std::runtime_error("sample size larger than range"); 53 | 54 | while (n>0) 55 | { 56 | double r = g()/(RAND_MAX+1.0); 57 | if (r*(max-min) < n) 58 | { 59 | *out++ = min; 60 | --n; 61 | } 62 | ++min; 63 | } 64 | } 65 | 66 | 67 | template 68 | void range_no_replacement(OutputIterator out, IntegerType n, IntegerType min, IntegerType max){ 69 | //std::minstd_rand0 generator (time(NULL)); 70 | //range_no_replacement(out, n, min, max, generator); 71 | range_no_replacement(out, n, min, max, rand); 72 | } 73 | 74 | 75 | 76 | 77 | template 78 | std::vector get_range_no_replacement(IntegerType n, IntegerType min, IntegerType max){ 79 | std::vector samples(n); 80 | range_no_replacement(samples.data(), n, min, max); 81 | return samples; 82 | } 83 | 84 | 85 | 86 | 87 | template 88 | std::vector get_range_no_replacement(IntegerType n, IntegerType min, IntegerType max, URNG && g){ 89 | std::vector samples(n); 90 | range_no_replacement(samples.data(), n, min, max, g); 91 | return samples; 92 | } 93 | 94 | 95 | 96 | template 97 | std::vector get_permuted_range(IntegerType n, URNG && g){ 98 | std::vector shuffled(n, 0); 99 | std::iota(shuffled.begin(), shuffled.end(), 0); 100 | std::random_shuffle(shuffled.begin(), shuffled.end(), [&g](IntegerType i){return g()%i;}); 101 | return shuffled; 102 | } 103 | 104 | // See http://en.cppreference.com/w/cpp/algorithm/random_shuffle for inspiration :) 105 | template 106 | void inplace_shuffle_by_row(TInt nrows, TInt ncols, TFloat * const data){ 107 | 108 | std::unique_ptr ptrtemp ( new TFloat [ncols] ); 109 | TFloat * const temp = ptrtemp.get(); 110 | TInt copyindex; 111 | 112 | 113 | for (int i = nrows-1; i > 0; --i) { 114 | copyindex = rand()%(i+1); 115 | 116 | std::memcpy(temp, data + i*ncols, sizeof(TFloat)*ncols); 117 | std::memcpy(data + i*ncols, data + copyindex*ncols, sizeof(TFloat)*ncols); 118 | std::memcpy(data + copyindex*ncols, temp, sizeof(TFloat)*ncols); 119 | } 120 | } 121 | 122 | 123 | 124 | } 125 | } 126 | 127 | 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /src/simple1.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_SIMPLEKMEANS_H 22 | #define PLL_SIMPLEKMEANS_H 23 | 24 | #include "basesimpleexact.h" 25 | 26 | namespace kmeans{ 27 | 28 | template 29 | class SimpleKmeans1 : public kmeans::BaseSimpleExactKmeans{ 30 | 31 | 32 | protected: 33 | 34 | virtual void set_X_tasks(){ 35 | 36 | this->X_tasks = { 37 | 38 | arrutilv2::update_L_S_H_ati(this->nthreads, this->ndata, this->dimension, this->data, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dsums(), this->get_dcounts(), this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex, this->ndcalcs_X), 39 | 40 | 41 | 42 | }; 43 | 44 | } 45 | 46 | public: 47 | template 48 | SimpleKmeans1(Args&&... args): kmeans::BaseSimpleExactKmeans (std::forward(args)...) { 49 | this->setalgname("simple kmeans"); 50 | } 51 | virtual ~SimpleKmeans1(){}; 52 | 53 | }; 54 | 55 | } 56 | 57 | //extern template class kmeans::SimpleKmeans1; 58 | //extern template class kmeans::SimpleKmeans1; 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/simplest.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_SIMPLESTKMEANS_H 22 | #define PLL_SIMPLESTKMEANS_H 23 | 24 | #include 25 | 26 | #include "basekmeans.h" 27 | 28 | namespace kmeans{ 29 | 30 | template 31 | class SimplestKmeans : public kmeans::BaseExact{ 32 | 33 | 34 | public: 35 | template 36 | /* variadic args ala Eli Bendersky */ 37 | SimplestKmeans(Args&&... args): kmeans::BaseExact (std::forward(args)...) {this->setalgname("simplest");} 38 | virtual ~SimplestKmeans(){}; 39 | 40 | protected: 41 | virtual void set_initialisation_tasks(){ 42 | 43 | auto init_tasks_A = kmeans::BaseExact::makeset_C_C_l22s_L_inds0_mati(); 44 | 45 | auto init_task_B = kmeans::BaseExact::set_S_H_ati(); 46 | 47 | this->initialisation_tasks = std::move(init_tasks_A); 48 | this->initialisation_tasks.push_back(std::move(init_task_B)); 49 | 50 | } 51 | 52 | virtual void set_X_tasks(){ 53 | //set Ls 54 | this->X_tasks = { 55 | [this](TInt ti){ 56 | 57 | if (ti == 0){ 58 | this->nchanges = 0; 59 | 60 | for (TInt i = 0; i < this->getndata(); ++i){ 61 | TFloat best_distance = std::numeric_limits::max(); 62 | TInt oldlabel = this->get_L()[i]; 63 | for (TInt ci = 0; ci < this->getncentroids(); ++ci){ 64 | TFloat distance2 = 0; 65 | for (TInt di = 0; di < this->getdimension(); ++di){ 66 | TFloat diffy = this->get_C()[ci*this->getdimension() + di] - this->getdata()[i*this->getdimension() + di]; 67 | distance2 += diffy*diffy; 68 | } 69 | TFloat distance = std::sqrt(std::max(static_cast(0), distance2)); 70 | if (distance <= best_distance){ 71 | best_distance = distance; 72 | this->get_L()[i] = ci; 73 | } 74 | } 75 | if (this->get_L()[i] != oldlabel){ 76 | this->nchanges += 1; 77 | } 78 | } 79 | 80 | this->ndcalcs_X += this->ndata * this->ncentroids; 81 | } 82 | } 83 | 84 | 85 | 86 | 87 | 88 | }; 89 | } 90 | 91 | virtual void set_C_tasks(){ 92 | //set C 93 | this->C_tasks = { 94 | 95 | 96 | 97 | [this](TInt ti){ 98 | if (ti == 0){ 99 | std::vector simplesums (this->getncentroids()*this->getdimension(),0); 100 | std::vector simplecounts (this->getncentroids(),0); 101 | for (TInt i = 0; i < this->getndata(); ++i){ 102 | for (TInt di = 0; di < this->getdimension(); ++di){ 103 | simplesums[this->get_L()[i]*this->getdimension() + di] += this->getdata()[i*this->getdimension() + di]; 104 | } 105 | simplecounts[this->get_L()[i]]+=1; 106 | } 107 | for (TInt ci = 0; ci < this->getncentroids(); ++ ci){ 108 | for (TInt di = 0; di < this->getdimension(); ++di){ 109 | if (simplecounts[ci] != 0){ 110 | this->get_C()[ci*this->getdimension() + di] = simplesums[ci*this->getdimension() + di] / static_cast (simplecounts[ci]); 111 | } 112 | } 113 | } 114 | } 115 | } 116 | 117 | 118 | 119 | 120 | 121 | }; 122 | } 123 | }; 124 | 125 | } 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /src/sparseelkan3v0.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_SPARSEELKANKMEANS_3V0_H 22 | #define PLL_SPARSEELKANKMEANS_3V0_H 23 | 24 | #include "basesparseelkan.h" 25 | #include "alg_X_selkSN.h" 26 | 27 | namespace kmeans{ 28 | 29 | /* discrepency in ndcalcs as compared to a3v0 due to not computing CC initially (I propose) */ 30 | 31 | template 32 | class SP3V0 : public kmeans::BaseSparseElkan{ 33 | 34 | protected: 35 | TFloat * const get_lowers(){ 36 | return this->elkan_lowers_base.get(); 37 | } 38 | 39 | TFloat * const get_upbs(){ 40 | return this->elkan_upper_base.get(); 41 | } 42 | 43 | TFloat * const get_delta_C(){ 44 | return this->elkan_delta_C.get(); 45 | } 46 | 47 | std::function update_3v0_L_lowers_upper_where_changes_ati(){ 48 | //TODO : neaten up and move out 49 | 50 | return [this](TInt ti){ 51 | TInt data0 = (ti*this->ndata)/this->nthreads; 52 | TInt data1 = ((ti + 1)*this->ndata)/this->nthreads; 53 | 54 | TInt ndcalcs_local = 0; 55 | kmeans::sparse_update_L_lowers_upper_where_changes_3v0(this->ncentroids, this->dimension, data0, data1, *this->ptrdata, this->get_C(), this->get_data_l22s() + data0, this->get_C_l22s(), this->get_delta_C(), this->where_label_changes[ti], ndcalcs_local, this->get_L() + data0, this->get_lowers() + data0*this->ncentroids, this->get_upbs() + data0); 56 | this->ndcalcs_X += ndcalcs_local; 57 | }; 58 | } 59 | 60 | std::function update_S_H_from_where_changes_ati(){ 61 | return [this](TInt ti){ 62 | if (ti == 0){ 63 | sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts(), this->nchanges); 64 | } 65 | }; 66 | } 67 | 68 | 69 | 70 | public: 71 | typedef kmeans::BaseSparseElkan EB; 72 | template 73 | SP3V0(Args&&... args): EB(std::forward(args)...) 74 | 75 | 76 | { 77 | this->setalgname("SP3V0"); 78 | this->elkan_delta_C.reset(new TFloat [this->getncentroids()]); 79 | } 80 | virtual ~SP3V0(){} 81 | 82 | virtual TInt get_approximate_memory_requirement(){ 83 | return EB::get_approximate_memory_requirement() + 84 | sizeof(TFloat)*this->getncentroids(); // delta_C 85 | } 86 | 87 | virtual void verbose_write_additional(){ 88 | this->EB_verbose_write_additional(); 89 | /* anything else to add ? */ 90 | } 91 | 92 | virtual void set_initialisation_tasks(){ 93 | /* all Elkan variants have same initialisation tasks */ 94 | this->ElkBase_set_initialisation_tasks(); 95 | } 96 | 97 | virtual void set_C_tasks(){ 98 | this->C_tasks = { 99 | arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX) 100 | }; 101 | } 102 | 103 | virtual void set_X_tasks(){ 104 | this->X_tasks = { 105 | this->update_3v0_L_lowers_upper_where_changes_ati(), 106 | this->update_S_H_from_where_changes_ati() 107 | }; 108 | } 109 | }; 110 | 111 | } 112 | 113 | #endif 114 | -------------------------------------------------------------------------------- /src/sparseinitialise.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | 22 | #ifndef SPARSEINITIALISE_H 23 | #define SPARSEINITIALISE_H 24 | 25 | 26 | #include 27 | #include 28 | namespace kmeans{ 29 | namespace sparseinit{ 30 | 31 | //get copyindices guaranteeing that all distinct. 32 | template 33 | std::tuple, std::vector > get_initialisation_indices(TInt ncentroids, const sparse::SparseData & data, TInt data0 = 0, TInt data1 = 0){ 34 | 35 | if (data1 == 0){ 36 | data1 = data.ndata; 37 | } 38 | 39 | 40 | 41 | 42 | TInt ndata_range = data1 - data0; 43 | std::vector initialisation_indices (ncentroids); 44 | std::unique_ptr C(new TFloat [ncentroids*data.dimension]); 45 | 46 | TInt nattempts = 0; 47 | TInt currentindex = 0; 48 | 49 | 50 | 51 | while (currentindex < ncentroids && nattempts < 5*ncentroids){ 52 | TInt proposal = data0 + rand() % ndata_range; 53 | 54 | bool rejected = false; 55 | for (TInt ci = 0; ci < currentindex; ++ci){ 56 | TFloat l22_diff = sparse::get_l22( 57 | 58 | data.starts[initialisation_indices[ci] + 1] - data.starts[initialisation_indices[ci]], 59 | data.indices.data() + data.starts[initialisation_indices[ci]], 60 | data.values.data() + data.starts[initialisation_indices[ci]], 61 | 62 | data.starts[proposal + 1] - data.starts[proposal], 63 | data.indices.data() + data.starts[proposal], 64 | data.values.data() + data.starts[proposal] 65 | 66 | ); 67 | 68 | if (l22_diff < 1e-5){ 69 | rejected = true; 70 | break; 71 | } 72 | } 73 | ++nattempts; 74 | if (rejected == false){ 75 | sparse::todense::zero_and_copy(proposal, data, C.get() + currentindex*data.dimension); 76 | initialisation_indices[currentindex] = proposal; 77 | ++currentindex; 78 | } 79 | else{ 80 | } 81 | } 82 | 83 | if (currentindex != ncentroids){ 84 | throw std::runtime_error("Tried to find a set of distinct datapoints, but failed (nattempts/ncentroids = 5)"); 85 | } 86 | 87 | 88 | return std::make_tuple (std::move(C), std::move(initialisation_indices)); 89 | } 90 | 91 | 92 | 93 | template 94 | std::tuple, std::unique_ptr, std::unique_ptr, TFloat > 95 | get_kmeanspp_initialisation(TInt ncentroids, const sparse::SparseData & data, TInt ind0, TInt ind1){ 96 | throw std::runtime_error("sparse kmeans ++ not yet implemented. Look for inspiration in dense version. Probably common code to be extracted."); 97 | 98 | 99 | return std::make_tuple (std::unique_ptr {}, 100 | std::unique_ptr {}, 101 | std::unique_ptr {}, 102 | TFloat {}); 103 | 104 | } 105 | 106 | 107 | } 108 | } 109 | #endif 110 | 111 | -------------------------------------------------------------------------------- /src/sparseminibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_SPARSEMINIBATCHKMEANS_H 22 | #define PLL_SPARSEMINIBATCHKMEANS_H 23 | 24 | #include "basesparseminibatch.h" 25 | 26 | namespace kmeans{ 27 | template 28 | class SparseMiniBatch : public kmeans::BaseSparseMiniBatch{ 29 | 30 | private: 31 | 32 | virtual void post_L_adjust_S_H() override final{ 33 | 34 | if (this->round < this->mba.nsubrounds){ 35 | TInt data0 = this->mba.batchsize*(this->round%this->mba.nsubrounds); 36 | TInt data1 = std::min(data0 + this->mba.batchsize, this->ndata); 37 | sparse::increment_S_H(data0, data1, *this->ptrdata, this->get_L(), this->get_sums(), this->get_counts()); 38 | } 39 | 40 | else{ 41 | sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts()); 42 | } 43 | } 44 | 45 | 46 | public: 47 | 48 | 49 | template 50 | SparseMiniBatch(Args&&... args): kmeans::BaseSparseMiniBatch (std::forward(args)...){ 51 | this->algname = "sparse mini batch"; 52 | } 53 | 54 | virtual ~SparseMiniBatch(){}; 55 | 56 | }; 57 | 58 | } 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/sparsesimple.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_SPARSESIMPLE_H 22 | #define PLL_SPARSESIMPLE_H 23 | 24 | #include "basesparseexact.h" 25 | 26 | namespace kmeans{ 27 | 28 | template 29 | class SparseSimple : public kmeans::BaseSparseExact { 30 | 31 | private: 32 | 33 | public: 34 | template 35 | SparseSimple(Args&&... args): kmeans::BaseSparseExact (std::forward(args)...) { 36 | this->setalgname("sparse-simple-kmeans"); 37 | } 38 | 39 | virtual ~SparseSimple(){}; 40 | 41 | protected: 42 | virtual void set_initialisation_tasks(){ 43 | auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati(); 44 | auto init_task_B = 45 | [this](TInt ti){ 46 | if (ti == 0){ 47 | this->set_S_H(static_cast(0), this->ndata); 48 | } 49 | }; 50 | this->initialisation_tasks = std::move(init_tasks_A); 51 | this->initialisation_tasks.push_back(std::move(init_task_B)); 52 | } 53 | 54 | 55 | virtual void set_X_tasks() override final{ 56 | 57 | this->X_tasks = { 58 | 59 | 60 | //experiments show that this is were the majority of the time is spent 61 | sparse::update_L_label_changes_ati(this->nthreads, *this->ptrdata, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->ndcalcs_X, this->where_label_changes), 62 | 63 | 64 | [this](TInt ti){ 65 | if (ti == 0){ 66 | sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts(), this->nchanges); 67 | } 68 | }, 69 | 70 | [this](TInt ti){ 71 | if (ti == 0){ 72 | } 73 | } 74 | 75 | }; 76 | } 77 | 78 | virtual void set_C_tasks(){ 79 | this->C_tasks = { 80 | arrutilv2::update_C_C_l22s_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s()) 81 | }; 82 | } 83 | 84 | 85 | virtual void verbose_write_additional(){ 86 | //TODO 87 | } 88 | 89 | }; 90 | } 91 | 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /src/sparsestandardminibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_SPARSESTANDARDMINIBATCHKMEANS_H 22 | #define PLL_SPARSESTANDARDMINIBATCHKMEANS_H 23 | 24 | #include "basesparseminibatch.h" 25 | 26 | namespace kmeans{ 27 | template 28 | class SparseStandardMiniBatch : public kmeans::BaseSparseMiniBatch{ 29 | 30 | private: 31 | 32 | virtual void post_L_adjust_S_H() override final{ 33 | //just update S and H by adding data which changed 34 | 35 | //TODO : these could be class variables as same calculated here as in update_L_label_changes. 36 | TInt data0 = this->mba.batchsize*(this->round%this->mba.nsubrounds); 37 | TInt data1 = std::min(data0 + this->mba.batchsize, this->ndata); 38 | 39 | sparse::increment_S_H(data0, data1, *this->ptrdata, this->get_L(), this->get_sums(), this->get_counts()); 40 | 41 | } 42 | 43 | public: 44 | 45 | 46 | template 47 | SparseStandardMiniBatch(Args&&... args): kmeans::BaseSparseMiniBatch (std::forward(args)...){ 48 | 49 | this->algname = "sparse standard mini batch"; 50 | } 51 | 52 | virtual ~SparseStandardMiniBatch(){}; 53 | 54 | }; 55 | 56 | } 57 | 58 | //extern template class kmeans::SparseStandardMiniBatch; 59 | //extern template class kmeans::SparseStandardMiniBatch; 60 | 61 | #endif 62 | 63 | -------------------------------------------------------------------------------- /src/standardminibatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef PLL_STANDARDMINIBATCHKMEANS_H 22 | #define PLL_STANDARDMINIBATCHKMEANS_H 23 | 24 | #include "basesimpleminibatch.h" 25 | 26 | namespace kmeans{ 27 | template 28 | class StandardMiniBatch : public kmeans::BaseSimpleMiniBatch{ 29 | 30 | private: 31 | 32 | virtual void update_L_S_H(TInt x0, TInt x1, TInt ti) override final{ 33 | this->update_L_S_H_batch_increment_only(x0, x1, ti); 34 | } 35 | 36 | 37 | 38 | public: 39 | 40 | 41 | template 42 | StandardMiniBatch(Args&&... args): kmeans::BaseSimpleMiniBatch (std::forward(args)...) { 43 | this->setalgname("Standard Mini Batch Kmeans"); 44 | } 45 | 46 | virtual ~StandardMiniBatch(){}; 47 | 48 | }; 49 | 50 | 51 | } 52 | 53 | //extern template class kmeans::StandardMiniBatch; 54 | //extern template class kmeans::StandardMiniBatch; 55 | 56 | #endif 57 | 58 | 59 | -------------------------------------------------------------------------------- /src/stringutilbase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #include "stringutilbase.h" 22 | #include 23 | #include 24 | namespace stringutil{ 25 | //split the string tosplit by delim. With x appearances of delim in tosplit, the returned vector will have length x + 1 (even if appearances at the start, end, contiguous. 26 | std::vector split(const std::string & tosplit, const std::string & delim){ 27 | 28 | std::vector spv; //vector to return 29 | if (delim.length() > tosplit.length()){ 30 | return spv; 31 | } 32 | 33 | 34 | std::vector splitposstarts {0}; 35 | std::vector splitposends; 36 | 37 | for (size_t x = 0; x < tosplit.length() - delim.length() + 1; ++x){ 38 | auto res = std::mismatch(delim.begin(), delim.end(), tosplit.begin() + x); 39 | if (res.first == delim.end()){ 40 | splitposends.push_back(x); 41 | splitposstarts.push_back(x + delim.length()); 42 | 43 | } 44 | } 45 | 46 | splitposends.push_back(tosplit.length()); 47 | 48 | for (unsigned i = 0; i < splitposends.size(); ++i){ 49 | spv.push_back(tosplit.substr(splitposstarts[i], splitposends[i] - splitposstarts[i] )); 50 | } 51 | 52 | return spv; 53 | } 54 | 55 | bool isws(const char & c){ 56 | return (c == ' ' || c == '\t' || c == '\n'); 57 | } 58 | 59 | std::vector split(const std::string & tosplit){ 60 | 61 | std::vector spv2; 62 | 63 | unsigned it = 0; 64 | 65 | while (it != tosplit.size()){ 66 | while (isws(tosplit[it]) and it != tosplit.size()){ 67 | ++it; 68 | } 69 | unsigned start = it; 70 | 71 | while (!isws(tosplit[it]) and it != tosplit.size()){ 72 | ++it; 73 | } 74 | unsigned end = it; 75 | 76 | if (!isws(tosplit[end -1])){ 77 | spv2.push_back(tosplit.substr(start, end - start)); 78 | } 79 | } 80 | 81 | 82 | 83 | 84 | return spv2; 85 | } 86 | 87 | 88 | std::string getdirfromfn(const std::string & fn){ 89 | auto morcels = split(fn, "/"); 90 | 91 | if (morcels[0].compare("") != 0){ 92 | throw std::runtime_error("The string passed to getdirfromfn is not a valid path as there is no leading / ."); 93 | } 94 | 95 | std::string dir = "/"; 96 | 97 | for (unsigned i = 1; i < morcels.size() - 1; ++i){ 98 | dir = dir + morcels[i] + "/"; 99 | } 100 | return dir; 101 | } 102 | 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/stringutilbase.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #include 22 | #include 23 | 24 | namespace stringutil{ 25 | //split the string tosplit by delim. With x appearances of delim in tosplit, the returned vector will have length x + 1 (even if appearances at the start, end, contiguous. 26 | std::vector split(const std::string & tosplit, const std::string & delim); 27 | 28 | //split on whitespaces 29 | std::vector split(const std::string & tosplit); 30 | 31 | 32 | std::string getdirfromfn(const std::string & fn); 33 | } 34 | -------------------------------------------------------------------------------- /src/stringutilclustering.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef ENDOFROUNDSTRING_H 22 | #define ENDOFROUNDSTRING_H 23 | 24 | #include 25 | 26 | 27 | 28 | namespace stringutil{ 29 | namespace clustering{ 30 | namespace helper{ 31 | std::string getstars(); 32 | } 33 | 34 | 35 | 36 | namespace pll{ 37 | 38 | 39 | namespace exact{ 40 | 41 | std::string getstartsummary_v1(std::string algname, size_t memory_usage, float mse, float val_mse); 42 | std::string getstartsummary_v2(std::string algname, size_t memory_usage, float mse, float val_mse); 43 | std::string getroundsummary_v1(size_t roundchanges); 44 | std::string getroundsummary_v2(size_t round, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, float val_mse); 45 | std::string getfinalsummary_v1(size_t round, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse); 46 | std::string getfinalsummary_v2(size_t round, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse); 47 | 48 | } 49 | 50 | namespace minibatch{ 51 | std::string getstartsummary_v1(std::string algname, size_t memory_usage, float val_mse); 52 | std::string getstartsummary_v2(std::string algname, size_t memory_usage, float val_mse); 53 | 54 | std::string getroundsummary_v1(size_t round, size_t nsubrounds, size_t roundchanges); 55 | std::string getroundsummary_v2(size_t round, size_t nsubrounds, size_t subround, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, float val_mse); 56 | 57 | std::string getfinalsummary_v1(size_t round, size_t nsubrounds, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse); 58 | std::string getfinalsummary_v2(size_t round, size_t nsubrounds, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse); 59 | 60 | 61 | 62 | } 63 | 64 | 65 | namespace growbatch{ 66 | 67 | 68 | std::string getstartsummary_v1(std::string algname, size_t memory_usage, float val_mse); 69 | std::string getstartsummary_v2(std::string algname, size_t memory_usage, float val_mse); 70 | 71 | std::string getroundsummary_v1(size_t roundchanges, bool didgrow); 72 | std::string getroundsummary_v2(size_t round, size_t nactive, float d_C__by__d_AB, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, float val_mse); 73 | 74 | std::string getfinalsummary_v1(size_t round, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse); 75 | std::string getfinalsummary_v2(size_t round, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse); 76 | 77 | } 78 | 79 | 80 | 81 | 82 | } //end pll 83 | 84 | 85 | 86 | 87 | } 88 | } 89 | 90 | 91 | 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /src/stringutilfile.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #include "stringutilfile.h" 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include "stringutilbase.h" 29 | 30 | 31 | #include 32 | 33 | #include 34 | 35 | namespace stringutilfile{ 36 | 37 | 38 | //stolen from http://stackoverflow.com/questions/2844817/how-do-i-check-if-a-c-string-is-an-int 39 | inline bool is_integer(const std::string & s){ 40 | if (s.empty() || ((!isdigit(s[0])) && (s[0] != '-') && (s[0] != '+'))){ 41 | return false; 42 | } 43 | 44 | char * p; 45 | 46 | strtol(s.c_str(), &p, 10); 47 | 48 | return (*p == 0); 49 | } 50 | 51 | bool file_has_2int_header(const std::string & filename){ 52 | std::ifstream dfile(filename, std::ios_base::in); 53 | std::string line; 54 | if (!dfile.is_open()){ 55 | throw std::runtime_error(std::string("The file ") + filename + " probably does not exist. Cannot determine whether the file has the 2 int header or not, as the file does not seem to exist." ); 56 | } 57 | std::getline(dfile, line); 58 | auto bob = stringutil::split(line); 59 | 60 | /* first determine that it contains 2 nuggets: */ 61 | if (bob.size() != 2){ 62 | //"file does not have 2 frags, it has : " << bob.size() << " frags " << std::endl; 63 | return false; 64 | } 65 | /* next test that they are indeed integers */ 66 | if (is_integer(bob[0]) and is_integer(bob[1])){ 67 | return true; 68 | } 69 | 70 | //"fail due to a non int in header : " << is_integer(bob[0]) << " " << is_integer(bob[1]) << std::endl; 71 | //"fail due to a non int in header : |" << bob[0] << "| |" << bob[1] << "|" << std::endl; 72 | 73 | return false; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/stringutilfile.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef STRINGUTILFILE_H 22 | #define STRINGUTILFILE_H 23 | 24 | #include 25 | namespace stringutilfile{ 26 | 27 | 28 | 29 | 30 | bool file_has_2int_header(const std::string & filename); 31 | 32 | 33 | 34 | 35 | } 36 | #endif 37 | -------------------------------------------------------------------------------- /src/txtdatasets.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/ 3 | Written by James Newling 4 | All rights reserved. 5 | 6 | eakmeans is a library for exact and approximate k-means written in C++ and 7 | Python. This file is part of eakmeans. See file COPYING for more details. 8 | 9 | This file is part of eakmeans. 10 | 11 | eakmeans is free software: you can redistribute it and/or modify 12 | it under the terms of the 3-Clause BSD Licence. See 13 | https://opensource.org/licenses/BSD-3-Clause for more details. 14 | 15 | eakmeans is distributed in the hope that it will be useful, 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file 18 | COPYING for more details. 19 | */ 20 | 21 | #ifndef TXTDATASETS_H 22 | #define TXTDATASETS_H 23 | 24 | namespace datasets{ 25 | 26 | static std::string sparse_data_dir("/idiap/temp/jnewling/data/sparsedata/rcv1rcv/"); 27 | 28 | 29 | class TxtDataset{ 30 | public: 31 | std::string name; 32 | unsigned nd; 33 | unsigned dim; 34 | std::string datapath_dim; 35 | std::string datapath_dimless; 36 | TxtDataset(){}; //why do I need this? 37 | TxtDataset(const std::string & name, unsigned nd, unsigned dim):name(name), nd(nd), dim(dim){ 38 | datapath_dim = "/idiap/temp/jnewling/data/txtdata/normalised/" + name + "_" + std::to_string(nd) + "_" + std::to_string(dim) + "_cnormed.txt"; 39 | datapath_dimless = "/idiap/temp/jnewling/data/txtdata/normalised/" + name + "_" + std::to_string(nd) + "_" + std::to_string(dim) + "_cnormed_dimless.txt"; 40 | } 41 | }; 42 | 43 | class TrainTestDataset{ //TODO replace TrainTestDataset everywhere with TrainTestDataset. 44 | public: 45 | 46 | std::string name; 47 | 48 | std::string datapath_train_dim; 49 | std::string datapath_train_dimless; 50 | std::string datapath_test_dim; 51 | std::string datapath_test_dimless; 52 | 53 | TrainTestDataset(){}; 54 | 55 | //, nd(nd), dim(dim) , unsigned nd, unsigned dim 56 | //, const std::string & datapath_dim, const std::string & datapath_dimless 57 | // unsigned nd; 58 | // unsigned dim; 59 | 60 | TrainTestDataset(const std::string & name, 61 | std::string rootdir = "/idiap/temp/jnewling/data/sparsedata/trainandtest/" 62 | ): name(name){ 63 | datapath_train_dim = rootdir + name + "_train_withdims.txt"; 64 | datapath_train_dimless = rootdir + name + "_train_dimless.txt"; 65 | datapath_test_dim = rootdir + name + "_test_withdims.txt"; 66 | datapath_test_dimless = rootdir + name + "_test_dimless.txt"; 67 | } 68 | }; 69 | 70 | std::vector sparse_datasets{ 71 | 72 | {"truercv"}, 73 | {"truercvos"}, 74 | {"rcv"}, //558700 , 0, "None", "/idiap/temp/jnewling/data/sparsedata/rcv1rcv/all_shuffled.txt"}, 75 | {"nips"}, //1500 , 0, "None", "/idiap/temp/jnewling/data/sparsedata/bagofwords/nips.txt"}, 76 | {"nytimes"},// 299751 , 102661, "None", "/idiap/temp/jnewling/data/sparsedata/bagofwords/nytimes.txt"} 77 | {"randdim5"}, 78 | {"randdim6"}, 79 | {"rand4", "/idiap/temp/jnewling/data/densedata/trainandtest/"}, 80 | {"infimnist", "/idiap/temp/jnewling/data/densedata/trainandtest/"}, 81 | {"infimnist28by28", "/idiap/temp/jnewling/data/densedata/trainandtest/"}, 82 | {"covtype", "/idiap/temp/jnewling/data/densedata/trainandtest/"}, 83 | {"kddcup98", "/idiap/temp/jnewling/data/densedata/trainandtest/"}, 84 | {"stl10", "/idiap/temp/jnewling/data/densedata/trainandtest/"} 85 | }; 86 | 87 | std::vector txt_datasets { 88 | 89 | {"tsn", 200000, 4}, 90 | {"conflongdemo", 164860, 3}, 91 | {"skinseg", 200000, 4}, 92 | {"wcomp", 165630, 15}, 93 | {"kegg", 65550, 28}, 94 | 95 | {"miniboone", 130060, 50}, 96 | {"covtype", 581012, 55}, 97 | {"gassensor", 13910, 128}, 98 | {"uscensus", 2458285, 68}, 99 | {"colormoments", 68040,9}, 100 | 101 | {"ldfpads", 164850, 3}, 102 | {"kddcup98", 95000, 310}, 103 | {"kddcup04bio", 145750, 74}, 104 | {"egeod", 5580, 31099}, 105 | {"mnist50", 60000, 50}, 106 | 107 | {"house16H", 22780, 17}, 108 | {"mv", 40760, 11}, 109 | {"europe", 169300, 2}, 110 | {"birch3", 100000, 2}, 111 | {"mnist", 60000, 784}, 112 | 113 | {"stl10", 1000000, 108}, 114 | {"random", 1000000, 30}, 115 | {"random2", 1000000, 2}, 116 | {"small", 1000, 2}, 117 | 118 | }; 119 | 120 | 121 | 122 | } 123 | 124 | 125 | #endif 126 | --------------------------------------------------------------------------------