├── .gitignore
├── COPYING
├── Makefile
├── README.md
├── addheaders.py
├── cythonsrc
    └── kmeans.pyx
├── examples
    ├── dense_200_5_header.txt
    ├── dense_200_5_noheader.txt
    ├── examples.py
    ├── linking_example
    │   ├── README
    │   └── main.cpp
    ├── sparse_randdim10_test_header.txt
    ├── sparse_randdim10_test_noheader.txt
    ├── sparse_randdim10_train_header.txt
    ├── sparse_randdim10_train_noheader.txt
    └── testing_BF.py
├── setup.py
└── src
    ├── BaseGrowBatch.h
    ├── BaseGrowBatchMse.h
    ├── BaseGrowBatchPartitional.h
    ├── BaseSparseGrowBatch.h
    ├── BaseSparseGrowBatchMse.h
    ├── GBMse3v1.h
    ├── GBMseSimple.h
    ├── GBPSimple.h
    ├── SparseGBMse3v1.h
    ├── SparseGBMseSimple.h
    ├── YY17v2.h
    ├── YY17v3.h
    ├── YY17v5.h
    ├── YY17v6.h
    ├── YY21v3.h
    ├── YY21v4.h
    ├── YY21v5.h
    ├── alg_X_selkSN.h
    ├── arrutilv2copy.h
    ├── arrutilv2discrete.h
    ├── arrutilv2l0.h
    ├── arrutilv2l0blasless.h
    ├── arrutilv2l0withblas.h
    ├── arrutilv2l1.h
    ├── arrutilv2l2.h
    ├── arrutilv2l3.h
    ├── arrutilv2minmax.h
    ├── arrutilv2mse.h
    ├── barrierutil.cpp
    ├── barrierutil.h
    ├── baseYY.h
    ├── baseYYMNS.h
    ├── baseYYMSN.h
    ├── baseYYSMN.h
    ├── basecluster.h
    ├── basedensecentroidkmeans.h
    ├── baseelkan.h
    ├── baseelkanminibatch.h
    ├── baseexact.h
    ├── baseexponion.h
    ├── basehamerly.h
    ├── basekmeans.h
    ├── baseminibatch.h
    ├── basesimpleexact.h
    ├── basesimpleminibatch.h
    ├── basesparseelkan.h
    ├── basesparseexact.h
    ├── basesparsekmeans.h
    ├── basesparseminibatch.h
    ├── blastemplates.cpp
    ├── blastemplates.h
    ├── elkan3v0.h
    ├── elkan4v2.h
    ├── elkan5v1.h
    ├── elkan6v0.h
    ├── exactsimplebatch.h
    ├── growbatchapp.h
    ├── hamerly11v0.h
    ├── hamerly12v6.h
    ├── hamerly12v7.h
    ├── hamerly13v0.h
    ├── initialise2.h
    ├── main.cpp
    ├── mb3v0.h
    ├── minibatch.h
    ├── minibatchapp.h
    ├── optionsutil.cpp
    ├── optionsutil.h
    ├── pllcluster.h
    ├── pllkmeansfuncs.cpp
    ├── pllkmeansfuncs.hpp
    ├── pllkmeansfuncs_nonvoid.h
    ├── pllkmeansfuncs_void.h
    ├── processingfilename.py
    ├── randomarray.h
    ├── randomsparse.h
    ├── sample.h
    ├── simple1.h
    ├── simplest.h
    ├── sortutil.h
    ├── sparsedatasets.h
    ├── sparseelkan3v0.h
    ├── sparseinitialise.h
    ├── sparseminibatch.h
    ├── sparsesimple.h
    ├── sparsestandardminibatch.h
    ├── sparseutil.h
    ├── standardminibatch.h
    ├── stringutilbase.cpp
    ├── stringutilbase.h
    ├── stringutilclustering.cpp
    ├── stringutilclustering.h
    ├── stringutilfile.cpp
    ├── stringutilfile.h
    ├── templatedbarrierutil.h
    └── txtdatasets.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.so
 3 | *.swp
 4 | *.pyc
 5 | *.pyxbldc
 6 | *.bbl
 7 | *.blg
 8 | *.log
 9 | *bin*
10 | build/*
11 | *junk*
12 | *eakmeans/*
13 | python/batch*
14 | python/other*
15 | bin
16 | build
17 | cythonsrc/kmeans.cpp
18 | lib
19 | obj
20 | 
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | # Written by James Newling <james.newling@gmail.com>
 4 | # All rights reserved.
 5 | # 
 6 | # eakmeans is a library for exact and approximate k-means written in C++ and
 7 | # Python. This file is part of eakmeans. See file COPYING for more details.
 8 | # 
 9 | # This file is part of eakmeans.
10 | # 
11 | # eakmeans is free software: you can redistribute it and/or modify
12 | # it under the terms of the 3-Clause BSD Licence. See
13 | # https://opensource.org/licenses/BSD-3-Clause for more details.
14 | # 
15 | # eakmeans is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | # COPYING for more details.
19 | # 
20 | ##########################################################
21 | #compile with blas?
22 | USEBLAS = YES
23 | export LIBBLASDIR=/idiap/user/jnewling/openblas/lib  #/home/james/openblas/lib
24 | export INCBLASDIR=/idiap/user/jnewling/openblas/include  #/home/james/openblas/include
25 | ##########################################################
26 | 
27 | 
28 | CXX :=  g++
29 | CXXFLAGS := -std=c++11 -O3  -Wall -pedantic -fPIC
30 | LINKER := g++ #-Wl,--no-as-needed
31 | LFLAGS := -lpthread
32 | ifeq ($(USEBLAS), YES)
33 | 	LFLAGS := ${LFLAGS} -lopenblas -L${LIBBLASDIR}
34 | 	CXXFLAGS := ${CXXFLAGS} -D WITHBLAS
35 | 	INCLUDEPATHS = -I${INCBLASDIR}
36 | 	TARGET := withblaskmeans
37 | 	LIBNAME := libwithblaskmeans
38 | 	export WITHBLAS
39 | else
40 | 	TARGET := blaslesskmeans
41 | 	LIBNAME := libblaslesskmeans
42 | endif
43 | 
44 | 
45 | SOURCES  := $(wildcard src/*.cpp)
46 | HEADERS	 := $(wildcard src/*.h)
47 | OBJECTS  := $(SOURCES:src/%.cpp=obj/%.o)
48 | OBJECTS_FORLIB := $(filter-out obj/main.o,$(OBJECTS))
49 | 
50 | all : main lib pythonkmeans
51 | 
52 | 
53 | main :  $(OBJECTS)
54 | 	@mkdir -p bin	
55 | 	$(LINKER) -o bin/${TARGET} $(OBJECTS) $(LFLAGS)
56 | 	@echo "Linking for main of ${NAME} done!"
57 | 
58 | lib : $(OBJECTS_FORLIB)
59 | 	@mkdir -p lib
60 | 	$(CXX) -shared $^ -o lib/$(LIBNAME).so
61 | 	@echo "Shared library ${LIBNAME} made!"
62 | 
63 | pythonkmeans:
64 | 	python setup.py build_ext -b lib
65 | 
66 | obj/%.o : src/%.cpp  $(HEADERS)
67 | 	@mkdir -p obj
68 | 	$(CXX) -c  $(CXXFLAGS) $(INCLUDEPATHS) $< -o $@
69 | 	@echo "compiled "$<" successfully!"
70 | 
71 | 
72 | .PHONEY: clean
73 | 
74 | 
75 | clean:
76 | 	rm -f  $(OBJECTS)
77 | 	@echo "cleanup done!"
78 | 	
79 | .PHONEY: remove
80 | 
81 | remove: clean
82 | 	-rm cythonsrc/kmeans.cpp
83 | 	-rm -rf build
84 | 	-rm -rf bin
85 | 	-rm -rf lib
86 | 	-rm -rf obj
87 | 	@echo "should be 100% clean!"
88 | 
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | WHAT
 2 | ====
 3 | Implementations of fast exact k-means algorithms as described in http://arxiv.org/abs/1602.02514 and implementations of turbo-charged mini-batch k-means as described in http://arxiv.org/pdf/1602.02934
 4 | 
 5 | for interfaces
 6 | - (LIB) Shared library with accompanying C++ header file
 7 | - (EX) Command-line exectuble
 8 | - (PY) Python library
 9 | 
10 | 
11 | REQUIREMENTS
12 | ============
13 | Minimal installation requirements:
14 | - C++ compiler supporting C++11
15 | - Linux operating system
16 | 
17 | Optional but recommended:
18 | - BLAS implementation, we recommend this one : http://www.openblas.net/
19 | 
20 | Specific to Python library:
21 | - Python and Cython
22 | 
23 | 
24 | CONFIGURATION
25 | =============
26 | In `Makefile`, set `USEBLAS` to either `NO` or `YES`
27 | if `USEBLAS = YES`, then set `LIBBLASDIR`, `INCBLASDIR` (unless blas paths will be found automatically)
28 | 
29 | 
30 | BUILDING
31 | ========
32 | - For (LIB) and (EX) and (PY) : `make all`
33 | - For (EX) : `make main`
34 | - For (LIB) : `make lib`
35 | 
36 | USING
37 | =====
38 | (EX) If succesfully installed, you should find an executable in directory bin
39 | Run the executable with -h flag to see the options
40 | 
41 | (LIB) You need to add lib directory to your LD_LIBRARY_PATH : put the following line in your ``~/.bashrc` file:
42 | ```
43 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/to/kmeans/lib
44 | ```
45 | (PY) If successfully installed, you should be able to `import kmeans` when in directory lib.
46 | To use from a different directory,
47 | 
48 | (a) as per (LIB), and
49 | 
50 | (b) add the path to lib to your python path, either by:
51 | ```
52 | export PYTHONPATH=${PYTHONPATH}:/path/to/kmeans/lib
53 | ```
54 | or directly in your python script :
55 | ```
56 | import sys
57 | sys.path.insert(0,'/path/to/kmeans/lib')
58 | ```
59 | Example use is found in `examples/examples.py`
60 | 
61 | 
62 | 
63 | DOESN'T WORK?
64 | =============
65 | Please contact James Newling at <james.newling@gmail.com>
66 | 


--------------------------------------------------------------------------------
/addheaders.py:
--------------------------------------------------------------------------------
  1 | # 
  2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | # Written by James Newling <james.newling@gmail.com>
  4 | # All rights reserved.
  5 | # 
  6 | # eakmeans is a library for exact and approximate k-means written in C++ and
  7 | # Python. This file is part of eakmeans. See file COPYING for more details.
  8 | # 
  9 | # This file is part of eakmeans.
 10 | # 
 11 | # eakmeans is free software: you can redistribute it and/or modify
 12 | # it under the terms of the 3-Clause BSD Licence. See
 13 | # https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | # 
 15 | # eakmeans is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | # COPYING for more details.
 19 | # 
 20 | from IPython.core.debugger import Tracer
 21 | 
 22 | 
 23 | old_rawheader = r"""Copyright (c) 2015 Idiap Research Institute, http://www.idiap.ch/
 24 | Written by James Newling <jnewling@idiap.ch>
 25 | 
 26 | eakmeans is a library for exact and approximate k-means written in C++ and Python. This file is part of eakmeans. eakmeans is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License version 3 as published by the Free Software Foundation. eakmeans is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with eakmeans. If not, see <http://www.gnu.org/licenses/>.
 27 | 
 28 | """
 29 | 
 30 | old_hashheader = r"""#Copyright (c) 2015 Idiap Research Institute, http://www.idiap.ch/
 31 | #Written by James Newling <jnewling@idiap.ch>
 32 | 
 33 | #eakmeans is a library for exact and approximate k-means written in C++ and Python. This file is part of eakmeans. eakmeans is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License version 3 as published by the Free Software Foundation. eakmeans is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with eakmeans. If not, see <http://www.gnu.org/licenses/>.
 34 | 
 35 | """
 36 | 
 37 | old_cppheader = r"""/*
 38 | %s
 39 | */
 40 | 
 41 | """%(old_rawheader, )
 42 | 
 43 | new_rawheader = r"""
 44 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 45 | Written by James Newling <james.newling@gmail.com>
 46 | All rights reserved.
 47 | 
 48 | eakmeans is a library for exact and approximate k-means written in C++ and
 49 | Python. This file is part of eakmeans. See file COPYING for more details.
 50 | 
 51 | This file is part of eakmeans.
 52 | 
 53 | eakmeans is free software: you can redistribute it and/or modify
 54 | it under the terms of the 3-Clause BSD Licence. See
 55 | https://opensource.org/licenses/BSD-3-Clause for more details.
 56 | 
 57 | eakmeans is distributed in the hope that it will be useful,
 58 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 59 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 60 | COPYING for more details.
 61 | """
 62 | 
 63 | new_hashheader = '\n'.join(['# ' + l for l in new_rawheader.split('\n')]) + '\n'
 64 | new_cppheader = "/*%s*/\n\n" % new_rawheader
 65 | 
 66 | import os
 67 | import sys
 68 | 
 69 | import commands
 70 | 
 71 | 
 72 | hfiles = commands.getstatusoutput("find . -name \"*.h\"")[1].split("\n")
 73 | cppfiles = commands.getstatusoutput("find . -name \"*.cpp\"")[1].split("\n")
 74 | hppfiles = commands.getstatusoutput("find . -name \"*.hpp\"")[1].split("\n")
 75 | cppheaderable = hfiles + cppfiles + hppfiles
 76 | 
 77 | makefiles = commands.getstatusoutput("find . -name \"Makefile\"")[1].split("\n")
 78 | pyfiles =  commands.getstatusoutput("find . -name \"*.py\"")[1].split("\n")
 79 | pyxfiles = commands.getstatusoutput("find . -name \"*.pyx\"")[1].split("\n")
 80 | pyxbldfiles = commands.getstatusoutput("find . -name \"*.pyxbld\"")[1].split("\n")
 81 | hashheaderable = makefiles + pyfiles + pyxfiles + pyxbldfiles
 82 | 
 83 | 
 84 | for files, old_header, new_header in zip(
 85 |         [cppheaderable, hashheaderable],
 86 |         [old_cppheader, old_hashheader],
 87 |         [new_cppheader, new_hashheader]
 88 |         ):
 89 |     for fn in files:
 90 | 
 91 |         if fn:
 92 |             sys.stdout.write("headering " + fn +"...")
 93 | 
 94 |             filly = open(fn, "r")
 95 |             lines = filly.read()
 96 |             filly.close()
 97 | 
 98 |             if lines.startswith(old_header):
 99 |                 lines = lines[len(old_header):]
100 | 
101 |             if lines.startswith(new_header):
102 |                 # already with header, skip this file
103 |                 sys.stdout.write(" already done, skip.\n")
104 |                 continue
105 | 
106 |             filly = open(fn, "w")
107 |             filly.write(new_header)
108 |             filly.write(lines)
109 |             filly.close()
110 |             sys.stdout.write(" done.\n")
111 | 


--------------------------------------------------------------------------------
/examples/examples.py:
--------------------------------------------------------------------------------
  1 | # 
  2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | # Written by James Newling <james.newling@gmail.com>
  4 | # All rights reserved.
  5 | # 
  6 | # eakmeans is a library for exact and approximate k-means written in C++ and
  7 | # Python. This file is part of eakmeans. See file COPYING for more details.
  8 | # 
  9 | # This file is part of eakmeans.
 10 | # 
 11 | # eakmeans is free software: you can redistribute it and/or modify
 12 | # it under the terms of the 3-Clause BSD Licence. See
 13 | # https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | # 
 15 | # eakmeans is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | # COPYING for more details.
 19 | # 
 20 | import numpy as np
 21 | import numpy.random as npr
 22 | import time
 23 | from IPython.core.debugger import Tracer
 24 | 
 25 | import sys
 26 | sys.path.insert(0, "../lib")
 27 | 
 28 | import kmeans
 29 | reload(kmeans)
 30 | 
 31 | 
 32 | def example_1(ndata = 1e4, dimension = 100, dtype = np.float64):
 33 | 	"""
 34 | 	basic use : cluster random data
 35 | 	"""
 36 | 	data = npr.randn(ndata, dimension).astype(dtype)
 37 | 	clustering = kmeans.get_clustering(X = data, n_clusters = 60, algorithm = 'auto', verbose = 2)
 38 | 	
 39 | 
 40 | def example_2():
 41 | 	"""
 42 | 	compare algorithms which may be good in low-d 
 43 | 	ham, ann, expSN, expNS, syinSN, syinNS, yin,
 44 | 	on dataset ldfpads.txt (~160000 points in 3 dimensions)
 45 | 	"""
 46 | 	data = np.loadtxt('ldfpads.txt')
 47 | 	print "Data shape : ", data.shape
 48 | 	seed = npr.randint(100000)
 49 | 	algs = ['ham','ann', 'exp-sn', 'exp-ns', 'syin-sn', 'syin-ns', 'yin']
 50 | 
 51 | 	times = dict.fromkeys(algs)
 52 | 	for alg in algs:
 53 | 		clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed)
 54 | 		times[alg] = clustering['duration']
 55 | 		
 56 | 	
 57 | 	print "TIMES:"
 58 | 	for alg in algs:
 59 | 		print alg, " : ", times[alg]
 60 | 		
 61 | 
 62 | def example_3():
 63 | 	"""
 64 | 	compare selkNS to standard algorithm on random data. selkNS is faster, but not by as much as when the data has structure.
 65 | 	"""	
 66 | 	algs = ['sta','selk-ns']
 67 | 	times = dict.fromkeys(algs)
 68 | 	data = npr.randn(50000, 25).astype(np.float64)
 69 | 	seed = 1011
 70 | 	for alg in algs:
 71 | 		clustering = kmeans.get_clustering(X = data, n_clusters = 1000, algorithm = alg, verbose = 1, n_threads = 1, seed = seed)
 72 | 		times[alg] = clustering['duration']
 73 | 	
 74 | 
 75 | 	print "TIMES:"
 76 | 	for alg in algs:
 77 | 		print alg, " : ", times[alg]
 78 | 		
 79 | 
 80 | def example_4():
 81 | 	"""
 82 | 	compare to scikitlearn implementation of kmeans
 83 | 	"""
 84 | 
 85 | 	import sklearn.cluster as skc
 86 | 	import time
 87 | 	
 88 | 	ndata = 50000
 89 | 	dimension = 10
 90 | 	ncentroids = 1000
 91 | 	data = npr.randn(ndata, dimension).astype(np.float64)
 92 | 
 93 | 	centroids0 = data[0:ncentroids, :]
 94 | 
 95 | 	t0 = time.time()
 96 | 	kmeans.get_clustering(X = data, init = centroids0, n_clusters = ncentroids, algorithm = 'auto', verbose = 1, n_threads = 1)
 97 | 	t1 = time.time()
 98 | 
 99 | 	sklearner = skc.k_means(X = data, n_clusters = ncentroids, max_iter = 1000, n_init = 1, init = centroids0, precompute_distances = False, verbose = True, n_jobs = 1, return_n_iter = True, tol = 0.0)
100 | 	t2 = time.time()	
101 | 	
102 | 	kmeans_time = t1 - t0
103 | 	sklearner_time = t2 - t1
104 | 	
105 | 	print "sklearn : ", sklearner_time, " s"
106 | 	print "this kmeans: ",  kmeans_time, " s"
107 | 			
108 | 	
109 | 
110 | 


--------------------------------------------------------------------------------
/examples/linking_example/README:
--------------------------------------------------------------------------------
1 | at terminal (assuming you've installed with blas and that openblas .so file is in ../../openblas/lib): 
2 | g++ -std=c++11 -I../src/ -L../lib/ -lwithblaskmeans -L../../openblas/lib -lopenblas main.cpp -o xme
3 | 
4 | at terminal (assuming no blas)
5 | g++ -std=c++11 -I../src/ -L../lib/ -lblaslesskmeans  main.cpp -o xme
6 | 


--------------------------------------------------------------------------------
/examples/linking_example/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #include <vector>
22 | #include <algorithm>
23 | #include <numeric>
24 | #include <string>
25 | #include <iostream>
26 | 
27 | #include "pllkmeansfuncs_nonvoid.h"
28 | 
29 | 
30 | 
31 | int main(){
32 |   
33 |   
34 |   /* full list of algorithms can be found in pllcluster.h, for exact k-means on dense data, you'll probably want one of:
35 |    * exp-ns (exponion - ns), selk-ns (simplified elkan - ns), syin-ns (simplified yinyang - ns) 
36 |    * See ICML paper Fast K-Means with Accurate Bounds, or --help flag in executable or python function string for more info.
37 |    * */
38 |   std::string algorithm = "exp-ns";
39 |   
40 |   size_t nthreads = 2;
41 |   
42 |   /* we make some data */
43 |   size_t ndata = 10000;
44 |   size_t dimension = 3;
45 |   std::vector<float> v_data (ndata*dimension, 0);
46 |   for (size_t i = 0; i < ndata*dimension; ++i){
47 |     v_data[i]= static_cast<float> (rand() % 100);
48 |   }
49 |   
50 |   size_t ncentroids = 100;
51 |   int cout_verbosity = 2;
52 |   std::string initialisation_method = "from_indices";
53 |   const float * const C_init = nullptr;
54 |   std::vector<size_t> v_data_indices_init_from(ncentroids);
55 |   std::iota(v_data_indices_init_from.begin(), v_data_indices_init_from.end(), 0);
56 |   bool setseed = true;
57 |   size_t seed = 1011;
58 |   float maxtime = 1000;
59 |   size_t maxrounds = 1000;
60 |   size_t minibatchsize = 0;
61 |   bool capture_verbose = false;
62 |   
63 |    
64 |   std::cout << "entering solveiolessf" << std::endl;  
65 |   /* the double version is sloveiolessd : see pllkmeansfuncs_nonvoid */
66 |   auto results = cluster::solveiolessf(algorithm, nthreads, ndata, dimension, v_data.data(), ncentroids, cout_verbosity, initialisation_method, C_init,  v_data_indices_init_from.data(),  setseed,  seed,  maxtime, maxrounds, minibatchsize,  capture_verbose);
67 |   
68 |   /* return : C, L, inds0, duration, niterations, mse */
69 |   float * C = std::get<0> (results).get();
70 |   size_t * labels = std::get<1> (results).get();
71 |   size_t * starting_indices_returned = std::get<2> (results).get();
72 |   size_t duration = std::get<3> (results);
73 |   size_t niterations = std::get<4> (results);
74 |   double mse = std::get<5> (results);
75 | 
76 |   std::vector<size_t> counts (ncentroids, 0);
77 |   for (size_t i = 0; i < ndata; ++i){
78 |     ++counts[labels[i]];
79 |   }
80 |   
81 |   std::cout << "- -- -  - -  --- -  -  -- - - -   --- -  -- -   -- -" << std::endl;
82 |   for (size_t k = 0; k < ncentroids; ++k){
83 |     std::cout << "in cluster " << k << " : " << counts[k] << std::endl;
84 |   }
85 |   
86 |   
87 |   
88 |   return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/examples/sparse_randdim10_test_header.txt:
--------------------------------------------------------------------------------
  1 | 111	10
  2 | 1011 9:0.755 
  3 | 1011 1:0.320 2:0.460 3:0.967 5:0.100 6:0.220 9:0.715 
  4 | 1011 0:0.011 9:0.611 
  5 | 1011 9:0.255 
  6 | 1011 2:0.697 9:0.682 
  7 | 1011 0:0.931 7:0.491 
  8 | 1011 6:0.010 
  9 | 1011 1:0.640 6:0.322 
 10 | 1011 1:0.366 6:0.709 
 11 | 1011 0:0.936 1:0.414 2:0.654 4:0.929 7:0.702 
 12 | 1011 1:0.880 3:0.165 5:0.494 6:0.692 
 13 | 1011 
 14 | 1011 6:0.008 
 15 | 1011 4:0.987 6:0.359 8:0.113 9:0.761 
 16 | 1011 2:0.943 4:0.716 8:0.634 
 17 | 1011 8:0.731 
 18 | 1011 0:0.530 1:0.030 2:0.942 3:0.878 5:0.513 7:0.095 
 19 | 1011 0:0.491 1:0.870 4:0.815 7:0.284 
 20 | 1011 0:0.666 4:0.733 6:0.609 7:0.489 9:0.349 
 21 | 1011 2:0.999 6:0.325 8:0.723 9:0.678 
 22 | 1011 7:0.255 
 23 | 1011 7:0.856 9:0.930 
 24 | 1011 1:0.803 6:0.708 
 25 | 1011 1:0.417 2:0.404 6:0.027 7:0.173 
 26 | 1011 5:0.511 
 27 | 1011 3:0.499 8:0.876 
 28 | 1011 0:0.043 1:0.264 2:0.826 3:0.573 4:0.363 8:0.519 9:0.084 
 29 | 1011 2:0.028 5:0.154 7:0.536 8:0.608 
 30 | 1011 0:0.040 1:0.238 4:0.952 5:0.778 
 31 | 1011 1:0.656 3:0.249 7:0.108 
 32 | 1011 4:0.213 8:0.725 
 33 | 1011 0:0.344 1:0.433 7:0.283 8:0.285 
 34 | 1011 8:0.067 
 35 | 1011 1:0.973 4:0.139 9:0.723 
 36 | 1011 0:0.043 1:0.388 5:0.124 6:0.573 7:0.596 9:0.165 
 37 | 1011 0:0.315 2:0.591 5:0.538 
 38 | 1011 0:0.330 1:0.809 2:0.074 9:0.903 
 39 | 1011 1:0.156 4:0.825 9:0.044 
 40 | 1011 
 41 | 1011 0:0.270 3:0.628 6:0.894 
 42 | 1011 0:0.058 1:0.704 2:0.842 6:0.832 8:0.174 
 43 | 1011 1:0.354 6:0.908 8:0.270 
 44 | 1011 1:0.347 2:0.626 5:0.563 6:0.400 
 45 | 1011 7:0.980 8:0.952 
 46 | 1011 5:0.661 
 47 | 1011 9:0.724 
 48 | 1011 0:0.384 5:0.911 7:0.833 8:0.915 
 49 | 1011 0:0.356 3:0.769 5:0.979 
 50 | 1011 1:0.586 2:0.190 3:0.247 
 51 | 1011 2:0.930 4:0.232 8:0.013 
 52 | 1011 3:0.538 8:0.104 9:0.174 
 53 | 1011 0:0.848 1:0.635 
 54 | 1011 0:0.250 1:0.015 
 55 | 1011 1:0.595 8:0.139 
 56 | 1011 3:0.533 7:0.093 8:0.528 
 57 | 1011 7:0.075 8:0.506 
 58 | 1011 
 59 | 1011 2:0.466 5:0.185 8:0.512 
 60 | 1011 9:0.440 
 61 | 1011 1:0.753 3:0.261 4:0.789 
 62 | 1011 1:0.385 7:0.465 
 63 | 1011 0:0.347 2:0.726 5:0.607 6:0.092 8:0.040 
 64 | 1011 1:0.871 6:0.311 9:0.989 
 65 | 1011 2:0.418 5:0.284 
 66 | 1011 0:0.539 1:0.159 6:0.408 
 67 | 1011 2:0.868 7:0.617 8:0.665 
 68 | 1011 0:0.967 5:0.060 
 69 | 1011 1:0.747 5:0.966 
 70 | 1011 5:0.803 
 71 | 1011 2:0.377 6:0.680 7:0.792 
 72 | 1011 6:0.926 9:0.216 
 73 | 1011 0:0.231 8:0.242 
 74 | 1011 0:0.657 2:0.512 4:0.570 7:0.819 9:0.829 
 75 | 1011 2:0.878 5:0.987 6:0.953 7:0.911 
 76 | 1011 0:0.146 1:0.978 2:0.558 7:0.530 9:0.136 
 77 | 1011 0:0.761 1:0.050 7:0.457 
 78 | 1011 5:0.200 6:0.192 
 79 | 1011 1:0.202 2:0.186 4:0.164 6:0.825 8:0.454 
 80 | 1011 0:0.899 5:0.719 8:0.176 
 81 | 1011 2:0.314 6:0.202 
 82 | 1011 3:0.094 5:0.591 
 83 | 1011 0:0.619 9:0.779 
 84 | 1011 1:0.611 3:0.585 5:0.882 8:0.574 9:0.181 
 85 | 1011 1:0.886 3:0.533 4:0.444 5:0.858 
 86 | 1011 0:0.132 2:0.439 3:0.364 5:0.838 8:0.088 
 87 | 1011 2:0.079 5:0.700 6:0.640 9:0.356 
 88 | 1011 2:0.138 3:0.282 5:0.645 6:0.398 
 89 | 1011 0:0.574 1:0.966 4:0.902 5:0.042 
 90 | 1011 2:0.331 6:0.843 
 91 | 1011 0:0.352 7:0.352 8:0.579 
 92 | 1011 9:0.097 
 93 | 1011 3:0.262 7:0.072 8:0.406 
 94 | 1011 3:0.634 6:0.095 
 95 | 1011 1:0.152 6:0.888 
 96 | 1011 0:0.821 1:0.378 3:0.625 4:0.361 
 97 | 1011 0:0.326 3:0.770 7:0.207 
 98 | 1011 3:0.085 6:0.393 
 99 | 1011 1:0.691 2:0.729 9:0.957 
100 | 1011 3:0.045 4:0.328 5:0.278 8:0.502 
101 | 1011 5:0.452 7:0.430 
102 | 1011 1:0.564 2:0.152 3:0.118 5:0.315 9:0.056 
103 | 1011 3:0.637 4:0.803 
104 | 1011 0:0.182 1:0.892 3:0.879 7:0.470 9:0.557 
105 | 1011 0:0.695 7:0.190 
106 | 1011 2:0.124 6:0.271 
107 | 1011 3:0.226 5:0.708 
108 | 1011 1:0.005 2:0.096 5:0.967 
109 | 1011 1:0.678 5:0.601 
110 | 1011 0:0.020 2:0.391 4:0.480 
111 | 1011 4:0.746 9:0.505 
112 | 1011 1:0.734 4:0.862 5:0.305 7:0.928 
113 | 


--------------------------------------------------------------------------------
/examples/sparse_randdim10_test_noheader.txt:
--------------------------------------------------------------------------------
  1 | 1011 9:0.755 
  2 | 1011 1:0.320 2:0.460 3:0.967 5:0.100 6:0.220 9:0.715 
  3 | 1011 0:0.011 9:0.611 
  4 | 1011 9:0.255 
  5 | 1011 2:0.697 9:0.682 
  6 | 1011 0:0.931 7:0.491 
  7 | 1011 6:0.010 
  8 | 1011 1:0.640 6:0.322 
  9 | 1011 1:0.366 6:0.709 
 10 | 1011 0:0.936 1:0.414 2:0.654 4:0.929 7:0.702 
 11 | 1011 1:0.880 3:0.165 5:0.494 6:0.692 
 12 | 1011 
 13 | 1011 6:0.008 
 14 | 1011 4:0.987 6:0.359 8:0.113 9:0.761 
 15 | 1011 2:0.943 4:0.716 8:0.634 
 16 | 1011 8:0.731 
 17 | 1011 0:0.530 1:0.030 2:0.942 3:0.878 5:0.513 7:0.095 
 18 | 1011 0:0.491 1:0.870 4:0.815 7:0.284 
 19 | 1011 0:0.666 4:0.733 6:0.609 7:0.489 9:0.349 
 20 | 1011 2:0.999 6:0.325 8:0.723 9:0.678 
 21 | 1011 7:0.255 
 22 | 1011 7:0.856 9:0.930 
 23 | 1011 1:0.803 6:0.708 
 24 | 1011 1:0.417 2:0.404 6:0.027 7:0.173 
 25 | 1011 5:0.511 
 26 | 1011 3:0.499 8:0.876 
 27 | 1011 0:0.043 1:0.264 2:0.826 3:0.573 4:0.363 8:0.519 9:0.084 
 28 | 1011 2:0.028 5:0.154 7:0.536 8:0.608 
 29 | 1011 0:0.040 1:0.238 4:0.952 5:0.778 
 30 | 1011 1:0.656 3:0.249 7:0.108 
 31 | 1011 4:0.213 8:0.725 
 32 | 1011 0:0.344 1:0.433 7:0.283 8:0.285 
 33 | 1011 8:0.067 
 34 | 1011 1:0.973 4:0.139 9:0.723 
 35 | 1011 0:0.043 1:0.388 5:0.124 6:0.573 7:0.596 9:0.165 
 36 | 1011 0:0.315 2:0.591 5:0.538 
 37 | 1011 0:0.330 1:0.809 2:0.074 9:0.903 
 38 | 1011 1:0.156 4:0.825 9:0.044 
 39 | 1011 
 40 | 1011 0:0.270 3:0.628 6:0.894 
 41 | 1011 0:0.058 1:0.704 2:0.842 6:0.832 8:0.174 
 42 | 1011 1:0.354 6:0.908 8:0.270 
 43 | 1011 1:0.347 2:0.626 5:0.563 6:0.400 
 44 | 1011 7:0.980 8:0.952 
 45 | 1011 5:0.661 
 46 | 1011 9:0.724 
 47 | 1011 0:0.384 5:0.911 7:0.833 8:0.915 
 48 | 1011 0:0.356 3:0.769 5:0.979 
 49 | 1011 1:0.586 2:0.190 3:0.247 
 50 | 1011 2:0.930 4:0.232 8:0.013 
 51 | 1011 3:0.538 8:0.104 9:0.174 
 52 | 1011 0:0.848 1:0.635 
 53 | 1011 0:0.250 1:0.015 
 54 | 1011 1:0.595 8:0.139 
 55 | 1011 3:0.533 7:0.093 8:0.528 
 56 | 1011 7:0.075 8:0.506 
 57 | 1011 
 58 | 1011 2:0.466 5:0.185 8:0.512 
 59 | 1011 9:0.440 
 60 | 1011 1:0.753 3:0.261 4:0.789 
 61 | 1011 1:0.385 7:0.465 
 62 | 1011 0:0.347 2:0.726 5:0.607 6:0.092 8:0.040 
 63 | 1011 1:0.871 6:0.311 9:0.989 
 64 | 1011 2:0.418 5:0.284 
 65 | 1011 0:0.539 1:0.159 6:0.408 
 66 | 1011 2:0.868 7:0.617 8:0.665 
 67 | 1011 0:0.967 5:0.060 
 68 | 1011 1:0.747 5:0.966 
 69 | 1011 5:0.803 
 70 | 1011 2:0.377 6:0.680 7:0.792 
 71 | 1011 6:0.926 9:0.216 
 72 | 1011 0:0.231 8:0.242 
 73 | 1011 0:0.657 2:0.512 4:0.570 7:0.819 9:0.829 
 74 | 1011 2:0.878 5:0.987 6:0.953 7:0.911 
 75 | 1011 0:0.146 1:0.978 2:0.558 7:0.530 9:0.136 
 76 | 1011 0:0.761 1:0.050 7:0.457 
 77 | 1011 5:0.200 6:0.192 
 78 | 1011 1:0.202 2:0.186 4:0.164 6:0.825 8:0.454 
 79 | 1011 0:0.899 5:0.719 8:0.176 
 80 | 1011 2:0.314 6:0.202 
 81 | 1011 3:0.094 5:0.591 
 82 | 1011 0:0.619 9:0.779 
 83 | 1011 1:0.611 3:0.585 5:0.882 8:0.574 9:0.181 
 84 | 1011 1:0.886 3:0.533 4:0.444 5:0.858 
 85 | 1011 0:0.132 2:0.439 3:0.364 5:0.838 8:0.088 
 86 | 1011 2:0.079 5:0.700 6:0.640 9:0.356 
 87 | 1011 2:0.138 3:0.282 5:0.645 6:0.398 
 88 | 1011 0:0.574 1:0.966 4:0.902 5:0.042 
 89 | 1011 2:0.331 6:0.843 
 90 | 1011 0:0.352 7:0.352 8:0.579 
 91 | 1011 9:0.097 
 92 | 1011 3:0.262 7:0.072 8:0.406 
 93 | 1011 3:0.634 6:0.095 
 94 | 1011 1:0.152 6:0.888 
 95 | 1011 0:0.821 1:0.378 3:0.625 4:0.361 
 96 | 1011 0:0.326 3:0.770 7:0.207 
 97 | 1011 3:0.085 6:0.393 
 98 | 1011 1:0.691 2:0.729 9:0.957 
 99 | 1011 3:0.045 4:0.328 5:0.278 8:0.502 
100 | 1011 5:0.452 7:0.430 
101 | 1011 1:0.564 2:0.152 3:0.118 5:0.315 9:0.056 
102 | 1011 3:0.637 4:0.803 
103 | 1011 0:0.182 1:0.892 3:0.879 7:0.470 9:0.557 
104 | 1011 0:0.695 7:0.190 
105 | 1011 2:0.124 6:0.271 
106 | 1011 3:0.226 5:0.708 
107 | 1011 1:0.005 2:0.096 5:0.967 
108 | 1011 1:0.678 5:0.601 
109 | 1011 0:0.020 2:0.391 4:0.480 
110 | 1011 4:0.746 9:0.505 
111 | 1011 1:0.734 4:0.862 5:0.305 7:0.928 
112 | 


--------------------------------------------------------------------------------
/examples/testing_BF.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | # Written by James Newling <james.newling@gmail.com>
 4 | # All rights reserved.
 5 | # 
 6 | # eakmeans is a library for exact and approximate k-means written in C++ and
 7 | # Python. This file is part of eakmeans. See file COPYING for more details.
 8 | # 
 9 | # This file is part of eakmeans.
10 | # 
11 | # eakmeans is free software: you can redistribute it and/or modify
12 | # it under the terms of the 3-Clause BSD Licence. See
13 | # https://opensource.org/licenses/BSD-3-Clause for more details.
14 | # 
15 | # eakmeans is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | # COPYING for more details.
19 | # 
20 | import kmeans
21 | 
22 | import numpy as np
23 | import numpy.random as npr
24 | 
25 | old_seed = npr.randint(100000)
26 | 
27 | ndata = 10000
28 | dimension = 300
29 | n_clusters  = 10
30 | npr.seed(1012)
31 | 
32 | X = npr.randn(ndata, dimension)
33 | C0 = 1.001*npr.randn(n_clusters, dimension)
34 | 
35 | npr.seed(old_seed)
36 | 
37 | bla = kmeans.get_clustering(X = X, n_clusters = n_clusters, init = "BF", verbose = 1, seed = old_seed)
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | # Written by James Newling <james.newling@gmail.com>
 4 | # All rights reserved.
 5 | # 
 6 | # eakmeans is a library for exact and approximate k-means written in C++ and
 7 | # Python. This file is part of eakmeans. See file COPYING for more details.
 8 | # 
 9 | # This file is part of eakmeans.
10 | # 
11 | # eakmeans is free software: you can redistribute it and/or modify
12 | # it under the terms of the 3-Clause BSD Licence. See
13 | # https://opensource.org/licenses/BSD-3-Clause for more details.
14 | # 
15 | # eakmeans is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | # COPYING for more details.
19 | # 
20 | 
21 | 
22 | from distutils.core import Extension, setup
23 | from Cython.Build import cythonize
24 | 
25 | import os
26 | 
27 | #X_library_dir = "./"
28 | openblaslibdir = os.environ["LIBBLASDIR"] #/idiap/user/jnewling/openblas/lib"
29 | 	
30 | #LIBBLASDIR	
31 | 
32 | libname = "kmeans"
33 | 	
34 | if "WITHBLAS" not in  os.environ.keys():
35 | 	librariestouse = ["blaslesskmeans"]
36 | 	print "will build the python library pkmeans without blas (building with blas will make it faster)"
37 | 
38 | else:
39 | 	librariestouse = ["withblaskmeans", "openblas"]
40 | 	print "will build the python library pkmeans with blas (good choice)"
41 | 
42 | #TODO : sort out libname vs "kmeans" below
43 | 
44 | ########## Using Cython directly ###################################
45 | ext = Extension(libname, sources = [os.path.abspath("cythonsrc/kmeans.pyx")], include_dirs = ["cythonsrc", "src"], library_dirs = [openblaslibdir, "lib"], libraries = librariestouse, language = "c++")
46 | setup(name = libname, ext_modules = cythonize(ext))
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | ############ Using precompiled cpp file (no need for Cython) ##########
56 | #ext = Extension("kmeans", sources = [os.path.abspath("./precythonised/kmeans.cpp")], include_dirs = [X_include_dir], library_dirs = [X_library_dir, openblaslibdir], libraries = librariestouse, language = "c++")
57 | #setup(name = "kmeans", ext_modules = [ext])
58 | 
59 | 
60 | 
61 | 
62 | 
63 | #if hostname == "goudurix12":
64 | 	#X_include_dir = "/idiap/home/jnewling/libraries/utility/pllkmeans/include"
65 | 	#X_library_dir = "/idiap/user/jnewling/own/templib"
66 | 	#openblaslibdir = "/idiap/user/jnewling/openblas/lib"
67 | 
68 | #else:
69 | 	#X_include_dir = "/home/james/libraries/utility/pllkmeans/include"
70 | 	#X_library_dir = "/home/james/oak/own/templib"
71 | 	#openblaslibdir = "/home/james/oak/openblas/lib"
72 | 


--------------------------------------------------------------------------------
/src/BaseGrowBatch.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef BASEGROWBATCH_H
 22 | #define BASEGROWBATCH_H
 23 | 
 24 | #include "basekmeans.h"
 25 | 
 26 | #include "growbatchapp.h"
 27 | 
 28 | 
 29 | namespace kmeans{
 30 | template <typename TInt, typename TFloat>
 31 | class BaseGrowBatch : public kmeans::BaseKmeans<TInt, TFloat>{
 32 | 
 33 | 	private:
 34 | 		virtual void endroundupdate() override final{
 35 | 			this->iscomplete = (this->nchanges == 0 && (this->gba.ndata_active == this->ndata)) || (this->duration > this->maxtime) || (this->round >= this->maxrounds);
 36 | 			this->nchanges = 0;
 37 | 			++this->round;
 38 | 		}
 39 | 
 40 | 	protected:	
 41 | 	
 42 | 		growbatchapp::GBApp<TInt, TFloat> gba;
 43 | 		//For the data used in an X update, some algorithms 
 44 | 		//do full on data x centroid multiplications, how much 
 45 | 		//data can be used per thread in a full data x centroid
 46 | 		// product?
 47 | 		TInt maxpermultiplyblock;	
 48 | 
 49 | 		virtual void set_X_tasks() = 0;
 50 | 		virtual void set_C_tasks() = 0;
 51 | 		
 52 | 		inline TFloat * const get_delta_C(){
 53 | 			return this->gba.delta_C.get();
 54 | 		}
 55 | 		
 56 | 		virtual void set_initialisation_tasks() = 0;		
 57 | 
 58 | 		void BGB_constructor_helper_densebits(){
 59 | 
 60 | 			//stuff specific to dense goes here.
 61 | 			
 62 | 			this->setalgname("Dense Base Grow Batch");
 63 | 			
 64 | 			this->maxpermultiplyblock =
 65 | 			std::max(static_cast<TInt> (1),
 66 | 			static_cast<TInt> ((this->getndata() * this->getdimension())/(2 * this->getncentroids() * this->nthreads)));			
 67 | 		}
 68 | 		
 69 | 		
 70 | 		//A c&p from minibatch base. Not as code reducing as the baseexact version, but easier to understand
 71 | 		template <typename Function, typename... Args>
 72 | 		void gb_pll_principal_X(const Function & X_updater, TInt ti, Args&&... args){
 73 | 	
 74 | 			arrutilv2::pll_update_L_etc(
 75 | 			//The compulsory parameters to pll_update_L_etc,
 76 | 			X_updater, 
 77 | 			this->ncentroids, this->dimension, this->get_sums(), this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_counts(), this->get_dcounts() + ti*this->ncentroids, this->nchanges, this->ndcalcs_X, this->work_mutex
 78 | 			//The additional parameters to pll_update_L_etc with correct offset
 79 | 			, std::forward<Args>(args)...);
 80 | 		}
 81 | 		
 82 | 		
 83 | 	private:
 84 | 		virtual void set_summaries() override final{
 85 | 			this->set_summaries_growbatch(this->gba);
 86 | 		}
 87 | 
 88 | 
 89 | 		//Note that BaseGrowBatchMse overrides this.
 90 | 		virtual void set_mse() override {
 91 | 			//if not all data is active, refuse to compute the mse
 92 | 			if (this->gba.ndata_active != this->ndata){
 93 | 				this->mse = -1;
 94 | 			}
 95 | 			
 96 | 			else{
 97 | 				this->mse = arrutilv2::getmeanl22at(this->ncentroids, this->dimension, this->get_C(), this->ndata, this->data, this->get_L(), this->get_C_l22s(), this->get_data_l22s());
 98 | 			}
 99 | 		}
100 | 		
101 | 	
102 | 	
103 | 		
104 | 		
105 | 		
106 | 
107 | 	public:
108 | 	
109 | 		template<typename... Args>
110 | 		BaseGrowBatch(TInt batchsize0, Args&&... args): kmeans::BaseKmeans<TInt, TFloat> (std::forward<Args>(args)...)		
111 | 		{
112 | 			this->BGB_constructor_helper(batchsize0, this->gba);
113 | 			this->BGB_constructor_helper_densebits();
114 | 		}
115 | 		virtual ~BaseGrowBatch(){};
116 | };
117 | }
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------
/src/BaseGrowBatchMse.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef BASEGROWBATCHMSE_H
22 | #define BASEGROWBATCHMSE_H
23 | 
24 | #include "BaseGrowBatch.h"
25 | #include "growbatchapp.h"
26 | 
27 | #include <algorithm>
28 | #include <functional>
29 | namespace kmeans{
30 | 	
31 | template <typename TInt, typename TFloat>
32 | /* A type of GrowBatch, so batch size doubles when determined to be appropriate.
33 |  * Specifically, we monitor 
34 |  * (1) mse per cluster, 
35 |  * (2) delta_C per cluster 
36 |  * and if *median* of mse/delta_C > *1*, double (while can) see basecluster function for details (in basecluster so that sparse can use as well)
37 |  * */
38 |  
39 | class BaseGrowBatchMse : public kmeans::BaseGrowBatch<TInt, TFloat>{
40 | 	
41 | 	private:		
42 | 		virtual void set_mse() override final {
43 | 			this->gbmse_set_mse(this->gba, this->gbmseapp);
44 | 		}
45 | 		
46 | 	protected:
47 | 
48 | 		growbatchapp::GBMseApp<TInt, TFloat> gbmseapp;
49 | 		
50 | 		TFloat * const get_dn(){
51 | 			return this->gbmseapp.dn.get();
52 | 		}
53 | 	
54 | 		virtual bool should_double() override final{
55 | 			return this->gbmse_should_double(this->gba, this->gbmseapp.mse_by_cluster.data());
56 | 		}
57 | 		
58 | 		virtual void set_L_dn(TInt x0, TInt x1) override final {
59 | 			TInt local_ndcalcs = 0;			
60 | 			arrutilv2::set_rargminmins(x1 - x0, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dn() + x0, local_ndcalcs);
61 | 			this->ndcalcs_X += local_ndcalcs;
62 | 		}
63 | 			
64 | 		virtual void set_initialisation_tasks() = 0;
65 | 		
66 | 		virtual void set_X_tasks() override final{
67 | 			/* using function as defined in basedensecentroidkmeans */
68 | 			this->X_tasks = this->bgbmse_update_L_dn_etc_S_H_batch_switch_mati(this->gba); 		
69 | 		}
70 | 		
71 | 		virtual void set_C_tasks() = 0;
72 | 		
73 | 		public:
74 | 
75 | 			template<typename... Args>
76 | 			BaseGrowBatchMse(TInt batchsize0, Args&&... args): kmeans::BaseGrowBatch<TInt, TFloat> (batchsize0, std::forward<Args>(args)...)		
77 | 			{
78 | 				this->BGBM_constructor_helper(this->gbmseapp);			
79 | 			}
80 | 				
81 | 			virtual ~BaseGrowBatchMse(){};
82 | 
83 | };
84 | }
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/src/BaseSparseGrowBatch.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | //Based blindly on BaseSparseGrowBatch.h
 22 | 
 23 | #ifndef BASESPARSEGROWBATCH_H
 24 | #define BASESPARSEGROWBATCH_H
 25 | 
 26 | #include "basesparsekmeans.h"
 27 | #include "growbatchapp.h"
 28 | 
 29 | 
 30 | namespace kmeans{
 31 | template <typename TInt, typename TFloat>
 32 | class BaseSparseGrowBatch : public kmeans::BaseSparseKmeans<TInt, TFloat>{
 33 | 
 34 | 	private: 	
 35 | 	
 36 | 		virtual void endroundupdate() override final{
 37 | 			this->iscomplete = (this->nchanges == 0 && (this->gba.ndata_active == this->ndata)) || (this->duration > this->maxtime) || (this->round >= this->maxrounds);
 38 | 			this->nchanges = 0;
 39 | 			++this->round;
 40 | 		}
 41 | 	
 42 | 		virtual void sgb_update_L_etc(TInt x0, TInt x1, TInt ti) = 0;
 43 | 		virtual void sgb_set_L_etc(TInt x0, TInt x1, TInt ti) = 0;
 44 | 
 45 | 	protected:	
 46 | 	
 47 | 		growbatchapp::GBApp<TInt, TFloat> gba;
 48 | 
 49 | 		virtual void set_C_tasks() = 0;
 50 | 		
 51 | 		inline TFloat * const get_delta_C(){
 52 | 			return this->gba.delta_C.get();
 53 | 		}
 54 | 		
 55 | 		virtual void set_initialisation_tasks() = 0;		
 56 | 
 57 | 		void BGB_constructor_helper_sparsebits(){
 58 | 			this->setalgname("Sparse Base Grow Batch");
 59 | 		}
 60 | 		
 61 | 		
 62 | 		//This may be a bit premature : hoping that my assumption of one task per update round is ~accurate.
 63 | 		virtual void set_X_tasks() override final{ 
 64 | 			this->X_tasks = this->sgb_update_L_dn_etc_S_H_mati(); 		
 65 | 		}
 66 | 
 67 | 		
 68 | 		std::vector<std::function<void(TInt)> > sgb_update_L_dn_etc_S_H_mati(){
 69 | 		std::vector<std::function<void(TInt)> > tasks = {};
 70 | 			tasks.emplace_back (
 71 | 				//update L and dn of data used in previous round			
 72 | 				[this](TInt ti){
 73 | 					TInt x0 = (ti*this->gba.ndata_active_previous)/this->nthreads;
 74 | 					TInt x1 = ((ti+1)*this->gba.ndata_active_previous)/this->nthreads;
 75 | 					
 76 | 					this->sgb_update_L_etc(x0, x1, ti);
 77 | 					
 78 | 				}
 79 | 			);
 80 | 			
 81 | 			tasks.emplace_back(
 82 | 				[this](TInt ti){
 83 | 					if (ti == 0){
 84 | 						sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts(), this->nchanges);	 
 85 | 					}
 86 | 				}
 87 | 			);
 88 | 				
 89 | 			tasks.emplace_back(
 90 | 				[this](TInt ti){
 91 | 				//set L and dn of unused 
 92 | 					if (this->gba.ndata_active != this->gba.ndata_active_previous){
 93 | 						TInt ndata_tail = this->gba.ndata_active - this->gba.ndata_active_previous;
 94 | 						TInt x0 =  this->gba.ndata_active_previous + (ti*ndata_tail)/this->nthreads;
 95 | 						TInt x1 =  this->gba.ndata_active_previous + ((ti + 1)*ndata_tail)/this->nthreads;
 96 | 						this->sgb_set_L_etc(x0, x1, ti);
 97 | 						this->nchanges += x1 - x0;
 98 | 					} 
 99 | 				}
100 | 			);
101 | 			
102 | 			tasks.emplace_back(
103 | 				[this](TInt ti){
104 | 					if (ti == 0){
105 | 						if (this->gba.ndata_active != this->gba.ndata_active_previous){
106 | 							sparse::increment_S_H(this->gba.ndata_active_previous, 
107 | 							this->gba.ndata_active, 
108 | 							*this->ptrdata, 
109 | 							this->get_L(), 
110 | 							this->get_sums(), 
111 | 							this->get_counts());
112 | 						}
113 | 					}
114 | 				}
115 | 			);
116 | 			return tasks;
117 | 		}
118 | 		
119 | 		
120 | 	private:
121 | 		virtual void set_summaries() override final{
122 | 			this->set_summaries_growbatch(this->gba);
123 | 		}
124 | 
125 | 		virtual void set_mse() override {
126 | 			if (this->gba.ndata_active != this->ndata){
127 | 				this->mse = -1;
128 | 			}
129 | 			
130 | 			else{
131 | 				this->mse = this->getmeanl22at();
132 | 			}
133 | 		}
134 | 		
135 | 	
136 | 	
137 | 		
138 | 		
139 | 		
140 | 
141 | 	public:
142 | 	
143 | 		template<typename... Args>
144 | 		BaseSparseGrowBatch(TInt batchsize0, Args&&... args): kmeans::BaseSparseKmeans<TInt, TFloat> (std::forward<Args>(args)...)
145 | 		{
146 | 			this->BGB_constructor_helper(batchsize0, this->gba);
147 | 			this->BGB_constructor_helper_sparsebits();
148 | 
149 | 		}
150 | 		virtual ~BaseSparseGrowBatch(){};
151 | };
152 | }
153 | 
154 | #endif
155 | 


--------------------------------------------------------------------------------
/src/BaseSparseGrowBatchMse.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | //based blindly on BaseSparseGrowBatchMse.h, almost identical. 
22 | 
23 | #ifndef BASESPARSEGROWBATCHMSE_H
24 | #define BASESPARSEGROWBATCHMSE_H
25 | 
26 | #include "BaseSparseGrowBatch.h"
27 | #include "growbatchapp.h"
28 | 
29 | #include <algorithm>
30 | #include <functional>
31 | namespace kmeans{
32 | 	
33 | template <typename TInt, typename TFloat>
34 | /* A type of GrowBatch, so batch size doubles when determined to be appropriate.
35 |  * Specifically, we monitor 
36 |  * (1) mse per cluster, 
37 |  * (2) delta_C per cluster 
38 |  * and if *median* of mse/delta_C > *1*, double (while can) see basecluster function for details (in basecluster so that sparse can use as well)
39 |  * */
40 |  
41 | class BaseSparseGrowBatchMse : public kmeans::BaseSparseGrowBatch<TInt, TFloat>{
42 | 	
43 | 	private:		
44 | 		virtual void set_mse() override final {
45 | 			this->gbmse_set_mse(this->gba, this->gbmseapp);
46 | 		}
47 | 		
48 | 		
49 | 		
50 | 		
51 | 		
52 | 	protected:
53 | 
54 | 		growbatchapp::GBMseApp<TInt, TFloat> gbmseapp;
55 | 		
56 | 		TFloat * const get_dn(){
57 | 			return this->gbmseapp.dn.get();
58 | 		}
59 | 	
60 | 		virtual bool should_double() override final{
61 | 			return this->gbmse_should_double(this->gba, this->gbmseapp.mse_by_cluster.data());
62 | 		}
63 | 		
64 | 		virtual void set_L_dn(TInt x0, TInt x1) override final {
65 | 			sparse::set_L_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), 
66 | 			this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dn());
67 | 			this->ndcalcs_X += (x1 - x0)*this->ncentroids;
68 | 		}
69 | 			
70 | 		virtual void set_initialisation_tasks() = 0;
71 | 				
72 | 		virtual void set_C_tasks() = 0;
73 | 		
74 | 		public:
75 | 
76 | 			template<typename... Args>
77 | 			BaseSparseGrowBatchMse(TInt batchsize0, Args&&... args): kmeans::BaseSparseGrowBatch<TInt, TFloat> (batchsize0, std::forward<Args>(args)...)		
78 | 			{
79 | 				this->BGBM_constructor_helper(this->gbmseapp);			
80 | 			}
81 | 				
82 | 			virtual ~BaseSparseGrowBatchMse(){};
83 | 
84 | };
85 | }
86 | 
87 | #endif
88 | 


--------------------------------------------------------------------------------
/src/GBMse3v1.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef GBMse3v1_H
 22 | #define GBMse3v1_H
 23 | 
 24 | 
 25 | 
 26 | #include "BaseGrowBatchMse.h"
 27 | #include "alg_X_selkSN.h"
 28 | #include "arrutilv2l3.h"
 29 | 
 30 | namespace kmeans{
 31 | 	
 32 | //The updating with 3v1 is similar to that of 3v0, but takes advantage of the fact that upper bounds are always tight. (dn is always the distance to the nearest centroid).
 33 | 	
 34 | template <typename TInt, typename TFloat>
 35 |  
 36 | class GBMse3v1 : public kmeans::BaseGrowBatchMse<TInt, TFloat>{
 37 | 	
 38 | 	private:		
 39 | 
 40 | 		virtual void update_already_used(TInt x0, TInt x1, TInt ti) override final{
 41 | 			
 42 | 			this->gb_pll_principal_X(
 43 | 			kmeans::update_L_lowers_upper_S_H_3v1<TInt, TFloat>, 
 44 | 			ti,
 45 | 			x1 - x0,
 46 | 			this->data + x0*this->dimension,
 47 | 			this->get_C(),
 48 | 			this->get_data_l22s() + x0, 
 49 | 			this->get_C_l22s(), 
 50 | 			this->gba.delta_C.get(), 
 51 | 			this->get_L() + x0, 
 52 | 			this->get_lowers() + x0*this->ncentroids, 
 53 | 			this->get_dn() + x0, 
 54 | 			this->round);
 55 | 			
 56 | 		}
 57 | 		
 58 | 		virtual void update_unused(TInt x0, TInt x1, TInt ti) override final{
 59 | 					
 60 | 			arrutilv2::set_L_lowers_dn_and_increment_S_H(x1 - x0, this->dimension, this->data + this->dimension*x0, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_lowers() + x0*this->ncentroids, this->get_dn() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex);
 61 | 			this->ndcalcs_X += (x1 - x0)*this->ncentroids;
 62 | 		}
 63 | 
 64 | 	
 65 | 	protected:
 66 | 
 67 | 		TFloat * const get_lowers(){
 68 | 			return this->elkan_lowers_base.get();
 69 | 		}
 70 | 		
 71 | 		virtual void set_initialisation_tasks() override final{
 72 | 			
 73 | 			auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_lowers_dn_inds0_mati(this->gba);
 74 | 			
 75 | 			auto init_task_B = this->base_set_S_H_ati(static_cast<TInt>(0), this->gba.ndata_active);	
 76 | 			this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end());
 77 | 			this->initialisation_tasks.push_back(init_task_B);
 78 | 		}
 79 | 		
 80 | 		virtual void set_C_tasks() override final{
 81 | 			this->C_tasks = {};
 82 | 			this->C_tasks.push_back(	
 83 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX)
 84 | 			);
 85 | 			this->C_tasks.push_back(
 86 | 				this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp)
 87 | 			);				
 88 | 			this->C_tasks.push_back(
 89 | 				this->update_ndata_active_ati(this->gba)
 90 | 			);		
 91 | 		}
 92 | 		
 93 | 		virtual void set_L_lowers_dn(TInt x0, TInt x1) override final{
 94 | 						
 95 | 			arrutilv2::set_rrl2ss_argminmins<TInt, TFloat>(x1 - x0, this->dimension, 
 96 | 			this->data + x0*this->dimension, this->ncentroids, 
 97 | 			this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), 
 98 | 			this->elkan_lowers_base.get() + x0*this->ncentroids, 
 99 | 			this->get_L() + x0, 
100 | 			this->gbmseapp.dn.get() + x0
101 | 			
102 | 			);
103 | 		}
104 | 
105 | 	public:
106 | 
107 | 		template<typename... Args>
108 | 		GBMse3v1(TInt batchsize0, Args&&... args): kmeans::BaseGrowBatchMse<TInt, TFloat> (batchsize0, std::forward<Args>(args)...)		
109 | 		{
110 | 			this->assignmemory_elkan_lowers(); 
111 | 			this->algname = "GBMse Elkan 3v1";
112 | 		}
113 | 			
114 | 		virtual ~GBMse3v1(){};
115 | 
116 | };
117 | 
118 | }
119 | 
120 | #endif
121 | 


--------------------------------------------------------------------------------
/src/GBMseSimple.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef GBMSESIMPLE_H
22 | #define GBMSESIMPLE_H
23 | 
24 | #include "BaseGrowBatchMse.h"
25 | 
26 | namespace kmeans{
27 | 	
28 | template <typename TInt, typename TFloat>
29 |  
30 | class GBMseSimple : public kmeans::BaseGrowBatchMse<TInt, TFloat>{
31 | 	
32 | 	private:		
33 | 
34 | 		virtual void update_already_used(TInt x0, TInt x1, TInt ti) override final{
35 | 			arrutilv2::update_L_dn_S_H_batch(x1 - x0, this->maxpermultiplyblock, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dn() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex);			
36 | 			this->ndcalcs_X += (x1 - x0)*this->ncentroids;
37 | 		}
38 | 		
39 | 		virtual void update_unused(TInt x0, TInt x1, TInt ti) override final{
40 | 			arrutilv2::update_L_dn_S_H_batch_increment_only(x1 - x0, this->maxpermultiplyblock, this->dimension, this->data + this->dimension*x0, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dn() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids, this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex);
41 | 			this->ndcalcs_X += (x1 - x0)*this->ncentroids;
42 | 		}
43 | 
44 | 	
45 | 	protected:
46 | 	
47 | 		virtual void set_initialisation_tasks() override final{
48 | 			auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_dn_inds0_mati(this->gba);
49 | 			auto init_task_B = this->base_set_S_H_ati(static_cast<TInt>(0), this->gba.ndata_active);	
50 | 			this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end());
51 | 			this->initialisation_tasks.push_back(init_task_B);
52 | 		}
53 | 		
54 | 		virtual void set_C_tasks() override final{
55 | 			this->C_tasks = {};
56 | 			this->C_tasks.push_back(			
57 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX)
58 | 			);
59 | 			this->C_tasks.push_back(
60 | 				this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp)
61 | 			);				
62 | 			this->C_tasks.push_back(
63 | 				this->update_ndata_active_ati(this->gba)
64 | 			);		
65 | 		}
66 | 
67 | 	public:
68 | 
69 | 		template<typename... Args>
70 | 		GBMseSimple(TInt batchsize0, Args&&... args): kmeans::BaseGrowBatchMse<TInt, TFloat> (batchsize0, std::forward<Args>(args)...)		
71 | 		{
72 | 			this->algname = "GBMse Simple Dense"; 
73 | 		}
74 | 			
75 | 		virtual ~GBMseSimple(){};
76 | 
77 | };
78 | 
79 | 
80 | }
81 | 
82 | #endif
83 | 


--------------------------------------------------------------------------------
/src/SparseGBMse3v1.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef SPARSEGBMSE3V0_H
 22 | #define SPARSEGBMSE3V0_H
 23 | 
 24 | #include "BaseSparseGrowBatchMse.h"
 25 | 
 26 | #include "alg_X_selkSN.h"
 27 | 
 28 | namespace kmeans{
 29 | 	
 30 | template <typename TInt, typename TFloat>
 31 |  
 32 | class SparseGBMse3v1 : public kmeans::BaseSparseGrowBatchMse<TInt, TFloat>{
 33 | 	
 34 | 	private:		
 35 | 
 36 | 		//updates L, dn, this->mba.nchanges_on_batch[ti], ...
 37 | 		virtual void sgb_update_L_etc(TInt x0, TInt x1, TInt ti){
 38 | 			
 39 | 			this->where_label_changes[ti].clear(); //index, old, new.
 40 | 
 41 | 			TInt ndcalcs_local = 0;
 42 | 			kmeans::sparse_update_L_lowers_upper_where_changes_3v1(this->ncentroids, this->dimension, x0, x1, *this->ptrdata, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->gba.delta_C.get(), this->where_label_changes[ti], ndcalcs_local, this->get_L() + x0, this->get_lowers() + x0*this->ncentroids, this->get_dn() + x0);
 43 | 			
 44 | 			std::lock_guard<std::mutex> gluk(this->work_mutex);
 45 | 			this->ndcalcs_X += ndcalcs_local;
 46 | 		}
 47 | 
 48 | 		//sets L, dn, ...		
 49 | 		virtual void sgb_set_L_etc(TInt x0, TInt x1, TInt ti){
 50 | 
 51 | 			sparse::set_L_lowers_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_lowers(), this->get_dn());
 52 | 
 53 | 			std::lock_guard<std::mutex> gluk(this->work_mutex);
 54 | 			this->ndcalcs_X += this->ncentroids*(x1 - x0);			
 55 | 		}
 56 | 
 57 | 	
 58 | 
 59 | 	
 60 | 	protected:
 61 | 	
 62 | 		TFloat * const get_lowers(){
 63 | 			return this->elkan_lowers_base.get();
 64 | 		}
 65 | 	
 66 | 	
 67 | 	
 68 | 		virtual void set_initialisation_tasks() override final{
 69 | 			auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_lowers_dn_inds0_mati(this->gba);		
 70 | 			auto init_task_B = this->base_set_S_H_ati(static_cast<TInt>(0), this->gba.ndata_active);	
 71 | 			this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end());
 72 | 			this->initialisation_tasks.push_back(init_task_B);
 73 | 		}
 74 | 		
 75 | 		virtual void set_C_tasks() override final{
 76 | 			this->C_tasks = {};
 77 | 			this->C_tasks.push_back(	
 78 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX)
 79 | 			);
 80 | 			this->C_tasks.push_back(
 81 | 				this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp)
 82 | 			);				
 83 | 			this->C_tasks.push_back(
 84 | 				this->update_ndata_active_ati(this->gba)
 85 | 			);
 86 | 		}
 87 | 			
 88 | 			//this->C_tasks.push_back(
 89 | 				//[this](TInt ti){
 90 | 					//if (ti == 0){
 91 | 						//std::cout << "\n----------------------------------------------\n";
 92 | 						////TInt mincount = 100000;
 93 | 						//for (TInt ci = 0; ci < this->ncentroids; ++ci){
 94 | 							//std::cout << this->get_counts()[ci] << " ";
 95 | 							////if (mincount > this->get_counts()[ci]){
 96 | 								////mincount = this->get_counts()[ci];
 97 | 							////}
 98 | 						//}
 99 | 						//std::cout << "\n----------------------------------------------\n";
100 | 
101 | 						
102 | 					//}
103 | 				//}
104 | 			//);		
105 | 		
106 | 		
107 | 		virtual void set_L_lowers_dn(TInt x0, TInt x1) override final{
108 | 			
109 | 			////TODO : do I need to increment ndcalcs_X?
110 | 			sparse::set_L_lowers_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_lowers(), this->get_dn());
111 | 
112 | 
113 | 
114 | 
115 | 			//arrutilv2::set_rrl2ss_argminmins<TInt, TFloat>(x1 - x0, this->dimension, 
116 | 			//this->data + x0*this->dimension, this->ncentroids, 
117 | 			//this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), 
118 | 			//this->elkan_lowers_base.get() + x0*this->ncentroids, 
119 | 			//this->get_L() + x0, 
120 | 			//this->gbmseapp.dn.get() + x0
121 | 			
122 | 			//);
123 | 		}
124 | 
125 | 	public:
126 | 
127 | 		template<typename... Args>
128 | 		SparseGBMse3v1(TInt batchsize0, Args&&... args): kmeans::BaseSparseGrowBatchMse<TInt, TFloat> (batchsize0, std::forward<Args>(args)...)		
129 | 		{
130 | 			this->assignmemory_elkan_lowers(); 
131 | 			this->algname = "GBMse 3v0 Sparse (turbocharged-rho)"; 
132 | 		}
133 | 			
134 | 		virtual ~SparseGBMse3v1(){};
135 | 
136 | };
137 | 
138 | 
139 | }
140 | 
141 | #endif
142 | 


--------------------------------------------------------------------------------
/src/SparseGBMseSimple.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | //blindly based on GBMseSimple, again very similar.
 22 | 
 23 | #ifndef SPARSEGBMSESIMPLE_H
 24 | #define SPARSEGBMSESIMPLE_H
 25 | 
 26 | #include "BaseSparseGrowBatchMse.h"
 27 | 
 28 | namespace kmeans{
 29 | 	
 30 | template <typename TInt, typename TFloat>
 31 |  
 32 | class SparseGBMseSimple : public kmeans::BaseSparseGrowBatchMse<TInt, TFloat>{
 33 | 	
 34 | 	private:		
 35 | 
 36 | 		//updates L, dn, this->mba.nchanges_on_batch[ti]		
 37 | 		virtual void sgb_update_L_etc(TInt x0, TInt x1, TInt ti){
 38 | 			
 39 | 			this->where_label_changes[ti].clear(); //index, old, new.
 40 | 			sparse::update_L_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dn(), this->where_label_changes[ti]);
 41 | 			
 42 | 			std::lock_guard<std::mutex> gluk(this->work_mutex);
 43 | 			//this->nchanges += this->where_label_changes[ti].size();
 44 | 			this->ndcalcs_X += this->ncentroids*(x1 - x0);			
 45 | 		}
 46 | 
 47 | 		//sets L, dn		
 48 | 		virtual void sgb_set_L_etc(TInt x0, TInt x1, TInt ti){
 49 | 
 50 | 			sparse::set_L_dn(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dn());
 51 | 
 52 | 			std::lock_guard<std::mutex> gluk(this->work_mutex);
 53 | 				this->ndcalcs_X += this->ncentroids*(x1 - x0);			
 54 | 		}
 55 | 
 56 | 	
 57 | 
 58 | 	
 59 | 	protected:
 60 | 	
 61 | 		virtual void set_initialisation_tasks() override final{
 62 | 			auto init_tasks_A = this->bgbmse_makeset_C_C_l22s_L_dn_inds0_mati(this->gba);
 63 | 			auto init_task_B = this->base_set_S_H_ati(static_cast<TInt>(0), this->gba.ndata_active);	
 64 | 			this->initialisation_tasks.insert(this->initialisation_tasks.end(), init_tasks_A.begin(), init_tasks_A.end());
 65 | 			this->initialisation_tasks.push_back(init_task_B);
 66 | 		}
 67 | 		
 68 | 		virtual void set_C_tasks() override final{
 69 | 			this->C_tasks = {};
 70 | 			
 71 | 			
 72 | 			//we use this codefrag to confirm that S and H are correctly set.
 73 | 			if (true == false){
 74 | 				this->C_tasks.push_back(
 75 | 					[this](TInt ti){
 76 | 						if (ti == 0){
 77 | 							sparse::todense::set_S_H(*this->ptrdata, static_cast<TInt> (0), this->gba.ndata_active, this->ncentroids, this->get_L(), this->get_sums(), this->get_counts());
 78 | 						}
 79 | 					}
 80 | 				);
 81 | 			}
 82 | 
 83 | 
 84 | 
 85 | 			this->C_tasks.push_back(			
 86 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->gba.delta_C.get(), this->ndcalcs_notX)
 87 | 			);
 88 | 			this->C_tasks.push_back(
 89 | 				this->set_mse_sse_by_cluster_ati(this->gba, this->gbmseapp)
 90 | 			);				
 91 | 			this->C_tasks.push_back(
 92 | 				this->update_ndata_active_ati(this->gba)
 93 | 			);
 94 | 			
 95 | 		}
 96 | 
 97 | 	public:
 98 | 
 99 | 		template<typename... Args>
100 | 		SparseGBMseSimple(TInt batchsize0, Args&&... args): kmeans::BaseSparseGrowBatchMse<TInt, TFloat> (batchsize0, std::forward<Args>(args)...)		
101 | 		{
102 | 			this->algname = "GBMse Simple Sparse"; 
103 | 		}
104 | 			
105 | 		virtual ~SparseGBMseSimple(){};
106 | 
107 | };
108 | 
109 | 
110 | }
111 | 
112 | #endif
113 | 


--------------------------------------------------------------------------------
/src/YY17v2.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_ELKANKMEANS_17V2_H
 22 | #define PLL_ELKANKMEANS_17V2_H
 23 | 
 24 | #include "baseYYSMN.h" //for initial clustering of the centroids
 25 | 
 26 | 
 27 | #include <fstream>
 28 | #include <sstream>
 29 | #include <iostream>
 30 | #include <string>
 31 | #include <functional>
 32 | 
 33 | namespace kmeans{
 34 | 	
 35 | 
 36 | template <typename TInt, typename TFloat>
 37 | /* the same as a17v2.h : no delta_C test as per yy, so if group bound fails all distances calculated. 
 38 |  * a few percent slower, strange as most pxx are faster than axx.
 39 |  * */
 40 | void update_L_glowers_upb_S_H_17v2(TInt ncentroids, TInt dimension, TFloat * const S, TInt * const H, TInt & nchanges, TInt & ndcalcs, TInt ndata, const TFloat * const data, const TFloat * const C, const TFloat * const data_l22s,  const TFloat * const  C_l22s, const TFloat * const delta_C, const TFloat * const  delta_G,  TInt * const  L, TInt ngroups, const TInt * const groupparts, const TInt * const  groupsizes, TInt * const  group, TFloat * const glowers, TFloat * const upb, const TInt & round){
 41 | 	
 42 | 
 43 | 	std::unique_ptr<TFloat []> distances( new TFloat [ncentroids] );	
 44 | 	TInt group_nearest_index;
 45 | 	TFloat group_nearest;
 46 | 	TFloat group_second_nearest;
 47 | 
 48 | 	for (TInt i = 0; i < ndata; ++i){
 49 | 		arrutilv2::set_l2(dimension, data + i*dimension, C + L[i]*dimension, data_l22s[i], C_l22s[L[i]], upb[i] , ndcalcs );
 50 | 		TInt label_before = L[i];	
 51 | 			
 52 | 		for (TInt gi = 0; gi < ngroups; ++gi){
 53 | 			glowers[i*ngroups + gi] -= delta_G[gi];
 54 | 		}
 55 | 		
 56 | 		for (TInt gi = 0; gi < ngroups; ++gi){
 57 | 			if (glowers[i*ngroups + gi] < upb[i]){
 58 | 				arrutilv2::set_rl2s(dimension, data + i*dimension, groupsizes[gi], C + groupparts[gi]*dimension, data_l22s[i], C_l22s + groupparts[gi], distances.get(), ndcalcs);
 59 | 
 60 | 				if (gi != group[i]){
 61 | 					arrutilv2::set_argminmin2(groupsizes[gi], distances.get(), group_nearest_index, group_nearest, group_second_nearest);
 62 | 					group_nearest_index += groupparts[gi];								
 63 | 					if (group_nearest < upb[i]){
 64 | 						if (gi < group[i]){
 65 | 							glowers[i*ngroups + group[i]] = std::min(upb[i], glowers[i*ngroups + group[i]]);
 66 | 						}
 67 | 						else{
 68 | 							glowers[i*ngroups + group[i]] = upb[i];
 69 | 						}
 70 | 						glowers[i*ngroups + gi] = group_second_nearest;
 71 | 						L[i] = group_nearest_index;
 72 | 						group[i] = gi;
 73 | 						upb[i] = group_nearest;
 74 | 					}
 75 | 
 76 | 					else{
 77 | 						glowers[i*ngroups + gi] = group_nearest;
 78 | 					}
 79 | 				}
 80 | 
 81 | 				else{
 82 | 					arrutilv2::set_argminmin2(groupsizes[gi], distances.get(), L[i], upb[i], glowers[i*ngroups + gi]);
 83 | 					L[i] += groupparts[gi];
 84 | 				}
 85 | 			}
 86 | 		}
 87 | 		
 88 | 		if (L[i] != label_before){ 
 89 | 			++nchanges;
 90 | 			++H[L[i]];
 91 | 			--H[label_before];
 92 | 			arrutilv2::addto(dimension, data + i*dimension, S + dimension*L[i]);
 93 | 			arrutilv2::subtractfrom(dimension, data + i*dimension, S + dimension*label_before);
 94 | 		}
 95 | 	}
 96 | }
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | template <typename TInt, typename TFloat>
103 | class P17V2 : public kmeans::BaseYYSMN<TInt, TFloat>{
104 | 	
105 | 
106 | 	protected:	
107 | 		
108 | 		std::function<void(TInt)> update_L_glowers_upb_S_H_17v2_ati(){
109 | 			return [this](TInt ti){
110 | 				TInt x0 = (ti*this->getndata())/this->getnthreads();
111 | 				this->pll_principal_X(update_L_glowers_upb_S_H_17v2<TInt, TFloat>, ti, 
112 | 				this->get_delta_C(), this->get_delta_G(), this->get_L() + x0, this->get_ngroups(), this->get_groupparts(), this->get_groupsizes(), this->get_group() + x0, this->get_glowers() + x0*this->get_ngroups(), this->get_upb() + x0, this->round);
113 | 			};
114 | 		}
115 | 		
116 | 	public:
117 | 		typedef kmeans::BaseYYSMN<TInt, TFloat> BC;
118 | 		template<typename... Args>
119 | 		P17V2(Args&&... args): BC(std::forward<Args>(args)...)
120 | 		
121 | 		{
122 | 			this->setalgname("p17v2");
123 | 		}
124 | 		
125 | 		virtual ~P17V2(){}
126 | 
127 | 		virtual void verbose_write_additional(){
128 | 			this->get_verbose_file() << "\n\n ..not implemented down to 17v2..\n\n";
129 | 		}
130 | 
131 | 		virtual void set_initialisation_tasks(){
132 | 			this->yinyang_initialisation_tasks();
133 | 		}
134 | 		
135 | 		virtual void set_C_tasks(){
136 | 			
137 | 			this->C_tasks = {
138 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX),
139 | 
140 | 				arrutilv2::update_delta_G_from_delta_C_ati(this->getncentroids(), this->get_delta_C(), this->get_ngroups(), this->get_groupparts(), this->get_groupsizes(), this->get_delta_G())
141 | 			};
142 | 		}
143 | 		
144 | 		virtual void set_X_tasks(){
145 | 			
146 | 			this->X_tasks = {
147 | 				this->update_L_glowers_upb_S_H_17v2_ati()
148 | 			};
149 | 		}
150 | };
151 | 
152 | }
153 | 
154 | #endif
155 | 
156 | 


--------------------------------------------------------------------------------
/src/arrutilv2copy.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef ARRUTILV2COPY_H
 22 | #define ARRUTILV2COPY_H
 23 | 
 24 | #include <exception>
 25 | #include <stdexcept>
 26 | #include <cstring>
 27 | #include <vector>
 28 | #include <numeric>
 29 | #include <memory>
 30 | 
 31 | 
 32 | namespace arrutilv2{
 33 | 	
 34 | template <typename TSize, typename DType>
 35 | void copyatindices(TSize ndata_from, TSize ndata_to, TSize dimension, const DType * const data_from, DType * const data_to, const TSize * const indices){
 36 | 		
 37 | 	for (TSize index_i = 0; index_i < ndata_to; ++index_i){
 38 | 		std::memcpy(data_to + index_i*dimension, data_from + indices[index_i]*dimension, dimension*sizeof(DType));
 39 | 	}
 40 | }
 41 | 
 42 | 
 43 | /* calls copy at indices but first does some checks for uniqueness, bounds etc*/
 44 | template <typename TSize, typename DType>
 45 | void copyatuniqueindices(TSize ndata_from, TSize ndata_to, TSize dimension, const DType * const data_from, DType * const data_to, const TSize * const indices){
 46 | 	
 47 | 	std::vector<bool> used (ndata_from,false);
 48 | 	TSize index;
 49 | 	for (TSize i = 0; i < ndata_to; ++i){
 50 | 		index = indices[i];
 51 | 		if (index >= ndata_from){
 52 | 			throw std::runtime_error("index of the data to copy (" + std::to_string(index) + ") seems to be out of range in copyatuniqueindices (" + std::to_string(ndata_from) + "). In other words, the following is false, causing the throw " + std::to_string(index) + " >= " + std::to_string(ndata_from));
 53 | 		}
 54 | 		
 55 | 		else if (used[index] == true){
 56 | 			throw std::runtime_error("index to copy (" +std::to_string(index) +") seems to have already been copied. Possible cause: indices to be copied are not unique, which contradicts assumptions in this function");
 57 | 		}
 58 | 		
 59 | 		else{
 60 | 			used[i] = true;
 61 | 		}
 62 | 	}
 63 | 	copyatindices(ndata_from, ndata_to, dimension, data_from, data_to, indices);
 64 | }
 65 | 	
 66 | 
 67 | 
 68 | template <typename TSize, typename DType>
 69 | void copyatindices(TSize ndata_from, TSize dimension, const DType * const data_from, DType * const data_to, const std::vector<TSize> & indices){
 70 | 	TSize ndata_to = indices.size();
 71 | 	copyatindices(ndata_from, ndata_to, dimension, data_from, data_to, indices.data());
 72 | }
 73 | 
 74 | 
 75 | template <typename TSize, typename DType>
 76 | void copyatuniqueindices(TSize ndata_from, TSize dimension, const DType * const data_from, DType * const data_to, const std::vector<TSize> & indices){
 77 | 	TSize ndata_to = indices.size();
 78 | 	copyatuniqueindices(ndata_from, ndata_to, dimension, data_from, data_to, indices.data());
 79 | }
 80 | 
 81 | //untested
 82 | template <typename TSize, typename DType>
 83 | std::unique_ptr<DType [] > getatuniqueindices(TSize ndata_from, TSize dimension, const DType * const data_from, const std::vector<TSize> & indices){
 84 | 	std::unique_ptr<DType [] > data_to (new DType[dimension*indices.size()]);
 85 | 	copyatuniqueindices(ndata_from, dimension, data_from, data_to.get(), indices);
 86 | 	return data_to;
 87 | }
 88 | 
 89 | template <typename TSize, class T>
 90 | std::unique_ptr<T []> copy_uptrarr_to_uptrarr(TSize n, const std::unique_ptr<T []> & uptr){
 91 | 	std::unique_ptr<T []> thecopy (new T[n]);
 92 | 	std::memcpy(thecopy.get(), uptr.get(), n*sizeof(T));
 93 | 	return thecopy;
 94 | }
 95 | 
 96 | 
 97 | template <typename TSize, class T>
 98 | std::unique_ptr<T []> copy_ptrarr_to_uptrarr(TSize n, const T * const ptrarr){
 99 | 	std::unique_ptr<T []> thecopy (new T[n]);
100 | 	std::memcpy(thecopy.get(), ptrarr, n*sizeof(T));
101 | 	return thecopy;
102 | }
103 | 
104 | 
105 | template <typename TNumber, typename TSize>
106 | std::unique_ptr<TNumber []> get_initialised_uptrarr(TSize npts, TNumber defval){
107 | 	std::unique_ptr<TNumber []> upined (new TNumber [npts]);
108 | 	std::fill_n(upined.get(), npts, defval);
109 | 	return upined; 
110 | }
111 | 
112 | 
113 | template <typename TInt>
114 | std::unique_ptr < TInt []> get_with_offset(TInt n, const TInt * const vals, TInt offset){
115 | 	std::unique_ptr < TInt []> newvals (new TInt [n]);
116 | 	for (TInt ci = 0; ci < n; ++ ci){
117 | 		newvals[ci] = offset + vals[ci];
118 | 	}
119 | 	return newvals;
120 | }
121 | 
122 | 
123 | }
124 | 
125 | #endif
126 | 


--------------------------------------------------------------------------------
/src/arrutilv2discrete.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef ARRUTILV2DISCRETE_H
 22 | #define ARRUTILV2DISCRETE_H
 23 | 
 24 | #include <memory>
 25 | #include <vector>
 26 | #include <limits>
 27 | 
 28 | /* histograms, where non zero, where above threshold etc. 
 29 |  * templates should :
 30 |  * -- have no typename TFloat (better off in arrutilv2lxxx)*/
 31 | 
 32 | //TODO : think about when to return std::unique_ptr< []> and when to return std::vector
 33 | 
 34 | namespace arrutilv2{
 35 |  
 36 | template <typename TInt, typename TLabel>
 37 | std::unique_ptr<TLabel []> get_vhisto(TInt nrows, TInt ncols, TInt range, const TLabel * const labels){
 38 | 	std::unique_ptr<TInt []> vhisto (new TLabel [range*ncols]);
 39 | 	std::fill_n(vhisto.get(), range*ncols, 0);
 40 | 	for (TInt r = 0; r < nrows; ++r){
 41 | 		for (TInt c = 0; c < ncols; ++ c){
 42 | 			++vhisto[ncols*labels[r*ncols + c] + c];
 43 | 		}
 44 | 	}
 45 | 	return vhisto; 
 46 | }
 47 | 
 48 | template <typename TInt, typename TLabel>
 49 | std::vector<TInt> get_where_nonzero(TInt N, const TLabel * const X){
 50 | 	std::vector<TInt> where;
 51 | 	for (TInt i = 0; i < N; ++i){
 52 | 		if (X[i] > 0){
 53 | 			where.push_back(i);
 54 | 		}
 55 | 	}
 56 | 	return where;
 57 | }
 58 | 
 59 | 
 60 | template <typename TInt, typename TLabel>
 61 | std::vector<TInt> get_where_above_threshold(TInt N, const TLabel * const X, TInt threshold){
 62 | 	std::vector<TInt> where;
 63 | 	for (TInt i = 0; i < N; ++i){
 64 | 		if (X[i] > threshold){
 65 | 			where.push_back(i);
 66 | 		}
 67 | 	}
 68 | 	return where;
 69 | }
 70 | 
 71 | 
 72 | 
 73 | 
 74 | template <typename TInt>
 75 | std::unique_ptr<TInt []> gethistogram(TInt N, TInt range, const TInt * const L){
 76 | 	std::unique_ptr<TInt []> hist(new TInt[range]); 
 77 | 	std::fill_n(hist.get(), range, 0);
 78 | 	for (TInt i = 0; i < N; ++i){
 79 | 		++hist[L[i]];
 80 | 	}
 81 | 	return hist;
 82 | }
 83 | 
 84 | /* set max in partitions where partitions are ~equal  */
 85 | template <typename TInt, typename TNumber>
 86 | void set_maxinpartition(TInt npts, TInt npartitions, const TNumber * const vals, TNumber * const maxinpartition){
 87 | 	TInt p_end = 0;
 88 | 	TInt p_start;
 89 | 	for (TInt p = 0; p < npartitions; ++p){
 90 | 		p_start = p_end;
 91 | 		p_end = ((p+1)*npts)/npartitions;
 92 | 		maxinpartition[p] = std::numeric_limits<TNumber>::min();
 93 | 		for (TInt ci = p_start ; ci < p_end; ++ci){
 94 | 			if (vals[ci] >  maxinpartition[p]){
 95 | 				maxinpartition[p] = vals[ci];
 96 | 			}
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | /* same as inline void set_minexclusionnocheck in arrutilv2l1.h */
102 | template <typename TInt, typename TNumber>
103 | void set_min_excluding(TInt nvals, const TNumber * const vals, TInt excl, TNumber & toset){
104 | 	toset = std::numeric_limits<TNumber>::max();
105 | 	for (TInt i = 0; i < excl; ++i){
106 | 		if (vals[i] < toset){
107 | 			toset = vals[i];
108 | 		}
109 | 	}
110 | 	
111 | 	for (TInt i = excl+1; i < nvals; ++i){
112 | 		if (vals[i] < toset){
113 | 			toset = vals[i];
114 | 		}
115 | 	}
116 | }
117 | 
118 | template <typename TSize, typename TInt>
119 | TInt get_sum_int_array(TSize size, TInt * arr){
120 | 	TInt sum = arr[0];
121 | 	for (TSize i = 1; i < size; ++i){
122 | 		sum += arr[i];
123 | 	}
124 | 	return sum;
125 | }
126 | 
127 | template <typename TSize, typename TInt>
128 | bool get_sum_int_array_iszero(TSize size, const TInt * const arr){
129 | 	TInt sum = arr[0];
130 | 	TSize i = 0;
131 | 	while (sum == 0 && i < size){
132 | 		sum += arr[i];
133 | 		++i;
134 | 	}
135 | 	if (sum == 0){
136 | 		return true;
137 | 	}
138 | 	else{
139 | 		return false;
140 | 	}
141 | }
142 | 
143 | 
144 | template <typename TIntArray, typename TInt>
145 | void integraladdto(TInt N, const TIntArray * const to_add, TIntArray * const to){
146 | 	for (TInt i = 0; i < N; ++i){
147 | 		to[i] += to_add[i];
148 | 	}
149 | }
150 | 
151 | 
152 | template <typename TInt>
153 | std::vector<TInt> intlinspace(TInt i0, TInt i1, TInt npts){
154 | 	std::vector<TInt> linspaced (npts, i1);
155 | 	for (TInt k = 0; k < npts - 1; ++k){
156 | 		linspaced[k] = i0 + (k*(i1 - i0)/(npts - 1));
157 |   }
158 |   
159 |   return linspaced;
160 | }
161 | 
162 | 
163 | template<typename TSize, typename TLabel>
164 | void make_balanced(TSize minclustersize, TSize ndata, TLabel * const L, TSize nclusters, TSize * const groupsizes){
165 | 	
166 | 	TLabel argminc;
167 | 	TSize minc;
168 | 	TLabel argmaxc;
169 | 	TSize maxc;
170 | 	
171 | 	arrutilv2::set_argminmin(nclusters, groupsizes, argminc, minc);
172 | 	arrutilv2::set_argmaxmax(nclusters, groupsizes, argmaxc, maxc);
173 | 	
174 | 	while (minc < minclustersize){
175 | 		if (maxc <= minclustersize){
176 | 			throw std::runtime_error("In get contig by cluster 3, trying to balance clusters. minc < minclustersize, so balancing required. But maxc <= minclustersize, so balancing will cause another hole to appear, it will be impossible to fill all the holes there are simply not enough plugs");
177 | 		}
178 | 		
179 | 		*(std::find(L, L + ndata, argmaxc)) = argminc;
180 | 		++groupsizes[argminc];
181 | 		--groupsizes[argmaxc];
182 | 		arrutilv2::set_argminmin(nclusters, groupsizes, argminc, minc);
183 | 		arrutilv2::set_argmaxmax(nclusters, groupsizes, argmaxc, maxc);
184 | 	}
185 | }
186 | 
187 | 
188 | }
189 | 
190 | #endif
191 | 


--------------------------------------------------------------------------------
/src/arrutilv2l0.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef ARRUTILV2L0_H
22 | #define ARRUTILV2L0_H
23 | 
24 | #include <memory>
25 | #include <exception>
26 | #include <stdexcept>
27 | #include <cmath>
28 | #include <algorithm>
29 | #include <cstring>
30 | #include <iostream>
31 | #include <set>
32 | #include <chrono>
33 | #include <map>
34 | #include <limits>
35 | 
36 | /* Herein all things doable directly by blas
37 |  * Herein all distances, distances squared, mins, maxs, combos therof (which if using blas would be different)
38 |  * 
39 |  * rules of functions to make easier to use / remember:
40 |  * (1) dimensions of arrays must appear before arrays, but as late as possible
41 |  * (2) functions which return must be getxxx
42 |  * (3) functions which set_ must be set_xxx (or subtractfrom , addto , update , something obvious)
43 |  * (4) thing(s) being set_ should come as late as possible (excluding flag like parameters, background increment parameters etc.) without violating above rules 
44 |  * (5) if array being set_ is dimension d, there should be d trailing 's' to function name
45 |  * (6) if operation is on 1-D and 2-D array, should have r/c somewhere telling whether row or column
46 |  * (7) if operation on 2-D and 2-D array should have rr/rc/cr/cc as above (unless a flag like bool asrow)
47 |  * (8) nrows before ncols in parameter list
48 |  * for [TFloat = double, TInt = unsigned] autogeneration of functions to arrutilv2.cpp is done by python function 
49 |  * */
50 | 
51 | #ifdef WITHBLAS
52 | #include "arrutilv2l0withblas.h"
53 | #else
54 | #include "arrutilv2l0blasless.h"
55 | #endif
56 | 
57 | #endif
58 | 
59 | 


--------------------------------------------------------------------------------
/src/arrutilv2l0blasless.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef ARRUTILV2L0BLASLESS_H
 22 | #define ARRUTILV2L0BLASLESS_H
 23 | 
 24 | namespace arrutilv2{
 25 | 
 26 | inline void proxy_openblas_set_num_threads(int nthreads){
 27 | 
 28 | }
 29 | 
 30 | 
 31 | template <typename TInt, typename TFloat>
 32 | void set_rargmaxabs(TInt nrows, TInt ncols, const TFloat * const A,  TInt * argmaxabss){
 33 | 	TFloat absval;
 34 | 	TFloat vmax;
 35 | 	for (TInt r = 0; r < nrows; ++r){
 36 | 		argmaxabss[r] = 0;
 37 | 		vmax = A[r*ncols];
 38 | 		for (TInt c = 1; c < ncols; ++c){
 39 | 			absval = std::abs(A[r*ncols + c]);
 40 | 			if (absval > vmax){
 41 | 				argmaxabss[r] = c;
 42 | 				vmax = absval;
 43 | 			}
 44 | 		}
 45 | 	}
 46 | }
 47 | 
 48 | template <typename TInt, typename TFloat, typename TScaleFloatType>
 49 | void scale(TInt N, TScaleFloatType factor, TFloat * const toscale){
 50 | 	TFloat factor_ct = static_cast<TFloat> (factor);
 51 | 	for (TInt i = 0; i < N; ++i){
 52 | 		toscale[i] *= factor_ct;
 53 | 	}
 54 | }
 55 | 
 56 | 
 57 | 
 58 | template <typename TInt, typename TFloat>
 59 | void rank1rowupdate(TInt ncols,  const TFloat * const row, TFloat scale, TInt nrows, TFloat * const toupdate){
 60 | 	std::unique_ptr<TFloat []> scaledrow (new TFloat [ncols]);
 61 | 	for (TInt c = 0; c < ncols; ++c){
 62 | 		scaledrow[c] = row[c]*scale;
 63 | 	}
 64 | 	for (TInt r = 0; r < nrows; ++r){
 65 | 		for (TInt c = 0; c < ncols; ++c){
 66 | 			toupdate[r*ncols + c] += scaledrow[c];
 67 | 		}
 68 | 	}
 69 | }		
 70 | 
 71 | template <typename TInt, typename TFloat>
 72 | inline void set_l22(const TInt & ndata, const TFloat * const a, TFloat & l22){
 73 | 	l22 = 0;
 74 | 	for (TInt i = 0; i < ndata; ++i){
 75 | 		l22 += a[i]*a[i];
 76 | 	}
 77 | }
 78 | 
 79 | template <typename TInt, typename TFloat>
 80 | inline void set_l22(const TInt & dimension, const TFloat * const a, const TFloat * const b, const TFloat & a_l22, const TFloat & b_l22, TFloat & l22){
 81 | 	l22 = 0;
 82 | 	for (TInt i = 0; i < dimension; ++ i){
 83 | 		l22 += a[i]*b[i];
 84 | 	}
 85 | 	l22 *= -2;
 86 | 	l22 += a_l22;
 87 | 	l22 += b_l22;
 88 | }	
 89 | 
 90 | template <typename TInt, typename TFloat>
 91 | inline void set_sum(const TInt & ndata, const TFloat * const a, TFloat & sum){
 92 | 	sum = 0;
 93 | 	for (TInt i = 0; i < ndata; ++i){
 94 | 		sum += a[i];
 95 | 	}
 96 | }
 97 | 
 98 | template <typename TInt, typename TFloat>
 99 | void set_l22s(TInt nrows, TInt ncols, const TFloat * const A, TFloat * const l22s, bool byrow){
100 | 	if (byrow == true){
101 | 		for (TInt r = 0; r < nrows; ++r){
102 | 			set_l22(ncols, A + r*ncols, l22s[r]);
103 | 		}
104 | 	}
105 | 	
106 | 	else{
107 | 		for (TInt c = 0; c < ncols; ++c){
108 | 			l22s[c] = 0;
109 | 		}
110 | 		
111 | 		for (TInt r = 0; r < nrows; ++r){
112 | 			for (TInt c = 0; c < ncols; ++c){
113 | 				l22s[c] += A[r*ncols + c]*A[r*ncols + c];
114 | 			}
115 | 		}
116 | 	}
117 | }
118 | 
119 | 
120 | 
121 | 
122 | template <typename TInt, typename TFloat>
123 | void set_sums(TInt nrows, TInt ncols, const TFloat * const A, TFloat * const sums, bool byrow){
124 | 	if (byrow == true){
125 | 		for (TInt r = 0; r < nrows; ++r){
126 | 			set_sum(ncols, A + r*ncols, sums[r]);
127 | 		}
128 | 	}
129 | 	
130 | 	else{
131 | 		for (TInt c = 0; c < ncols; ++c){
132 | 			sums[c] = 0;
133 | 		}
134 | 		for (TInt r = 0; r < nrows; ++r){
135 | 			for (TInt c = 0; c < ncols; ++c){
136 | 				sums[c] += A[r*ncols + c];
137 | 			}
138 | 		}
139 | 	}
140 | }
141 | 
142 | /* v : 1-D of size ncols
143 | * B : nrows x ncols
144 | * l22s[i] : |v - B[i]|_2
145 | * */
146 | template <typename TInt, typename TFloat>
147 | inline void set_rl22s(const TInt & ncols, const TFloat * const v, const TInt & nrows, const TFloat * const B, const TFloat & v_l22s, const TFloat * const B_l22s, TFloat * const l22s){
148 | 	for (TInt r = 0; r < nrows; ++r){
149 | 		l22s[r] = 0;
150 | 		for (TInt c = 0; c < ncols; ++c){
151 | 			l22s[r] += B[r*ncols + c]*v[c];
152 | 		}
153 | 		l22s[r] *= -2;
154 | 		l22s[r] += v_l22s;
155 | 		l22s[r] += B_l22s[r];
156 | 	}
157 | }
158 | 
159 | 
160 | /* A : nrowsA x ncols
161 | * B : nrowsB x ncols
162 | * C[i,j] : |A[i] - B[j]|_2 for 0 <= i < nrowsA   0 <= j < nrowsB 
163 | *  */
164 | 
165 | template <typename TInt, typename TFloat>
166 | void set_rrl22ss(TInt nrowsA, TInt ncols, const TFloat * const A, TInt nrowsB, const TFloat * const B, const TFloat * const A_l22s, const TFloat * const B_l22s, TFloat * const l22ss){
167 | 	for (TInt r = 0; r < nrowsA; ++r){
168 | 		set_rl22s(ncols, A + r*ncols, nrowsB, B, A_l22s[r], B_l22s, l22ss + r*nrowsB);
169 | 	}
170 | }
171 | 
172 | 
173 | template <typename TInt, typename TFloat>
174 | void subtractfrom(TInt N, const TFloat * const tosubtract, TFloat * const from){
175 | 	for (TInt i = 0; i < N; ++i){
176 | 		from[i] -= tosubtract[i];
177 | 	}
178 | }	
179 | 
180 | template <typename TInt, typename TFloat>
181 | void addto(TInt N, const TFloat * const toadd, TFloat * const to){
182 | 	for (TInt i = 0; i < N; ++i){
183 | 		to[i] += toadd[i];
184 | 	}
185 | }
186 | 
187 | 
188 | 
189 | 
190 | 
191 | }
192 | 
193 | #endif
194 | 
195 | 
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/src/arrutilv2mse.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef ARRUTILV2MSE_H
 22 | #define ARRUTILV2MSE_H
 23 | 
 24 | #include <cmath>
 25 | #include <memory>
 26 | 
 27 | 
 28 | 
 29 | namespace arrutilv2{
 30 | 
 31 | template <typename TInt, typename TSize, typename TFloat>
 32 | TFloat get_mse(TSize ndata, TInt ncentroids, const TFloat * const distances2, const TInt * const labels){
 33 | 	TSize s_ncentroids = static_cast<TSize> (ncentroids);
 34 | 	TFloat suml2s = 0;
 35 | 	TFloat d2;
 36 | 	for (TSize i = 0; i < ndata; ++i){
 37 | 		d2 = distances2[i*s_ncentroids + labels[i]];
 38 | 		if (d2 > 0){
 39 | 			suml2s += std::sqrt(d2);
 40 | 		}
 41 | 		else if (d2 <-1e-5){
 42 | 			throw std::runtime_error("negative value in get_mse of magnitude less than 1e-5. Probable cause: user has provided negative distance squared value");
 43 | 		}
 44 | 		else{
 45 | 			//assumed rounding error (provide warning?)
 46 | 		}
 47 | 	}
 48 | 	
 49 | 	return suml2s/static_cast<TFloat> (ndata);
 50 | }
 51 | 
 52 | template <typename TSize, typename TFloat>
 53 | TFloat get_mse(TSize ndata, TFloat * distances){
 54 | 		TFloat mse;
 55 | 		set_row_sum_squares(static_cast<TSize>(1), ndata, distances, &mse);
 56 | 		mse/=ndata;
 57 | 		return mse;
 58 | }
 59 | 
 60 | 
 61 | //[ 0.5 * ridgeterm * sum (count - mean count)^2 ] / ndata
 62 | template <typename TSize, typename TFloat>
 63 | TFloat get_meanridge(TFloat ridgeterm, TSize ncentroids, TSize * const counts){
 64 | 	
 65 | 	auto ndata = 0;
 66 | 	for (TSize si = 0; si < ncentroids; ++si){
 67 | 		ndata += counts[si];
 68 | 	}//std::accumulate(counts, counts + ncentroids);
 69 | 	
 70 | 	TFloat meancount = static_cast<TFloat> (ndata ) / static_cast<TFloat > (ncentroids);
 71 | 	
 72 | 	TFloat ridge_penalty = 0.;
 73 | 	for (TSize ci = 0; ci < ncentroids; ++ ci){
 74 | 		ridge_penalty += (counts[ci] - meancount)*(counts[ci] - meancount);
 75 | 	}
 76 | 	//TFloat ridge_penalty = static_cast<TFloat> (ridge_penalty_st);
 77 | 	ridge_penalty *= ridgeterm/2.;
 78 | 	ridge_penalty /= static_cast<TFloat> (ndata);
 79 | 	return ridge_penalty;
 80 | }
 81 | 
 82 | 
 83 | 
 84 | template <typename TInt, typename TFloat>
 85 | TFloat getmeanl22at(TInt ncentroids, TInt dimension, const TFloat * const centroids, TInt ndata, const TFloat * const data, const TInt * const labels, const TFloat * const centroid_l22s, const TFloat * const data_l22s){		
 86 | 	TFloat sum_variances = 0;
 87 | 	for (TInt i = 0; i < ndata; ++ i){
 88 | 		TFloat variance = 0;		
 89 | 		for (TInt d = 0; d < dimension; ++ d){
 90 | 			variance += data[i*dimension + d]*centroids[labels[i]*dimension + d];
 91 | 		}
 92 | 		variance *= -2;
 93 | 		variance += data_l22s[i];
 94 | 		variance += centroid_l22s[labels[i]];
 95 | 		sum_variances += variance;
 96 | 	}
 97 | 	TFloat variance_estimate = sum_variances/static_cast<TFloat>(ndata);
 98 | 	return variance_estimate;
 99 | }
100 | 
101 | //mse +  meanridgeerror
102 | template <typename TSize, typename TFloat>
103 | TFloat get_mse_ridge(TSize ndata, TFloat * distances, TFloat ridgeterm, TSize ncentroids, TSize * const counts){
104 | 	TFloat mse;
105 | 	set_row_sum_squares(static_cast<TSize>(1), ndata, distances, &mse);
106 | 	mse/=ndata;	
107 | 	TFloat meanridge = get_meanridge(ridgeterm, ncentroids, counts);	
108 | 	return mse + meanridge;
109 | }
110 | 
111 | 
112 | 
113 | 
114 | 
115 | template <typename TInt, typename TFloat> 
116 | TFloat get_sse_batchwise(TInt ndata, TInt nperbatch, TInt dimension, const TFloat * const data, TInt ncentroids, const TFloat * const centroids, const TFloat * const data_l22s, const TFloat * const centroid_l22s, TInt & ndcalcs){
117 | 	
118 | 	TInt nfullbatches = ndata/nperbatch;
119 | 	TInt nfinalbatch = ndata - nfullbatches*nperbatch;
120 | 	std::unique_ptr<TFloat []> distances_squared (new TFloat [nperbatch*ncentroids]);
121 | 	//data from the full batches
122 | 	TFloat sse = 0;
123 | 	for (TInt bi = 0; bi < nfullbatches; ++bi){
124 | 		set_rrl22ss(nperbatch, dimension, data + bi*dimension*nperbatch, ncentroids, centroids, data_l22s +bi*nperbatch, centroid_l22s, distances_squared.get());	
125 | 		for (TInt i = nperbatch*bi; i < nperbatch*(bi + 1); ++ i){
126 | 			sse += *std::min_element(distances_squared.get() + (i - nperbatch*bi)*ncentroids,  distances_squared.get() + (i - nperbatch*bi + 1)*ncentroids); 
127 | 		}
128 | 	}
129 | 	//data from the tail
130 | 	set_rrl22ss(nfinalbatch, dimension, data + nfullbatches*dimension*nperbatch, ncentroids, centroids, data_l22s + nfullbatches*nperbatch, centroid_l22s, distances_squared.get());	
131 | 	
132 | 	for (TInt i = nperbatch*nfullbatches; i < ndata; ++ i){
133 | 		sse += *std::min_element(distances_squared.get() + (i - nperbatch*nfullbatches)*ncentroids, distances_squared.get() + (i - nperbatch*nfullbatches + 1)*ncentroids); 
134 | 	}
135 | 	
136 | 	return sse;
137 | }
138 | 
139 | 
140 | 
141 | 		
142 | }
143 | 
144 | 
145 | #endif
146 | 


--------------------------------------------------------------------------------
/src/barrierutil.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef BARRIERUTIL_H
 22 | #define BARRIERUTIL_H
 23 | 
 24 | #include <thread>
 25 | #include <mutex> 
 26 | #include <vector>
 27 | #include <functional>
 28 | #include <condition_variable>
 29 | #include <chrono>
 30 | 
 31 | namespace stdthreadutil{
 32 | 	
 33 | 
 34 | /* btask for `barriered task'
 35 |  * used for situation when 
 36 |  * (1) several threads perform same task
 37 |  * (2) when all finished, one thread performs finishing task
 38 |  * (3) all threads are released */
 39 |  
 40 | void btask(
 41 | /* identity of thread, not used in function but left as parameter as potentially useful for debugging */
 42 | const size_t & ti, 
 43 | /* number of threads performing the task*/
 44 | const size_t & nthreads, 
 45 | /* reference to number of threads which have completed the task*/
 46 | size_t & completions, 
 47 | /* when work of this thread is complete, use the following tools (shared by all workers on this task) to notify the others */
 48 | std::mutex & workend_mutex,
 49 | std::condition_variable & condvar, 
 50 | /* task to perform, and task to perform at end if this thread finishes last*/
 51 | const std::function<void()> & task, 
 52 | const std::function<void()> & endtask);
 53 | 
 54 | 
 55 | /* btasks for `barrirered tasks'
 56 |  * used in situation when several threads of equal status perform
 57 |  for (# tasks) { do task | do end task if last to complete | }
 58 |  */
 59 | inline void btasks(
 60 | const size_t & ti,
 61 | const size_t & nthreads,
 62 | std::vector<size_t> & section_completions,
 63 | std::vector<std::mutex> & sectionend_mutexes,
 64 | std::vector<std::condition_variable> & section_condvars, 
 65 | const std::vector<std::function<void(size_t)>> & section_tasks,
 66 | const std::vector<std::function<void()>> &  sectionend_tasks
 67 | );
 68 | 
 69 | /* btask_rbasks for `barriered task then repeat barriered tasks' 
 70 |  * used in situation when 
 71 |  * several threads of equal status perform
 72 |  * do initialisation task | do intitialisation end task if last to complete | 
 73 |  * while (condition is true) for (# tasks) do task | do end task if last to complete |
 74 |  */
 75 | void btask_rbtasks(
 76 | size_t ti, 
 77 | size_t nthreads,
 78 | std::vector<size_t> & section_completions, 
 79 | std::vector<std::mutex> & sectionend_mutexes, 
 80 | std::vector<std::condition_variable> & section_condvars, 
 81 | const std::function<void(size_t)> & initialisation_task,
 82 | const std::function<void()> & initialisationend_task,  
 83 | const std::vector<std::function<void(size_t)>> & section_tasks,
 84 | const std::vector<std::function<void()>> &  sectionend_tasks, 
 85 | const std::function<bool()> & getiscomplete);
 86 | 
 87 | /* barriered tasks (inititialisation) then while condition repeat barriered tasks  */
 88 | void btasks_rbtasks(
 89 | size_t ti, 
 90 | size_t nthreads,
 91 | std::vector<size_t> & x_completions, 
 92 | std::vector<std::mutex> & xend_mutexes, 
 93 | std::vector<std::condition_variable> & x_condvars, 
 94 | const std::vector< std::function<void(size_t)>> & initialisation_tasks,
 95 | const std::vector< std::function<void()>> & initialisationend_tasks,  
 96 | const std::vector< std::function<void(size_t)>> & section_tasks,
 97 | const std::vector< std::function<void()>> &  sectionend_tasks, 
 98 | const std::function<bool()> & getiscomplete);
 99 | 
100 | /* launch barriered tasks */
101 | int launch_btasks(
102 | size_t nthreads, 
103 | const std::vector<std::function<void(size_t)>> & section_tasks,
104 | const std::vector<std::function<void()>> & sectionend_tasks
105 | );
106 | 
107 | 
108 | int launch_btasks_rbtasks(
109 | size_t nthreads, 
110 | const std::vector<std::function<void(size_t)>> & initialisation_tasks,
111 | const std::vector<std::function<void()>> & initialisationend_tasks,  
112 | const std::vector<std::function<void(size_t)>> & section_tasks,
113 | const std::vector<std::function<void()>> & sectionend_tasks, 
114 | const std::function<bool()> & getiscomplete,
115 | const std::function<void()> & closing_task 
116 | );
117 | 
118 | 
119 | 
120 | int launch_btask_rbtasks(
121 | size_t nthreads, 
122 | const std::function<void(size_t)> & initialisation_task,
123 | const std::function<void()> & initialisationend_task,  
124 | const std::vector<std::function<void(size_t)>> & section_tasks,
125 | const std::vector<std::function<void()>> & sectionend_tasks, 
126 | const std::function<bool()> & getiscomplete,
127 | const std::function<void()> & closing_task 
128 | );
129 | 
130 | 
131 | }
132 | 
133 | 
134 | #endif
135 | 


--------------------------------------------------------------------------------
/src/baseYYMSN.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | // UNDER CONSTRUCTION.
 22 | 
 23 | #ifndef PLL_PLLYINYANGMSNBASEKMEANS_H
 24 | #define PLL_PLLYINYANGMSNBASEKMEANS_H
 25 | 
 26 | #include "baseYY.h"
 27 | 
 28 | #include <fstream>
 29 | #include <sstream>
 30 | #include <iostream>
 31 | #include <string>
 32 | 
 33 | namespace kmeans{
 34 | 
 35 | 
 36 | template <typename TInt, typename TFloat>
 37 | /* " max sum norm" for lower bounds versions inherit from here */ 
 38 | class YYMSNBase : public kmeans::BaseYY<TInt, TFloat>{
 39 | 	
 40 | 	private:
 41 | 		std::unique_ptr<TFloat []> delta_C;
 42 | 		std::unique_ptr<TFloat []> u_delta_G;
 43 | 
 44 | 	protected:	
 45 | 		TFloat * const get_glowers(){
 46 | 			return this->get_glowers_base();
 47 | 		}
 48 | 		
 49 | 		TFloat * const get_upb(){
 50 | 			return this->get_upb_base();
 51 | 		}
 52 | 		
 53 | 		TFloat * const get_delta_C(){
 54 | 			return delta_C.get();
 55 | 		}
 56 | 		
 57 | 		TFloat * const get_u_delta_G(){
 58 | 			return u_delta_G.get();
 59 | 		}
 60 | 		
 61 | 
 62 | 		
 63 | 	public:
 64 | 		typedef kmeans::BaseYY<TInt, TFloat> YYB;
 65 | 		template<typename... Args>
 66 | 		YYMSNBase(Args&&... args): YYB(std::forward<Args>(args)...), 
 67 | 		delta_C{ new TFloat [this->getncentroids()] },
 68 | 		delta_G{ new TFloat [this->get_ngroups()] }
 69 | 		
 70 | 		{
 71 | 			this->setalgname("YYMSNBase");
 72 | 		}
 73 | 		
 74 | 		virtual ~YYMSNBase(){}
 75 | 
 76 | 
 77 | 		virtual TInt get_approximate_memory_requirement(){
 78 | 			return YYB::get_approximate_memory_requirement() + 
 79 | 			sizeof(TFloat)*(
 80 | 				this->getncentroids() + //delta_C
 81 | 				this->get_ngroups()); //delta_G
 82 | 		}
 83 | 
 84 | 		virtual void verbose_write_additional(){
 85 | 			this->get_verbose_file() << "\n\n ..not implemented down to YYMSNBase..\n\n";
 86 | 		}
 87 | 
 88 | 		
 89 | 		
 90 | 		
 91 | 		virtual void set_initialisation_tasks() = 0;
 92 | 
 93 | 		
 94 | 		virtual void set_C_tasks() = 0;
 95 | 		virtual void set_X_tasks() = 0;
 96 | };
 97 | 
 98 | }
 99 | 
100 | #endif
101 | 
102 | 


--------------------------------------------------------------------------------
/src/baseYYSMN.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_PLLYINYANGSMNBASEKMEANS_H
 22 | #define PLL_PLLYINYANGSMNBASEKMEANS_H
 23 | 
 24 | #include "baseYY.h"
 25 | 
 26 | #include <fstream>
 27 | #include <sstream>
 28 | #include <iostream>
 29 | #include <string>
 30 | 
 31 | namespace kmeans{
 32 | 
 33 | 
 34 | template <typename TInt, typename TFloat>
 35 | /* "sum max norm" for lower bounds versions inherit from here */ 
 36 | class BaseYYSMN : public kmeans::BaseYY<TInt, TFloat>{
 37 | 	
 38 | 	private:
 39 | 		std::unique_ptr<TFloat []> delta_C;
 40 | 		std::unique_ptr<TFloat []> delta_G;
 41 | 
 42 | 	protected:	
 43 | 		TFloat * const get_glowers(){
 44 | 			return this->get_glowers_base();
 45 | 		}
 46 | 		
 47 | 		TFloat * const get_upb(){
 48 | 			return this->get_upb_base();
 49 | 		}
 50 | 		
 51 | 		TFloat * const get_delta_C(){
 52 | 			return delta_C.get();
 53 | 		}
 54 | 		
 55 | 		TFloat * const get_delta_G(){
 56 | 			return delta_G.get();
 57 | 		}
 58 | 		
 59 | 
 60 | 		
 61 | 	public:
 62 | 		typedef kmeans::BaseYY<TInt, TFloat> YYB;
 63 | 		template<typename... Args>
 64 | 		BaseYYSMN(Args&&... args): YYB(std::forward<Args>(args)...), 
 65 | 		delta_C{ new TFloat [this->getncentroids()] },
 66 | 		delta_G{ new TFloat [this->get_ngroups()] }
 67 | 		
 68 | 		{
 69 | 			this->setalgname("BaseYYSMN");
 70 | 		}
 71 | 		
 72 | 		virtual ~BaseYYSMN(){}
 73 | 
 74 | 
 75 | 		virtual TInt get_approximate_memory_requirement(){
 76 | 			return YYB::get_approximate_memory_requirement() + 
 77 | 			sizeof(TFloat)*(
 78 | 				this->getncentroids() + //delta_C
 79 | 				this->get_ngroups()); //delta_G
 80 | 		}
 81 | 
 82 | 		virtual void verbose_write_additional(){
 83 | 			this->get_verbose_file() << "\n\n ..not implemented down to BaseYYSMN..\n\n";
 84 | 		}
 85 | 
 86 | 		
 87 | 		
 88 | 		
 89 | 		virtual void set_initialisation_tasks() = 0;
 90 | 
 91 | 		
 92 | 		virtual void set_C_tasks() = 0;
 93 | 		virtual void set_X_tasks() = 0;
 94 | };
 95 | 
 96 | }
 97 | 
 98 | #endif
 99 | 
100 | 


--------------------------------------------------------------------------------
/src/baseelkan.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_BASEELKANKMEANS__H
22 | #define PLL_BASEELKANKMEANS__H
23 | 
24 | namespace kmeans{
25 | 
26 | template <typename TInt, typename TFloat>
27 | class BaseElkan : public kmeans::BaseExact<TInt, TFloat>{
28 | 		
29 | 	protected:
30 | 			
31 | 		
32 | 	public:
33 | 		typedef kmeans::BaseExact<TInt, TFloat> BC;
34 | 		template<typename... Args>
35 | 		BaseElkan(Args&&... args): BC(std::forward<Args>(args)...)
36 | 		
37 | 		{
38 | 			this->assignmemory_elkan_upper_lowers();
39 | 			this->setalgname("elkan base");
40 | 		}
41 | 		
42 | 		virtual ~BaseElkan(){}
43 | 
44 | 		virtual void verbose_write_additional() override {}
45 | 		virtual void set_initialisation_tasks() = 0;
46 | 		virtual void set_C_tasks() = 0;
47 | 		virtual void set_X_tasks() = 0;
48 | 		
49 | 		virtual TInt get_approximate_memory_requirement(){
50 | 			return BC::get_approximate_memory_requirement() + this->get_elkan_base_memory();
51 | 		}
52 | };
53 | 
54 | }
55 | 
56 | #endif
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/src/baseelkanminibatch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_BASEELKANMINIBATCHKMEANS_H
22 | #define PLL_BASEELKANMINIBATCHKMEANS_H
23 | 
24 | #include "baseminibatch.h"
25 | 
26 | namespace kmeans{
27 | template <typename TInt, typename TFloat>
28 | class BaseElkanMiniBatch : public kmeans::BaseMiniBatch<TInt, TFloat>{
29 | 
30 | 		
31 | 	protected:
32 | 			
33 | 		
34 | 	public:
35 | 		typedef kmeans::BaseMiniBatch<TInt, TFloat> BC;
36 | 		template<typename... Args>
37 | 		BaseElkanMiniBatch(Args&&... args): BC(std::forward<Args>(args)...)
38 | 		
39 | 		{
40 | 			this->assignmemory_elkan_upper_lowers();
41 | 			this->setalgname("elkan minibatch base");
42 | 		}
43 | 		
44 | 		virtual ~BaseElkanMiniBatch(){}
45 | 
46 | 		virtual void verbose_write_additional() override {}
47 | 		virtual void set_initialisation_tasks() = 0;
48 | 		virtual void set_C_tasks() = 0;
49 | 		virtual void set_X_tasks() = 0;
50 | 		
51 | 		virtual TInt get_approximate_memory_requirement(){
52 | 			return BC::get_approximate_memory_requirement() + this->get_elkan_base_memory();
53 | 		}
54 | };
55 | 
56 | }
57 | 
58 | #endif
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/src/baseexact.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_BASEEXACTKMEANSTRUE_H
22 | #define PLL_BASEEXACTKMEANSTRUE_H
23 | 
24 | #include "basekmeans.h"
25 | 
26 | namespace kmeans{
27 | 
28 | template <typename TInt, typename TFloat>
29 | class BaseExact : public kmeans::BaseKmeans<TInt, TFloat> {
30 | 	
31 | 	private:
32 | 		virtual void set_summaries() {
33 | 			this->set_summaries_exact();
34 | 		}
35 | 		
36 | 		virtual void set_mse() override {
37 | 			this->mse = arrutilv2::getmeanl22at(this->ncentroids, this->dimension, this->get_C(), this->ndata, this->data, this->get_L(), this->get_C_l22s(), this->get_data_l22s());
38 | 		}
39 | 
40 | 	protected:
41 | 	
42 | 		virtual void set_initialisation_tasks() = 0;
43 | 		virtual void set_X_tasks() = 0;		
44 | 		virtual void set_C_tasks() = 0;
45 | 
46 | 		template <typename Function, typename... Args>
47 | 		void pll_principal_X(const Function & X_updater, TInt ti, Args&&... args){
48 | 			this->base_pll_principal_X(static_cast<TInt> (0), this->ndata, X_updater, ti, std::forward<Args>(args)...);
49 | 		}
50 | 		
51 | 		std::function<void(TInt)> set_L_ati(){
52 | 			return this->set_L_ati(0, this->ndata);
53 | 		}
54 | 	
55 | 	public:
56 | 		template<typename... Args>
57 | 		BaseExact(Args&&... args): kmeans::BaseKmeans<TInt, TFloat>(std::forward<Args>(args)...){}
58 | 		
59 | 		virtual ~BaseExact(){}
60 | 	
61 | };
62 | 
63 | }
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/src/basehamerly.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_HAMERLYBASEKMEANS_H
 22 | #define PLL_HAMERLYBASEKMEANS_H
 23 | 
 24 | 
 25 | namespace kmeans{
 26 | 
 27 | template <typename TInt, typename TFloat>
 28 | class BaseHamerly : public kmeans::BaseExact<TInt, TFloat>{
 29 | 	
 30 | 	private:
 31 | 	
 32 | 		std::unique_ptr<TFloat []> CC;
 33 | 		std::unique_ptr<TFloat []> halfminCC;
 34 | 		std::unique_ptr<TFloat []> delta_C;
 35 | 		std::unique_ptr<TFloat []> lower_base;
 36 | 		std::unique_ptr<TFloat []> upper_base;
 37 | 		
 38 | 		
 39 | 		std::vector<std::function<void(TInt)> > makeset_C_C_l22s_L_inds0_lower_upper_mati(){
 40 | 		
 41 | 			std::vector<std::function<void(TInt)> > tasks;
 42 | 			
 43 | 			tasks = this->exact_makeset_C_C_l22s_inds0_mati();	
 44 | 			tasks.push_back(	
 45 | 				[this](TInt ti){
 46 | 					TInt local_ndcalcs = 0;
 47 | 					TInt x0 = (ti*this->getndata())/this->getnthreads();
 48 | 					TInt x1 = ((ti+1)*this->getndata())/this->getnthreads();	
 49 | 					arrutilv2::set_L2_dn(x1 - x0, this->getdimension(), this->getdata() + x0*this->getdimension(), this->getncentroids(), this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_upper_base() + x0, this->get_lower_base() + x0, local_ndcalcs);
 50 | 					this->ndcalcs_notX += local_ndcalcs;
 51 | 				}
 52 | 			);
 53 | 			tasks.push_back(
 54 | 			//set starting mse. TODO : parallelise.	
 55 | 				[this](TInt ti){	
 56 | 					if (ti == 0 && this->get_initialisation_method().compare("kmeans++") != 0){ // && this->getfileptr()->is_open()
 57 | 						this->mse = 0;
 58 | 						for (TInt i = 0; i < this->getndata(); ++i){
 59 | 							this->mse += (this->get_upper_base()[i])*(this->get_upper_base()[i]);
 60 | 						}
 61 | 						this->mse /= static_cast<TFloat>(this->getndata());
 62 | 					}
 63 | 					
 64 | 				}
 65 | 			);
 66 | 
 67 | 			return tasks;	
 68 | 		}
 69 | 			
 70 | 			
 71 | 	protected:
 72 | 	
 73 | 		TFloat * const get_CC(){
 74 | 			return CC.get();
 75 | 		}
 76 | 		
 77 | 		TFloat * const get_halfminCC(){
 78 | 			return halfminCC.get();
 79 | 		}
 80 | 		
 81 | 		TFloat * const get_lower_base(){
 82 | 			return lower_base.get();
 83 | 		}
 84 | 		
 85 | 		TFloat * const get_upper_base(){
 86 | 			return upper_base.get();
 87 | 		}
 88 | 		
 89 | 		TFloat * const get_delta_C(){
 90 | 			return delta_C.get();
 91 | 		}
 92 | 	
 93 | 		std::vector<std::function<void(TInt)> > makeset_C_C_l22s_L_inds0_lower_upper_S_H_mati(){
 94 | 			auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_lower_upper_mati();
 95 | 
 96 | 			
 97 | 			 
 98 | 			auto init_task_B = arrutilv2::set_S_H_ati(this->nthreads, this->ndata, this->dimension, this->data, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dsums(), this->get_dcounts(), this->get_sums(), this->get_counts(), this->work_mutex);	
 99 | 
100 | 			auto initialisation_tasks = std::move(init_tasks_A);
101 | 			initialisation_tasks.push_back(std::move(init_task_B));
102 | 			return initialisation_tasks;
103 | 		}	
104 | 			
105 | 	public:
106 | 		template<typename... Args>
107 | 		BaseHamerly(Args&&... args): kmeans::BaseExact<TInt, TFloat>(std::forward<Args>(args)...), 
108 | 		
109 | 		CC{ new TFloat [this->getncentroids()*this->getncentroids()] },
110 | 		halfminCC{ new TFloat [this->getncentroids()] },
111 | 		delta_C{ new TFloat [this->getncentroids()] },
112 | 		lower_base{ new TFloat [this->getndata()]  },
113 | 		upper_base{ new TFloat [this->getndata()] }
114 | 		
115 | 		{
116 | 			this->setalgname("BaseHamerly");
117 | 		}
118 | 		virtual ~BaseHamerly(){}
119 | 
120 | 		virtual void verbose_write_additional(){
121 | 			kmeans::BaseExact<TInt, TFloat>::verbose_write_additional();
122 | 			this->get_verbose_file() << "\nlower_base:\n" << lower_base[0] << "\n";			
123 | 			this->get_verbose_file() << "\n\nupper_base:\n" << upper_base[0] << "\n";
124 | 			/* anything else to print ? */
125 | 		}
126 | 
127 | 		virtual void set_initialisation_tasks() = 0;
128 | 		virtual void set_C_tasks() = 0;
129 | 		virtual void set_X_tasks() = 0;
130 | 
131 | };
132 | 
133 | }
134 | 
135 | #endif
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/src/baseminibatch.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_BASEMINIBATCHKMEANS_H
 22 | #define PLL_BASEMINIBATCHKMEANS_H
 23 | 
 24 | #include "basekmeans.h"
 25 | #include "minibatchapp.h"
 26 | 
 27 | namespace kmeans{
 28 | template <typename TInt, typename TFloat>
 29 | class BaseMiniBatch : public kmeans::BaseKmeans<TInt, TFloat>{
 30 | 	
 31 | 	private:
 32 | 	
 33 | 		virtual void set_mse() override final {
 34 | 			this->minibatch_set_mse(this->mba);
 35 | 		}
 36 | 	
 37 | 		virtual void set_summaries() override final {
 38 | 			this->set_summaries_minibatch(this->mba);
 39 | 		}
 40 | 	
 41 | 	protected:
 42 | 		
 43 | 		minibatchapp::MiniBatchApp<TInt> mba;		
 44 | 		
 45 | 		TInt maxpermultiplyblock;
 46 | 
 47 | 
 48 | 
 49 | 		virtual void set_C_tasks() = 0;
 50 | 
 51 | 		
 52 | 		//set S, H from first batch
 53 | 		std::function<void(TInt)> set_S_H_ati(){
 54 | 			return this->base_set_S_H_ati(static_cast<TInt> (0), this->mba.initialising_batch_size);
 55 | 		}
 56 | 		
 57 | 		
 58 | 		//Not as code reducing as the baseexact version,  but easier to understand
 59 | 		template <typename Function, typename... Args>
 60 | 		void mb_pll_principal_X(const Function & X_updater, TInt ti, Args&&... args){
 61 | 	
 62 | 			arrutilv2::pll_update_L_etc(
 63 | 			//The compulsory parameters to pll_update_L_etc,
 64 | 			X_updater, 
 65 | 			this->ncentroids, this->dimension, this->get_sums(), this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_counts(), this->get_dcounts() + ti*this->ncentroids, this->mba.nchanges_on_batch[(this->mba.subround + 1)%this->mba.nsubrounds], this->ndcalcs_X, this->work_mutex
 66 | 			//The additional parameters to pll_update_L_etc with correct offset
 67 | 			, std::forward<Args>(args)...);
 68 | 		}
 69 | 
 70 | 	
 71 | 		
 72 | 	public:
 73 | 		void constructor_helper(const TInt & batchsize){
 74 | 			
 75 | 			this->mba = minibatchapp::MiniBatchApp<TInt>(batchsize, this->ndata);
 76 | 			this->setalgname("Base Mini Batch Kmeans");
 77 | 				
 78 | 			this->maxpermultiplyblock = //10000000; 
 79 | 			std::max(static_cast<TInt> (1),
 80 | 			static_cast<TInt> ((this->getndata() * this->getdimension())/(2 * this->getncentroids() * this->nthreads)));
 81 | 	
 82 | 			
 83 | 
 84 | 	
 85 | 			//TODO : move to summaries:
 86 | 			std::cout << "batchsize : " << batchsize << "  nsubrounds : " << this->mba.nsubrounds << "  lastbatchsize : " << this->mba.lastbatchsize << "   maxpermultiply : " << this->maxpermultiplyblock << "  initialising_batch_size : " << this->mba.initialising_batch_size << std::endl;
 87 | 
 88 | 		
 89 | 		}
 90 | 					
 91 | 		/* overly hungry, consult Meyers to see how I can prevent this. 
 92 | 		 * if non-standard constructor args, use variadic args ala Eli Bendersky. 
 93 | 		 * Note that these won't be initialised by extern template class, so changes here will require full remake
 94 | 		 * */ 
 95 | 		 template<typename... Args>
 96 | 		 BaseMiniBatch(TInt batchsize, Args&&... args): kmeans::BaseKmeans<TInt, TFloat> (std::forward<Args>(args)...){
 97 | 			 this->constructor_helper(batchsize);
 98 | 		 }
 99 | 		 		
100 | 		virtual ~BaseMiniBatch(){};
101 | 
102 | };
103 | 
104 | 
105 | }
106 | 
107 | 
108 | #endif
109 | 
110 | 


--------------------------------------------------------------------------------
/src/basesimpleexact.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_SIMPLEKBASEKMEANS_H
22 | #define PLL_SIMPLEKBASEKMEANS_H
23 | 
24 | #include "baseexact.h"
25 | 
26 | 
27 | namespace kmeans{
28 | 
29 | //two simple versions inherit from this class : simplebatch (distances calculated in batches of data) and simple (memory light version)
30 | template <typename TInt, typename TFloat>
31 | class BaseSimpleExactKmeans : public kmeans::BaseExact<TInt, TFloat>{
32 | 
33 | 	public:
34 | 		
35 | 		template<typename... Args>
36 | 		/* variadic args ala Eli Bendersky */
37 | 		BaseSimpleExactKmeans(Args&&... args): kmeans::BaseExact<TInt, TFloat> (std::forward<Args>(args)...) {this->setalgname("simple base");}		
38 | 		virtual ~BaseSimpleExactKmeans(){};
39 | 			
40 | 	protected:
41 | 		virtual void set_initialisation_tasks(){
42 | 	
43 | 			auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati();
44 | 	
45 | 			auto init_task_B = this->base_set_S_H_ati(static_cast<TInt>(0), this->ndata);
46 | 					
47 | 			this->initialisation_tasks = std::move(init_tasks_A);
48 | 			this->initialisation_tasks.push_back(std::move(init_task_B));			
49 | 		}
50 | 		
51 | 		virtual void set_X_tasks() = 0;
52 | 
53 | 		virtual void set_C_tasks(){		
54 | 			this->C_tasks = {
55 | 				//[](TInt ti){std::cout << "C task start " << std::endl; },
56 | 				arrutilv2::update_C_C_l22s_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s())
57 | 				//[](TInt ti){std::cout << "C task end " << std::endl; }
58 | 
59 | 			};
60 | 		}		
61 | };
62 | 
63 | }
64 | 
65 | #endif
66 | 
67 | 


--------------------------------------------------------------------------------
/src/basesimpleminibatch.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_BASESIMPLEMINIBATCHKMEANS_H
 22 | #define PLL_BASESIMPLEMINIBATCHKMEANS_H
 23 | 
 24 | #include "baseminibatch.h"
 25 | 
 26 | namespace kmeans{
 27 | template <typename TInt, typename TFloat>
 28 | class BaseSimpleMiniBatch : public kmeans::BaseMiniBatch<TInt, TFloat>{
 29 | 
 30 | 
 31 | 	private:
 32 | 	
 33 | 		virtual void update_L_S_H(TInt x0, TInt x1, TInt ti) = 0;
 34 | 		
 35 | 		virtual std::function<void(TInt)> update_L_S_H_ati() override final{
 36 | 			
 37 | 			return [this](TInt ti){
 38 | 				//the batch to use this round (same for all threads)				
 39 | 				TInt ind0 = this->mba.batchsize*((this->mba.subround + 1) % this->mba.nsubrounds); //(this->round%this->mba.nsubrounds); //not this->mba.subround!
 40 | 				TInt ind1 = std::min(ind0 + this->mba.batchsize, this->ndata);
 41 | 				TInt thisbatchsize = ind1 - ind0;
 42 | 				
 43 | 				//absolute indices of data to process on this threads
 44 | 				TInt x0 = ind0 + (ti*thisbatchsize)/this->nthreads;
 45 | 				TInt x1 = ind0 + ((ti + 1)*thisbatchsize)/this->nthreads;
 46 | 			
 47 | 				this->update_L_S_H(x0, x1, ti);
 48 | 				
 49 | 				this->ndcalcs_X += (x1 - x0)*this->ncentroids;
 50 | 			};
 51 | 		}
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 	protected:
 59 | 	
 60 | 	
 61 | 		std::unique_ptr<TFloat []> delta_C;	
 62 | 				
 63 | 		virtual void set_C_tasks() override final {
 64 | 			this->C_tasks = {
 65 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->delta_C.get(), this->ndcalcs_notX)
 66 | 			};
 67 | 		}
 68 | 		
 69 | 
 70 | 		//some initialisation, using the first batch if necessary
 71 | 		std::vector<std::function<void(TInt)> > makeset_C_C_l22s_L_inds0_mati(){
 72 | 			return this->minibatch_makeset_C_C_l22s_L_inds0_mati(this->mba);
 73 | 		}
 74 | 
 75 | 		
 76 | 		
 77 | 		
 78 | 		virtual void set_initialisation_tasks() override final{
 79 | 			auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati();
 80 | 			auto init_task_B = this->set_S_H_ati();
 81 | 			this->initialisation_tasks = std::move(init_tasks_A);
 82 | 			this->initialisation_tasks.push_back(std::move(init_task_B));
 83 | 		}
 84 | 		
 85 | 		
 86 | 
 87 | 		
 88 | 		virtual void set_X_tasks() override final {
 89 | 			this->X_tasks = {
 90 | 				
 91 | 				//[this](TInt ti){
 92 | 					//std::cout << "\nnchanges_on_batch" << std::endl;
 93 | 					//for (TInt a = 0; a < this->mba.nsubrounds; ++a){
 94 | 						//std::cout << this->mba.nchanges_on_batch[a] << " ";
 95 | 					//}
 96 | 					//std::cout << std::endl;
 97 | 				//},
 98 | 				
 99 | 				this->update_L_S_H_ati(),
100 | 				this->minibatch_subround_update(this->mba)
101 | 			};
102 | 		}
103 | 		
104 | 		
105 | 		void update_L_S_H_batch_increment_only(TInt x0, TInt  x1, TInt ti){
106 | 			arrutilv2::update_L_S_H_batch_increment_only(x1-x0, this->maxpermultiplyblock, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids,  this->get_sums(), this->get_counts(), this->mba.nchanges_on_batch[(this->mba.subround + 1)%this->mba.nsubrounds], this->work_mutex);
107 | 		}
108 | 		
109 | 		
110 | 		void update_L_S_H_batch(TInt x0, TInt  x1,  TInt ti){
111 | 			arrutilv2::update_L_S_H_batch(x1-x0, this->maxpermultiplyblock, this->dimension, this->data + x0*this->dimension, this->ncentroids, this->get_C(), this->get_data_l22s() + x0, this->get_C_l22s(), this->get_L() + x0, this->get_dsums() + ti*this->dimension*this->ncentroids, this->get_dcounts() + ti*this->ncentroids,  this->get_sums(), this->get_counts(), this->mba.nchanges_on_batch[(this->mba.subround + 1)%this->mba.nsubrounds], this->work_mutex);
112 | 		}
113 | 	
114 | 
115 | 
116 | 		
117 | 		
118 | 	public:
119 | 
120 | 	template<typename... Args>
121 | 	BaseSimpleMiniBatch(Args&&... args): kmeans::BaseMiniBatch<TInt, TFloat> (std::forward<Args>(args)...){
122 | 	 			this->delta_C = std::unique_ptr<TFloat []> (new TFloat [this->ncentroids]);
123 | 	}
124 | 		
125 | 	virtual ~BaseSimpleMiniBatch(){};
126 | 	
127 | };
128 | }
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/src/basesparseelkan.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_BASESPARSEELKANKMEANS__H
22 | #define PLL_BASESPARSEELKANKMEANS__H
23 | 
24 | #include "basesparseexact.h"
25 | #include "sparseutil.h"
26 | 
27 | namespace kmeans{
28 | 
29 | template <typename TInt, typename TFloat>
30 | class BaseSparseElkan : public kmeans::BaseSparseExact<TInt, TFloat>{
31 | 		
32 | 	protected:
33 | 			
34 | 		virtual void set_upper_lowers_L(TInt x0, TInt x1) override final{ /* from basedensecentroidkmeans */
35 | 			for (TInt i = x0; i < x1; ++i){								
36 | 				sparse::set_argminmin_rl2s(this->ptrdata->starts[i+1] - this->ptrdata->starts[i], this->ptrdata->indices.data() + this->ptrdata->starts[i],  this->ptrdata->values.data() + this->ptrdata->starts[i], this->ptrdata->dimension, this->ncentroids, this->get_C(), this->data_l22s[i], this->get_C_l22s(), this->L[i], this->elkan_upper_base[i], this->elkan_lowers_base.get() + i*this->ncentroids);
37 | 			}
38 | 			
39 | 			this->ndcalcs_X += this->ncentroids*(x1 - x0);
40 | 		}
41 | 		
42 | 	public:
43 | 		typedef kmeans::BaseSparseExact<TInt, TFloat> BC;
44 | 		template<typename... Args>
45 | 		BaseSparseElkan(Args&&... args): BC(std::forward<Args>(args)...)
46 | 		
47 | 		{
48 | 			this->assignmemory_elkan_upper_lowers();
49 | 			this->setalgname("sparse elkan base");
50 | 		}
51 | 		
52 | 		virtual ~BaseSparseElkan(){}
53 | 
54 | 		virtual void verbose_write_additional() override {}
55 | 		virtual void set_initialisation_tasks() = 0;
56 | 		virtual void set_C_tasks() = 0;
57 | 		virtual void set_X_tasks() = 0;
58 | 		
59 | 		virtual TInt get_approximate_memory_requirement(){
60 | 			return BC::get_approximate_memory_requirement() + this->get_elkan_base_memory();
61 | 		}
62 | };
63 | 
64 | }
65 | 
66 | #endif
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/src/basesparseexact.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_BASESPARSEEXACT_H
22 | #define PLL_BASESPARSEEXACT_H
23 | 
24 | #include "basesparsekmeans.h"
25 | #include <stdexcept>
26 | 
27 | namespace kmeans{
28 | 
29 | template <typename TInt, typename TFloat>
30 | class BaseSparseExact : public kmeans::BaseSparseKmeans<TInt, TFloat> {
31 | 	
32 | 	private: 
33 | 		virtual void set_mse() override{
34 | 			TFloat sse = 0;
35 | 			for (TInt i = 0; i < this->ndata; ++i){
36 | 				sse += 
37 | 				this->data_l22s[i] + this->C_l22s[this->L[i]]				
38 | 				-2.*sparse::get_inner(this->ptrdata->starts[i+1] - this->ptrdata->starts[i], 
39 | 				this->ptrdata->indices.data() + this->ptrdata->starts[i], 
40 | 				this->ptrdata->values.data() + this->ptrdata->starts[i],
41 | 				this->get_C() + this->dimension*this->L[i]);
42 | 			}
43 | 			this->mse =  sse / static_cast<TFloat> (this->ndata);
44 | 		}
45 | 
46 | 		
47 | 	public:
48 | 		template<typename... Args>
49 | 		BaseSparseExact(Args&&... args): kmeans::BaseSparseKmeans<TInt, TFloat> (std::forward<Args>(args)...) {
50 | 			this->setalgname("base-sparse-exact-kmeans");
51 | 		}
52 | 		
53 | 		virtual ~BaseSparseExact(){};
54 | 	
55 | 	protected:
56 | 		virtual void set_initialisation_tasks() = 0;
57 | 		virtual void set_X_tasks() = 0;		
58 | 		virtual void set_C_tasks() = 0;
59 | 		
60 | 		virtual void set_summaries(){
61 | 			this->set_summaries_exact();
62 | 		}
63 | 		
64 | 
65 | 
66 | 
67 | 		//A hack as no pllsation as suggested by ati suffix
68 | 		std::function<void(TInt)> set_S_H_ati(){
69 | 			return this->base_set_S_H_ati(static_cast<TInt>(0), this->ndata);
70 | 		}
71 | 
72 | 		
73 | 		virtual void verbose_write_additional(){
74 | 			throw std::runtime_error("verbose_write_additional needs implementing in basesparseexact");
75 | 		}
76 | 
77 | 	
78 | 
79 | };
80 | } 
81 | 
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/src/basesparseminibatch.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_BASESPARSEMINIBATCHKMEANS_H
 22 | #define PLL_BASESPARSEMINIBATCHKMEANS_H
 23 | 
 24 | #include "basesparsekmeans.h"
 25 | #include "minibatchapp.h"
 26 | 
 27 | namespace kmeans{
 28 | template <typename TInt, typename TFloat>
 29 | class BaseSparseMiniBatch : public kmeans::BaseSparseKmeans<TInt, TFloat>{
 30 | 	
 31 | 	private:
 32 | 
 33 | 
 34 | 
 35 | 		//different versions for sparsestandardminibatch and sparseminibatch (my version, where not just a naive add)
 36 | 		virtual void post_L_adjust_S_H() = 0;
 37 | 		
 38 | 		//update L, label_changes in pll on batch specified by round.
 39 | 		virtual std::function<void(TInt)> update_L_label_changes_ati(){
 40 | 			return [this](TInt ti){
 41 | 				
 42 | 				
 43 | 				TInt data0 = this->mba.batchsize*(this->round%this->mba.nsubrounds);
 44 | 				TInt data1 = std::min(data0 + this->mba.batchsize, this->ndata);
 45 | 				TInt ndata_batch = data1 - data0;
 46 | 				TInt x0 = data0 + (ti*ndata_batch)/this->nthreads;
 47 | 				TInt x1 = data0 + ((ti+1)*ndata_batch)/this->nthreads;
 48 | 				
 49 | 				//std::cout << "\nupdating in [ " << x0 << ", " << x1 << " ] " << std::endl;
 50 | 				
 51 | 				
 52 | 				this->where_label_changes[ti].clear(); //index, old, new.
 53 | 				sparse::update_L(*this->ptrdata, x0, x1, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->where_label_changes[ti]);
 54 | 				this->ndcalcs_X += this->ncentroids*(x1 - x0);
 55 | 				
 56 | 				std::lock_guard<std::mutex> gluk(this->work_mutex);
 57 | 				this->mba.nchanges_on_batch[this->mba.subround] += this->where_label_changes[ti].size();
 58 | 			};
 59 | 		}
 60 | 	
 61 | 
 62 | 		virtual void set_mse() override final {
 63 | 			this->minibatch_set_mse(this->mba);
 64 | 		}
 65 | 	
 66 | 		virtual void set_summaries() override final {
 67 | 			this->set_summaries_minibatch(this->mba);
 68 | 		}
 69 | 	
 70 | 	protected:
 71 | 		
 72 | 		minibatchapp::MiniBatchApp<TInt> mba;		
 73 | 				
 74 | 		virtual void set_C_tasks() override final {
 75 | 			this->C_tasks = {
 76 | 				arrutilv2::update_C_C_l22s_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s())
 77 | 			};
 78 | 		}
 79 | 		
 80 | 		//set S, H from first batch
 81 | 		std::function<void(TInt)> set_S_H_from_initial_batch_ati(){
 82 | 			return [this](TInt ti){
 83 | 				if (ti == 0){
 84 | 					this->set_S_H(static_cast<TInt>(0), this->mba.initialising_batch_size);
 85 | 				}
 86 | 			};
 87 | 		}
 88 | 		
 89 | 
 90 | 		std::vector<std::function<void(TInt)> > makeset_C_C_l22s_L_inds0_mati(){
 91 | 			return this->minibatch_makeset_C_C_l22s_L_inds0_mati(this->mba);
 92 | 		}
 93 | 
 94 | 		virtual void set_initialisation_tasks(){
 95 | 			auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati();
 96 | 			auto init_task_B = this->set_S_H_from_initial_batch_ati();
 97 | 			this->initialisation_tasks = std::move(init_tasks_A);
 98 | 			
 99 | 			this->initialisation_tasks.push_back([this](TInt ti)
100 | 			{
101 | 
102 | 			});
103 | 			
104 | 			this->initialisation_tasks.push_back(std::move(init_task_B));
105 | 		}
106 | 		
107 | 		
108 | 
109 | 		
110 | 		virtual void set_X_tasks(){
111 | 			this->X_tasks = {
112 | 				this->update_L_label_changes_ati(),
113 | 				[this](TInt ti){
114 | 					if (ti == 0){
115 | 						this->post_L_adjust_S_H();
116 | 					}
117 | 				},
118 | 				this->minibatch_subround_update(this->mba)
119 | 			};
120 | 		}	
121 | 	
122 | 		
123 | 	public:
124 | 		void constructor_helper(const TInt & batchsize){
125 | 	
126 | 			
127 | 			this->mba = minibatchapp::MiniBatchApp<TInt>(batchsize, this->ndata);			
128 | 			this->setalgname("Base Sparse Mini Batch Kmeans");
129 | 				
130 | 		}
131 | 		
132 | 		template<typename... Args>
133 | 		BaseSparseMiniBatch(TInt batchsize, Args&&... args): kmeans::BaseSparseKmeans<TInt, TFloat> (std::forward<Args>(args)...){
134 | 			
135 | 			this->constructor_helper(batchsize);
136 | 		}
137 | 		 		
138 | 		virtual ~BaseSparseMiniBatch(){};
139 | 
140 | };
141 | 
142 | 
143 | }
144 | 
145 | 
146 | #endif
147 | 
148 | 


--------------------------------------------------------------------------------
/src/elkan3v0.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_ELKANKMEANS_3V0_H
22 | #define PLL_ELKANKMEANS_3V0_H
23 | 
24 | #include "baseelkan.h"
25 | #include "alg_X_selkSN.h"
26 | 
27 | namespace kmeans{
28 | 
29 | /* discrepency in ndcalcs as compared to a3v0 due to not computing CC initially (I propose) */
30 | 
31 | template <typename TInt, typename TFloat>
32 | class P3V0 : public kmeans::BaseElkan<TInt, TFloat>{
33 | 				
34 | 	protected:
35 | 		TFloat * const get_lowers(){
36 | 			return this->elkan_lowers_base.get();
37 | 		}
38 | 		
39 | 		TFloat * const get_upbs(){
40 | 			return this->elkan_upper_base.get();
41 | 		}
42 | 		
43 | 		TFloat * const get_delta_C(){
44 | 			return this->elkan_delta_C.get();
45 | 		}
46 | 		
47 | 		std::function<void(TInt)> update_L_lowers_upper_S_H_3v0_ati(){
48 | 			return [this](TInt ti){
49 | 				TInt x0 = (ti*this->getndata())/this->getnthreads();
50 | 				this->pll_principal_X(update_L_lowers_upper_S_H_3v0<TInt, TFloat>, ti, this->get_delta_C(), this->get_L() + x0,  this->get_lowers() + x0*this->getncentroids(), this->get_upbs() + x0, this->round);
51 | 			};
52 | 		}
53 | 		
54 | 		
55 | 	public:
56 | 		typedef kmeans::BaseElkan<TInt, TFloat> EB;
57 | 		template<typename... Args>
58 | 		P3V0(Args&&... args): EB(std::forward<Args>(args)...)
59 | 
60 | 
61 | 		{
62 | 			this->setalgname("p3v0");
63 | 			this->elkan_delta_C.reset(new TFloat [this->getncentroids()]);
64 | 		}
65 | 		virtual ~P3V0(){}
66 | 
67 | 		virtual TInt get_approximate_memory_requirement(){
68 | 			return EB::get_approximate_memory_requirement() + 
69 | 			sizeof(TFloat)*this->getncentroids(); // delta_C  
70 | 		}
71 | 
72 | 		virtual void verbose_write_additional(){
73 | 			this->EB_verbose_write_additional();
74 | 			/* anything else to add ? */
75 | 		}
76 | 
77 | 		virtual void set_initialisation_tasks(){
78 | 			/* all Elkan variants have same initialisation tasks */
79 | 			this->ElkBase_set_initialisation_tasks();
80 | 		}
81 | 	
82 | 		virtual void set_C_tasks(){
83 | 			this->C_tasks = {
84 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX)
85 | 			};
86 | 		}
87 | 		
88 | 		virtual void set_X_tasks(){
89 | 			this->X_tasks = {
90 | 				this->update_L_lowers_upper_S_H_3v0_ati()
91 | 			};
92 | 		}
93 | };
94 | 
95 | }
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/src/elkan5v1.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_ELKANKMEANS_5V1_H
 22 | #define PLL_ELKANKMEANS_5V1_H
 23 | 
 24 | #include "elkan3v0.h"
 25 | 
 26 | namespace kmeans{
 27 | 
 28 | 
 29 | template <typename TInt, typename TFloat>
 30 | void update_L_lowers_upbs_S_H_5v1(TInt ncentroids, TInt dimension, TFloat * const S, TInt * const H , TInt & nchanges, TInt &ndcalcs, 
 31 | TInt ndata, const TFloat * const data, const TFloat * const C, const TFloat * const data_l22s, const TFloat * const C_l22s, const TFloat * const CC, const TFloat * const halfminCC, const TFloat * const delta_C, TInt * const L, TFloat * const lowers, TFloat * const upbs,  const TInt & round){
 32 | 	
 33 | 
 34 | 	nchanges = 0;
 35 | 	ndcalcs = 0; 
 36 | 
 37 | 	/* experiments with this in or out the loop show that it makes little difference (@test1) */
 38 | 	arrutilv2::rank1rowupdate(ncentroids, delta_C, static_cast<TFloat>(-1.), ndata, lowers);
 39 | 	
 40 | 	
 41 | 	for (TInt i = 0; i < ndata; ++i){
 42 | 		/* (@test1) */
 43 | 				
 44 | 		upbs[i] += delta_C[L[i]];
 45 | 		if (halfminCC[L[i]] < upbs[i]){
 46 | 		TInt label_before = L[i];
 47 | 			TInt ci = 0;
 48 | 			while (ci < ncentroids){
 49 | 				if ((L[i] != ci) && (upbs[i] > lowers[i*ncentroids + ci]) && (upbs[i] > 0.5*CC[ci*ncentroids + L[i]])){
 50 | 					arrutilv2::set_l2(dimension, data + i*dimension, C + L[i]*dimension, data_l22s[i], C_l22s[L[i]], upbs[i], ndcalcs);
 51 | 					lowers[i*ncentroids + L[i]] = upbs[i];
 52 | 					if ((upbs[i] > lowers[i*ncentroids + ci]) && (upbs[i] > 0.5*CC[ci*ncentroids + L[i]])){
 53 | 						arrutilv2::set_l2(dimension, data + i*dimension, C + ci*dimension, data_l22s[i], C_l22s[ci], lowers[i*ncentroids + ci], ndcalcs);
 54 | 						if (upbs[i] > lowers[i*ncentroids + ci]){
 55 | 							upbs[i] = lowers[i*ncentroids + ci];
 56 | 							L[i] = ci;
 57 | 						}
 58 | 					}
 59 | 					++ci;
 60 | 					break;
 61 | 				}
 62 | 				++ci;
 63 | 			}
 64 | 			while (ci < ncentroids){
 65 | 				if ((upbs[i] > lowers[i*ncentroids + ci]) && (upbs[i] > 0.5*CC[ci*ncentroids + L[i]])){ // (L[i] != ci) && 
 66 | 					arrutilv2::set_l2(dimension, data + i*dimension, C + ci*dimension, data_l22s[i], C_l22s[ci], lowers[i*ncentroids + ci], ndcalcs);
 67 | 					if (upbs[i] > lowers[i*ncentroids + ci]){
 68 | 						upbs[i] = lowers[i*ncentroids + ci];
 69 | 						L[i] = ci;
 70 | 					}
 71 | 				}
 72 | 				++ci;
 73 | 			}
 74 | 			if (L[i] != label_before){
 75 | 				++nchanges;
 76 | 				++H[L[i]];
 77 | 				--H[label_before];
 78 | 				arrutilv2::addto(dimension, data + i*dimension, S + dimension*L[i]);
 79 | 				arrutilv2::subtractfrom(dimension, data + i*dimension, S + dimension*label_before);
 80 | 			}
 81 | 		}
 82 | 	}
 83 | }
 84 | 
 85 | 
 86 | 
 87 | template <typename TInt, typename TFloat>
 88 | class P5V1 : public P3V0<TInt, TFloat>{
 89 | 
 90 | 
 91 | 
 92 | 	private:
 93 | 		std::unique_ptr<TFloat []> CC;
 94 | 		std::unique_ptr<TFloat []> halfminCC;	
 95 | 		
 96 | 	protected:
 97 | 		TFloat * const get_CC(){
 98 | 			return CC.get();
 99 | 		}
100 | 		
101 | 		TFloat * const get_halfminCC(){
102 | 			return halfminCC.get();
103 | 		}
104 | 		
105 | 		
106 | 		std::function<void(TInt)> update_L_lowers_upbs_S_H_5v1_ati(){
107 | 			return [this](TInt ti){
108 | 				TInt x0 = (ti*this->getndata())/this->getnthreads();
109 | 				this->pll_principal_X(update_L_lowers_upbs_S_H_5v1<TInt, TFloat>, ti, this->get_CC(), this->get_halfminCC(), this->get_delta_C(), this->get_L() + x0,  this->get_lowers() + x0*this->getncentroids(), this->get_upbs() + x0, this->round);
110 | 			};
111 | 		}
112 | 		
113 | 	public:
114 | 		typedef kmeans::P3V0<TInt, TFloat> PC; 
115 | 		template<typename... Args>
116 | 		P5V1(Args&&... args): PC(std::forward<Args>(args)...), 
117 | 		
118 | 		CC{ new TFloat [this->getncentroids()*this->getncentroids()]  },
119 | 		halfminCC{ new TFloat [this->getncentroids()] }
120 | 		{
121 | 			this->setalgname("p5v1");
122 | 		}
123 | 		
124 | 		virtual ~P5V1(){}
125 | 
126 | 		virtual void verbose_write_additional(){
127 | 			PC::verbose_write_additional();
128 | 			/* do I want to write CC as well ? */
129 | 		}
130 | 
131 | 		virtual void set_initialisation_tasks(){
132 | 			
133 | 
134 | 			this->initialisation_tasks = this->exact_makeset_C_C_l22s_L_inds0_lowers_upper_S_H_mati();
135 | 						
136 | 			/* note : CC and halfminCC don't need to be set at this point. C and C_l22s must be set so that S&H can be set above. Maybe with smarter initialisations or initialisations with kmeans++ */
137 | 			
138 | 			}
139 | 		
140 | 		virtual void set_C_tasks(){
141 | 				
142 | 			this->C_tasks = {
143 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX),
144 | 			
145 | 				arrutilv2::update_CC_halfminCC_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_C(), this->get_C_l22s(), this->get_CC(), this->get_halfminCC(), this->ndcalcs_notX)
146 | 			};
147 | 		}
148 | 		
149 | 		virtual void set_X_tasks(){
150 | 				
151 | 			this->X_tasks = {
152 | 				this->update_L_lowers_upbs_S_H_5v1_ati()
153 | 			};
154 | 		}
155 | };
156 | 
157 | 
158 | }
159 | 
160 | #endif
161 | 


--------------------------------------------------------------------------------
/src/exactsimplebatch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_EXACTSIMPLEBATCHKMEANS_H
22 | #define PLL_EXACTSIMPLEBATCHKMEANS_H
23 | 
24 | #include "basesimpleexact.h"
25 | 
26 | 
27 | 
28 | namespace kmeans{
29 | 
30 | template <typename TInt, typename TFloat>
31 | class SimpleExactBatchKmeans : public kmeans::BaseSimpleExactKmeans<TInt, TFloat>{
32 | 	
33 | 	private:
34 | 		TInt nperbatch;
35 | 
36 | 	public:
37 | 		TInt get_nperbatch(){
38 | 			return this->nperbatch;
39 | 		}
40 | 		
41 | 		template<typename... Args>
42 | 		SimpleExactBatchKmeans(Args&&... args): kmeans::BaseSimpleExactKmeans<TInt, TFloat> (std::forward<Args>(args)...) {
43 | 			this->setalgname("Exact Simple Batch K-Means");
44 | 			//set so that the batch step does not cause memory in assigment to exceed half memory of data itself			
45 | 			nperbatch = std::max(
46 | 			static_cast<TInt> (1), static_cast<TInt> ((this->getndata() * this->getdimension())/(2 * this->getncentroids() * this->nthreads))
47 | 			);
48 | 		}				
49 | 	
50 | 		virtual ~SimpleExactBatchKmeans(){};
51 | 		
52 | 
53 | 	protected:
54 | 		virtual void set_X_tasks(){
55 | 			this->X_tasks = {
56 | 				arrutilv2::update_L_S_H_batch_ati(this->getnthreads(), this->getndata(), this->nperbatch, this->getdimension(), this->getdata(), this->getncentroids(), this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dsums(), this->get_dcounts(), this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex, this->ndcalcs_X)
57 | 			};
58 | 			
59 | 			
60 | 			//update_L_S_H_batch_ati(TInt nthreads, TInt ndata, TInt nperbatch, TInt dimension, const TFloat * const data, TInt ncentroids, const TFloat * const C, const TFloat * const data_l22s, const TFloat * const C_l22s, TInt * const L, TFloat * const dsums, TInt * const dcounts, TFloat * const sums, TInt * const counts, TInt & nchanges, std::mutex & work_mutex, std::atomic<TInt> & ndcalcs){
61 | 				
62 | 				
63 | 		}	
64 | };
65 | 	
66 | }
67 | 
68 | 
69 | 
70 | #endif
71 | 
72 | 
73 | 
74 | //extern template class kmeans::SimpleExactBatchKmeans<size_t, double>;
75 | //extern template class kmeans::SimpleExactBatchKmeans<size_t, float>;
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/src/growbatchapp.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef GBAPP_H
22 | #define GBAPP_H
23 | 
24 | #include <memory>
25 | 
26 | namespace growbatchapp{
27 | 	
28 | template <typename TInt, typename TFloat>
29 | class GBApp {
30 | 	public:
31 | 	
32 | 	
33 | 		/* amount of data which is active. Initially determined by user, thereafter grows by factor of growthfactor when nec */
34 | 		TInt ndata_active; 
35 | 
36 | 		/* Hacky variable for printing purposes */
37 | 		TFloat d_C__over__d_AB;
38 | 		
39 | 		/* will be 2 */
40 | 		TFloat growthfactor; 
41 | 		
42 | 		/* definition depends on class, for Grow Batch Partitional: if \|C_A - C_B\|_2 > threshold * \|C_{t} - C_{t-1}\|, then grow by growthfactor. will be 1.0 */
43 | 		TFloat threshold; 
44 | 		
45 | 		/* amount of data which was active in previous round. Either ndata_active or ndata_active/2. */
46 | 		TInt ndata_active_previous; 
47 | 
48 | 		/* used to determine if exapansion should take place (and maybe other things) */
49 | 		std::unique_ptr<TFloat []> delta_C;
50 | 
51 | 
52 | };
53 | 
54 | template <typename TInt, typename TFloat>
55 | class GBMseApp {
56 | 	public:
57 | 		std::vector<TFloat> sse_by_cluster;
58 | 		std::vector<TFloat> mse_by_cluster;
59 | 		std::unique_ptr<TFloat []> dn; //distance to nearest. TODO: checl that not this quantity squared.	
60 | };
61 | 
62 | }
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | #endif
70 | 		
71 | 


--------------------------------------------------------------------------------
/src/hamerly11v0.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_HAMERLYKMEANS_11V0_H
 22 | #define PLL_HAMERLYKMEANS_11V0_H
 23 | 
 24 | #include "basehamerly.h"
 25 | 
 26 | namespace kmeans{
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | template <typename TInt, typename TFloat>
 33 | void update_L_lower_upper_S_H_11v0(TInt ncentroids, TInt dimension, TFloat * const S, TInt * const H , TInt & nchanges, TInt &ndcalcs, 
 34 | TInt ndata, const TFloat * const data, const TFloat * const C,  const TFloat * const data_l22s, const TFloat * const C_l22s, const TFloat * const CC, const TFloat * const halfminCC, const TFloat * const delta_C,   TInt * const L, TFloat * const lower, TFloat * const upper,  const TInt & round){
 35 | 	
 36 | 	nchanges = 0;
 37 | 	ndcalcs = 0; 
 38 | 
 39 | 
 40 | 	TFloat m;
 41 | 	TInt oldlabel;
 42 | 	std::unique_ptr<TFloat []> distances (new TFloat [ncentroids]);
 43 | 	
 44 | 	//TODO: Hamerly checks that the label of data point is not max-mover (if fail, add second biggest budge). 
 45 | 	TFloat max_deltaC_previous_round;
 46 | 	TInt index_max_deltaC_previous_round;
 47 | 	arrutilv2::set_argmaxmax(ncentroids, delta_C, index_max_deltaC_previous_round, max_deltaC_previous_round);
 48 | 	for (TInt i = 0; i < ndata; ++i){
 49 | 		lower[i] -= max_deltaC_previous_round;
 50 | 		upper[i] += max_deltaC_previous_round;	
 51 | 		m = std::max(halfminCC[L[i]], lower[i]);
 52 | 		if (upper[i] > m){
 53 | 			arrutilv2::set_l2(dimension, data + i*dimension,  C + L[i]*dimension, data_l22s[i], C_l22s[L[i]], upper[i], ndcalcs);		
 54 | 			if (upper[i] > m){
 55 | 				oldlabel = L[i];
 56 | 				arrutilv2::set_rl2s(dimension, data + i*dimension, ncentroids, C, data_l22s[i], C_l22s, distances.get(), ndcalcs);
 57 | 				arrutilv2::set_argminmin2nocheck(ncentroids, distances.get(), L[i], upper[i], lower[i]);
 58 | 				if (L[i] != oldlabel){
 59 | 					++nchanges;
 60 | 					++H[L[i]];
 61 | 					--H[oldlabel];
 62 | 					arrutilv2::addto(dimension, data + i*dimension, S + dimension*L[i]);
 63 | 					arrutilv2::subtractfrom(dimension, data + i*dimension, S + dimension*oldlabel);
 64 | 				}
 65 | 			}
 66 | 		}
 67 | 	}
 68 | }
 69 | 
 70 | 
 71 | 
 72 | 
 73 | /* discrepency in ndcalcs with a11v0 is due to a11v0 performing CC computation before first round */
 74 | 
 75 | template <typename TInt, typename TFloat>
 76 | class P11V0 : public kmeans::BaseHamerly<TInt, TFloat>{
 77 | 	
 78 | 	private:
 79 | 		
 80 | 			
 81 | 	protected:
 82 | 	
 83 | 		TFloat * const get_lower(){
 84 | 			return this->get_lower_base();
 85 | 		}
 86 | 		
 87 | 		TFloat * const get_upper(){
 88 | 			return this->get_upper_base();
 89 | 		}
 90 | 			
 91 | 		std::function<void(TInt)> update_L_lower_upper_S_H_11v0_ati(){
 92 | 			return [this](TInt ti){
 93 | 				TInt x0 = (ti*this->getndata())/this->getnthreads();
 94 | 				
 95 | 				this->pll_principal_X(update_L_lower_upper_S_H_11v0<TInt, TFloat>, ti, this->get_CC(), this->get_halfminCC(), this->get_delta_C(), this->get_L() + x0,  this->get_lower() + x0, this->get_upper() + x0, this->round);
 96 | 			};
 97 | 		}
 98 | 		
 99 | 	public:
100 | 		typedef kmeans::BaseHamerly<TInt, TFloat> BH;
101 | 		template<typename... Args>
102 | 		P11V0(Args&&... args): BH(std::forward<Args>(args)...)
103 | 		
104 | 		{
105 | 			this->setalgname("p11v0");
106 | 		}
107 | 		virtual ~P11V0(){}
108 | 
109 | 		virtual void verbose_write_additional(){
110 | 			BH::verbose_write_additional();
111 | 			/* anything else to print ? */
112 | 		}
113 | 		
114 | 
115 | 		virtual void set_initialisation_tasks(){
116 | 			
117 | 			
118 | 			this->initialisation_tasks = this->makeset_C_C_l22s_L_inds0_lower_upper_S_H_mati();
119 | 				
120 | 			/* note : CC and halfminCC don't need to be set at this point. C and C_l22s must be set so that S&H can be set above. Maybe with smarter initialisations or initialisations with kmeans++ */
121 | 			
122 | 			}
123 | 		
124 | 		virtual void set_C_tasks(){
125 | 			this->C_tasks = {
126 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX),
127 | 
128 | 				arrutilv2::update_CC_halfminCC_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_C(), this->get_C_l22s(), this->get_CC(), this->get_halfminCC(), this->ndcalcs_notX)
129 | 			};
130 | 		}
131 | 		
132 | 		virtual void set_X_tasks(){
133 | 					
134 | 			this->X_tasks = {
135 | 
136 | 			
137 | 			this->update_L_lower_upper_S_H_11v0_ati()	
138 | 			};
139 | 		}
140 | };
141 | 
142 | }
143 | 
144 | #endif
145 | 
146 | 


--------------------------------------------------------------------------------
/src/minibatch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_MINIBATCHKMEANS_H
22 | #define PLL_MINIBATCHKMEANS_H
23 | 
24 | #include "basesimpleminibatch.h"
25 | 
26 | namespace kmeans{
27 | template <typename TInt, typename TFloat>
28 | //Like D Sculley, but instead of just adding newly labeled data to centroids, if the data has already been used first remove it from the centroid it was assigned to previously. This breaks the 1/t convergence to the local minimum 
29 | class MiniBatch : public kmeans::BaseSimpleMiniBatch<TInt, TFloat>{
30 | 	
31 | 	private:
32 | 	
33 | 		virtual void update_L_S_H(TInt x0, TInt x1, TInt ti) override final{
34 | 			
35 | 
36 | 			if (this->round < this->mba.nsubrounds){
37 | 				this->update_L_S_H_batch_increment_only(x0, x1, ti);
38 | 			}
39 | 			
40 | 			else{
41 | 				this->update_L_S_H_batch(x0, x1, ti);
42 | 			}
43 | 		}
44 | 		
45 | 	public:
46 | 		
47 | 		
48 | 		template<typename... Args>
49 | 		MiniBatch(Args&&... args): kmeans::BaseSimpleMiniBatch<TInt, TFloat> (std::forward<Args>(args)...) {
50 | 			this->setalgname("(Improved) Mini Batch Kmeans");
51 | 		}		
52 | 		
53 | 
54 | 		virtual ~MiniBatch(){};
55 | 
56 | };
57 | 
58 | 
59 | }
60 | 
61 | 
62 | #endif
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/src/minibatchapp.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef MINIBATCHAPP_H
22 | #define MINIBATCHAPP_H
23 | 
24 | namespace minibatchapp{
25 | 
26 | template <typename TInt>
27 | class MiniBatchApp{
28 | 	public:
29 | 		//nuber of data used for each centroid update
30 | 		TInt batchsize;
31 | 		//number of centroid updates in one complete round of data
32 | 		TInt nsubrounds;
33 | 		//this round mod nsubrounds
34 | 		TInt subround;
35 | 		//the amount of data used in the nsubround'th update, the other updates use batchsize datapoints
36 | 		TInt lastbatchsize;
37 | 		//The size of the first batch. 
38 | 		TInt initialising_batch_size;
39 | 		//Number of changes on each batch since previous time that batch processed. 
40 | 		std::vector<TInt> nchanges_on_batch;		
41 | 
42 | 
43 | 		MiniBatchApp() = default;
44 | 
45 | 		MiniBatchApp(TInt batchsize, TInt ndata){
46 | 			this->batchsize = batchsize;
47 | 			if (ndata % batchsize == 0){
48 | 				this->nsubrounds = ndata/batchsize;
49 | 			}
50 | 			
51 | 			else{
52 | 				this->nsubrounds = 1 + ndata/batchsize;
53 | 			}
54 | 			this->lastbatchsize = ndata - batchsize*(this->nsubrounds - 1);
55 | 			this->subround = 0;
56 | 			
57 | 			
58 | 			this->initialising_batch_size = std::min(this->batchsize, ndata);
59 | 			this->nchanges_on_batch = std::vector<TInt> (this->nsubrounds, 0);			
60 | 		}
61 | };
62 | 
63 | //template <typename TInt, typename TFloat>
64 | //void set_summaries_minibatch(cluster::BaseCluster<TInt, TFloat> & basecluster, const minibatchapp::MiniBatchApp<TInt> & mba){
65 | 	
66 | 			
67 | 
68 | 
69 | 
70 | 		
71 | }
72 | 
73 | 
74 | 	
75 | 	
76 | #endif
77 | 		
78 | 


--------------------------------------------------------------------------------
/src/optionsutil.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #include <utility>
 22 | #include <string>
 23 | #include <map>
 24 | #include <iostream>
 25 | #include <iomanip>
 26 | #include <exception>
 27 | #include "optionsutil.h"
 28 | namespace optionsutil{
 29 | 
 30 | Option::Option(std::string fn, std::string sn, std::string desc, std::string tp, std::string dv): fullname(std::move(fn)), shortname(std::move(sn)), description(std::move(desc)), type(std::move(tp)), defval(std::move(dv)), isset(false) {
 31 | 	if (defval.compare("") == 0){
 32 | 		definition = "--" + fullname + " -" + shortname + "  "  + type;
 33 | 	}
 34 | 	else{
 35 | 		definition = "--" + fullname + " -" + shortname + "  "  + type + "  (" + defval + ")  ";
 36 | 	}
 37 | }
 38 | 		
 39 | Option::Option(){
 40 | 	throw std::logic_error("Default constructor for Option called, this should never happen");
 41 | }
 42 | 
 43 | 		
 44 | void Option::print(unsigned tab1, unsigned tab2){
 45 | 	unsigned width = tab2 - tab1;
 46 | 	unsigned margin = 2;
 47 | 	
 48 | 	std::cout << definition;
 49 | 	if (definition.size() > tab1-margin){
 50 | 		 std::cout << " \n";
 51 | 		 std::cout << std::setw(tab1) << " ";
 52 | 	}
 53 | 	else{
 54 | 		std::cout << std::setw(tab1 - definition.size()) << " ";
 55 | 	}
 56 | 	
 57 | 	
 58 | 	unsigned fragi = 0;
 59 | 	//unsigned currenti = 0;
 60 | 	//std::string nextline("");
 61 | 	//while (currenti < description.size()){
 62 | 		//nextline = description.substr(currenti, width);
 63 | 		//if (nextline.find("\0") != std::string::npos) {
 64 | 		
 65 | 		//}
 66 | 	//}
 67 | 	
 68 | 	while(fragi < description.size()/width){
 69 | 		
 70 | 		//\033
 71 | 		
 72 | 		//if (description.substr(fragi*width, width).find("\\0") != std::string::npos){
 73 | 			//std::cout << "-------------------------------------------------------------" << std::endl;
 74 | 		//}
 75 | 		std::cout << description.substr(fragi*width, width) << " \n";
 76 | 		std::cout << std::setw(tab1) << " ";
 77 | 		++fragi;
 78 | 	}
 79 | 	std::cout << description.substr(fragi*width) << " \n" << std::endl;
 80 | }
 81 | 
 82 | /* To do : is it possible to have the following class using variadic templates:
 83 |  * Options<int, float, std::string> options;
 84 |  * options.add("name1", anint)
 85 |  * options.add("name2", astring)
 86 |  * ...
 87 |  * ?
 88 |  * */
 89 | 				
 90 | void Options::add(Option && o){
 91 | 	fullname.emplace(std::make_pair(o.shortname, o.fullname));
 92 | 	options.emplace(std::make_pair(o.fullname, std::move(o)));
 93 | }
 94 | 
 95 | void Options::add(std::string fn, std::string sn, std::string desc, std::string type, std::string defval){
 96 | 	fullname.emplace(std::make_pair(sn, fn));
 97 | 	options.emplace(std::make_pair(fn, Option(fn, std::move(sn), std::move(desc), std::move(type), std::move(defval))));
 98 | }
 99 | 
100 | void Options::print(unsigned tab1, unsigned tab2){
101 | 	
102 | 	//std::cout << "\n-------------------------------------------------\n";
103 | 	std::cout << "\nThe options are of the form,\n--full_option_name  -abridged_name  type  (default) \n\n";
104 | 	
105 | 	
106 | 	std::cout << std::endl;
107 | 	for (auto & p : options){
108 | 		p.second.print(tab1, tab2);
109 | 	}
110 | 	
111 | 	unsigned fragi = 0;
112 | 	while(fragi < tail.size()/tab2){
113 | 		std::cout << tail.substr(fragi*tab2, tab2) << "\n";
114 | 		++fragi;
115 | 	}
116 | 	std::cout << tail.substr(fragi*tab2) <<"\n" << std::endl;
117 | 	
118 | }
119 | 
120 | }
121 | 


--------------------------------------------------------------------------------
/src/optionsutil.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #include <string>
22 | #include <map>
23 | 
24 | namespace optionsutil{
25 | 
26 | class Option{
27 | 	public:
28 | 		std::string fullname;
29 | 		std::string shortname;
30 | 		std::string description;
31 | 		std::string type;
32 | 		std::string defval;
33 | 		bool isset;
34 | 		
35 | 		Option(std::string fn, std::string sn, std::string desc, std::string tp, std::string dv);
36 | 		Option();
37 | 		void print(unsigned tab1, unsigned tab2);
38 | 		
39 | 	private:
40 | 		std::string definition;
41 | };
42 | 
43 | class Options{
44 | 	public:
45 | 		std::map<std::string, Option> options;
46 | 		std::map<std::string, std::string> fullname;
47 | 		std::string tail;
48 | 		void add(Option && o);
49 | 		void add(std::string fn, std::string sn, std::string desc, std::string type, std::string defval);
50 | 		void print(unsigned tab1 = 40, unsigned tab2 = 85);
51 | };
52 | 
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/src/pllkmeansfuncs.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLLKMEANSFUNCS_HPP
22 | #define PLLKMEANSFUNCS_HPP
23 | 
24 | #include <sstream>
25 | 
26 | 
27 | #include "pllcluster.h"
28 | 
29 | 
30 | namespace cluster{
31 | 	
32 | 	//boilerplate
33 | 	template <typename TFloat>
34 | 	std::tuple<std::unique_ptr<TFloat []>, std::unique_ptr<size_t []>, std::unique_ptr<size_t []>, size_t, size_t, TFloat, std::string>
35 | 	
36 | 	solveioless(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const TFloat * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const TFloat * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, TFloat maxtime, size_t maxrounds, size_t minibatchsize, size_t nvaldata, const TFloat * const valdata, size_t valperiod, bool captureverbose){
37 | 		
38 |     
39 | 	
40 | 		std::stringstream buffer;		
41 | 		auto cout_buff = std::cout.rdbuf();
42 |     
43 |     if (captureverbose == true){
44 |       auto bizzle = buffer.rdbuf();
45 | 			std::cout.rdbuf(bizzle);	      
46 | 		}	
47 |     
48 | 		std::ofstream nowhere;
49 | 		
50 | 		
51 | 		//I assume cmse not wanted
52 | 		size_t cmserate = 0;
53 | 
54 | 
55 | 		TFloat gbphi = 1e-3; //ooph......
56 |     
57 |     
58 | 		auto pretro = solve6<'d', size_t, TFloat>(algorithm, minibatchsize, nthreads, ndata, dimension, data, ncentroids, cout_verbosity, 0,  nowhere, initialisation_method, C_init, data_indices_init_from, setseed, seed, maxtime, maxrounds, "", nvaldata, valdata, valperiod, "", cmserate, gbphi);
59 | 		
60 | 
61 | 		std::string text;
62 | 		
63 | 		if (captureverbose == true){
64 | 			text = buffer.str();
65 | 			std::cout.rdbuf(cout_buff);
66 | 		}
67 | 		
68 | 		else{
69 | 			text = "captureverbose was false, so nothing here";
70 | 		}
71 | 		
72 | 
73 | 		auto retro = std::move(std::tuple_cat(std::move(pretro), std::make_tuple(text)));//, std::make_tuple<std::string>("bwerlk"));		
74 | 
75 | 		return retro;
76 | 	//}	
77 | 	}	
78 | }
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/src/pllkmeansfuncs_nonvoid.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLLKMEANSFUNCS_H
22 | #define PLLKMEANSFUNCS_H
23 | 
24 | 
25 | #include <string>
26 | #include <memory>
27 | #include <tuple>
28 | #include <vector>
29 | 
30 | 
31 | namespace cluster{
32 | 
33 | 
34 | 	/* useful for direct use in C++ code */
35 | 	/* return : C, L, inds0, duration, niterations, mse */
36 | 	std::tuple<std::unique_ptr<float []>, std::unique_ptr<size_t[]>, std::unique_ptr<size_t[]>, size_t, size_t, float, std::string>
37 | 	solveiolessf(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const float * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const float * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, float maxtime, size_t maxrounds, size_t minibatchsize, bool captureverbose);
38 | 	
39 | 	std::tuple<std::unique_ptr<double []>, std::unique_ptr<size_t[]>, std::unique_ptr<size_t[]>, size_t, size_t, double, std::string>
40 | 	solveiolessd(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const double * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const double * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, double maxtime, size_t maxrounds, size_t minibatchsize, bool captureverbose);
41 | 	
42 | 	
43 | }
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/pllkmeansfuncs_void.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLLKMEANSVOIDFUNCS_H
22 | #define PLLKMEANSVOIDFUNCS_H
23 | 
24 | namespace cluster {
25 | 	
26 | 	
27 | 	
28 | 
29 | 	/* As per nonvoid versions, but C, L, inds0, duration, niterations, mse set inplace. They should be initialised to be the right dimension before entering.  (useful function for Cython so that no messing around with smart pointers, although apparently it is straightforward...) */
30 | 	void v_solveiolessf(const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const float * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const float * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, float maxtime, size_t maxrounds, float * const C, size_t * const L, size_t * const inds0, size_t & duration, size_t &  niterations, float & mse, size_t minibatchsize, size_t nvaldata, const float * const valdata, size_t valperiod  , bool captureverbose, std::string & verbosestring);
31 | 
32 | 
33 | 	void v_solveiolessd(
34 | 	const std::string & algorithm, size_t nthreads, size_t ndata, size_t dimension, const double * const data, size_t ncentroids, int cout_verbosity, const std::string & initialisation_method, const double * const C_init, const size_t * const data_indices_init_from, bool setseed, size_t seed, double maxtime, size_t maxrounds, double * const C, size_t * const L, size_t * const inds0,size_t & duration, size_t & niterations, double & mse, size_t minibatchsize, size_t nvaldata, const double * const valdata, size_t valperiod  , bool captureverbose, std::string & verbosestring);		
35 | 
36 | 
37 | 	/* functions used in kmeans executable */
38 | 	void solvewrited(
39 | 	const std::string & algorithm, 
40 | 	bool issparse, 
41 | 	size_t nruns, 
42 | 	size_t nthreads, 
43 | 	int cout_verbosity, 
44 | 	int file_verbosity, 
45 | 	const std::string & datainfn, 
46 | 	const std::string & coutfn, 
47 | 	const std::string & loutfn,  
48 | 	const std::string & ioutfn, 
49 | 	const std::string & soutfn, 
50 | 	const std::string & voutfn, 
51 | 	const std::string & moutfn, 
52 | 	const std::string & moutdir,  
53 | 	const std::string & cinf, 
54 | 	const std::string & ind0fn, 
55 | 	const std::string & init0, 
56 | 	bool setseed, 
57 | 	size_t seed, 
58 | 	size_t ncentroids, 
59 | 	size_t maxiter, 
60 | 	double maxtime, 
61 | 	const std::string & valinfn, 
62 | 	size_t valperiod, 
63 | 	size_t minibatchsize, 
64 | 	std::string & cmsewritefn, 
65 | 	size_t cmserate, //27
66 | 	double gbphi
67 | 	);
68 | 	
69 | 	void solvewritef(const std::string & algorithm, bool issparse,  size_t nruns, size_t nthreads, int cout_verbosity, int file_verbosity, const std::string & datainfn, const std::string & coutfn, const std::string & loutfn,  const std::string & ioutfn, const std::string & soutfn, const std::string & voutfn, const std::string & moutfn, const std::string & moutdir,  const std::string & cinf, const std::string & ind0fn, const std::string & init0, bool setseed, size_t seed, size_t ncentroids, size_t maxiter, double maxtime, const std::string & valinfn, size_t valperiod, size_t minibatchsize, std::string & cmsewritefn, size_t cmserate, float gbphi);
70 | 	
71 | 	
72 | 	
73 | 
74 | }
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/src/processingfilename.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | # Written by James Newling <james.newling@gmail.com>
 4 | # All rights reserved.
 5 | # 
 6 | # eakmeans is a library for exact and approximate k-means written in C++ and
 7 | # Python. This file is part of eakmeans. See file COPYING for more details.
 8 | # 
 9 | # This file is part of eakmeans.
10 | # 
11 | # eakmeans is free software: you can redistribute it and/or modify
12 | # it under the terms of the 3-Clause BSD Licence. See
13 | # https://opensource.org/licenses/BSD-3-Clause for more details.
14 | # 
15 | # eakmeans is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | # COPYING for more details.
19 | # 
20 | import sys
21 | import os
22 | import commands
23 | import shutil
24 | 
25 | names = commands.getstatusoutput('find .. -name "*.hpp" -type "f"')[1].split("\n")
26 | 
27 | bobs = []
28 | for n in names:
29 | 	shutil.copy(n, 	n.split("/")[-1])
30 | 	
31 | 	#if "whileprototying" not in n and "test" not in n and "experiments" not in n and "junk" not in n:
32 | 		#if "util" in n and "main" in n:
33 | 			#pass
34 | 			##print "-------->  ", n  
35 | 		#else:
36 | 			#bobs.append(n)
37 | 
38 | #for b in bobs:
39 | 	#shutil.copy(b, 	b.split("/")[-1])
40 | 	#print b
41 | 
42 | 		#print n
43 | 
44 | 	#bobs.append(n.split("/")[-1])
45 | #bobs.sort()
46 | 
47 | #for b in bobs:
48 | 	#print b
49 | ##for n in names:
50 | 	##if "arrutilv2l0" not in n and "kmeansstandalone" not in n:
51 | 		##shutil.copy(n, 	n.split("/")[-1])
52 | 


--------------------------------------------------------------------------------
/src/randomarray.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef RANDOMARRAY_H
 22 | #define RANDOMARRAY_H
 23 | 
 24 | 
 25 | #include <exception>
 26 | #include <stdexcept>
 27 | #include <cstdlib>
 28 | #include <iostream>
 29 | #include <random>
 30 | #include <memory>
 31 | 
 32 | namespace randomutil{ 
 33 | namespace randomarray{
 34 | 	
 35 | template <typename IntType, typename SizeType>
 36 | void filluniform_int(SizeType size_tofill, IntType * tofill, IntType lower, IntType upper){
 37 | 	IntType range = upper - lower;
 38 | 	for (SizeType i = 0; i < size_tofill; ++ i){
 39 | 		tofill[i] = lower + rand() % range;
 40 | 	}
 41 | }
 42 | 
 43 | template <typename FloatType, typename SizeType>
 44 | void filluniform_float(SizeType size_tofill, FloatType * tofill, FloatType lower, FloatType upper){
 45 | 	FloatType range = upper - lower;
 46 | 	for (SizeType i = 0; i < size_tofill; ++ i){
 47 | 		tofill[i] = lower + range * (static_cast <FloatType> (rand()) / static_cast <FloatType> (RAND_MAX));
 48 | 	}
 49 | }
 50 | 
 51 | template <typename NumberType, typename SizeType>
 52 | void filluniform(SizeType size_tofill, NumberType * tofill, NumberType lower, NumberType upper);
 53 | 
 54 | template<typename SizeType>
 55 | void filluniform(SizeType size_tofill, float * tofill, float lower, float upper){
 56 | 	filluniform_float(size_tofill, tofill, lower, upper);
 57 | }
 58 | 
 59 | template<typename SizeType>
 60 | void filluniform(SizeType size_tofill, double * tofill, double lower, double upper){
 61 | 	filluniform_float(size_tofill, tofill, lower, upper);
 62 | }
 63 | 
 64 | 
 65 | template<typename SizeType>
 66 | void filluniform(SizeType size_tofill, unsigned * tofill, unsigned lower, unsigned upper){
 67 | 	filluniform_int(size_tofill, tofill, lower, upper);
 68 | }
 69 | 
 70 | template<typename SizeType>
 71 | void filluniform(SizeType size_tofill, int * tofill, int lower, int upper){
 72 | 	filluniform_int(size_tofill, tofill, lower, upper);
 73 | }
 74 | 
 75 | /* fill tofill with values chosen from options uniformly at random */
 76 | template <typename NumberType, typename SizeType, typename Container>
 77 | void filluniform(SizeType size_tofill, NumberType * tofill, Container && options){
 78 | 	unsigned n_options = options.size();
 79 | 	std::vector<decltype(n_options)> option_numbers (size_tofill);
 80 | 	filluniform(size_tofill, option_numbers.data(), static_cast<unsigned> (0), n_options);
 81 | 	for (SizeType i = 0; i < size_tofill; ++i){
 82 | 		tofill[i] = options[option_numbers[i]];
 83 | 	}
 84 | }
 85 | 
 86 | 
 87 | template <typename NumberType, typename SizeType, typename Container>
 88 | std::vector<NumberType> getuniform(SizeType N, Container && options){
 89 | 	std::vector<NumberType>  sampled (N,0);
 90 | 	filluniform(N, sampled.data(), options);
 91 | 	return sampled;
 92 | }
 93 | 
 94 | 
 95 | // untested function:
 96 | template <typename NumberType, typename SizeType>
 97 | std::vector<NumberType> getuniform(SizeType N, NumberType lower, NumberType upper){
 98 | 	std::vector<NumberType>  sampled (N,0);
 99 | 	filluniform(N, sampled.data(), lower, upper);
100 | 	return sampled;
101 | }
102 | 
103 | // untested function:
104 | template <typename NumberType, typename SizeType>
105 | std::unique_ptr<NumberType [] > getuniform_uptr(SizeType N, NumberType lower, NumberType upper){
106 | 	std::unique_ptr<NumberType [] > sampled (new NumberType [N]);
107 | 	filluniform(N, sampled.get(), lower, upper);
108 | 	return sampled;
109 | }
110 | 
111 | 
112 | //return vector of length ndraes of sorted vectors, each vector has probability that a value (TInt) lies in the vector being p in range [0, N) and 0 otherwise.
113 | template <typename TInt, typename TFloat>
114 | std::vector<std::vector<TInt>> get_p_sample(TInt ndraws, TFloat p, TInt N){
115 | 	
116 | 	TInt proposal;
117 | 	bool goodproposal;
118 | 	std::vector<std::vector<TInt>> samples (ndraws);
119 | 	
120 | 	std::default_random_engine generator(rand());
121 | 	std::binomial_distribution<TInt> distribution(N,p);
122 | 	for (TInt draw=0; draw<ndraws; ++draw) {
123 | 		//number of distinct values in [0, N) to insert into samples[draw]:
124 |     TInt number = distribution(generator);
125 |     samples[draw].resize(number);
126 |     TInt i = 0;    
127 |     while (i < number){
128 | 			proposal = rand()%N;
129 | 			goodproposal = true;
130 | 			//confirm that proposal is good:
131 | 			for (TInt j = 0; j < i; ++j){
132 | 				if (proposal == samples[draw][j]){
133 | 					goodproposal = false;
134 | 					break;
135 | 				}
136 | 			}
137 | 			if (goodproposal == true){
138 | 				samples[draw][i] = proposal;
139 | 				++i;
140 | 			}
141 | 		}
142 | 		std::sort(samples[draw].begin(), samples[draw].end());
143 |   }
144 |   
145 |   return samples;
146 | }
147 | 
148 | 
149 | 
150 | }
151 | }
152 | 
153 | namespace randomutil{ 
154 | namespace noise{
155 | 
156 | template <typename NumberType, typename SizeType, typename ProbType>
157 | void signflip(SizeType ndata, NumberType * const data, ProbType switch_probability){
158 | 	for (SizeType i = 0; i < ndata; ++i){
159 | 		if ((float(rand()) / float(RAND_MAX)) < switch_probability){
160 | 			data[i]*=(-1);
161 | 		}
162 | 	}
163 | }
164 | 	
165 | }
166 | }	
167 | 
168 | #endif
169 | 


--------------------------------------------------------------------------------
/src/randomsparse.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef RANDOMSPARSE_H
22 | #define RANDOMSPARSE_H
23 | 
24 | 
25 | #include <exception>
26 | #include <stdexcept>
27 | #include <cstdlib>
28 | #include <iostream>
29 | #include <random>
30 | #include <memory>
31 | #include "sparsedatasets.h"
32 | #include "randomarray.h"
33 | 
34 | namespace randomutil{ 
35 | namespace randomsparse{
36 | 
37 | //TODO : make function accept a random number generator. 
38 | template <typename TInt, typename TFloat>
39 | sparse::SparseData<TInt, TFloat> get_sparsedata(TInt ndata, TInt dimension, TFloat sparsity){
40 | 	
41 | 	
42 | 	std::vector<TFloat> values;
43 | 	std::vector<TInt> indices;
44 | 	std::vector<TInt> starts (1,0);
45 | 	std::vector<std::string> labels;
46 | 	
47 | 	
48 | 	//vector of vectors of indices
49 | 	auto vvinds = randomutil::randomarray::get_p_sample<TInt, TFloat>(ndata, sparsity, dimension);
50 | 	
51 | 	
52 | 	for (TInt i = 0; i < ndata; ++i){
53 | 		for (auto & index : vvinds[i]){
54 | 			indices.push_back(index);
55 | 			TFloat value = (static_cast <TFloat> (rand()) / static_cast <TFloat> (RAND_MAX));
56 | 			values.push_back(value);
57 | 		}
58 | 		//for (TInt j = 0; j < dimension; ++j){ //really slow way to do it! TODO: 
59 | 			//bool nonzero = (sparsity > (rand() / (RAND_MAX + 0.)));
60 | 			//if (nonzero == true){
61 | 				//TFloat value = (static_cast <TFloat> (rand()) / static_cast <TFloat> (RAND_MAX));
62 | 				//values.push_back(value);
63 | 				//indices.push_back(j);
64 | 			//}
65 | 		//}
66 | 		starts.push_back(indices.size());
67 | 		labels.push_back("1011"); //give everything label 0
68 | 	}
69 | 	
70 | 	return sparse::SparseData<TInt, TFloat>(std::move(values), std::move(indices), std::move(starts), std::move(labels));
71 | 	
72 | }
73 | 
74 | //TODO : make function accept a random number generator. 
75 | void write_sparsedata(unsigned ndata, unsigned dimension, double sparsity, const std::string & filename, bool dimheader = true){
76 | 	auto sd  = get_sparsedata<unsigned, double> (ndata, dimension, sparsity);
77 | 	sd.write(filename, dimheader); 
78 | }
79 | 
80 | void write_sparse_and_dense_data(unsigned ndata, unsigned dimension, double sparsity, const std::string & sparsefilename,  const std::string & densefilename){
81 | 	auto sd  = get_sparsedata<unsigned, double> (ndata, dimension, sparsity);
82 | 	sd.write(sparsefilename, true); 
83 | 	sd.write_dense(densefilename, true); 
84 | }
85 | 
86 | 
87 | }
88 | 
89 | }
90 | 
91 | 
92 | #endif
93 | 
94 | 


--------------------------------------------------------------------------------
/src/sample.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef SAMPLE_H
 22 | #define SAMPLE_H
 23 | 
 24 | #include <exception>
 25 | #include <stdexcept>
 26 | #include <cstdlib>
 27 | #include <iostream>
 28 | #include <random>
 29 | #include <algorithm>
 30 | 
 31 | 
 32 | 
 33 | namespace randomutil{ 
 34 | namespace sample{
 35 | 
 36 | /*TODO based on code at: //codegolf.stackexchange.com/questions/4772/random-sampling-without-replacement
 37 |  * but changed to make uninclusive of upperbound!
 38 | g by universal ref?? I really need to clarify when universal ref should be used..
 39 | see https://isocpp.org/blog/2012/11/universal-references-in-c11-scott-meyers */
 40 | 
 41 | //O(max - min) algorithm for almost uniform sampling.
 42 | 
 43 | template<typename OutputIterator, typename IntegerType, class URNG>
 44 | 
 45 | 
 46 | void range_no_replacement(OutputIterator out, IntegerType n, IntegerType min, IntegerType max, URNG && g){
 47 |   if (n < 0)
 48 |     throw std::runtime_error("negative sample size");
 49 |   if (max < min)
 50 |     throw std::runtime_error("invalid range");
 51 |   if (n > max-min+1)
 52 |     throw std::runtime_error("sample size larger than range");
 53 | 
 54 |   while (n>0)
 55 |   {
 56 |     double r = g()/(RAND_MAX+1.0);
 57 |     if (r*(max-min) < n)
 58 |     {
 59 |       *out++ = min;
 60 |       --n;
 61 |     }
 62 |     ++min;
 63 |   }
 64 | }
 65 | 
 66 | 
 67 | template<typename OutputIterator, typename IntegerType>
 68 | void range_no_replacement(OutputIterator out, IntegerType n, IntegerType min, IntegerType max){
 69 |   //std::minstd_rand0 generator (time(NULL));
 70 | 	//range_no_replacement(out, n, min, max, generator);
 71 |   range_no_replacement(out, n, min, max, rand);
 72 | }
 73 | 
 74 | 
 75 | 
 76 | 
 77 | template<typename IntegerType>
 78 | std::vector<IntegerType> get_range_no_replacement(IntegerType n, IntegerType min, IntegerType max){
 79 |   std::vector<IntegerType> samples(n);
 80 |   range_no_replacement(samples.data(), n, min, max);
 81 |   return samples;
 82 | }
 83 | 
 84 | 
 85 | 
 86 | 
 87 | template<typename IntegerType, class URNG>
 88 | std::vector<IntegerType> get_range_no_replacement(IntegerType n, IntegerType min, IntegerType max, URNG && g){
 89 |   std::vector<IntegerType> samples(n);
 90 |   range_no_replacement(samples.data(), n, min, max, g);
 91 |   return samples;
 92 | }
 93 | 
 94 | 
 95 | 
 96 | template<typename IntegerType, class URNG>
 97 | std::vector<IntegerType> get_permuted_range(IntegerType n, URNG && g){
 98 |   std::vector<IntegerType> shuffled(n, 0);
 99 |   std::iota(shuffled.begin(), shuffled.end(), 0);
100 |   std::random_shuffle(shuffled.begin(), shuffled.end(), [&g](IntegerType i){return g()%i;});
101 |   return shuffled;
102 | }
103 | 
104 | // See http://en.cppreference.com/w/cpp/algorithm/random_shuffle for inspiration :)
105 | template<typename TInt, typename TFloat>
106 | void inplace_shuffle_by_row(TInt nrows, TInt ncols, TFloat * const data){
107 |   
108 |   std::unique_ptr<TFloat []> ptrtemp ( new TFloat [ncols] );
109 |   TFloat * const temp = ptrtemp.get();
110 |   TInt copyindex;
111 |   
112 |   
113 |   for (int i = nrows-1; i > 0; --i) {
114 |     copyindex = rand()%(i+1);
115 |     
116 |     std::memcpy(temp, data + i*ncols, sizeof(TFloat)*ncols);
117 |     std::memcpy(data + i*ncols, data + copyindex*ncols, sizeof(TFloat)*ncols);
118 |     std::memcpy(data + copyindex*ncols, temp, sizeof(TFloat)*ncols);
119 |   }
120 | }
121 | 
122 | 
123 | 
124 | }
125 | }
126 | 
127 | 
128 | 
129 | #endif
130 | 


--------------------------------------------------------------------------------
/src/simple1.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_SIMPLEKMEANS_H
22 | #define PLL_SIMPLEKMEANS_H
23 | 
24 | #include "basesimpleexact.h"
25 | 
26 | namespace kmeans{
27 | 
28 | template <typename TInt, typename TFloat>
29 | class SimpleKmeans1 : public kmeans::BaseSimpleExactKmeans<TInt, TFloat>{
30 | 
31 | 
32 | 	protected: 
33 | 	
34 | 			virtual void set_X_tasks(){
35 | 		
36 | 			this->X_tasks = {
37 | 				
38 | 				arrutilv2::update_L_S_H_ati(this->nthreads, this->ndata, this->dimension, this->data, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->get_dsums(), this->get_dcounts(), this->get_sums(), this->get_counts(), this->nchanges, this->work_mutex, this->ndcalcs_X),
39 | 				
40 | 				
41 | 			
42 | 			};
43 | 			
44 | 		}
45 | 
46 | 	public:		
47 | 		template<typename... Args>
48 | 		SimpleKmeans1(Args&&... args): kmeans::BaseSimpleExactKmeans<TInt, TFloat> (std::forward<Args>(args)...) {
49 | 			this->setalgname("simple kmeans");
50 | 		}		
51 | 		virtual ~SimpleKmeans1(){};
52 | 		
53 | };
54 | 
55 | }
56 | 
57 | //extern template class kmeans::SimpleKmeans1<size_t, double>;
58 | //extern template class kmeans::SimpleKmeans1<size_t, float>;
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/simplest.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_SIMPLESTKMEANS_H
 22 | #define PLL_SIMPLESTKMEANS_H
 23 | 
 24 | #include <limits>
 25 | 
 26 | #include "basekmeans.h"
 27 | 
 28 | namespace kmeans{
 29 | 
 30 | template <typename TInt, typename TFloat>
 31 | class SimplestKmeans : public kmeans::BaseExact<TInt, TFloat>{
 32 | 	
 33 | 		
 34 | 	public:
 35 | 		template<typename... Args>
 36 | 		/* variadic args ala Eli Bendersky */
 37 | 		SimplestKmeans(Args&&... args): kmeans::BaseExact<TInt, TFloat> (std::forward<Args>(args)...) {this->setalgname("simplest");}		
 38 | 		virtual ~SimplestKmeans(){};
 39 | 			
 40 | 	protected:
 41 | 		virtual void set_initialisation_tasks(){
 42 | 			
 43 | 			auto init_tasks_A = kmeans::BaseExact<TInt, TFloat>::makeset_C_C_l22s_L_inds0_mati();
 44 | 			
 45 | 			auto init_task_B = kmeans::BaseExact<TInt, TFloat>::set_S_H_ati();
 46 | 			
 47 | 			this->initialisation_tasks = std::move(init_tasks_A);
 48 | 			this->initialisation_tasks.push_back(std::move(init_task_B));
 49 | 			
 50 | 		}
 51 | 
 52 | 		virtual void set_X_tasks(){
 53 | 			//set Ls
 54 | 			this->X_tasks = {
 55 | 				[this](TInt ti){
 56 | 					
 57 | 					if (ti == 0){
 58 | 						this->nchanges = 0;
 59 | 
 60 | 						for (TInt i = 0; i < this->getndata(); ++i){
 61 | 							TFloat best_distance = std::numeric_limits<TFloat>::max();
 62 | 							TInt oldlabel = this->get_L()[i];
 63 | 							for (TInt ci = 0; ci < this->getncentroids(); ++ci){
 64 | 								TFloat distance2 = 0;
 65 | 								for (TInt di = 0; di < this->getdimension(); ++di){
 66 | 									TFloat diffy = this->get_C()[ci*this->getdimension() + di] - this->getdata()[i*this->getdimension() + di];
 67 | 									distance2 += diffy*diffy;
 68 | 								}
 69 | 								TFloat distance = std::sqrt(std::max(static_cast<TFloat>(0), distance2));
 70 | 								if (distance <= best_distance){
 71 | 									best_distance = distance;
 72 | 									this->get_L()[i] = ci;
 73 | 								}
 74 | 							}
 75 | 							if (this->get_L()[i] != oldlabel){
 76 | 								this->nchanges += 1;
 77 | 							}
 78 | 						}
 79 | 						
 80 | 						this->ndcalcs_X += this->ndata * this->ncentroids;
 81 | 					}
 82 | 				}
 83 | 				
 84 | 				
 85 | 				
 86 | 	
 87 | 
 88 | 			};
 89 | 		}
 90 | 		
 91 | 		virtual void set_C_tasks(){
 92 | 			//set C
 93 | 			this->C_tasks = {
 94 | 				
 95 | 				
 96 | 		
 97 | 				[this](TInt ti){
 98 | 					if (ti == 0){
 99 | 						std::vector<TFloat> simplesums (this->getncentroids()*this->getdimension(),0);
100 | 						std::vector<TInt> simplecounts (this->getncentroids(),0);
101 | 						for (TInt i = 0; i < this->getndata(); ++i){
102 | 							for (TInt di = 0; di < this->getdimension(); ++di){
103 | 								simplesums[this->get_L()[i]*this->getdimension() + di] += this->getdata()[i*this->getdimension() + di];
104 | 							}
105 | 							simplecounts[this->get_L()[i]]+=1;
106 | 						}
107 | 						for (TInt ci = 0; ci < this->getncentroids(); ++ ci){
108 | 							for (TInt di = 0; di < this->getdimension(); ++di){
109 | 								if (simplecounts[ci] != 0){
110 | 									this->get_C()[ci*this->getdimension() + di] = simplesums[ci*this->getdimension() + di] / static_cast<TFloat> (simplecounts[ci]);
111 | 								}
112 | 							}
113 | 						}
114 | 					}
115 | 				}
116 | 				
117 | 				
118 | 
119 | 
120 | 					
121 | 			};
122 | 		}
123 | };
124 | 
125 | }
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/src/sparseelkan3v0.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef PLL_SPARSEELKANKMEANS_3V0_H
 22 | #define PLL_SPARSEELKANKMEANS_3V0_H
 23 | 
 24 | #include "basesparseelkan.h"
 25 | #include "alg_X_selkSN.h"
 26 | 
 27 | namespace kmeans{
 28 | 
 29 | /* discrepency in ndcalcs as compared to a3v0 due to not computing CC initially (I propose) */
 30 | 
 31 | template <typename TInt, typename TFloat>
 32 | class SP3V0 : public kmeans::BaseSparseElkan<TInt, TFloat>{
 33 | 				
 34 | 	protected:
 35 | 		TFloat * const get_lowers(){
 36 | 			return this->elkan_lowers_base.get();
 37 | 		}
 38 | 		
 39 | 		TFloat * const get_upbs(){
 40 | 			return this->elkan_upper_base.get();
 41 | 		}
 42 | 		
 43 | 		TFloat * const get_delta_C(){
 44 | 			return this->elkan_delta_C.get();
 45 | 		}
 46 | 		
 47 | 		std::function<void(TInt)> update_3v0_L_lowers_upper_where_changes_ati(){
 48 | 			//TODO : neaten up and move out
 49 | 			
 50 | 			return [this](TInt ti){
 51 | 				TInt data0 = (ti*this->ndata)/this->nthreads;
 52 | 				TInt data1 = ((ti + 1)*this->ndata)/this->nthreads;
 53 | 
 54 | 				TInt ndcalcs_local = 0;
 55 | 				kmeans::sparse_update_L_lowers_upper_where_changes_3v0<TInt, TFloat>(this->ncentroids, this->dimension, data0, data1, *this->ptrdata, this->get_C(), this->get_data_l22s() + data0, this->get_C_l22s(), this->get_delta_C(), this->where_label_changes[ti], ndcalcs_local, this->get_L() + data0, this->get_lowers()  + data0*this->ncentroids, this->get_upbs() + data0);
 56 | 				this->ndcalcs_X += ndcalcs_local;
 57 | 			};
 58 | 		}
 59 | 		
 60 | 		std::function<void(TInt)> update_S_H_from_where_changes_ati(){
 61 | 			return [this](TInt ti){
 62 | 					if (ti == 0){
 63 | 						sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts(), this->nchanges);
 64 | 					}
 65 | 				};
 66 | 		}
 67 | 			
 68 | 		
 69 | 		
 70 | 	public:
 71 | 		typedef kmeans::BaseSparseElkan<TInt, TFloat> EB;
 72 | 		template<typename... Args>
 73 | 		SP3V0(Args&&... args): EB(std::forward<Args>(args)...)
 74 | 
 75 | 
 76 | 		{
 77 | 			this->setalgname("SP3V0");
 78 | 			this->elkan_delta_C.reset(new TFloat [this->getncentroids()]);
 79 | 		}
 80 | 		virtual ~SP3V0(){}
 81 | 
 82 | 		virtual TInt get_approximate_memory_requirement(){
 83 | 			return EB::get_approximate_memory_requirement() + 
 84 | 			sizeof(TFloat)*this->getncentroids(); // delta_C  
 85 | 		}
 86 | 
 87 | 		virtual void verbose_write_additional(){
 88 | 			this->EB_verbose_write_additional();
 89 | 			/* anything else to add ? */
 90 | 		}
 91 | 
 92 | 		virtual void set_initialisation_tasks(){
 93 | 			/* all Elkan variants have same initialisation tasks */
 94 | 			this->ElkBase_set_initialisation_tasks();
 95 | 		}
 96 | 	
 97 | 		virtual void set_C_tasks(){
 98 | 			this->C_tasks = {
 99 | 				arrutilv2::update_C_C_l22s_delta_C_from_SH_ati(this->getnthreads(), this->getncentroids(), this->getdimension(), this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s(), this->get_delta_C(), this->ndcalcs_notX)
100 | 			};
101 | 		}
102 | 		
103 | 		virtual void set_X_tasks(){
104 | 			this->X_tasks = {
105 | 				this->update_3v0_L_lowers_upper_where_changes_ati(),
106 | 				this->update_S_H_from_where_changes_ati()
107 | 			};
108 | 		}
109 | };
110 | 
111 | }
112 | 
113 | #endif
114 | 


--------------------------------------------------------------------------------
/src/sparseinitialise.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | 
 22 | #ifndef SPARSEINITIALISE_H
 23 | #define SPARSEINITIALISE_H
 24 | 
 25 | 
 26 | #include <memory>
 27 | #include <vector>
 28 | namespace kmeans{
 29 | namespace sparseinit{
 30 | 
 31 | //get copyindices guaranteeing that all distinct.
 32 | template <typename TFloat, typename TInt>
 33 | std::tuple<std::unique_ptr<TFloat []>, std::vector<TInt> > get_initialisation_indices(TInt ncentroids, const sparse::SparseData<TInt, TFloat> & data, TInt data0 = 0, TInt data1 = 0){
 34 | 	
 35 | 	if (data1 == 0){
 36 | 		data1 = data.ndata;
 37 | 	}
 38 | 
 39 | 
 40 | 	
 41 | 	
 42 | 	TInt ndata_range = data1 - data0;
 43 | 	std::vector<TInt> initialisation_indices (ncentroids);
 44 | 	std::unique_ptr<TFloat []> C(new TFloat [ncentroids*data.dimension]);
 45 | 	
 46 | 	TInt nattempts = 0;
 47 | 	TInt currentindex = 0;
 48 | 	
 49 | 
 50 | 			
 51 | 	while (currentindex < ncentroids && nattempts < 5*ncentroids){
 52 | 		TInt proposal = data0 + rand() % ndata_range;
 53 | 		
 54 | 		bool rejected = false;
 55 | 		for (TInt ci = 0; ci < currentindex; ++ci){
 56 | 			TFloat l22_diff = sparse::get_l22(
 57 | 
 58 | 				data.starts[initialisation_indices[ci] + 1] - data.starts[initialisation_indices[ci]],
 59 | 				data.indices.data() + data.starts[initialisation_indices[ci]],
 60 | 				data.values.data() + data.starts[initialisation_indices[ci]],
 61 | 
 62 | 				data.starts[proposal + 1] - data.starts[proposal],
 63 | 				data.indices.data() + data.starts[proposal],
 64 | 				data.values.data() + data.starts[proposal]
 65 | 				
 66 | 			);
 67 | 			
 68 | 			if (l22_diff < 1e-5){
 69 | 				rejected = true;
 70 | 				break;
 71 | 			}
 72 | 		}
 73 | 		++nattempts;
 74 | 		if (rejected == false){
 75 | 			sparse::todense::zero_and_copy(proposal, data, C.get() + currentindex*data.dimension);
 76 | 			initialisation_indices[currentindex] = proposal;
 77 | 			++currentindex;
 78 | 		}
 79 | 		else{
 80 | 		}
 81 | 	}
 82 | 
 83 | 	if (currentindex != ncentroids){
 84 | 		throw std::runtime_error("Tried to find a set of distinct datapoints, but failed (nattempts/ncentroids = 5)"); 
 85 | 	}
 86 | 
 87 | 	
 88 | 	return std::make_tuple (std::move(C), std::move(initialisation_indices));
 89 | }
 90 | 
 91 | 
 92 | 
 93 | template <typename TFloat, typename TInt>
 94 | std::tuple<std::unique_ptr<TFloat []>, std::unique_ptr<TFloat []>, std::unique_ptr<TInt []>, TFloat > 
 95 | get_kmeanspp_initialisation(TInt ncentroids, const sparse::SparseData<TInt, TFloat> & data, TInt ind0, TInt ind1){
 96 | 	throw std::runtime_error("sparse kmeans ++ not yet implemented. Look for inspiration in dense version. Probably common code to be extracted."); 
 97 | 	
 98 | 	
 99 | 	return std::make_tuple (std::unique_ptr<TFloat []> {}, 
100 | 	std::unique_ptr<TFloat []> {},
101 | 	std::unique_ptr<TInt []> {},
102 | 	TFloat {});
103 | 
104 | }
105 | 
106 | 
107 | }
108 | }
109 | #endif
110 | 
111 | 


--------------------------------------------------------------------------------
/src/sparseminibatch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_SPARSEMINIBATCHKMEANS_H
22 | #define PLL_SPARSEMINIBATCHKMEANS_H
23 | 
24 | #include "basesparseminibatch.h"
25 | 
26 | namespace kmeans{
27 | template <typename TInt, typename TFloat>
28 | class SparseMiniBatch : public kmeans::BaseSparseMiniBatch<TInt, TFloat>{
29 | 	
30 | 	private:
31 | 				
32 | 		virtual void post_L_adjust_S_H() override final{
33 | 
34 | 			if (this->round < this->mba.nsubrounds){
35 | 				TInt data0 = this->mba.batchsize*(this->round%this->mba.nsubrounds);
36 | 				TInt data1 = std::min(data0 + this->mba.batchsize, this->ndata);
37 | 				sparse::increment_S_H(data0, data1, *this->ptrdata, this->get_L(), this->get_sums(), this->get_counts());
38 | 			}
39 | 			
40 | 			else{
41 | 				sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts());
42 | 			}						
43 | 		}
44 | 				
45 | 		
46 | 	public:
47 | 					
48 | 		
49 | 		template<typename... Args>
50 | 		SparseMiniBatch(Args&&... args): kmeans::BaseSparseMiniBatch<TInt, TFloat> (std::forward<Args>(args)...){
51 | 				this->algname = "sparse mini batch";
52 | 		}
53 | 		 		
54 | 		virtual ~SparseMiniBatch(){};
55 | 
56 | };
57 | 
58 | }
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/sparsesimple.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_SPARSESIMPLE_H
22 | #define PLL_SPARSESIMPLE_H
23 | 
24 | #include "basesparseexact.h"
25 | 
26 | namespace kmeans{
27 | 
28 | template <typename TInt, typename TFloat>
29 | class SparseSimple : public kmeans::BaseSparseExact<TInt, TFloat> {
30 | 	
31 | 	private: 
32 | 		
33 | 	public:
34 | 		template<typename... Args>
35 | 		SparseSimple(Args&&... args): kmeans::BaseSparseExact<TInt, TFloat> (std::forward<Args>(args)...) {
36 | 			this->setalgname("sparse-simple-kmeans");
37 | 		}
38 | 		
39 | 		virtual ~SparseSimple(){};
40 | 	
41 | 	protected:
42 | 		virtual void set_initialisation_tasks(){
43 | 			auto init_tasks_A = this->makeset_C_C_l22s_L_inds0_mati();
44 | 			auto init_task_B = 
45 | 				[this](TInt ti){
46 | 					if (ti == 0){
47 | 						this->set_S_H(static_cast<TInt>(0), this->ndata);
48 | 					}
49 | 				};
50 | 			this->initialisation_tasks = std::move(init_tasks_A);
51 | 			this->initialisation_tasks.push_back(std::move(init_task_B));
52 | 		}
53 | 		
54 | 			
55 | 		virtual void set_X_tasks() override final{
56 | 			
57 | 			this->X_tasks = {
58 | 				
59 | 				
60 | 				//experiments show that this is were the majority of the time is spent
61 | 				sparse::update_L_label_changes_ati(this->nthreads, *this->ptrdata, this->ncentroids, this->get_C(), this->get_data_l22s(), this->get_C_l22s(), this->get_L(), this->ndcalcs_X, this->where_label_changes),
62 | 
63 | 			
64 | 				[this](TInt ti){
65 | 					if (ti == 0){
66 | 						sparse::update_S_H_from_label_changes(*this->ptrdata, this->where_label_changes, this->get_sums(), this->get_counts(), this->nchanges);
67 | 					}
68 | 				},
69 | 				
70 | 				[this](TInt ti){
71 | 					if (ti == 0){
72 | 					}
73 | 				} 			
74 | 				
75 | 			};
76 | 		}
77 | 		
78 | 		virtual void set_C_tasks(){
79 | 			this->C_tasks = {
80 | 				arrutilv2::update_C_C_l22s_from_SH_ati(this->nthreads, this->ncentroids, this->dimension, this->get_sums(), this->get_counts(), this->get_C(), this->get_C_l22s())
81 | 			};
82 | 		}
83 | 		
84 | 		
85 | 		virtual void verbose_write_additional(){
86 | 				//TODO
87 | 		}
88 | 
89 | };
90 | } 
91 | 
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/src/sparsestandardminibatch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_SPARSESTANDARDMINIBATCHKMEANS_H
22 | #define PLL_SPARSESTANDARDMINIBATCHKMEANS_H
23 | 
24 | #include "basesparseminibatch.h"
25 | 
26 | namespace kmeans{
27 | template <typename TInt, typename TFloat>
28 | class SparseStandardMiniBatch : public kmeans::BaseSparseMiniBatch<TInt, TFloat>{
29 | 	
30 | 	private:
31 | 				
32 | 		virtual void post_L_adjust_S_H() override final{
33 | 			//just update S and H by adding data which changed
34 | 			
35 | 			//TODO : these could be class variables as same calculated here as in update_L_label_changes.
36 | 			TInt data0 = this->mba.batchsize*(this->round%this->mba.nsubrounds);
37 | 			TInt data1 = std::min(data0 + this->mba.batchsize, this->ndata);
38 | 			
39 | 			sparse::increment_S_H(data0, data1, *this->ptrdata, this->get_L(), this->get_sums(), this->get_counts());
40 | 						
41 | 		}
42 | 		
43 | 	public:
44 | 					
45 | 		
46 | 		template<typename... Args>
47 | 		SparseStandardMiniBatch(Args&&... args): kmeans::BaseSparseMiniBatch<TInt, TFloat> (std::forward<Args>(args)...){
48 | 			
49 | 			this->algname = "sparse standard mini batch";
50 | 		}
51 | 		 		
52 | 		virtual ~SparseStandardMiniBatch(){};
53 | 
54 | };
55 | 
56 | }
57 | 
58 | //extern template class kmeans::SparseStandardMiniBatch<size_t, double>;
59 | //extern template class kmeans::SparseStandardMiniBatch<size_t, float>;
60 | 
61 | #endif
62 | 
63 | 


--------------------------------------------------------------------------------
/src/standardminibatch.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef PLL_STANDARDMINIBATCHKMEANS_H
22 | #define PLL_STANDARDMINIBATCHKMEANS_H
23 | 
24 | #include "basesimpleminibatch.h"
25 | 
26 | namespace kmeans{
27 | template <typename TInt, typename TFloat>
28 | class StandardMiniBatch : public kmeans::BaseSimpleMiniBatch<TInt, TFloat>{
29 | 	
30 | 	private:
31 | 				
32 | 		virtual void update_L_S_H(TInt x0, TInt x1, TInt ti) override final{
33 | 			this->update_L_S_H_batch_increment_only(x0, x1, ti);
34 | 		}
35 | 	
36 | 
37 | 		
38 | 	public:
39 | 
40 | 
41 | 		template<typename... Args>
42 | 		StandardMiniBatch(Args&&... args): kmeans::BaseSimpleMiniBatch<TInt, TFloat> (std::forward<Args>(args)...) {
43 | 			this->setalgname("Standard Mini Batch Kmeans");
44 | 		}		
45 | 		 		
46 | 		virtual ~StandardMiniBatch(){};
47 | 
48 | };
49 | 
50 | 
51 | }
52 | 
53 | //extern template class kmeans::StandardMiniBatch<size_t, double>;
54 | //extern template class kmeans::StandardMiniBatch<size_t, float>;
55 | 
56 | #endif
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/src/stringutilbase.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #include "stringutilbase.h"
 22 | #include <stdexcept>
 23 | #include <iostream>
 24 | namespace stringutil{
 25 | //split the string tosplit by delim. With x appearances of delim in tosplit, the returned vector will have length x + 1 (even if appearances at the start, end, contiguous.
 26 | std::vector<std::string> split(const std::string & tosplit, const std::string & delim){
 27 | 	
 28 | 	std::vector<std::string> spv; //vector to return
 29 | 	if (delim.length() > tosplit.length()){
 30 | 		return spv;
 31 | 	}
 32 | 
 33 | 
 34 | 	std::vector<size_t> splitposstarts {0};		
 35 | 	std::vector<size_t> splitposends;
 36 | 	
 37 | 	for (size_t x = 0; x <  tosplit.length() - delim.length() + 1; ++x){		
 38 | 		auto res = std::mismatch(delim.begin(), delim.end(), tosplit.begin() + x);
 39 | 		if (res.first == delim.end()){
 40 | 			splitposends.push_back(x);
 41 | 			splitposstarts.push_back(x + delim.length());
 42 | 
 43 | 		}
 44 | 	}
 45 | 	
 46 | 	splitposends.push_back(tosplit.length());
 47 | 
 48 | 	for (unsigned i = 0; i < splitposends.size(); ++i){
 49 | 		spv.push_back(tosplit.substr(splitposstarts[i], splitposends[i] - splitposstarts[i] ));
 50 | 	}
 51 | 	
 52 | 	return spv;
 53 | }
 54 | 
 55 | bool isws(const char & c){
 56 | 	return (c == ' ' ||  c == '\t' || c == '\n');
 57 | }
 58 | 
 59 | std::vector<std::string> split(const std::string & tosplit){
 60 | 	
 61 | 	std::vector<std::string> spv2;
 62 | 	
 63 | 	unsigned it = 0;	
 64 | 	
 65 | 	while (it != tosplit.size()){
 66 | 		while (isws(tosplit[it]) and it != tosplit.size()){
 67 | 			++it;
 68 | 		}
 69 | 		unsigned start = it;
 70 | 		
 71 | 		while (!isws(tosplit[it]) and it != tosplit.size()){
 72 | 			++it;
 73 | 		}
 74 | 		unsigned end = it;
 75 | 		
 76 | 		if (!isws(tosplit[end -1])){
 77 | 			spv2.push_back(tosplit.substr(start, end - start));
 78 | 		}
 79 | 	}
 80 | 		
 81 | 		
 82 | 		
 83 | 	
 84 | 	return spv2;
 85 | } 
 86 | 
 87 | 
 88 | std::string getdirfromfn(const std::string & fn){
 89 | 	auto morcels = split(fn, "/");
 90 | 
 91 | 	if (morcels[0].compare("") != 0){
 92 | 		throw std::runtime_error("The string passed to getdirfromfn is not a valid path as there is no leading / .");
 93 | 	}
 94 | 
 95 | 	std::string dir = "/";
 96 | 		
 97 | 	for (unsigned i = 1; i < morcels.size() - 1; ++i){
 98 | 		dir = dir + morcels[i] + "/";
 99 | 	}
100 | 	return dir;
101 | }
102 | 
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/src/stringutilbase.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #include <string>
22 | #include <vector>
23 | 
24 | namespace stringutil{
25 | //split the string tosplit by delim. With x appearances of delim in tosplit, the returned vector will have length x + 1 (even if appearances at the start, end, contiguous.
26 | std::vector<std::string> split(const std::string & tosplit, const std::string & delim);
27 | 
28 | //split on whitespaces
29 | std::vector<std::string> split(const std::string & tosplit);
30 | 
31 | 
32 | std::string getdirfromfn(const std::string & fn);
33 | }
34 | 


--------------------------------------------------------------------------------
/src/stringutilclustering.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef ENDOFROUNDSTRING_H
22 | #define ENDOFROUNDSTRING_H
23 | 
24 | #include <string>
25 | 
26 | 
27 | 
28 | namespace stringutil{
29 | namespace clustering{
30 | namespace helper{
31 | std::string getstars();
32 | }
33 | 
34 | 
35 | 		
36 | namespace pll{
37 | 
38 | 
39 | namespace exact{
40 | 
41 | std::string getstartsummary_v1(std::string algname, size_t memory_usage, float mse, float val_mse);
42 | std::string getstartsummary_v2(std::string algname, size_t memory_usage, float mse, float val_mse);
43 | std::string getroundsummary_v1(size_t roundchanges);
44 | std::string getroundsummary_v2(size_t round, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, float val_mse);
45 | std::string getfinalsummary_v1(size_t round, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse);
46 | std::string getfinalsummary_v2(size_t round, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse);
47 | 
48 | }
49 | 
50 | namespace minibatch{
51 | std::string getstartsummary_v1(std::string algname, size_t memory_usage, float val_mse);
52 | std::string getstartsummary_v2(std::string algname, size_t memory_usage, float val_mse);
53 | 
54 | std::string getroundsummary_v1(size_t round, size_t nsubrounds, size_t roundchanges);
55 | std::string getroundsummary_v2(size_t round, size_t nsubrounds, size_t subround, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, float val_mse);
56 | 
57 | std::string getfinalsummary_v1(size_t round, size_t nsubrounds, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse);
58 | std::string getfinalsummary_v2(size_t round, size_t nsubrounds, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse);
59 | 
60 | 
61 | 
62 | }
63 | 
64 | 
65 | namespace growbatch{
66 | 
67 | 
68 | std::string getstartsummary_v1(std::string algname, size_t memory_usage, float val_mse);
69 | std::string getstartsummary_v2(std::string algname, size_t memory_usage, float val_mse);
70 | 
71 | std::string getroundsummary_v1(size_t roundchanges, bool didgrow);
72 | std::string getroundsummary_v2(size_t round, size_t nactive, float d_C__by__d_AB, size_t roundchanges, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, float val_mse);
73 | 
74 | std::string getfinalsummary_v1(size_t round, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse);
75 | std::string getfinalsummary_v2(size_t round, size_t ncalcs_X, size_t ncalcs, size_t t_total, float mse, size_t n_empty_clusters, float val_mse);
76 | 
77 | }
78 | 
79 | 
80 | 
81 | 
82 | } //end pll
83 | 
84 | 
85 | 
86 | 
87 | }
88 | }
89 | 
90 | 
91 | 
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/src/stringutilfile.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #include "stringutilfile.h"
22 | 
23 | 
24 | #include <fstream>
25 | #include <sstream>
26 | #include <stdexcept>
27 | 
28 | #include "stringutilbase.h"
29 | 
30 | 
31 | #include <cstdlib>
32 | 
33 | #include <iostream>
34 | 
35 | namespace stringutilfile{
36 | 	
37 | 
38 | //stolen from http://stackoverflow.com/questions/2844817/how-do-i-check-if-a-c-string-is-an-int
39 | inline bool is_integer(const std::string & s){
40 | 	if (s.empty() || ((!isdigit(s[0])) && (s[0] != '-') && (s[0] != '+'))){
41 | 		return false;
42 | 	}
43 | 	
44 | 	char * p;
45 | 	
46 | 	strtol(s.c_str(), &p, 10);
47 | 	
48 | 	return (*p == 0);
49 | }
50 | 	
51 | bool file_has_2int_header(const std::string & filename){
52 | 	std::ifstream dfile(filename, std::ios_base::in);		
53 | 	std::string line;
54 | 	if (!dfile.is_open()){
55 | 		throw std::runtime_error(std::string("The file ") + filename + " probably does not exist. Cannot determine whether the file has the 2 int header or not, as the file does not seem to exist." );
56 | 	}
57 | 	std::getline(dfile, line);
58 | 	auto bob = stringutil::split(line);
59 | 	
60 | 	/* first determine that it contains 2 nuggets: */
61 | 	if (bob.size() != 2){
62 | 		//"file does not have 2 frags, it has :  " << bob.size() << " frags " << std::endl;
63 | 		return false;
64 | 	}
65 | 	/* next test that they are indeed integers */	
66 | 	if (is_integer(bob[0]) and is_integer(bob[1])){
67 | 		return true;
68 | 	}
69 | 
70 | 	//"fail due to a non int in header :  " <<  is_integer(bob[0]) << " " << is_integer(bob[1])  << std::endl;
71 | 	//"fail due to a non int in header :  |" <<  bob[0] << "|  |" << bob[1]  << "|" <<  std::endl;
72 | 	
73 | 	return false;
74 | }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/stringutilfile.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
 3 | Written by James Newling <james.newling@gmail.com>
 4 | All rights reserved.
 5 | 
 6 | eakmeans is a library for exact and approximate k-means written in C++ and
 7 | Python. This file is part of eakmeans. See file COPYING for more details.
 8 | 
 9 | This file is part of eakmeans.
10 | 
11 | eakmeans is free software: you can redistribute it and/or modify
12 | it under the terms of the 3-Clause BSD Licence. See
13 | https://opensource.org/licenses/BSD-3-Clause for more details.
14 | 
15 | eakmeans is distributed in the hope that it will be useful,
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
18 | COPYING for more details.
19 | */
20 | 
21 | #ifndef STRINGUTILFILE_H
22 | #define STRINGUTILFILE_H
23 | 
24 | #include <string>
25 | namespace stringutilfile{
26 | 	
27 | 
28 | 
29 | 
30 | bool file_has_2int_header(const std::string & filename);
31 | 
32 | 		
33 | 
34 | 
35 | }
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/txtdatasets.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2015-2018 Idiap Research Institute, http://www.idiap.ch/
  3 | Written by James Newling <james.newling@gmail.com>
  4 | All rights reserved.
  5 | 
  6 | eakmeans is a library for exact and approximate k-means written in C++ and
  7 | Python. This file is part of eakmeans. See file COPYING for more details.
  8 | 
  9 | This file is part of eakmeans.
 10 | 
 11 | eakmeans is free software: you can redistribute it and/or modify
 12 | it under the terms of the 3-Clause BSD Licence. See
 13 | https://opensource.org/licenses/BSD-3-Clause for more details.
 14 | 
 15 | eakmeans is distributed in the hope that it will be useful,
 16 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See file
 18 | COPYING for more details.
 19 | */
 20 | 
 21 | #ifndef TXTDATASETS_H
 22 | #define TXTDATASETS_H
 23 | 
 24 | namespace datasets{
 25 | 
 26 | static std::string sparse_data_dir("/idiap/temp/jnewling/data/sparsedata/rcv1rcv/");
 27 | 
 28 | 
 29 | class TxtDataset{
 30 | 	public:
 31 | 		std::string name;
 32 | 		unsigned nd;
 33 | 		unsigned dim;
 34 | 		std::string datapath_dim;
 35 | 		std::string datapath_dimless;
 36 | 		TxtDataset(){}; //why do I need this?
 37 | 		TxtDataset(const std::string & name, unsigned nd, unsigned dim):name(name), nd(nd), dim(dim){
 38 | 		 datapath_dim = "/idiap/temp/jnewling/data/txtdata/normalised/" + name + "_" + std::to_string(nd) + "_" + std::to_string(dim) + "_cnormed.txt";
 39 | 		 datapath_dimless = "/idiap/temp/jnewling/data/txtdata/normalised/" + name + "_" + std::to_string(nd) + "_" + std::to_string(dim) + "_cnormed_dimless.txt";
 40 | 		}
 41 | };
 42 | 
 43 | class TrainTestDataset{ //TODO replace TrainTestDataset everywhere with TrainTestDataset. 
 44 | 	public:
 45 | 	
 46 | 		std::string name;
 47 | 				
 48 | 		std::string datapath_train_dim;
 49 | 		std::string datapath_train_dimless;
 50 | 		std::string datapath_test_dim;
 51 | 		std::string datapath_test_dimless;
 52 | 		
 53 | 		TrainTestDataset(){};
 54 | 		
 55 | 		//, nd(nd), dim(dim) , unsigned nd, unsigned dim
 56 | 		//, const std::string & datapath_dim, const std::string & datapath_dimless
 57 | 		//  unsigned nd;
 58 | 		//  unsigned dim;
 59 | 
 60 | 		TrainTestDataset(const std::string & name, 
 61 | 		std::string rootdir = "/idiap/temp/jnewling/data/sparsedata/trainandtest/"
 62 | 		): name(name){
 63 | 			datapath_train_dim = rootdir + name + "_train_withdims.txt";
 64 | 			datapath_train_dimless = rootdir + name + "_train_dimless.txt";
 65 |  			datapath_test_dim = rootdir + name + "_test_withdims.txt";
 66 | 			datapath_test_dimless = rootdir + name + "_test_dimless.txt";
 67 | 		}	
 68 | };
 69 | 
 70 | std::vector<TrainTestDataset> sparse_datasets{
 71 | 	
 72 | 	{"truercv"},
 73 | 	{"truercvos"},
 74 | 	{"rcv"}, //558700 , 0, "None", "/idiap/temp/jnewling/data/sparsedata/rcv1rcv/all_shuffled.txt"},
 75 | 	{"nips"}, //1500 , 0, "None", "/idiap/temp/jnewling/data/sparsedata/bagofwords/nips.txt"},
 76 | 	{"nytimes"},// 299751 , 102661, "None", "/idiap/temp/jnewling/data/sparsedata/bagofwords/nytimes.txt"}
 77 | 	{"randdim5"},
 78 | 	{"randdim6"},
 79 | 	{"rand4", "/idiap/temp/jnewling/data/densedata/trainandtest/"},
 80 | 	{"infimnist", "/idiap/temp/jnewling/data/densedata/trainandtest/"},
 81 | 	{"infimnist28by28", "/idiap/temp/jnewling/data/densedata/trainandtest/"},
 82 | 	{"covtype", "/idiap/temp/jnewling/data/densedata/trainandtest/"},	
 83 | 	{"kddcup98", "/idiap/temp/jnewling/data/densedata/trainandtest/"},
 84 | 	{"stl10", "/idiap/temp/jnewling/data/densedata/trainandtest/"}				
 85 | };
 86 | 
 87 | std::vector<TxtDataset> txt_datasets {
 88 | 	
 89 | 	{"tsn", 200000, 4},
 90 | 	{"conflongdemo", 164860, 3},
 91 | 	{"skinseg", 200000, 4},
 92 | 	{"wcomp", 165630, 15}, 
 93 | 	{"kegg", 65550, 28},
 94 | 	
 95 | 	{"miniboone", 130060, 50},
 96 | 	{"covtype", 581012, 55},
 97 | 	{"gassensor", 13910, 128},
 98 | 	{"uscensus", 2458285, 68}, 
 99 | 	{"colormoments", 68040,9},
100 | 	
101 | 	{"ldfpads", 164850, 3},
102 | 	{"kddcup98", 95000, 310},
103 | 	{"kddcup04bio", 145750, 74},
104 | 	{"egeod", 5580, 31099},
105 | 	{"mnist50", 60000, 50},
106 | 
107 | 	{"house16H", 22780, 17},
108 | 	{"mv", 40760, 11},
109 | 	{"europe", 169300, 2},
110 | 	{"birch3", 100000, 2},
111 | 	{"mnist", 60000, 784},
112 | 
113 | 	{"stl10", 1000000, 108},
114 | 	{"random", 1000000, 30},
115 | 	{"random2", 1000000, 2},
116 | 	{"small", 1000, 2},
117 | 
118 | };
119 | 
120 | 
121 | 
122 | }
123 | 
124 | 
125 | #endif
126 | 


--------------------------------------------------------------------------------