├── .gitmodules
├── LICENSE
├── Makefile
├── README.md
├── fftw-3.3.7.tar.gz
├── include
    ├── frp
    │   ├── compact.h
    │   ├── coresets.h
    │   ├── dci.h
    │   ├── dist.h
    │   ├── fhtgpu.h
    │   ├── frp.h
    │   ├── gpu.h
    │   ├── graph.h
    │   ├── ifc.h
    │   ├── jl.h
    │   ├── kama.h
    │   ├── kernel.h
    │   ├── linalg.h
    │   ├── lsh.h
    │   ├── mach.h
    │   ├── mm.h
    │   ├── parser.h
    │   ├── rand.h
    │   ├── sample.h
    │   ├── sdq.h
    │   ├── spectral.h
    │   ├── spinner.h
    │   ├── stackstruct.h
    │   └── util.h
    └── thirdparty
    │   └── fast_mutex.h
├── lib
    ├── fht_kernel0.cu
    ├── fht_kernel1.cu
    ├── fht_kernel2.cu
    ├── fht_kernel3.cu
    ├── fht_kernel4.cu
    ├── fht_kernel5.cu
    ├── fht_kernel6.cu
    ├── fht_kernel7.cu
    ├── fht_kernel8.cu
    └── fht_kernel9.cu
├── py
    ├── Makefile
    ├── frp.cpp
    ├── jl.py
    └── setup.py
├── scripts
    ├── autogen.py
    ├── ratio_err.py
    └── time_exp.py
├── src
    ├── aestest.cpp
    ├── dcitest.cpp
    ├── fhtest.cpp
    ├── graphtest.cpp
    ├── kernel_test.cpp
    ├── kernel_time.cpp
    ├── kstest.cpp
    ├── lshtest.cpp
    ├── mtest.cpp
    ├── multest.cpp
    ├── ojlt.cpp
    ├── parser.cpp
    ├── pcatest.cpp
    └── test_gs.cpp
└── test
    └── testfht.cpp


/.gitmodules:
--------------------------------------------------------------------------------
  1 | [submodule "FFHT"]
  2 | 	path = FFHT
  3 | 	url = https://github.com/FALCONN-LIB/FFHT
  4 |     branch = master
  5 | [submodule "fastrange"]
  6 | 	path = fastrange
  7 | 	url = https://github.com/lemire/fastrange
  8 |     branch = master
  9 | [submodule "blaze"]
 10 | 	path = blaze
 11 | 	url = https://bitbucket.org/blaze-lib/blaze.git
 12 |     branch = master
 13 | [submodule "kspp"]
 14 | 	path = kspp
 15 | 	url = https://github.com/dnbaker/kspp
 16 |     branch = master
 17 | [submodule "pybind11"]
 18 | 	path = pybind11
 19 | 	url = https://github.com/pybind/pybind11
 20 | [submodule "sleef"]
 21 | 	path = sleef
 22 | 	url = https://github.com/shibatch/sleef
 23 | [submodule "math"]
 24 | 	path = boost/math
 25 |     url = https://github.com/boostorg/math
 26 | [submodule "random"]
 27 | 	path = boost/random
 28 | 	url = https://github.com/boostorg/random
 29 | [submodule "config"]
 30 | 	path = boost/config
 31 | 	url = https://github.com/boostorg/config
 32 | [submodule "utility"]
 33 | 	path = boost/utility
 34 | 	url = https://github.com/boostorg/utility
 35 | [submodule "assert"]
 36 | 	path = boost/assert
 37 | 	url = https://github.com/boostorg/assert
 38 | [submodule "static_assert"]
 39 | 	path = boost/static_assert
 40 | 	url = https://github.com/boostorg/static_assert
 41 | [submodule "integer"]
 42 | 	path = boost/integer
 43 | 	url = https://github.com/boostorg/integer
 44 | [submodule "type_traits"]
 45 | 	path = boost/type_traits
 46 | 	url = https://github.com/boostorg/type_traits
 47 | [submodule "mpl"]
 48 | 	path = boost/mpl
 49 | 	url = https://github.com/boostorg/mpl
 50 | [submodule "core"]
 51 | 	path = boost/core
 52 | 	url = https://github.com/boostorg/core
 53 | [submodule "preprocessor"]
 54 | 	path = boost/preprocessor
 55 | 	url = https://github.com/boostorg/preprocessor
 56 | [submodule "exception"]
 57 | 	path = boost/exception
 58 | 	url = https://github.com/boostorg/exception
 59 | [submodule "throw_exception"]
 60 | 	path = boost/throw_exception
 61 | 	url = https://github.com/boostorg/throw_exception
 62 | [submodule "range"]
 63 | 	path = boost/range
 64 | 	url = https://github.com/boostorg/range
 65 | [submodule "iterator"]
 66 | 	path = boost/iterator
 67 | 	url = https://github.com/boostorg/iterator
 68 | [submodule "io"]
 69 | 	path = boost/io
 70 | 	url = https://github.com/boostorg/io
 71 | [submodule "predef"]
 72 | 	path = boost/predef
 73 | 	url = https://github.com/boostorg/predef
 74 | [submodule "concept_check"]
 75 | 	path = boost/concept_check
 76 | 	url = https://github.com/boostorg/concept_check
 77 | [submodule "detail"]
 78 | 	path = boost/detail
 79 | 	url = https://github.com/boostorg/detail
 80 | [submodule "lexical_cast"]
 81 | 	path = boost/lexical_cast
 82 | 	url = https://github.com/boostorg/lexical_cast
 83 | [submodule "numeric_conversion"]
 84 | 	path = boost/numeric_conversion
 85 | 	url = https://github.com/boostorg/numeric_conversion
 86 | [submodule "functional"]
 87 | 	path = boost/functional
 88 | 	url = https://github.com/boostorg/functional
 89 | [submodule "array"]
 90 | 	path = boost/array
 91 | 	url = https://github.com/boostorg/array
 92 | [submodule "container"]
 93 | 	path = boost/container
 94 | 	url = https://github.com/boostorg/container
 95 | [submodule "move"]
 96 | 	path = boost/move
 97 | 	url = https://github.com/boostorg/move
 98 | [submodule "thread"]
 99 | 	path = boost/thread
100 | 	url = https://github.com/boostorg/thread
101 | [submodule "smart_ptr"]
102 | 	path = boost/smart_ptr
103 | 	url = https://github.com/boostorg/smart_ptr
104 | [submodule "vec"]
105 | 	path = vec
106 | 	url = https://github.com/dnbaker/vec
107 | [submodule "klib"]
108 | 	path = klib
109 | 	url = https://github.com/attractivechaos/klib
110 | [submodule "distmat"]
111 | 	path = distmat
112 | 	url = https://github.com/dnbaker/distmat
113 | [submodule "aesctr"]
114 | 	path = aesctr
115 | 	url = https://github.com/dnbaker/aesctr
116 | [submodule "flat_hash_map"]
117 | 	path = flat_hash_map
118 | 	url = https://github.com/skarupke/flat_hash_map
119 | [submodule "boost/multiprecision"]
120 | 	path = boost/multiprecision
121 | 	url = https://github.com/boostorg/multiprecision
122 | [submodule "sketch"]
123 | 	path = sketch
124 | 	url = https://github.com/dnbaker/sketch
125 | [submodule "clhash"]
126 | 	path = clhash
127 | 	url = https://github.com/lemire/clhash
128 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY=all tests clean obj python
  2 | ifndef CXX
  3 | CXX=g++
  4 | endif
  5 | ifndef CC
  6 | CC=gcc
  7 | endif
  8 | ifndef STD
  9 | STD=c++17
 10 | endif
 11 | WARNINGS=-Wall -Wextra -Wno-char-subscripts \
 12 | 		 -Wpointer-arith -Wwrite-strings -Wdisabled-optimization \
 13 | 		 -Wformat -Wcast-align -Wno-unused-function -Wunused-variable -Wno-ignored-qualifiers -Wsuggest-attribute=const \
 14 |         # -Wconversion -Werror -Wno-float-conversion
 15 | DBG:= # -DNDEBUG
 16 | OFLAG?=-O3
 17 | OPT:= $(OFLAG) -funroll-loops -pipe -fno-strict-aliasing -march=native -fopenmp -DUSE_FASTRANGE \
 18 |       -funsafe-math-optimizations -ftree-vectorize \
 19 |         -DBOOST_NO_RTTI
 20 | OS:=$(shell uname)
 21 | 
 22 | EXTRA=
 23 | BLAS_LINKING_FLAGS?=
 24 | OPT:=$(OPT) $(FLAGS)
 25 | XXFLAGS=-fno-rtti
 26 | CBLASFILE?=cblas.h
 27 | BLAZEFLAGS= -DBLAZE_BLAS_MODE=1 \
 28 |     -DBLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION=1 -DBLAZE_BLAS_INCLUDE_FILE='"$(CBLASFILE)"' \
 29 |      $(BLAS_LINKING_FLAGS)
 30 | CXXFLAGS=$(OPT) $(XXFLAGS) -std=$(STD) $(WARNINGS) -DRADEM_LUT $(EXTRA) $(BLAZEFLAGS)
 31 | CCFLAGS=$(OPT) -std=c11 $(WARNINGS)
 32 | LIB=-lz -pthread -lfftw3 -lfftw3l -lfftw3f -lstdc++fs -lsleef -llapack
 33 | LD=-L. -Lfftw-3.3.7/lib -Lvec/sleef/build/lib
 34 | 
 35 | OBJS=$(patsubst %.cpp,%.o,$(wildcard lib/*.cpp)) clhash/clhash.o
 36 | TEST_OBJS=$(patsubst %.cpp,%.o,$(wildcard test/*.cpp))
 37 | EXEC_OBJS=$(patsubst %.cpp,%.o,$(wildcard src/*.cpp)) $(patsubst %.cpp,%.fo,$(wildcard src/*.cpp))
 38 | 
 39 | EX=$(patsubst src/%.fo,%f,$(EXEC_OBJS)) $(patsubst src/%.o,%,$(EXEC_OBJS))
 40 | BOOST_DIRS=math config random utility assert static_assert \
 41 |     integer type_traits mpl core preprocessor exception throw_exception \
 42 |     range iterator io predef concept_check detail lexical_cast \
 43 |     numeric_conversion functional array container move thread smart_ptr
 44 | 
 45 | SAN=-fsanitize=address -fsanitize=undefined
 46 | 
 47 | BOOST_INCS=$(patsubst %,-Iboost/%/include,$(BOOST_DIRS))
 48 | 
 49 | 
 50 | # If compiling with c++ < 17 and your compiler does not provide
 51 | # bessel functions with c++14, you must compile against boost.
 52 | 
 53 | INCLUDE=-I. -Iinclude -Ivec/blaze -Ithirdparty -Irandom/include/\
 54 |       -Ifftw-3.3.7/include -I vec/sleef/build/include/ $(BOOST_INCS) \
 55 |     -I/usr//local/Cellar/zlib/1.2.11/include -Ifastrange -Idistmat -Iaesctr \
 56 |         -Iinclude/frp -Iclhash/include # asdfnfkjhqefkjhdafs
 57 | 
 58 | ifdef BOOST_INCLUDE_PATH
 59 | INCLUDE += -I$(BOOST_INCLUDE_PATH)
 60 | endif
 61 | 
 62 | OBJS:=$(OBJS) vec/sleef/build/include/sleef.h fht.o FFHT/fast_copy.o
 63 | 
 64 | all: $(OBJS) $(EX) python
 65 | print-%  : ; @echo $* = $($*)
 66 | 
 67 | obj: $(OBJS) $(EXEC_OBJS)
 68 | 
 69 | HEADERS=$(wildcard include/frp/*.h)
 70 | 
 71 | fht.o: FFHT/fht.c
 72 | 	cd FFHT && make fht.o && cp fht.o ..
 73 | 
 74 | HEADERS=$(wildcard include/frp/*.h)
 75 | 
 76 | test/%.o: test/%.cpp $(OBJS)
 77 | 	$(CXX) $(CXXFLAGS) $(INCLUDE) $(LD) $(OBJS) -c $< -o $@ $(LIB)
 78 | 
 79 | %.fo: %.cpp $(OBJS)
 80 | 	$(CXX) $(CXXFLAGS) -DFLOAT_TYPE=float $(DBG) $(INCLUDE) $(LD) -c $< -o $@ $(LIB)
 81 | 
 82 | %.o: %.cpp $(OBJS)
 83 | 	$(CXX) $(CXXFLAGS) -DFLOAT_TYPE=double $(DBG) $(INCLUDE) $(LD) -c $< -o $@ $(LIB)
 84 | 
 85 | %: src/%.cpp $(OBJS) fftw3.h $(HEADERS)
 86 | 	$(CXX) $(CXXFLAGS) -DFLOAT_TYPE=double $(DBG) $(INCLUDE) $(LD) $(OBJS) $< -o $@ $(LIB)
 87 | pcatest: src/pcatest.cpp $(OBJS) $(HEADERS)
 88 | 	$(CXX) $(CXXFLAGS) -DFLOAT_TYPE=double $(DBG) $(INCLUDE) $(LD) $(OBJS) $< -o $@ $(LIB)
 89 | dcitest: src/dcitest.cpp $(OBJS) $(HEADERS)
 90 | 	$(CXX) $(CXXFLAGS) -DFLOAT_TYPE=double $(DBG) $(INCLUDE) $(LD) $(OBJS) $< -o $@ -lz -pthread -fopenmp -llapack -DTIME_ADDITIONS #$(SAN)
 91 | dcitestf: src/dcitest.cpp $(OBJS) $(HEADERS)
 92 | 	$(CXX) $(CXXFLAGS) -DFLOAT_TYPE=float $(DBG) $(INCLUDE) $(LD) $(OBJS) $< -o $@ -lz -pthread -fopenmp -llapack -DTIME_ADDITIONS #$(SAN)
 93 | 
 94 | %f: src/%.cpp $(OBJS) fftw3.h
 95 | 	$(CXX) $(CXXFLAGS) -DFLOAT_TYPE=float $(DBG) $(INCLUDE) $(LD) $(OBJS) $< -o $@ $(LIB)
 96 | 
 97 | %.o: %.c
 98 | 	$(CC) $(CCFLAGS) -Wno-sign-compare $(DBG) $(INCLUDE) $(LD) -c $< -o $@ $(LIB)
 99 | 
100 | %.o: FFHT/%.c $(OBJS) fftw3.h
101 | 	+cd FFHT && make $@ && cp $@ .. && cd ..
102 | 
103 | clhash/clhash.o:
104 | 	cd clhash && make && cd ..
105 | 
106 | fftw-3.3.7: fftw-3.3.7.tar.gz
107 | 	tar -zxvf fftw-3.3.7.tar.gz
108 | 
109 | fftw-3.3.7.exist: fftw-3.3.7.tar.gz
110 | 	tar -zxvf fftw-3.3.7.tar.gz && touch fftw-3.3.7.exist
111 | 
112 | PLATFORM_CONF_STR?=--enable-avx2
113 | 
114 | fftw3.h: fftw-3.3.7/lib/libfftw3l.a fftw-3.3.7/lib/libfftw3.a fftw-3.3.7/lib/libfftw3f.a
115 | 	cp fftw-3.3.7/api/fftw3.h .
116 | 
117 | python:
118 | 	cd py && make
119 | 
120 | fftw-3.3.7/lib/libfftw3.a: fftw-3.3.7.exist
121 | 	+cd fftw-3.3.7 &&\
122 | 	./configure $(PLATFORM_CONF_STR) --prefix=$$PWD && make && make install
123 | fftw-3.3.7/lib/libfftw3f.a: fftw-3.3.7.exist fftw-3.3.7/lib/libfftw3.a
124 | 	+cd fftw-3.3.7 &&\
125 | 	./configure $(PLATFORM_CONF_STR) --prefix=$$PWD --enable-single && make && make install
126 | fftw-3.3.7/lib/libfftw3l.a: fftw-3.3.7.exist fftw-3.3.7/lib/libfftw3f.a
127 | 	+cd fftw-3.3.7 &&\
128 | 	./configure --prefix=$$PWD --enable-long-double && make && make install && cp api/fftw3.h ..
129 | 
130 | 
131 | tests: clean unit
132 | 
133 | unit: $(OBJS) $(TEST_OBJS)
134 | 	$(CXX) $(CXXFLAGS) $(INCLUDE) $(TEST_OBJS) $(LD) $(OBJS) -o $@ $(LIB)
135 | 
136 | vec/sleef/build: vec/sleef
137 | 	mkdir -p vec/sleef/build
138 | 
139 | vec/sleef/build/include/sleef.h: vec/sleef/build
140 | 	cd $< && cmake .. && make && cd ../..
141 | 
142 | sleef.h:vec/sleef/build/include/sleef.h
143 | 	cp vec/sleef/build/include/sleef.h sleef.h
144 | 
145 | clean:
146 | 	+rm -f $(EXEC_OBJS) $(OBJS) $(EX) $(TEST_OBJS) fftw3.h unit lib/*o frp/src/*o && cd FFHT && make clean && cd ..
147 | 
148 | mostlyclean: clean
149 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # frp: Fast Randomized Projections
 2 | We use [Blaze](https://bitbucket.org/blaze-lib) for fast linear algebra, [Sleef](https://github.com/shibatch/sleef) for fast trigonometric operations,
 3 | [Fast Fast-Hadamard Transform](https://github.com/dnbaker/FFHT) from FALCONN-LIB for the Fast Hadamard Transform, [FFTW3](http://fftw.org/) for the FFT, and [boost](https://github.com/boostorg) for
 4 | special functions and random number generators. Only required boost headers are provided as submodules and no installation of boost is required.
 5 | 
 6 | ## Contents
 7 | 0. Orthogonal JL transform with linear space and linearithmic runtime
 8 |     1. This is available through the `ojlt` executable, in C++ programs accessing include/frp/jl.h, and using python bindings.
 9 |     2. Python bindings can be installed by `cd python && python3 setup.py install`.
10 | 1. Kernel projections
11 |     1. We support kernel approximation for the Gaussian kernel using Random Fourier Features, Orthogonal Random Features, Structured Orthogonal Random Features, and FastFood.
12 |     2. We recommend Structured Orthogonal Random Features, as it has the highest accuracy in our experiments and can also be hundreds of times faster while still having a small memory footprint.
13 | 2. A type-generic SIMD interface (vec/vec.h), which abstracts operations to allow the compiler to use the widest vectors possible as needed, facilitating generically dispatching the fastest implementation possible on a machine.
14 | 3. Utilities
15 |     2. PRNVector (PseudoRandom Number Vector) to provide access to random vectors using only constant memory requires instead of explicitly storing them by generating values as needed.
16 |     3. Utilities for sampling and filling containers from distributions.
17 |     4. Acquiring cache sizes from the OS.
18 | 4. Linear algebra methods
19 |     1. Implementation of the Gram-Schmidt algorithm for orthogonalizing matrices.
20 |     2. PCA using full eigendecomposition for symmetric matrices.
21 |     3. Covariance Matrix calculation
22 | 5. Miscellaneous, related work
23 |     1. Dynamic Continuous Indexing for real-valued data
24 |         1. [Dynamic Continuous Indexing](https://arxiv.org/abs/1512.00442)
25 |           1. Tested
26 |         2. [Prioritized DCI](https://arxiv.org/abs/1703.00440)
27 |           2. Draft form.
28 | 
29 | ### Build instructions
30 | 
31 | `make` should compile a variety of tests.
32 | We assume you're using BLAS for your linear algebra; to avoid doing that, modify the Makefile and remove the `-DBLAZE*` flags.
33 | 
34 | To specify a different blas header file, use the CBLASFILE variable when compiling:
35 | ```bash
36 | make ojlt CBLASFILE=mkl_cblas.h
37 | # Or, use an environmental variable
38 | export CBLASFILE=mkl_cblas.h && \
39 | make ojlt
40 | ```
41 | 
42 |         
43 | 
44 | ## Commentary
45 | 
46 | The initial design of this library was to implement methods from [https://arxiv.org/abs/1703.00864](https://arxiv.org/abs/1703.00864). The core transformss on which it is built are fast fast-hadamard transform accelerated structured matrix vector products. This has applications in memory-efficient, accelerated Johnson-Lindenstrauss Transforms, gaussian kernel approximation for linearizing datasets and FastFood/Adaptive Random Spinners.
47 | 
48 | ## DCI/Prioritized DCI usage
49 | 
50 | Notes:
51 | 
52 | During construction, it may be advantageous to use a std::set to maintain sorted indexes (logarithmic update time), whereas at query time, it's faster to use a contiguous array.
53 | We provide the cvt function, which copies the index, but converts the sorted index type from what it used to be (usually a red-black tree) into the destination type,
54 | by default an always-sorted array.
55 | 
56 | We suggest doing this for the purposes of faster construction and faster queries.
57 | 
58 | Additionally, we do not store any points, just references to them.
59 | 
60 | When using a non-default container which supports lower_bound functionality, one needs to both use `std::less<void>` for a comparator and overload `has_lower_bound_mf` struct.
61 | 


--------------------------------------------------------------------------------
/fftw-3.3.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dnbaker/frp/394b427b60221a5dc215a90c58f9fb922b0c4737/fftw-3.3.7.tar.gz


--------------------------------------------------------------------------------
/include/frp/compact.h:
--------------------------------------------------------------------------------
  1 | #ifndef _GFRP_CRAD_H__
  2 | #define _GFRP_CRAD_H__
  3 | #include "frp/util.h"
  4 | #include "frp/linalg.h"
  5 | #include "frp/dist.h"
  6 | #include "fastrange/fastrange.h"
  7 | #include <ctime>
  8 | 
  9 | namespace frp {
 10 | 
 11 | /*
 12 | // From https://arxiv.org/pdf/1702.08159.pdf
 13 | 
 14 | FHT!!!
 15 | https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Pseudocode
 16 | http://fourier.eng.hmc.edu/e161/lectures/wht/node4.html
 17 | Sliding windows http://www.ee.cuhk.edu.hk/~wlouyang/FWHT.htm
 18 | 
 19 | Presentation http://c.csie.org/~itct/slide/DCT_larry.pdf
 20 | 
 21 | Using renoramlization makes it orthogonal, which is GOOD.
 22 | some authors further multiply the X0 term by 1/√2 and multiply the resulting matrix by an overall scale factor of 2 N {\displaystyle {\sqrt {\tfrac {2}{N}}}} {\displaystyle {\sqrt {\tfrac {2}{N}}}} (see below for the corresponding change in DCT-III). This makes the DCT-II matrix orthogonal, but breaks the direct correspondence with a real-even DFT of half-shifted input.
 23 | 
 24 | 
 25 | Fast DCT https://unix4lyfe.org/dct-1d/
 26 | http://ieeexplore.ieee.org/document/558495/
 27 | 
 28 | 
 29 | 
 30 | F2F
 31 | proceeds with vectorized sums and subtractions iteratively for the first n/2^k
 32 | positions (where n is the length of the input vector and k is the iteration starting from 1)
 33 | computing the intermediate operations of the Cooley-Tukey algorithm till a small Hadamard
 34 | routine that fits in cache.  Then the algorithm continues in the same way but starting from
 35 | the  smallest  length  and  doubling  on  each  iteration  the  input  dimension  until  the  whole
 36 | FWHT is done in-place.
 37 | */
 38 | 
 39 | #if __GNUC__ || __clang__
 40 | constexpr INLINE unsigned clz(unsigned long long x) {
 41 |     return __builtin_clzll(x);
 42 | }
 43 | constexpr INLINE unsigned clz(unsigned long x) {
 44 |     return __builtin_clzl(x);
 45 | }
 46 | constexpr INLINE unsigned clz(unsigned x) {
 47 |     return __builtin_clz(x);
 48 | }
 49 | #else
 50 | 
 51 | #define clztbl(x, arg) do {\
 52 |     switch(arg) {\
 53 |         case 0:                         x += 4; break;\
 54 |         case 1:                         x += 3; break;\
 55 |         case 2: case 3:                 x += 2; break;\
 56 |         case 4: case 5: case 6: case 7: x += 1; break;\
 57 |     }} while(0)
 58 | 
 59 | constexpr INLINE int clz_manual( uint32_t x )
 60 | {
 61 |   int n(0);
 62 |   if ((x & 0xFFFF0000) == 0) {n  = 16; x <<= 16;}
 63 |   if ((x & 0xFF000000) == 0) {n +=  8; x <<=  8;}
 64 |   if ((x & 0xF0000000) == 0) {n +=  4; x <<=  4;}
 65 |   clztbl(n, x >> (32 - 4));
 66 |   return n;
 67 | }
 68 | 
 69 | // Overload
 70 | constexpr INLINE int clz_manual( uint64_t x )
 71 | {
 72 |   int n(0);
 73 |   if ((x & 0xFFFFFFFF00000000ull) == 0) {n  = 32; x <<= 32;}
 74 |   if ((x & 0xFFFF000000000000ull) == 0) {n += 16; x <<= 16;}
 75 |   if ((x & 0xFF00000000000000ull) == 0) {n +=  8; x <<=  8;}
 76 |   if ((x & 0xF000000000000000ull) == 0) {n +=  4; x <<=  4;}
 77 |   clztbl(n, x >> (64 - 4));
 78 |   return n;
 79 | }
 80 | #define clz(x) clz_manual(x)
 81 | #endif
 82 | 
 83 | template<typename T>
 84 | static constexpr INLINE unsigned ilog2(T x) noexcept {
 85 |     return sizeof(T) * CHAR_BIT - clz(x)  - 1;
 86 | }
 87 | static constexpr unsigned log2_64(uint64_t x) {return ilog2(x);}
 88 | 
 89 | class PRNRademacher {
 90 |     size_t      n_;
 91 |     uint64_t seed_;
 92 | public:
 93 |     using size_type = std::size_t;
 94 |     PRNRademacher(size_t n=0, uint64_t seed=0): n_(n), seed_(seed) {}
 95 |     auto size() const {return n_;}
 96 |     void resize(size_t newsize) {n_ = newsize;}
 97 |     void seed(uint64_t seed) {seed_ = seed;}
 98 | 
 99 |     template<typename Container>
100 |     void apply(Container &c) const {
101 |         wy::WyHash<uint64_t> gen(seed_);
102 |         uint64_t val(gen());
103 |         for(size_t i(0), e(c.size()); i < e; ++i) {
104 |             if(unlikely((i & ((CHAR_BIT * sizeof(uint64_t)) - 1)) == 0))
105 |                 val = gen();
106 |             c[i] *= val & 1 ? -1.: 1.;
107 |             val >>= 1;
108 |         }
109 |     }
110 | 
111 |     template<typename ArithType>
112 |     void apply(ArithType *c, size_t nitems=0) {
113 |         wy::WyHash<uint64_t> gen(seed_);
114 |         uint64_t val;
115 |         if(nitems == 0) nitems = n_;
116 |         for(size_t i(0); i < nitems; ++i) {
117 |             if(unlikely((i & ((CHAR_BIT * sizeof(uint64_t)) - 1)) == 0))
118 |                 val = gen();
119 |             c[i] *= val & 1 ? -1.: 1.;
120 |             val >>= 1;
121 |         }
122 |     }
123 | };
124 | 
125 | template<typename FT>
126 | class CachedRademacher {
127 | protected:
128 |     size_t n_;
129 |     uint64_t seed_;
130 |     blaze::DynamicVector<FT> vec_;
131 | public:
132 |     using size_type = std::size_t;
133 |     CachedRademacher(size_t n, uint64_t seed=0): n_(n), seed_(seed), vec_(n) {
134 |         this->seed(seed);
135 |     }
136 |     void resize(size_t newsz) {
137 |         if(newsz < n_) {
138 |             vec_.resize(n_);
139 |             n_ = newsz;
140 |             return;
141 |         }
142 |         if(newsz > n_) {
143 |             vec_.resize(newsz);
144 |             n_ = newsz;
145 |             seed(seed_);
146 |         }
147 |     }
148 |     void seed(uint64_t seed) {
149 |         wy::WyHash<uint64_t> gen(seed);
150 |         unsigned t = 64;
151 |         auto v = gen();
152 |         for(size_t i = 0, e = n_; i < e; ++i) {
153 |             vec_[i] = v & 1 ? -1.: 1.;
154 |             if(t-- == 0) {
155 |                 v = gen();
156 |                 t = 64;
157 |             }
158 |         }
159 |     }
160 |     auto size() const {return n_;}
161 |     template<typename VT, bool TF>
162 |     void apply(blaze::Vector<VT, TF> &c) const {
163 |         ~c *= vec_;
164 |     }
165 |     void apply(FT *c) const {
166 |         blaze::CustomVector<FT, blaze::unaligned, blaze::unpadded> cv(c, vec_.size());
167 |         apply(cv);
168 |     }
169 | };
170 | 
171 | template<typename T=uint64_t, typename RNG=aes::AesCtr<T>>
172 | class CompactRademacherTemplate {
173 |     T              seed_;
174 |     std::vector<T> data_;
175 |     using FloatType = FLOAT_TYPE;
176 | 
177 |     static constexpr size_t NBITS = sizeof(T) * CHAR_BIT;
178 |     static constexpr size_t SHIFT = log2_64(NBITS);
179 |     static constexpr size_t BITMASK = NBITS - 1;
180 | 
181 | public:
182 |     using value_type = FloatType;
183 |     using container_type = T;
184 |     using size_type = size_t;
185 |     // Constructors
186 |     CompactRademacherTemplate(size_t n=0, uint64_t seed=std::time(nullptr)): seed_(seed), data_(n >> SHIFT) {
187 |         if(n & (BITMASK))
188 |             std::fprintf(stderr, "Warning: n is not evenly divisible by BITMASK size. (n: %zu). (bitmask: %zu)\n", n, BITMASK);
189 |         randomize(seed_);
190 |     }
191 |     CompactRademacherTemplate(CompactRademacherTemplate &&other) = default;
192 |     CompactRademacherTemplate(const CompactRademacherTemplate &other) = default;
193 |     CompactRademacherTemplate &operator=(const CompactRademacherTemplate &o) = default;
194 |     CompactRademacherTemplate &operator=(CompactRademacherTemplate &&o) = default;
195 |     template<typename AsType>
196 |     class CompactAs {
197 |         static constexpr AsType values[2] {static_cast<AsType>(1), static_cast<AsType>(-1)};
198 |         const CompactRademacherTemplate &ref_;
199 |     public:
200 |         CompactAs(const CompactRademacherTemplate &ref): ref_(ref) {}
201 |         AsType operator[](size_t index) const {return values[ref_.bool_idx(index)];}
202 |     };
203 |     template<typename AsType>
204 |     CompactAs<AsType> as_type() const {
205 |         return CompactAs<AsType>(*this);
206 |     }
207 |     void seed(T seed) {seed_ = seed;}
208 |     void resize(T new_size) {
209 |         if(new_size != size()) {
210 |             data_.resize(std::max(static_cast<T>(1), new_size >> SHIFT));
211 |             randomize(seed_);
212 |         }
213 |     }
214 |     // For setting to random values
215 |     auto *data() {return data_;}
216 |     const auto *data() const {return data_;}
217 |     // For use
218 |     auto size() const {return data_.size() << SHIFT;}
219 |     auto capacity() const {return data_.capacity() << SHIFT;}
220 |     auto nwords() const {return data_.size();}
221 |     auto nbytes() const {return size();}
222 |     bool operator==(const CompactRademacherTemplate &other) const {
223 |         if(size() != other.size()) return false;
224 |         auto odata = other.data();
225 |         for(size_t i(0);i < data_.size(); ++i)
226 |             if(data_[i] != odata[i])
227 |                 return false;
228 |         return true;
229 |     }
230 | 
231 |     void randomize(uint64_t seed) {
232 |         random_fill(reinterpret_cast<uint64_t *>(data_.data()), data_.size() * sizeof(T) / sizeof(uint64_t), seed);
233 |     }
234 |     void zero() {memset(data_, 0, sizeof(T) * data_.size());}
235 |     void reserve(size_t newsize) {
236 |         data_.reserve(newsize >> SHIFT);
237 |     }
238 |     INLINE int bool_idx(size_type idx) const {return !(data_[(idx >> SHIFT)] & (static_cast<T>(1) << (idx & BITMASK)));}
239 | 
240 |     FloatType operator[](size_type idx) const {return bool_idx(idx) ? FloatType(-1.): FloatType(1.);}
241 |     template<typename InVector, typename OutVector>
242 |     void apply(const InVector &in, OutVector &out) const {
243 |         static_assert(is_same<decay_t<decltype(in[0])>, FloatType>::value, "Input vector should be the same type as this structure.");
244 |         static_assert(is_same<decay_t<decltype(out[0])>, FloatType>::value, "Output vector should be the same type as this structure.");
245 |         out = in;
246 |         apply(out);
247 |     }
248 |     template<typename FloatType2>
249 |     void apply(FloatType2 *vec) const {
250 |         auto tmp(as_type<FloatType2>());
251 |         for(T i = 0; i < size(); ++i) vec[i] *= tmp[i];
252 |     }
253 |     template<typename VectorType>
254 |     void apply(VectorType &vec) const {
255 |         //std::fprintf(stderr, "Applying %s vector of size %zu.\n", __PRETTY_FUNCTION__, vec.size());
256 |         auto tmp(as_type<std::decay_t<decltype(vec[0])>>());
257 |         if(vec.size() != size()) {
258 |             if(vec.size() > size())
259 |                 throw std::runtime_error("Vector is too big for he gotdam feet");
260 |             std::fprintf(stderr, "Warning: CompactRademacherTemplate is too big. Only affecting elements in my size (%zu) vs vector size (%zu). Any F*Ts might not be so kind.\n", size_t(size()), size_t(vec.size()));
261 |         }
262 |         for(T i = 0, e(std::min(vec.size(), size())); i < e; ++i) {
263 |             vec[i] *= tmp[i];
264 |         }
265 |     }
266 | };
267 | 
268 | using CompactRademacher = CompactRademacherTemplate<uint64_t>;
269 | 
270 | struct UnchangedRNGDistribution {
271 |     template<typename RNG>
272 |     auto operator()(RNG &rng) const {return rng();}
273 |     void reset() {}
274 | };
275 | 
276 | struct Int2GaussianDistribution {
277 |     template<typename RNG>
278 |     auto operator()(RNG &rng) const {return random_gaussian_from_seed(rng());}
279 |     void reset() {}
280 | };
281 | 
282 | template<typename RNG=aes::AesCtr<uint64_t>, typename Distribution=UnchangedRNGDistribution>
283 | class PRNVector {
284 |     // Vector of random values generated
285 |     const uint64_t    seed_;
286 |     uint64_t          used_;
287 |     uint64_t          size_;
288 |     RNG                rng_;
289 |     Distribution      dist_;
290 | public:
291 |     using ResultType = decay_t<decltype(dist_(rng_))>;
292 | private:
293 |     ResultType      val_;
294 | 
295 | public:
296 | 
297 |     class PRNIterator {
298 | 
299 |         PRNVector<RNG, Distribution> *const ref_;
300 |     public:
301 |         auto operator*() const {return ref_->val_;}
302 |         auto &operator ++() {
303 |             inc();
304 |             return *this;
305 |         }
306 |         void inc() {
307 |             ref_->gen();
308 |             ++ref_->used_;
309 |         }
310 |         void gen() {ref_->gen();}
311 |         bool operator !=([[maybe_unused]] const PRNIterator &other) const {
312 |             return ref_->used_ < ref_->size_; // Doesn't even access the other iterator. Only used for `while(it < end)`.
313 |         }
314 |         PRNIterator(PRNVector<RNG, Distribution> *prn_vec): ref_(prn_vec) {}
315 |     };
316 | 
317 |     template<typename... DistArgs>
318 |     PRNVector(uint64_t size, uint64_t seed=0, DistArgs &&... args):
319 |         seed_{seed}, used_{0}, size_{size}, rng_(seed_), dist_(forward<DistArgs>(args)...), val_(gen()) {}
320 | 
321 |     auto begin() {
322 |         reset();
323 |         return PRNIterator(this);
324 |     }
325 |     ResultType gen() {return val_ = dist_(rng_);}
326 |     void reset() {
327 |         rng_.seed(seed_);
328 |         dist_.reset();
329 |         used_ = 0;
330 |         gen();
331 |     }
332 |     auto end() {
333 |         return PRNIterator(static_cast<decltype(this)>(nullptr));
334 |     }
335 |     auto end() const {
336 |         return PRNIterator(static_cast<decltype(this)>(nullptr));
337 |     }
338 |     auto size() const {return size_;}
339 |     void resize(size_t newsize) {size_ = newsize;}
340 | };
341 | 
342 | } // namespace frp
343 | 
344 | 
345 | #endif // #ifndef _GFRP_CRAD_H__
346 | 


--------------------------------------------------------------------------------
/include/frp/coresets.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CORESETS_H__
 2 | #define _CORESETS_H__
 3 | #include "blaze/Math.h"
 4 | 
 5 | namespace frp {
 6 | inline namespace coresets {
 7 | 
 8 | template<typename MatType, typename VectorType=blaze::DynamicVector<typename MatType::ValueType>>
 9 | struct WeightedMatrix: std::pair<MatType, VectorType *> {
10 |     using matrix_type = MatType;
11 |     using vector_type = VectorType;
12 | };
13 | 
14 | // General idea:
15 | // Generate weighted 
16 | 
17 | template<typename WeightedMatrix>
18 | auto generate_lightweight_kmeans(const WeightedMatrix &m) {
19 |     // Method for k-means
20 |     using FT = typename Mat::matrix_type::ValueType;
21 |     static constexpr bool CSO = blaze::StorageOrder<typename Mat::vector_type>::value;
22 |     blaze::DynamicVector<FT, CSO> mean, importance;
23 |     auto weights = m.second;
24 |     if(weights) {
25 |         if(weights->size() != mat.rows()) throw 1;
26 |         auto vit = mat.second->begin();
27 |         FT tsum = *vit++;
28 |         mean = trans(row(m.first, 0)) * tsum;
29 |         for(size_t i = 1; i , m.first.rows(); ++i) {
30 |             auto rv = *vit++;
31 |             mean += trans(row(m.first, i)) * rv;
32 |             tsum += rv;
33 |         }
34 |         if(tsum == 0.) throw 2; // should never happen
35 |         mean *= 1. / tsum;
36 |         FT wnormsum = 0.;
37 |         importance.resize(m.rows());
38 |         vit = weights->begin();
39 |         for(size_t i = 0; i < m.first.rows(); ++i) {
40 |             auto diff = trans(row(m.first, i)) - mean;
41 |             FT rdiffnorm = blaze::sum(diff * diff) * *vit++;
42 |             wnormsum += rdiffnorm;
43 |         }
44 |         if(wnormsum)
45 |             importance = ((importance / wnormsum) + 1. / tsum) * .5;
46 |         else importance = (1. / tsum); // uniform assignment
47 | 
48 |     } else {
49 |         mean = blaze::mean<columnwise>(mat);
50 |         for(size_t i = 0; i < m.first.rows(); ++i) {
51 |             auto diff = trans(row(m.first, i)) - mean;
52 |             FT rdiffnorm = blaze::sum(diff * diff);
53 |             wnormsum += rdiffnorm;
54 |         }
55 |     }
56 |     importance /= blaze::sum(importance);
57 |     return importance;
58 | }
59 | 
60 | // To get a core-set: pick a size (TODO: make function determining coreset size)
61 | // and then sample that many (with replacement) until you hit that size
62 | // Thoughts: indexing via coresets?
63 | 
64 | } // coresets
65 | } // frp
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/include/frp/dist.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GFRP_DIST_H__
 2 | #define _GFRP_DIST_H__
 3 | #include <random>
 4 | #include "frp/rand.h"
 5 | #include "frp/linalg.h"
 6 | #include "boost/random/normal_distribution.hpp"
 7 | #include "boost/random.hpp"
 8 | 
 9 | namespace frp {
10 | 
11 | // Fill a matrix with distributions. Contains utilities for filling
12 | // vectors with C++ std distributions as well as Rademacher.
13 | 
14 | template<typename Container, template<typename> typename Distribution, typename RNG=aes::AesCtr<uint64_t>, typename... DistArgs>
15 | void sample_fill(Container &con, uint64_t seed, DistArgs &&... args) {
16 |     using FloatType = std::decay_t<decltype(*std::begin(con))>;
17 |     RNG gen(seed);
18 |     Distribution<FloatType> dist(std::forward<DistArgs>(args)...);
19 |     for(auto &el: con) el = dist(gen);
20 | }
21 | 
22 | template<typename FloatType, bool SO, template<typename> typename Distribution, typename RNG=aes::AesCtr<uint64_t>, typename... DistArgs>
23 | void sample_fill(blaze::DynamicMatrix<FloatType, SO> &con, uint64_t seed, DistArgs &&... args) {
24 |     #pragma omp parallel for
25 |     for(size_t i = 0; i < con.rows(); ++i) {
26 |         RNG gen(seed);
27 |         gen.seed(gen() + i);
28 |         thread_local Distribution<FloatType> dist(std::forward<DistArgs>(args)...);
29 |         for(size_t j(0); j < con.columns(); ++j)
30 |             con(i, j) = dist(gen);
31 |     }
32 | }
33 | 
34 | 
35 | template<typename RNG=aes::AesCtr<uint64_t>>
36 | void random_fill(uint64_t *data, uint64_t len, uint64_t seed=0) {
37 |     for(RNG gen(seed); len; data[--len] = gen());
38 | }
39 | 
40 | #define DEFINE_DIST_FILL(type, name) \
41 |     template<typename Container, typename RNG=aes::AesCtr<uint64_t>, typename...Args> \
42 |     void name##_fill(Container &con, uint64_t seed, Args &&... args) { \
43 |         sample_fill<Container, type, RNG, Args...>(con, seed, std::forward<Args>(args)...); \
44 |     }\
45 |     template<typename FloatType, bool SO, typename RNG=aes::AesCtr<uint64_t>, typename...Args> \
46 |     void name##_fill(blaze::DynamicMatrix<FloatType, SO> &con, uint64_t seed, Args &&... args) { \
47 |         sample_fill<FloatType, SO, type, RNG, Args...>(con, seed, std::forward<Args>(args)...); \
48 |     }\
49 |     struct name##_fill_struct {\
50 |         template<typename Container, typename RNG=aes::AesCtr<uint64_t>, typename...Args>\
51 |         void operator()(Container &con, uint64_t seed, Args &&... args) const {\
52 |             name##_fill<Container, RNG, Args...>(con, seed, std::forward<Args>(args)...);\
53 |         }\
54 |     };
55 | 
56 | template<typename FloatType>
57 | class unit_normal: public boost::normal_distribution<FloatType> {
58 | public:
59 |     void reset() {}
60 | };
61 | 
62 | enum DistributionType {
63 |     NORMAL,
64 |     UNIT_NORMAL,
65 |     CAUCHY,
66 |     CHI_SQUARED,
67 |     LOGNORMAL,
68 |     EXTREME_VALUE_DISTRIBUTION,
69 |     WEIBULL,
70 |     UNIFORM_REAL_DISTRIBUTION,
71 |     NEGATIVE_BINOMIAL,
72 |     EXPONENTIAL,
73 |     EVD=EXTREME_VALUE_DISTRIBUTION,
74 |     NB=NEGATIVE_BINOMIAL,   
75 |     EXP=EXPONENTIAL
76 | };
77 | DEFINE_DIST_FILL(boost::normal_distribution, gaussian)
78 | DEFINE_DIST_FILL(unit_normal, unit_gaussian)
79 | DEFINE_DIST_FILL(boost::cauchy_distribution, cauchy)
80 | DEFINE_DIST_FILL(boost::random::chi_squared_distribution, chisq)
81 | DEFINE_DIST_FILL(boost::lognormal_distribution, lognormal)
82 | DEFINE_DIST_FILL(boost::random::extreme_value_distribution, extreme_value)
83 | DEFINE_DIST_FILL(boost::random::weibull_distribution, weibull)
84 | DEFINE_DIST_FILL(boost::random::uniform_real_distribution, uniform)
85 | DEFINE_DIST_FILL(std::negative_binomial_distribution, nb)
86 | DEFINE_DIST_FILL(std::exponential_distribution, exp)
87 | 
88 | } // frp
89 | 
90 | #endif // #ifndef _GFRP_DIST_H__
91 | 


--------------------------------------------------------------------------------
/include/frp/fhtgpu.h:
--------------------------------------------------------------------------------
  1 | #ifndef FRP_GPU_FHT_H
  2 | #define FRP_GPU_FHT_H
  3 | namespace frp {
  4 | 
  5 | namespace detail {
  6 | // Derived from WyHash
  7 | static constexpr const uint64_t _wyp0=0xa0761d6478bd642full, _wyp1=0xe7037ed1a0b428dbull;
  8 | 
  9 | template<typename T>
 10 | static constexpr inline T seedind2val(T ind, T seed) {
 11 |     uint64_t oldstate = ind;
 12 |     uint64_t newstart = ind * 6364136223846793005ULL + seed;
 13 |     uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
 14 |     uint32_t rot = oldstate >> 59u;
 15 |     return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
 16 | }
 17 | 
 18 | template<typename T>
 19 | static constexpr inline T seedind2val_lazy(T ind, T seed) {
 20 |     return ((ind ^ seed) * static_cast<T>(6364136223846793005ULL)) ^ _wyp1;
 21 | }
 22 | 
 23 | 
 24 | // TODO: kernel fusion between fht and random diagonal matrix multiplication from fixed seeds.
 25 | 
 26 | } // detail
 27 | 
 28 | 
 29 | #ifdef __CUDACC__
 30 | template<typename T, bool renormalize=true, typename T2>
 31 | __global__ void grsfht_kernel(T *ptr, size_t l2, int nthreads, T theta, T2 *vals) {
 32 |     // Givens rotation hadamard product kernel
 33 |     // This maps pretty well to the GPU
 34 |     int tid = blockIdx.x*blockDim.x + threadIdx.x;
 35 |     int n = 1 << l2;
 36 |     for(int i = 0; i < l2; ++i) {
 37 |         T theta = vals[i];
 38 |         T m1 = cos(theta), m2 = sin(theta);
 39 |         int s1 = 1 << i, s2 = s1 << 1;
 40 |         int nthreads_active = min(n >> (i + 1), nthreads);
 41 |         int npert = n / nthreads_active;
 42 |         if(tid < nthreads_active) {
 43 |             for(int j = tid * npert, e = j + npert; j != e; j += s2) {
 44 |                 #pragma unroll
 45 |                 for(size_t k = 0; k < s1; ++k) {
 46 |                     auto u = ptr[j + k], v = ptr[j + k + s1];
 47 |                     ptr[j + k] = u * mc - v * ms, ptr[j + k + s1] = ms * u + mc * v;
 48 |                 }
 49 |             }
 50 |         }
 51 |         __syncthreads();
 52 |     }
 53 | }
 54 | 
 55 | template<typename T, bool renormalize=true>
 56 | __global__ void pfht_kernel(T *ptr, size_t l2, int nthreads, T theta) {
 57 |     // This maps pretty well to the GPU
 58 |     int tid = blockIdx.x*blockDim.x + threadIdx.x;
 59 |     int n = 1 << l2;
 60 |     T mc = cos(theta), ms = sin(theta);
 61 |     for(int i = 0; i < l2; ++i) {
 62 |         int s1 = 1 << i, s2 = s1 << 1;
 63 |         int nthreads_active = min(n >> (i + 1), nthreads);
 64 |         int npert = n / nthreads_active;
 65 |         if(tid < nthreads_active) {
 66 |             for(int j = tid * npert, e = j + npert; j != e; j += s2) {
 67 |                 #pragma unroll
 68 |                 for(size_t k = 0; k < s1; ++k) {
 69 |                     auto u = ptr[j + k], v = ptr[j + k + s1];
 70 |                     ptr[j + k] = u * mc - v * ms, ptr[j + k + s1] = ms * u + mc * v;
 71 |                 }
 72 |             }
 73 |         }
 74 |         __syncthreads();
 75 |     }
 76 | }
 77 | 
 78 | template<typename T, bool renormalize=true>
 79 | __global__ void fht_kernel(T *ptr, size_t l2, int nthreads) {
 80 |     // This maps pretty well to the GPU
 81 |     int tid = blockIdx.x*blockDim.x + threadIdx.x;
 82 |     int n = 1 << l2;
 83 |     for(int i = 0; i < l2; ++i) {
 84 |         int s1 = 1 << i, s2 = s1 << 1;
 85 |         int nthreads_active = min(n >> (i + 1), nthreads);
 86 |         int npert = n / nthreads_active;
 87 |         if(tid < nthreads_active) {
 88 |             for(int j = tid * npert, e = j + npert; j != e; j += s2) {
 89 |                 #pragma unroll
 90 |                 for(size_t k = 0; k < s1; ++k) {
 91 |                     auto u = ptr[j + k], v = ptr[j + k + s1];
 92 |                     ptr[j + k] = u + v, ptr[j + k + s1] = u - v;
 93 |                 }
 94 |             }
 95 |         }
 96 |         __syncthreads();
 97 |     }
 98 |     if(renormalize) {
 99 |         T mult = 1. / pow(sqrt(2.), l2);
100 |         int npert = n / nthreads;
101 |         #pragma unroll
102 |         for(int i = tid * npert, e = i + npert; i < e; ++i) {
103 |             ptr[i] *= mult;
104 |         }
105 |     }
106 | }
107 | template<typename T>
108 | __global__ void rademacher_multiply(T *ptr, uint32_t *rvals, size_t l2, int nthreads) {
109 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
110 |     size_t n = 1ull << l2;
111 |     int per_thread = n / nthreads;
112 |     int start_index = tid * per_thread, end = start_index + per_thread;
113 |     for(int i = start_index / 32; i != end / 32; ++i) {
114 |         auto rv = rvals[i];
115 |         int li = i * 32;
116 |         #pragma unroll
117 |         for(int j = 0; j < 32; ++j) {
118 |             auto v = ptr[li + j];
119 |             ptr[li + j] = (rv >> j)& 1 ? -v: v;
120 |         }
121 |     }
122 | }
123 | template<typename T, bool SO renormalize=true>
124 | __global__ void radfht_kernel(T *ptr, uint32_t *rvals, size_t l2, int nthreads) {
125 |     // Performs both 
126 |     int tid = blockIdx.x*blockDim.x + threadIdx.x;
127 |     int n = 1 << l2;
128 |     int per_thread = n / nthreads;
129 |     int start_index = tid * per_thread, end = start_index + per_thread;
130 |     for(int i = start_index / 32; i != end / 32; ++i) {
131 |         auto rv = rvals[i];
132 |         int li = i * 32;
133 |         #pragma unroll
134 |         for(int j = 0; j < 32; ++j) {
135 |             auto v = ptr[li + j];
136 |             ptr[li + j] = (rv >> j)& 1 ? -v: v;
137 |         }
138 |     }
139 |     for(int i = 0; i < l2; ++i) {
140 |         int s1 = 1 << i, s2 = s1 << 1;
141 |         int nthreads_active = min(n >> (i + 1), nthreads);
142 |         int npert = n / nthreads_active;
143 |         if(tid < nthreads_active) {
144 |             #pragma unroll
145 |             for(int j = tid * npert, e = j + npert; j != e; j += s2) {
146 |                 #pragma unroll
147 |                 for(size_t k = 0; k < s1; ++k) {
148 |                     auto u = ptr[j + k], v = ptr[j + k + s1];
149 |                     ptr[j + k] = u + v, ptr[j + k + s1] = u - v;
150 |                 }
151 |             }
152 |         }
153 |         __syncthreads();
154 |     }
155 |     if(renormalize) {
156 |         T mult = 1. / pow(sqrt(2.), l2);
157 |         int npert = n / nthreads;
158 |         #pragma unroll
159 |         for(int i = tid * npert, e = i + npert; i < e; ++i) {
160 |             ptr[i] *= mult;
161 |         }
162 |     }
163 | }
164 | 
165 | #endif /* #ifdef __CUDACC__ */
166 | 
167 | } // frp
168 | #endif /* FRP_GPU_FHT_H */
169 | 


--------------------------------------------------------------------------------
/include/frp/frp.h:
--------------------------------------------------------------------------------
 1 | #ifndef _GFRP_H__
 2 | #define _GFRP_H__
 3 | 
 4 | #include "frp/compact.h"
 5 | #include "frp/dci.h"
 6 | #include "frp/dist.h"
 7 | #include "frp/fhtgpu.h"
 8 | #include "frp/frp.h"
 9 | #include "frp/gpu.h"
10 | #include "frp/ifc.h"
11 | #include "frp/jl.h"
12 | #include "frp/kernel.h"
13 | #include "frp/linalg.h"
14 | #include "frp/mach.h"
15 | #include "frp/parser.h"
16 | #include "frp/rand.h"
17 | #include "frp/sample.h"
18 | #include "frp/sdq.h"
19 | #include "frp/spinner.h"
20 | #include "frp/stackstruct.h"
21 | #include "frp/util.h"
22 | 
23 | #endif
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/include/frp/gpu.h:
--------------------------------------------------------------------------------
1 | #ifndef FRP_GPU_H
2 | #define FRP_GPU_H
3 | #include "fhtgpu.h"
4 | 
5 | 
6 | #endif /* FRP_GPU_H */
7 | 


--------------------------------------------------------------------------------
/include/frp/graph.h:
--------------------------------------------------------------------------------
 1 | #ifndef FROOPY_GRAPH_H__
 2 | #define FROOPY_GRAPH_H__
 3 | #include "./util.h"
 4 | #include <deque>
 5 | #include <set>
 6 | 
 7 | namespace frp {
 8 | inline namespace graph {
 9 | 
10 | struct Emplacer {
11 |     template<template<typename...> class Container, typename Value, typename...CArgs>
12 |     static auto emplace(Container<CArgs...> &c, Value &&v) {
13 |         c.emplace(std::move(v));
14 |     }
15 |     template<typename Value, typename...CArgs>
16 |     static auto emplace(std::vector<CArgs...> &c, Value &&v) {
17 |         c.emplace_back(std::move(v));
18 |     }
19 |     template<typename Value, typename...CArgs>
20 |     static auto emplace(std::deque<CArgs...> &c, Value &&v) {
21 |         c.emplace_back(std::move(v));
22 |     }
23 | };
24 | 
25 | // Representation 1: all nodes implicit
26 | template<typename IndexType=::std::uint32_t, bool is_directed=false,
27 |          template<typename...> class EdgeContainer=std::set>
28 | class SparseGraph {
29 | public:
30 |     using index_type = IndexType;
31 |     using edge_type = std::pair<index_type, index_type>;
32 | protected:
33 |     index_type n_;
34 |     EdgeContainer<edge_type> edges_;
35 | public:
36 |     SparseGraph(index_type n=0): n_(n) {
37 |         
38 |     }
39 |     void resize(index_type newn) {
40 |         if(newn < n_)
41 |             for(const auto &pair: edges_)
42 |                 if(pair.first > n_ || pair.second > n_)
43 |                     throw std::runtime_error("Resizing leaves dangling edges.");
44 |         n_ = newn;
45 |     }
46 |     void add(index_type lhs, index_type rhs) {
47 |         add(std::make_pair(lhs, rhs));
48 |     }
49 |     void add(edge_type edge) {
50 |         CONST_IF(!is_directed) {
51 |             if(edge.first > edge.second)
52 |                 std::swap(edge.first, edge.second);
53 |         }
54 |         if(std::max(edge.first, edge.second) > n_)
55 |             throw std::runtime_error("Can't add edges between nodes that don't exist");
56 |         Emplacer::emplace(edges_, edge);
57 |     }
58 |     void sort() {
59 |         std::sort(edges_.begin(), edges_.end());
60 |     }
61 | };
62 | // Representation 2: nodes explicit, with values
63 | //                   edges are unweighted
64 | template<typename ValueType,
65 |          typename IndexType=::std::uint32_t, bool is_directed=false,
66 |          template<typename...> class EdgeContainer=std::set,
67 |          template<typename...> class NodeContainer=std::vector>
68 | class NodeValuedSparseGraph: public SparseGraph<IndexType, is_directed, EdgeContainer> {
69 | protected:
70 |     using super = SparseGraph<IndexType, is_directed, EdgeContainer>;
71 |     using node_type = ValueType;
72 |     using super::edge_type;
73 |     using super::index_type;
74 |     NodeContainer<ValueType> nodes_;
75 | public:
76 |     template<typename...Args>
77 |     NodeValuedSparseGraph(Args &&...args): nodes_(std::forward<Args>(args)...) {
78 |         if(nodes_.size())
79 |             super::resize(nodes_.size());
80 |     }
81 |     template<typename...Args>
82 |     auto emplace_node(Args &&...args) {
83 |         ++this->n_;
84 |         return Emplacer::emplace(nodes_, std::forward<Args>(args)...);
85 |     }
86 | };
87 | 
88 | // TODO: Representation 3: nodes implicit, weighted edges
89 | // TODO: Representation 4: nodes explicit, weighted edges
90 | } // graph
91 | } // frp
92 | 
93 | #endif /* FROOPY_GRAPH_H__ */
94 | 


--------------------------------------------------------------------------------
/include/frp/ifc.h:
--------------------------------------------------------------------------------
 1 | #ifndef _FAST_COPY_H__
 2 | #define _FAST_COPY_H__
 3 | 
 4 | #include <string.h>
 5 | #include <stdlib.h>
 6 | #if (defined(__x86_64__) || defined(__i386__))
 7 | #  include <x86intrin.h>
 8 | #endif
 9 | 
10 | #define _STORAGE_ static inline
11 | 
12 | // These functions all assume that the size of memory being copied is a power of 2.
13 | 
14 | #ifndef FAST_COPY_MEMCPY_THRESHOLD
15 | #define FAST_COPY_MEMCPY_THRESHOLD (1u << 20)
16 | #endif
17 | 
18 | #if _FEATURE_AVX512F
19 | // If n is less than 64, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
20 | _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
21 |     if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
22 |         return memcpy(out, in, n);
23 |     }
24 |     n >>= 6;
25 |     for(__m512 *ov = (__m512 *)out, *iv = (__m512 *)in; n--;) {
26 |         _mm512_storeu_ps((float *)(ov++), _mm512_loadu_ps((float *)(iv++)));
27 |     }
28 |     return out;
29 | }
30 | #elif __AVX2__
31 | // If n is less than 32, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
32 | _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
33 |     if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
34 |         return memcpy(out, in, n);
35 |     }
36 |     n >>= 5;
37 |     for(__m256 *ov = (__m256 *)out, *iv = (__m256 *)in; n--;) {
38 |         _mm256_storeu_ps((float *)(ov++), _mm256_loadu_ps((float *)(iv++)));
39 |     }
40 |     return out;
41 | }
42 | #elif __SSE2__
43 | // If n is less than 16, defaults to memcpy. Otherwise, being a power of 2, we can just use unaligned stores and loads.
44 | _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
45 |     if(n >= FAST_COPY_MEMCPY_THRESHOLD) {
46 |         return memcpy(out, in, n);
47 |     }
48 |     n >>= 4;
49 |     for(__m128 *ov = (__m128 *)out, *iv = (__m128 *)in; n--;) {
50 |         _mm_storeu_ps((float *)(ov++), _mm_loadu_ps((float *)(iv++)));
51 |     }
52 |     return out;
53 | }
54 | #else
55 | _STORAGE_ void *fast_copy(void *out, void *in, size_t n) {
56 |     return memcpy(out, in, n);
57 | }
58 | #endif
59 | 
60 | #undef _STORAGE_
61 | #endif
62 | 


--------------------------------------------------------------------------------
/include/frp/jl.h:
--------------------------------------------------------------------------------
  1 | #ifndef _JL_H__
  2 | #define _JL_H__
  3 | #include <random>
  4 | #include "frp/spinner.h"
  5 | 
  6 | namespace frp {
  7 | 
  8 | namespace jl {
  9 | 
 10 | template<typename MatrixType>
 11 | class JLTransform  {
 12 |     using FloatType = typename MatrixType::ElementType;
 13 |     const size_t m_, n_;
 14 |     MatrixType matrix_;
 15 | public:
 16 |     JLTransform(size_t m, size_t n):
 17 |         m_{m}, n_{n}, matrix_(m, n)  {
 18 |         if(m_ >= n_) fprintf(stderr, "Warning: JLTransform has to reduce dimensionality.");
 19 |     }
 20 |     template<typename RNG, typename Distribution>
 21 |     void fill(RNG &rng, Distribution &dist, bool orthogonalize=true) {
 22 |         for(size_t i(0); i < m_; ++i)
 23 |             for(size_t j(0); j < n_; ++j)
 24 |                 matrix_(i, j) = dist(rng);
 25 |         if(orthogonalize) {
 26 |             linalg::gram_schmidt(matrix_, linalg::RESCALE_TO_GAUSSIAN);
 27 |         }
 28 |         matrix_ *= 1. / std::sqrt(static_cast<double>(m_));
 29 |     }
 30 |     void fill(uint64_t seed, bool orthogonalize=true) {
 31 |         std::mt19937_64 rng(seed);
 32 |         std::normal_distribution<FloatType> dist;
 33 |         fill(rng, dist, orthogonalize);
 34 |     }
 35 |     template<typename InVec, typename OutVec>
 36 |     void apply(const InVec &in, OutVec out) {
 37 |         assert(out.size() == m_);
 38 |         assert(in.size() == n_);
 39 |         out = matrix_ * in;
 40 |     }
 41 |     auto size() const {return matrix_.rows() * matrix_.columns();}
 42 | };
 43 | 
 44 | template<typename FT=float>
 45 | class OrthogonalJLTransform {
 46 |     size_t from_, to_;
 47 |     std::vector<HadamardRademacherSDBlockT<FT>> blocks_;
 48 |     std::vector<uint64_t> seeds_;
 49 | public:
 50 |     using size_type = uint64_t;
 51 | 
 52 |     OrthogonalJLTransform(size_t from, size_t to, uint64_t seed, size_t nblocks=3): from_(roundup(from)), to_(to)
 53 |     {
 54 |         std::mt19937_64 gen(seed);
 55 |         while(seeds_.size() < nblocks) seeds_.push_back(gen());
 56 |         for(const auto seed: seeds_) blocks_.emplace_back(from, seed);
 57 |     }
 58 |     OrthogonalJLTransform(OrthogonalJLTransform &&o) = default;
 59 |     OrthogonalJLTransform(const OrthogonalJLTransform &o) = default;
 60 |     void resize(size_type newfrom, size_type newto) {
 61 |         //std::fprintf(stderr, "Resizing from %zu to %zu (rounded up %zu)\n", from_, roundup(newfrom), newfrom);
 62 |         newfrom = roundup(newfrom);
 63 |         resize_from(newfrom);
 64 |         resize_to(newto);
 65 |     }
 66 |     size_t from_size() const {return from_;}
 67 |     size_t to_size()   const {return to_;}
 68 |     void reseed(size_type newseed) {
 69 |         seeds_.clear();
 70 |         std::mt19937_64 gen(newseed);
 71 |         while(seeds_.size() < nblocks()) seeds_.push_back(gen());
 72 |     }
 73 |     void resize_from(size_type newfrom) {
 74 |         from_ = newfrom;
 75 |         for(size_type i(0); i < nblocks(); ++i) {
 76 |             blocks_[i].seed(seeds_[i]);
 77 |             blocks_[i].resize(from_);
 78 |         }
 79 |     }
 80 |     void resize_to(size_type newto) {
 81 |         to_ = newto;
 82 |     }
 83 |     size_t nblocks() const {return blocks_.size();}
 84 |     template<typename Vec1, typename Vec2>
 85 |     void transform(const Vec1 &in, Vec2 &out) const {
 86 |         Vec2 tmp(in); // Copy.
 87 |         transform_inplace(tmp);
 88 |         out = subvector(tmp, 0, to_); // Copy result out.
 89 |     }
 90 |     template<typename Vec1, typename=std::enable_if_t<blaze::IsVector<Vec1>::value>>
 91 |     void transform_inplace(Vec1 &in) const {
 92 |         for(auto it(std::rbegin(blocks_)), eit(std::rend(blocks_)); it != eit; ++it) {
 93 |             it->apply(in);
 94 |         }
 95 |         in *= std::sqrt(static_cast<double>(from_) / to_);
 96 |     }
 97 |     template<typename FloatType, typename=std::enable_if_t<std::is_floating_point<FloatType>::value>>
 98 |     void transform_inplace(FloatType *in) const {
 99 |         for(auto it(std::rbegin(blocks_)), eit(std::rend(blocks_)); it != eit; (it++)->apply(in)); // Apply transforms
100 |         // Renormalize.
101 |         using SType = typename vec::SIMDTypes<FloatType>;
102 |         const FloatType *end(in + to_);
103 |         const typename SType::Type vmul = SType::set1(std::sqrt(static_cast<FloatType>(from_) / to_));
104 |         if(SType::aligned(in)) {
105 |             do SType::store(in, SType::mul(SType::load(in), vmul)); while((in += sizeof(vmul) / sizeof(*in)) < end);
106 |         } else {
107 |             do SType::storeu(in, SType::mul(SType::loadu(in), vmul)); while((in += sizeof(vmul) / sizeof(*in)) < end);
108 |         }
109 |     }
110 |     // Downstream application has to subsample itself.
111 |     // Optionally add a (potentially scaled?) Guassian multiplication layer.
112 | };
113 | 
114 | class FastJLTransform {
115 |     /* https://www.cs.princeton.edu/~chazelle/pubs/FJLT-sicomp09.pdf
116 |      * THE FAST JOHNSON-LINDENSTRAUSS TRANSFORM AND APPROXIMATE NEAREST NEIGHBORS
117 |      * SIAM J. COMPUT ©2009 Society for Industrial and Applied MathematicsVol. 39, No. 1,p. 32
118 |      *
119 |      * The success of this approach to accelerating ANN suggests the potential utility of the OJLT in said searches.
120 |      */
121 |     size_t from_, to_;
122 |     HadamardRademacherSDBlock block_;
123 |     uint64_t seed_;
124 |     SubsampleStrategy sample_method_;
125 | public:
126 |     using size_type = uint64_t;
127 | 
128 |     FastJLTransform(size_t from, size_t to, uint64_t seed, SubsampleStrategy strat=FIRST_M):
129 |         from_(roundup(from)), to_(to), block_(from, seed), seed_(seed), sample_method_(strat)
130 |     {
131 |     }
132 |     FastJLTransform(FastJLTransform &&o) = default;
133 |     FastJLTransform(const FastJLTransform &o) = default;
134 |     void resize(size_type newfrom, size_type newto) {
135 |         //std::fprintf(stderr, "Resizing from %zu to %zu (rounded up %zu)\n", from_, roundup(newfrom), newfrom);
136 |         newfrom = roundup(newfrom);
137 |         resize_from(newfrom);
138 |         resize_to(newto);
139 |     }
140 |     SubsampleStrategy get_sample_method() const {return sample_method_;}
141 |     SubsampleStrategy set_sample_method(SubsampleStrategy newstrat) {return sample_method_ = newstrat;}
142 |     size_t from_size() const {return from_;}
143 |     size_t to_size()   const {return to_;}
144 |     void reseed(size_type newseed) {
145 |         block_ = HadamardRademacherSDBlock(from_, newseed);
146 |     }
147 |     void resize_from(size_type newfrom) {
148 |         from_ = newfrom;
149 |         reseed(seed_);
150 |     }
151 |     void resize_to(size_type newto) {
152 |         to_ = newto;
153 |     }
154 |     static constexpr size_t nblocks() {return 1;}
155 |     template<typename Vec1, typename Vec2>
156 |     void transform(const Vec1 &in, Vec2 &out) const {
157 |         Vec2 tmp(in); // Copy.
158 |         transform_inplace(tmp);
159 |         out = subvector(tmp, 0, to_); // Copy result out.
160 |     }
161 |     template<typename Vec1, typename=std::enable_if_t<blaze::IsVector<Vec1>::value>>
162 |     void transform_inplace(Vec1 &in) const {
163 |         block_.apply(in);
164 |         const auto mult = std::sqrt(static_cast<double>(from_) / to_);
165 |         // Note: we multiply in the same pass as the shuffle under the hope that the cache efficiency
166 |         // of a signle pass outweighs the value of SIMD acceleration
167 |         switch(sample_method_) {
168 |             case FIRST_M:
169 |                 in.resize(to_); // The buffer following is unused/unnecessary. We simply sample the first d rows wlog
170 |                 in *= mult;
171 |                 break;
172 |             case RANDOM_NO_REPLACEMENT: case RANDOM_NO_REPLACEMENT_HASH_SET: case RANDOM_NO_REPLACEMENT_VEC: case RANDOM_W_REPLACEMENT: default:
173 |                 aes::AesCtr<uint32_t> gen(seed_ ^ 1337);
174 |                 if(to_ > from_) {
175 |                     size_t initsz = in.size();
176 |                     in.resize(to_);
177 |                     for(size_t i = initsz; i < in.size(); in[i++] = (in[fastrange<uint32_t>(gen(), initsz)] * mult));
178 |                 } else if(to_ != from_) {
179 |                     for(size_t i = 0; i < to_; ++i) {
180 |                         uint32_t oind = fastrange<uint32_t>(gen(), in.size() - i) + i;  // This avoids replacement, just for convenience
181 |                         in[i] = in[oind] * mult;
182 |                     }
183 |                     in.resize(to_);
184 |                 } // else do nothing
185 |         }
186 |     }
187 |     template<typename FloatType, typename=std::enable_if_t<std::is_floating_point<FloatType>::value>>
188 |     void transform_inplace(FloatType *in) const {
189 |         // Apply transform and renormalize.
190 |         if(sample_method_ != FIRST_M) throw std::runtime_error("Sampling methods besides FIRST_M not implemented for pointers.");
191 |         if(from_ < to_) throw std::runtime_error("FastJLTransform only supports dimensionality reduction.");
192 |         block_.apply(in);
193 |         using SType = typename vec::SIMDTypes<FloatType>;
194 |         const FloatType *end(in + to_);
195 |         const typename SType::Type vmul = SType::set1(std::sqrt(static_cast<FloatType>(from_) / to_));
196 |         if(SType::aligned(in)) {
197 |             do SType::store(in, SType::mul(SType::load(in), vmul)); while((in += sizeof(vmul) / sizeof(*in)) < end);
198 |         } else {
199 |             do SType::storeu(in, SType::mul(SType::loadu(in), vmul)); while((in += sizeof(vmul) / sizeof(*in)) < end);
200 |         }
201 |     }
202 |     // Downstream application has to subsample itself.
203 |     // Optionally add a (potentially scaled?) Guassian multiplication layer.
204 | };
205 | 
206 | using OJLTransform = OrthogonalJLTransform<float>;
207 | using DOJ = OrthogonalJLTransform<double>;
208 | using OJLT = OJLTransform;
209 | using FOJ = OJLT;
210 | using FJLT = FastJLTransform;
211 | 
212 | } // namespace jl
213 | using namespace jl;
214 | 
215 | } // namespace frp
216 | 
217 | #endif // #ifndef _JL_H__
218 | 


--------------------------------------------------------------------------------
/include/frp/kama.h:
--------------------------------------------------------------------------------
1 | #ifndef KAMA_H__
2 | #define KAMA_H__
3 | 
4 | #endif /* KAMA_H__ */
5 | 


--------------------------------------------------------------------------------
/include/frp/lsh.h:
--------------------------------------------------------------------------------
  1 | #ifndef FRP_LSH_H__
  2 | #define FRP_LSH_H__
  3 | #include "vec/vec.h"
  4 | #include "frp/jl.h"
  5 | #include "clhash/include/clhash.h"
  6 | #include "flat_hash_map/flat_hash_map.hpp"
  7 | 
  8 | 
  9 | namespace frp {
 10 | struct mclhasher {
 11 |     const void *random_data_;
 12 |     mclhasher(uint64_t seed1=137, uint64_t seed2=777): random_data_(get_random_key_for_clhash(seed1, seed2)) {}
 13 |     mclhasher(const mclhasher &o): random_data_(copy_random_data(o)) {} // copy data
 14 |     mclhasher(mclhasher &&o): random_data_(o.random_data_) {
 15 |         o.random_data_ = nullptr; // move
 16 |     }
 17 |     static void *copy_random_data(const mclhasher &o) {
 18 |         void *ret;
 19 |         if(posix_memalign(&ret, sizeof(__m128i), RANDOM_BYTES_NEEDED_FOR_CLHASH)) throw std::bad_alloc();
 20 |         return std::memcpy(ret, o.random_data_, RANDOM_BYTES_NEEDED_FOR_CLHASH);
 21 |     }
 22 |     template<typename T>
 23 |     uint64_t operator()(const T *data, const size_t len) const {
 24 |         return clhash(random_data_, (const char *)data, len * sizeof(T));
 25 |     }
 26 |     uint64_t operator()(const char *str) const {return operator()(str, std::strlen(str));}
 27 |     template<typename T>
 28 |     uint64_t operator()(const T &input) const {
 29 |         return operator()((const char *)&input, sizeof(T));
 30 |     }
 31 |     template<typename T>
 32 |     uint64_t operator()(const std::vector<T> &input) const {
 33 |         return operator()((const char *)input.data(), sizeof(T) * input.size());
 34 |     }
 35 |     uint64_t operator()(const std::string &str) const {
 36 |         return operator()(str.data(), str.size());
 37 |     }
 38 |     ~mclhasher() {
 39 |         std::free((void *)random_data_);
 40 |     }
 41 | };
 42 | using SIMDSpace = vec::SIMDTypes<uint64_t>;
 43 | using VType = typename SIMDSpace::VType;
 44 | template<typename F, typename V> ATTR_CONST INLINE auto cmp_zero(V v);
 45 | #if _FEATURE_AVX512F
 46 | template<> ATTR_CONST INLINE auto
 47 | cmp_zero<float> (__m512 v) {
 48 |     return _mm512_cmp_ps_mask(v, _mm512_setzero_ps(), _CMP_GT_OQ);
 49 | }
 50 | template<> ATTR_CONST INLINE auto
 51 | cmp_zero<float> (__m512d v) {
 52 |     return _mm512_cmp_pd_mask(v, _mm512_setzero_pd(), _CMP_GT_OQ);
 53 | }
 54 | #elif __AVX__
 55 | template<> 
 56 | ATTR_CONST INLINE
 57 | auto cmp_zero<float, __m256> (__m256 v) {
 58 |     return _mm256_movemask_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GT_OQ));
 59 | }
 60 | template<>
 61 | ATTR_CONST INLINE
 62 | auto cmp_zero<double, __m256d> (__m256d v) {
 63 |     return _mm256_movemask_pd(_mm256_cmp_pd(v, _mm256_setzero_pd(), _CMP_GT_OQ));
 64 | }
 65 | #else
 66 | #pragma message("not vectorizing signed projection hashing")
 67 | #endif
 68 | 
 69 | template<typename FType, bool SO>
 70 | struct empty {
 71 |     template<typename...Args> empty(Args &&...args) {}
 72 | };
 73 | 
 74 | template<typename FType, size_t VSZ>
 75 | struct F2VType;
 76 | #if __AVX__
 77 | 
 78 | template<> struct F2VType<float, 32> {
 79 |     using type = __m256;
 80 |     static type load(const float *a) {
 81 |         return _mm256_loadu_ps(a);
 82 |     }
 83 | };
 84 | template<> struct F2VType<double, 32> {
 85 |     using type = __m256d;
 86 |     static type load(const double *a) {
 87 |         return _mm256_loadu_pd(a);
 88 |     }
 89 | };
 90 | #endif
 91 | #if HAS_AVX_512
 92 | template<> struct F2VType<float, 64> {
 93 |     using type = __m512;
 94 |     static type load(const float *a) {
 95 |         return _mm512_loadu_ps(a);
 96 |     }
 97 | };
 98 | template<> struct F2VType<double, 64> {
 99 |     using type = __m512d;
100 |     static type load(const double *a) {
101 |         return _mm512_loadu_pd(a);
102 |     }
103 | };
104 | #endif
105 | 
106 | #if HAS_AVX_512
107 |     template<typename FType>
108 |     static constexpr int f2b(__m512d v) {
109 |         return cmp_zero<FType, decltype(v)>(v);
110 |     }
111 |     template<typename FType>
112 |     static constexpr int f2b(__m512 v) {
113 |         return cmp_zero<FType, decltype(v)>(v);
114 |     }
115 |     //using VType = F2VType<FType, sizeof(__m512)>::type;
116 | #endif
117 | #if __AVX__
118 |     template<typename FType>
119 |     static constexpr int f2b(__m256d v) {
120 |         return cmp_zero<FType, decltype(v)>(v);
121 |     }
122 |     template<typename FType>
123 |     static constexpr int f2b(__m256 v) {
124 |         return cmp_zero<FType, decltype(v)>(v);
125 |     }
126 |     //using VType = typename F2VType<FType, sizeof(__m256)>::type;
127 | #endif
128 | 
129 | template<typename FType, bool SO=blaze::rowMajor, typename DistributionType=std::normal_distribution<FType>, typename...DistArgs>
130 | blaze::DynamicMatrix<FType, SO>
131 | generate_randproj_matrix(size_t nr, size_t ncol,
132 |                          bool orthonormalize=true, uint64_t seed=0,
133 |                          DistArgs &&...args)
134 | {
135 |     using matrix_type = blaze::DynamicMatrix<FType, SO>;
136 |     matrix_type ret(nr, ncol);
137 |     seed = ((seed ^ nr) * ncol) * seed;
138 |     if(orthonormalize) {
139 |         try {
140 |             matrix_type r, q;
141 |             if(ret.rows() >= ret.columns()) {
142 |                 // Randomize
143 |                 OMP_PRAGMA("omp parallel for")
144 |                 for(size_t i = 0; i < ret.rows(); ++i) {
145 |                     blaze::RNG gen(seed + i * seed + i);
146 |                     DistributionType dist(std::forward<DistArgs>(args)...);
147 |                     for(auto &v: row(ret, i))
148 |                         v = dist(gen);
149 |                 }
150 |                 // QR
151 |                 blaze::qr(ret, q, r);
152 |                 assert(ret.columns() == q.columns());
153 |                 assert(ret.rows() == q.rows());
154 |                 swap(ret, q);
155 |             } else {
156 |                 // Generate random matrix for (C, C) and then just take the first R rows
157 |                 const auto mc = ret.columns();
158 |                 matrix_type tmp(mc, mc);
159 |                 OMP_PRAGMA("omp parallel for")
160 |                 for(size_t i = 0; i < tmp.rows(); ++i) {
161 |                     blaze::RNG gen(seed + i * seed + i);
162 |                     DistributionType dist(std::forward<DistArgs>(args)...);
163 |                     for(auto &v: row(tmp, i))
164 |                         v = dist(gen);
165 |                 }
166 |                 blaze::qr(tmp, q, r);
167 |                 ret = submatrix(q, 0, 0, ret.rows(), ret.columns());
168 |             }
169 |             OMP_PRAGMA("omp parallel for")
170 |             for(size_t i = 0; i < ret.rows(); ++i)
171 |                 blaze::normalize(row(ret, i));
172 |         } catch(const std::exception &ex) { // Orthonormalize
173 |             std::fprintf(stderr, "failure in orthonormalization: %s\n", ex.what());
174 |             throw;
175 |         }
176 |     } else {
177 |         OMP_PRAGMA("omp parallel for")
178 |         for(size_t i = 0; i < nr; ++i) {
179 |             blaze::RNG gen(seed + i);
180 |             std::normal_distribution dist;
181 |             for(auto &v: row(ret, i))
182 |                 v = dist(gen);
183 |             normalize(row(ret, i));
184 |         }
185 |     }
186 |     return ret;
187 | }
188 | 
189 | 
190 | 
191 | template<typename FType=float, template<typename, bool> class Container=::blaze::DynamicVector, bool SO=blaze::rowMajor>
192 | struct LSHasher {
193 |     using CType = Container<FType, SO>;
194 |     CType container_;
195 |     template<typename... CArgs>
196 |     LSHasher(CArgs &&...args): container_(std::forward<CArgs>(args)...) {}
197 |     template<typename T>
198 |     auto dot(const T &ov) const {
199 |         return blaze::dot(container_, ov);
200 |     }
201 |     // TODO: Store full matrix to get hashes
202 |     // TODO: Use structured matrices to speed up calculation (FFHT, then downsample to bins)
203 | };
204 | 
205 | 
206 | 
207 | template<typename FType, bool OSO>
208 | static INLINE uint64_t cmp2hash(const blaze::DynamicVector<FType, OSO> &c, size_t n=0) {
209 |     assert(n <= 64);
210 |     uint64_t ret = 0;
211 |     if(n == 0) {
212 |         n = n;
213 |     }
214 | #if HAS_AVX_512
215 |     static constexpr size_t COUNT = sizeof(__m512d) / sizeof(FType);
216 | #elif __AVX__
217 |     static constexpr size_t COUNT = sizeof(__m256d) / sizeof(FType);
218 | #else
219 |     static constexpr size_t COUNT = 0;
220 | #endif
221 |     size_t i = 0;
222 | #if HAS_AVX_512 || defined(__AVX__)
223 |     CONST_IF(COUNT) {
224 |         using LV = F2VType<FType, sizeof(VType)>;
225 |         for(; i < n / COUNT;ret = (ret << COUNT) | cmp_zero<FType, typename LV::type>(LV::load((&c[i++ * COUNT]))));
226 |         i *= COUNT;
227 |     }
228 | #else
229 |     for(;i + 8 <= n; i += 8) {
230 |         ret = (ret << 8) |
231 |               ((c[i] > 0.) << 7) | ((c[i + 1] > 0.) << 6) |
232 |               ((c[i + 2] > 0.) << 5) | ((c[i + 3] > 0.) << 4) |
233 |               ((c[i + 4] > 0.) << 3) | ((c[i + 5] > 0.) << 2) |
234 |               ((c[i + 6] > 0.) << 1) | (c[i + 7] > 0.);
235 |     }
236 | #endif
237 |     for(; i < n; ret = (ret << 1) | (c[i++] > 0.));
238 |     return ret;
239 | }
240 | 
241 | template<typename FType=float, bool SO=blaze::rowMajor, typename DistributionType=std::normal_distribution<FType>>
242 | struct MatrixLSHasher {
243 |     using CType = ::blaze::DynamicMatrix<FType, SO>;
244 |     using this_type       =       MatrixLSHasher<FType, SO>;
245 |     using const_this_type = const MatrixLSHasher<FType, SO>;
246 |     CType container_;
247 |     template<typename...DistArgs>
248 |     MatrixLSHasher(size_t nr, size_t nc, bool orthonormalize=true, uint64_t seed=0,
249 |                    DistArgs &&...args):
250 |         container_(std::move(generate_randproj_matrix<FType, SO, DistributionType>(nr, nc, orthonormalize, seed, std::forward<DistArgs>(args)...))) {}
251 |     auto &multiply(const blaze::DynamicVector<FType, SO> &c, blaze::DynamicVector<FType, SO> &ret) const {
252 |         //std::fprintf(stderr, "size of input: %zu. size of ret: %zu. Matrix sizes: %zu/%zu\n", c.size(), ret.size(), container_.rows(), container_.columns());
253 |         ret = this->container_ * c;
254 |         //std::fprintf(stderr, "multiplied successfully\n");
255 |         return ret;
256 |     }
257 |     blaze::DynamicVector<FType, SO> multiply(const blaze::DynamicVector<FType, SO> &c) const {
258 |         blaze::DynamicVector<FType, SO> vec;
259 |         //std::fprintf(stderr, "size of input: %zu. size of vec: %zu. Matrix sizes: %zu/%zu\n", c.size(), vec.size(), container_.rows(), container_.columns());
260 |         this->multiply(c, vec);
261 |         return vec;
262 |     }
263 |     auto multiply(const blaze::DynamicVector<FType, !SO> &c) const {
264 |         //std::fprintf(stderr, "size of input: %zu. size of vec: %zu. Matrix sizes: %zu/%zu\n", c.size(), container_.rows(), container_.columns());
265 |         blaze::DynamicVector<FType, SO> vec = this->container_ * trans(c);
266 |         return vec;
267 |     }
268 |     template<typename...Args>
269 |     decltype(auto) project(Args &&...args) const {return multiply(std::forward<Args>(args)...);}
270 |     template<bool OSO>
271 |     uint64_t hash(const blaze::DynamicVector<FType, OSO> &c) const {
272 | #if VERBOSE_AF
273 |         std::cout << this->container_ << '\n';
274 | #endif
275 |         blaze::DynamicVector<FType, SO> vec = multiply(c);
276 |         return cmp2hash(vec); // This is the SRP hasher (signed random projection)
277 |     }
278 |     template<bool OSO>
279 |     uint64_t operator()(const blaze::DynamicVector<FType, OSO> &c) const {
280 |         return this->hash(c);
281 |     }
282 | };
283 | 
284 | template<typename FType=float, bool OSO=blaze::rowMajor, typename DistributionType=std::normal_distribution<FType>>
285 | struct E2LSHasher {
286 |     MatrixLSHasher<FType, OSO, DistributionType> superhasher_;
287 |     blaze::DynamicVector<FType> b_;
288 |     double r_;
289 |     mclhasher clhasher_;
290 |     template<typename...Args>
291 |     E2LSHasher(unsigned d, unsigned k, double r = 1., uint64_t seed=0, Args &&...args): superhasher_(k, d, false, seed, std::forward<Args>(args)...), r_(r), b_(k), clhasher_(seed * seed + seed) {
292 |         superhasher_.container_ /= r;
293 |         std::uniform_real_distribution<FType> gen(0, r_);
294 |         std::mt19937_64 mt(seed ^ uint64_t(d * k * r));
295 |         for(auto &v: b_)
296 |             v = gen(mt);
297 |     }
298 |     E2LSHasher(const E2LSHasher &o) = default;
299 |     E2LSHasher(E2LSHasher &&o) = default;
300 |     template<typename...Args>
301 |     decltype(auto) project(Args &&...args) const {
302 |         //std::fprintf(stderr, "b size: %zu\n", b_.size());
303 |         //auto v = superhasher_.project(std::forward<Args>(args)...);
304 |         //std::fprintf(stderr, "v size: %zu\n", v.size());
305 |         return floor(superhasher_.project(std::forward<Args>(args)...) + b_);
306 |     }
307 |     template<typename...Args>
308 |     uint64_t hash(Args &&...args) const {
309 |         auto proj = this->project(std::forward<Args>(args)...);
310 |         return clhasher_(&b_[0], b_.size() * sizeof(FType));
311 |     }
312 |     template<typename...Args>
313 |     uint64_t operator()(Args &&...args) const {
314 |         return hash(std::forward<Args>(args)...);
315 |     }
316 | };
317 | 
318 | template<typename FT=float>
319 | struct ThresholdedCauchyDistribution {
320 |     std::cauchy_distribution<FT> cd_;
321 |     FT absmax_;
322 |     template<typename...Args> ThresholdedCauchyDistribution(FT absmax, Args &&...args): cd_(std::forward<Args>(args)...), absmax_(std::abs(absmax)) {
323 |     }
324 |     FT operator()() {
325 |         return std::clamp(cd_(), -absmax_, absmax_);
326 |     }
327 | };
328 | 
329 | template<typename FType=float, bool OSO=blaze::rowMajor>
330 | struct L1E2LSHasher: public E2LSHasher<FType, OSO, ThresholdedCauchyDistribution<double>> {
331 |     using super = E2LSHasher<FType, OSO, ThresholdedCauchyDistribution<double>>;
332 |     L1E2LSHasher(unsigned d, unsigned k, double r = 1., uint64_t seed=0, FType amax=1000.): 
333 |         super(d, k, r, seed, amax) {}
334 | };
335 | 
336 | 
337 | 
338 | template<typename FType=float, bool SO=blaze::rowMajor, typename DistributionType=std::normal_distribution<FType>>
339 | struct FHTLSHasher {
340 |     using this_type       =       FHTLSHasher<FType, SO>;
341 |     using const_this_type = const FHTLSHasher<FType, SO>;
342 |     //std::vector<blaze::DynamicVector<FType, SO>> d_; // diagonal matrix
343 |     //                                                 // use Matrix for case that we need more projections than we have dimensions
344 |     std::vector<jl::OrthogonalJLTransform<FType>> jlt_;
345 |     size_t nc_, nr_;
346 |     auto ncroundup() const {return roundup(nc_);}
347 |     template<typename...DistArgs>
348 |     FHTLSHasher(size_t nr, size_t nc, uint64_t seed=0, unsigned nblocks=1,
349 |                 DistArgs &&...args): nc_(nc), nr_(nr) {
350 |         unsigned njlts = (nr + nc - 1) / nc;
351 |         jlt_.reserve(njlts);
352 |         if(nr > nc) {
353 |             std::mt19937_64 mt(seed + nc * nr);
354 |             throw std::runtime_error("Not implemented: projections > dimensionality. TODO: this");
355 |         } else {
356 |             jlt_.emplace_back(nc, nr, seed + (nc * nr), nblocks);
357 |             //d_.emplace_back(roundup(nc), 0);
358 |         }
359 |         blaze::RNG gen(seed);
360 |         DistributionType dist(std::forward<DistArgs>(args)...);
361 | #if 0
362 |         for(auto &d: d_) 
363 |             for(size_t i = 0; i < nc; ++i)
364 |                 d[i] = dist(gen);
365 | #endif
366 |     }
367 |     auto &multiply(const blaze::DynamicVector<FType, SO> &c, blaze::DynamicVector<FType, SO> &ret) const {
368 |         // This will change when we support more projections than input dimensions
369 |         //auto &d = d_[0];
370 |         auto &jl = jlt_[0];
371 |         const auto ts = ncroundup();
372 |         if(ret.size() != ts) ret.resize(ts);
373 |         //subvector(ret, 0, nc_) = trans(c) * subvector(d, 0, nc_);
374 |         subvector(ret, 0, nc_) = c;
375 |         subvector(ret, nc_, ts - nc_) = 0;
376 |         jl.transform_inplace(ret);
377 |         return ret;
378 |     }
379 |     auto multiply(const blaze::DynamicVector<FType, SO> &c) const {
380 |         blaze::DynamicVector<FType, SO> vec(ncroundup());
381 |         multiply(c, vec);
382 |         return vec;
383 |     }
384 |     auto multiply(const blaze::DynamicVector<FType, !SO> &c) const {
385 |         // This will change when we support more projections than input dimensions
386 |         //auto &d = d_[0];
387 |         auto &jl = jlt_[0];
388 |         auto ts = ncroundup();
389 |         blaze::DynamicVector<FType, SO> vec(ts);
390 |         //subvector(vec, 0, nc_) = trans(c) * subvector(d, 0, nc_); if using d_
391 |         subvector(vec, 0, nc_) = trans(c);
392 |         subvector(vec, nc_, ts - nc_) = 0;
393 |         jl.transform_inplace(vec);
394 |         return vec;
395 |     }
396 |     template<typename...Args>
397 |     decltype(auto) project(Args &&...args) const {return multiply(std::forward<Args>(args)...);}
398 |     template<bool OSO>
399 |     uint64_t hash(const blaze::DynamicVector<FType, OSO> &c) const {
400 |         blaze::DynamicVector<FType, SO> vec = multiply(c);
401 |         return cmp2hash(vec, nr_);
402 |     }
403 |     template<bool OSO>
404 |     uint64_t operator()(const blaze::DynamicVector<FType, OSO> &c) const {
405 |         return this->hash(c);
406 |     }
407 | };
408 | 
409 | 
410 | template<typename Hasher, typename IDType=uint32_t> //, ContainerTemplate=template<typename...> class=std::vector,
411 |          //typename... ContainerArgs>
412 | struct LSHTable {
413 |     using Container = std::vector<IDType>;
414 |     Hasher hasher_;
415 |     ska::flat_hash_map<uint64_t, Container> map_;
416 |     IDType nadded_ = 0;
417 |     LSHTable(Hasher &&hasher): hasher_(std::move(hasher)) {
418 |     }
419 |     template<typename T>
420 |     auto add(const T &x) {
421 |         auto id = nadded_++;
422 |         auto v = hasher_(x);
423 |         auto tmp = map_.emplace(v, {id}); // start with just ID
424 |         if(!tmp.second) tmp.first->push_back(id); // If already present, push back
425 |     }
426 |     template<typename T>
427 |     const Container *query(const T &x) const {
428 |         auto v = hasher_(x);
429 |         auto it = map_.find(v);
430 |         if(it == map_.end()) return nullptr;
431 |         return &(it->second);
432 |     }
433 | };
434 | 
435 | template<typename Hasher, typename IDType=uint32_t>
436 | auto make_lshtable(Hasher &&hasher) {
437 |     return LSHTable<Hasher, IDType>(std::move(hasher));
438 | }
439 | 
440 | } // frp
441 | 
442 | #endif
443 | 


--------------------------------------------------------------------------------
/include/frp/mach.h:
--------------------------------------------------------------------------------
  1 | #ifndef _GFRP_MACH_H__
  2 | #define _GFRP_MACH_H__
  3 | #include <cassert>
  4 | #include <string>
  5 | #include <unistd.h>
  6 | #include <stdexcept>
  7 | #include <cstdio>
  8 | #include <cstdlib>
  9 | #include <cstddef>
 10 | #include "kspp/ks.h"
 11 | #include "frp/util.h"
 12 | 
 13 | namespace frp { namespace mach {
 14 | 
 15 | void print_toks(std::vector<ks::string> &strings) {
 16 |     ks::string tmp;
 17 |     tmp.sprintf("Num toks: %zu\t", strings.size());
 18 |     for(const auto &str: strings) tmp.resize(tmp.size() + str.size());
 19 |     for(const auto &str: strings) tmp += str, tmp += ',';
 20 |     tmp.pop();
 21 |     fprintf(stderr, "toks: %s\n", tmp.data());
 22 | }
 23 | 
 24 | #ifdef __APPLE__
 25 | #define CACHE_CMD_STR "/usr/sbin/system_profiler SPHardwareDataType"
 26 | #else
 27 | #define CACHE_CMD_STR "lscpu"
 28 | #endif
 29 | 
 30 | template<typename T>
 31 | using ref = T&;
 32 | 
 33 | struct CacheSizes {
 34 |     size_t l1, l2, l3;
 35 |     operator ref<size_t [3]>() {
 36 |         return reinterpret_cast<ref<size_t [3]>>(*this);
 37 |     }
 38 |     CacheSizes(size_t l1a, size_t l2a, size_t l3a): l1(l1a), l2(l2a), l3(l3a) {}
 39 |     CacheSizes() {memset(this, 0, sizeof(*this));}
 40 |     std::string str() const {
 41 |         char buf[64];
 42 |         sprintf(buf, "L1:%zu,L2:%zu,L3:%zu", l1, l2, l3);
 43 |         return buf;
 44 |     }
 45 | };
 46 | 
 47 | template<typename SizeType=size_t>
 48 | CacheSizes get_cache_sizes() {
 49 |     FILE *fp(popen(CACHE_CMD_STR, "r"));
 50 |     char buf[1 << 16];
 51 |     memset(buf, 0, sizeof(buf));
 52 |     CacheSizes ret;
 53 |     SizeType  *ptr;
 54 |     char     *line;
 55 |     while((line = fgets(buf, sizeof(buf), fp))) {
 56 |         if(strstr(line, "ache") == nullptr) continue;
 57 |         if(strstr(line, "L") == nullptr) continue;
 58 |         auto toks(ks::toksplit<int>(line, strlen(line), 0));
 59 |         if(toks[0] == "L1i") {
 60 |             continue;
 61 |         } else if(toks[0] == "L1d") {
 62 |             ptr = &ret[0];
 63 |         } else if(toks[0] == "L2") {
 64 |             ptr = &ret[1];
 65 |         } else if(toks[0] == "L3") {
 66 |             ptr = &ret[2];
 67 |         } else {
 68 |             fclose(fp);
 69 |             fprintf(stderr, "DIE (%s)\n", toks[0].data());
 70 |             exit(1);
 71 |         }
 72 | #ifdef __APPLE__
 73 |         const auto &endtok(toks.back());
 74 |         const auto &magtok(toks[toks.size() - 2]);
 75 |         *ptr = atoi(magtok.data());
 76 |         const char sizechar(endtok[0]);
 77 | #else
 78 |         const char *tmp(toks.back().data());
 79 |         *ptr = atoi(tmp);
 80 |         while(isdigit(*tmp)) ++tmp;
 81 |         const char sizechar(*tmp);
 82 | #endif
 83 |         assert(isalpha(sizechar));
 84 |         switch(sizechar) {
 85 |             case 'T': case 't': *ptr <<= 40; break;
 86 |             case 'G': case 'g': *ptr <<= 30; break;
 87 |             case 'M': case 'm': *ptr <<= 20; break;
 88 |             case 'K': case 'k': *ptr <<= 10; break;
 89 |         }
 90 |     }
 91 | 
 92 |     fclose(fp);
 93 |     return ret;
 94 | }
 95 | 
 96 | }} // namespace gfpr::mach
 97 | 
 98 | 
 99 | #endif // #ifndef _GFRP_MACH_H__
100 | 


--------------------------------------------------------------------------------
/include/frp/mm.h:
--------------------------------------------------------------------------------
 1 | #ifndef FROOPY_MEX_H__
 2 | #define FROOPY_MEX_H__
 3 | #include <cstdlib>
 4 | #include <string>
 5 | #include <iostream>
 6 | #include <fstream>
 7 | #include "blaze/Math.h"
 8 | 
 9 | namespace frp {
10 |     template<typename FT, bool SO>
11 |     blaze::CompressedMatrix<FT, SO> parse_mm(std::string fn) {
12 |         std::ifstream ifs(fn);
13 |         std::string line;
14 |         do {
15 |             std::getline(ifs, line);
16 |         } while(line.empty() || line.front() == '%');
17 |         char *s = line.data();
18 |         while(std::isspace(*s)) ++s;
19 |         unsigned long nrows = std::strtoul(s, &s, 10);
20 |         do ++s while(std::isspace(*s));
21 |         unsigned long ncols = std::strtoul(s, &s, 10);
22 |         do ++s while(std::isspace(*s));
23 |         unsigned long nnz = std::strtoul(s, nullptr, 10);
24 |         blaze::CompressedMatrix<FT, SO> ret(nrows, ncols);
25 |         ret.reserve(nnz);
26 |         while(std::getline(ifs, line)) {
27 |             s = line.data();
28 |             while(std::isspace(*s)) ++s;
29 |             auto rownum = std::strtoul(s, &s, 10) - 1;
30 |             do ++s while(std::isspace(*s));
31 |             auto colnum = std::strtoul(s, &s, 10) - 1;
32 |             do ++s while(std::isspace(*s));
33 |             double val = std::strtod(s, nullptr);
34 |             ret.insert(rownum, colnum, val);
35 |         }
36 |         return ret;
37 |     }
38 | }
39 | 
40 | #endif /* FROOPY_MEX_H__ */
41 | 


--------------------------------------------------------------------------------
/include/frp/parser.h:
--------------------------------------------------------------------------------
  1 | #ifndef _GFRP_PARSER_H__
  2 | #define _GFRP_PARSER_H__
  3 | #include "frp/util.h"
  4 | 
  5 | namespace frp {
  6 | 
  7 | 
  8 | namespace io {
  9 | template<typename FPType>
 10 | struct IOTypes;
 11 | 
 12 | inline size_t fgzread(FILE *fp, void *buf, unsigned len) {
 13 |     return fread(buf, 1, len, fp);
 14 | }
 15 | 
 16 | template<>
 17 | struct IOTypes<FILE *> {
 18 |     static constexpr decltype(&fopen) open = &fopen;
 19 |     static constexpr decltype(&fclose) close = &fclose;
 20 |     static constexpr decltype(&fgzread) read = &fgzread;
 21 |     static constexpr decltype(&feof) eof = &feof;
 22 |     static constexpr decltype(&ferror) error = &ferror;
 23 | };
 24 | 
 25 | template<>
 26 | struct IOTypes<gzFile> {
 27 |     static constexpr decltype(&gzopen) open = &gzopen;
 28 |     static constexpr decltype(&gzclose) close = &gzclose;
 29 |     static constexpr decltype(&gzread) read = &gzread;
 30 |     static constexpr decltype(&gzeof) eof = &gzeof;
 31 |     static constexpr decltype(&gzerror) error = &gzerror;
 32 | };
 33 | 
 34 | static const std::string zlibsuf  = ".gz";
 35 | static const std::string bzip2suf = ".bz2";
 36 | static const std::string zstdsuf  = ".zst";
 37 | static const std::string zlibcmd  = "gzip -dc ";
 38 | static const std::string bzip2cmd = "bzip2 -dc ";
 39 | static const std::string zstdcmd  = "ztd -dc ";
 40 | 
 41 | bool ends_with(const std::string &pat, const std::string &ref) {
 42 |     return std::equal(std::rbegin(pat), std::rend(pat), std::rbegin(ref));
 43 | }
 44 | 
 45 | enum CType {
 46 |     UNKNOWN = -1,
 47 |     UNCOMPRESSED = 0, // FILE *
 48 |     ZLIB  = 1, // .gz
 49 |     ZSTD  = 2, // .zstd
 50 |     BZIP2 = 3  // .bz2
 51 | };
 52 | 
 53 | CType infer_ctype(const std::string &path) {
 54 |     if(ends_with(zlibsuf, path))  return ZLIB;
 55 |     if(ends_with(bzip2suf, path)) return BZIP2;
 56 |     if(ends_with(zstdsuf, path))  return ZSTD;
 57 |     return UNCOMPRESSED;
 58 | }
 59 | 
 60 | } // namespace io
 61 | 
 62 | #define USE_FP(attr) static constexpr auto attr = io::IOTypes<FPType>::attr
 63 | 
 64 | class LineReader {
 65 |     FILE         *fp_;
 66 |     std::string path_;
 67 |     io::CType  ctype_;
 68 |     char       delim_;
 69 |     size_t     bufsz_;
 70 |     ssize_t      len_;
 71 |     char       *data_;
 72 |     const std::string comment_lines_;
 73 | 
 74 |     /*
 75 |       Reads through a file line by line just once. Will add more functionality later.
 76 |      */
 77 | public:
 78 |     LineReader(const char *path,
 79 |                char delim='\n', size_t bufsz=0, io::CType ctype=io::UNKNOWN, std::string comment_lines="#"):
 80 |         fp_(nullptr), path_(path), ctype_(ctype >= 0 ? ctype: io::infer_ctype(path_)),
 81 |         delim_(delim), bufsz_(bufsz),
 82 |         len_(0), data_(bufsz_ ? (char *)std::malloc(bufsz_): nullptr),
 83 |         comment_lines_(std::move(comment_lines))
 84 |     {
 85 |     }
 86 |     ~LineReader() {
 87 |         if(fp_) fclose(fp_);
 88 |         std::free(data_);
 89 |     }
 90 |     class LineIterator {
 91 |         LineReader &ref_;
 92 |     public:
 93 |         LineIterator(LineReader &ref):
 94 |             ref_(ref) {}
 95 |         LineIterator &operator*() {
 96 |             return *this;
 97 |         }
 98 |         LineIterator &operator++() {
 99 |             ref_.len_ = getdelim(&ref_.data_, &ref_.bufsz_, ref_.delim_, ref_.fp_);
100 |             if(good())
101 |                 if(std::find(ref_.comment_lines_.begin(), ref_.comment_lines_.end(), ref_.data_[0]) != ref_.comment_lines_.end())
102 |                     return this->operator++();
103 |             return *this;
104 |         }
105 |         using uivec_t = std::vector<unsigned>;
106 | 
107 |         ssize_t len() const {return ref_.len();}
108 |         char *data() {return ref_.data();}
109 |         const char *data() const {return ref_.data();}
110 |         bool operator!=([[maybe_unused]] const LineIterator &other) const {return good();}
111 |         bool operator< ([[maybe_unused]] const LineIterator &other) const {return good();}
112 |         char &operator[](size_t index) {return data()[index];}
113 |         const char &operator[](size_t index) const {return data()[index];}
114 |         bool good() const {return ref_.len_ != -1;}
115 |         // TODO: speed this up by avoiding making a vector of positions and just parse in the first pass.
116 |         template<template <typename, bool> typename VectorType, typename FloatType, bool Orientation>
117 |         void set(VectorType<FloatType, Orientation> &ret, uivec_t &offsets, const int delim=',') {
118 |             CONST_IF(blaze::IsSparseVector<VectorType<FloatType, Orientation>>::value)
119 |                 blaze::reset(ret);
120 |             ks::split(data(), delim, len(), offsets);
121 |             if(offsets.size() != ret.size()) {
122 |                 ret.resize(offsets.size());
123 |                 std::fprintf(stderr, "Warning: ret is now %zu in size.\n", ret.size());
124 |                 //throw std::runtime_error(ks::sprintf("Wrong sizes. Number of fields: %zu. Size of array: %zu\n", offsets.size(), ret.size()).data());
125 |             }
126 |             size_t i;
127 | #ifdef USE_OPENMP
128 |             #pragma omp parallel for schedule(dynamic, 8192)
129 | #endif
130 |             for(i = 0; i < std::min(ret.size(), offsets.size()); ++i) {
131 |                 ret[i] = std::atof(data() + offsets[i]);
132 |             }
133 |             CONST_IF(!blaze::IsSparseVector<VectorType<FloatType, Orientation>>::value) {
134 |                 std::memset(&ret[i], 0, (ret.size() - i) * sizeof(FloatType)); // Zero the last elements in array.
135 |             }
136 |         }
137 |         template<template <typename, bool> typename VectorType, typename FloatType, bool Orientation>
138 |         void set(VectorType<FloatType, Orientation> &ret, const int delim=',') {
139 |             CONST_IF(blaze::IsSparseVector<VectorType<FloatType, Orientation>>::value)
140 |                 blaze::reset(ret);
141 |             char *p(data()), *line_end(p + len());
142 |             size_t i(0), e(ret.size());
143 |             while(p < line_end) {
144 |                 ret[i++] = std::atof(p);
145 |                 if(((p = std::strchr(p, delim)) == nullptr) | (i == e)) break;
146 |                 ++p;
147 |             }
148 |             CONST_IF(!blaze::IsSparseVector<VectorType<FloatType, Orientation>>::value) {
149 |                 std::memset(&ret[i], 0, (ret.size() - i) * sizeof(FloatType)); // Zero the last elements in array.
150 |             }
151 |         }
152 |         template<template <typename, bool> typename VectorType, typename FloatType, bool Orientation>
153 |         int sparse_set(VectorType<FloatType, Orientation> &ret, const int delim=' ') {
154 |             char *p(data()), *line_end(p + len());
155 |             const int label(std::atoi(p));
156 |             blaze::reset(ret);
157 |             if((p = std::strchr(p, ' ')) == nullptr) return label;
158 |             ++p;
159 |             while(p < line_end) {
160 |                 ret[std::atoi(p) - 1] = std::atof(std::strchr(p, ':') + 1);
161 |                 if((p = std::strchr(p, delim)) == nullptr) break;
162 |                 ++p;
163 |             }
164 |             return label;
165 |         }
166 |     };
167 |     LineIterator begin() {
168 |         using namespace io;
169 |         if(fp_) {
170 |             fclose(fp_);
171 |             std::fprintf(stderr, "Closing!\n");
172 |         }
173 |         std::fprintf(stderr, "Opening!\n");
174 |         switch(ctype_) {
175 |             case UNCOMPRESSED: fp_ = fopen(path_.data(), "r"); break;
176 |             case ZLIB:         fp_ = popen((zlibcmd  + path_).data(), "r"); break;
177 |             case BZIP2:        fp_ = popen((bzip2cmd + path_).data(), "r"); break;
178 |             case ZSTD:         fp_ = popen((zstdcmd  + path_).data(), "r"); break;
179 |             default:           throw std::runtime_error("Unexpected ctype code: " + std::to_string((int)ctype_));
180 |         }
181 |         if(fp_ == nullptr) {
182 |             throw std::runtime_error(ks::sprintf("Could not open file at %s", path_.data()).data());
183 |         }
184 |         LineIterator ret(*this);
185 |         return ++ret;
186 |     }
187 |     LineIterator end() {
188 |         return LineIterator(*this);
189 |     }
190 |     ssize_t len() const {return len_;}
191 |     char *data() {return data_;}
192 |     const char *data() const {return data_;}
193 | };
194 | 
195 | } // namespace frp
196 | 
197 | #endif
198 | 


--------------------------------------------------------------------------------
/include/frp/rand.h:
--------------------------------------------------------------------------------
  1 | #ifndef _GRFP_RAND_H__
  2 | #define _GRFP_RAND_H__
  3 | #include <random>
  4 | #include <ctime>
  5 | #include "fastrange/fastrange.h"
  6 | #include "include/thirdparty/fast_mutex.h"
  7 | #include "aesctr/aesctr.h"
  8 | 
  9 | namespace rng {
 10 | 
 11 | struct RandTwister {
 12 | 
 13 |     std::mt19937_64 twister_;
 14 | 
 15 |     using ResultType = std::mt19937_64::result_type;
 16 | 
 17 |     static const ResultType MAX     = std::mt19937_64::max();
 18 |     static const ResultType MIN     = std::mt19937_64::min();
 19 |     static constexpr double MAX_INV = 1. / static_cast<double>(MAX);
 20 | 
 21 |     RandTwister(ResultType seed=0): twister_(seed) {}
 22 |     void seed(ResultType seed) {twister_.seed(seed);}
 23 |     auto operator()()                              {return twister_();}
 24 |     auto operator()(std::mt19937_64 &engine) const {return engine();}
 25 |     // Generate a large number of random integers.
 26 |     template<typename IntType>
 27 |     void operator()(IntType n, RandTwister::ResultType *a) {
 28 | #if UNROLL
 29 |         const auto leftover(n & 0x7UL);
 30 |         n >>= 3;
 31 |         switch(leftover) {
 32 |             case 0: do {
 33 |                     *a++ = twister_();
 34 |             case 7: *a++ = twister_();
 35 |             case 6: *a++ = twister_();
 36 |             case 5: *a++ = twister_();
 37 |             case 4: *a++ = twister_();
 38 |             case 3: *a++ = twister_();
 39 |             case 2: *a++ = twister_();
 40 |             case 1: *a++ = twister_();
 41 |                        } while(n--);
 42 |         }
 43 | #else
 44 |         for(;n--;*a++ = twister_());
 45 | #endif
 46 |     }
 47 |     void reseed(ResultType seed) {twister_.seed(seed);}
 48 |     using TwisterReference = std::add_lvalue_reference<std::mt19937_64>::type;
 49 |     operator TwisterReference() {
 50 |         return twister_;
 51 |     }
 52 | };
 53 | 
 54 | struct ThreadsafeRandTwister: public RandTwister {
 55 |     ThreadsafeRandTwister(ResultType seed): RandTwister(seed) {}
 56 |     tthread::fast_mutex lock_;
 57 |     void seed(ResultType seed) {lock_.lock(); RandTwister::seed(seed); lock_.unlock();}
 58 |     auto operator()() {
 59 |         lock_.lock();
 60 |         const auto ret(twister_());
 61 |         lock_.unlock();
 62 |         return ret;
 63 |     }
 64 |     // Generate a large number of random integers.
 65 |     auto operator()(size_t n, ResultType *a) {
 66 |         lock_.lock();
 67 |         RandTwister::operator ()(n, a);
 68 |         lock_.unlock();
 69 |     }
 70 | };
 71 | 
 72 | static RandTwister random_twist(std::time(nullptr));
 73 | static ThreadsafeRandTwister tsrandom_twist(std::time(nullptr) + 1);
 74 | 
 75 | // Based on https://github.com/lemire/FastShuffleExperiments/blob/master/cpp/rangedrand.h
 76 | // map random value to [0,range) with slight bias, redraws to avoid bias if
 77 | // needed
 78 | template <typename RandomBitGenerator>
 79 | static inline uint64_t random_bounded_nearlydivisionless64(uint64_t range, RandomBitGenerator &rg) {
 80 |   __uint128_t random64bit, multiresult;
 81 |   uint64_t leftover;
 82 |   uint64_t threshold;
 83 |   random64bit = rg();
 84 |   multiresult = random64bit * range;
 85 |   leftover = (uint64_t)multiresult;
 86 |   if (leftover < range) {
 87 |     threshold = -range % range;
 88 |     while (leftover < threshold) {
 89 |       random64bit = rg();
 90 |       multiresult = random64bit * range;
 91 |       leftover = (uint64_t)multiresult;
 92 |     }
 93 |   }
 94 |   return static_cast<uint64_t>(multiresult >> 64); // [0, range)
 95 | }
 96 | 
 97 | static inline uint64_t random_bounded_nearlydivisionless64(uint64_t range) {
 98 |     return random_bounded_nearlydivisionless64<rng::RandTwister>(range, random_twist);
 99 | }
100 | 
101 | static inline uint64_t tsrandom_bounded_nearlydivisionless64(uint64_t range) {
102 |     return random_bounded_nearlydivisionless64<rng::ThreadsafeRandTwister>(range, tsrandom_twist);
103 | }
104 | 
105 | template<typename T>
106 | T randf() {return random_twist() * RandTwister::MAX_INV;}
107 | template<typename T>
108 | T tsrandf() {return tsrandom_twist() * RandTwister::MAX_INV;}
109 | 
110 | } // namespace rng
111 | 
112 | #endif // #ifndef _GRFP_RAND_H__
113 | 


--------------------------------------------------------------------------------
/include/frp/sample.h:
--------------------------------------------------------------------------------
  1 | #ifndef _GFRP_SAMPLE_H__
  2 | #define _GFRP_SAMPLE_H__
  3 | #include <unordered_set>
  4 | #include "frp/util.h"
  5 | 
  6 | namespace frp {
  7 | //TODO: make precomputed subsample indices for JL transforms and other squashings.
  8 | enum SubsampleStrategy {
  9 |     FIRST_M = 0,
 10 |     RANDOM_NO_REPLACEMENT = 1,
 11 |     RANDOM_NO_REPLACEMENT_HASH_SET = 2,
 12 |     RANDOM_NO_REPLACEMENT_VEC = 3,
 13 |     RANDOM_W_REPLACEMENT = 4
 14 | };
 15 | 
 16 | template<template <typename> typename SetContainer=std::unordered_set, typename SizeType=unsigned>
 17 | auto random_set_in_range(SizeType n, SizeType range, uint64_t seed=0) {
 18 |     if(n > range) throw std::logic_error(ks::sprintf("n (%zu) mod range (%zu) is imposcerous.", n, range).data());
 19 |     aes::AesCtr<SizeType> gen;
 20 |     if(seed == 0) seed = (n * range);
 21 |     SetContainer<SizeType> ret;
 22 |     ret.reserve(n);
 23 |     while(ret.size() < n) ret.insert(fastrange(n, range));
 24 |     return ret;
 25 | }
 26 | 
 27 | template<typename FullVector, typename SmallVector>
 28 | void subsample(const FullVector &in, SmallVector &out, SubsampleStrategy strat, uint64_t seed) {
 29 |     std::fprintf(stderr, "Warning: This always regenates the indices to copy over. The seed must be set the same every time.\n"
 30 |                          "You can make and save a reordering if you want to keep it the same later.\n");
 31 |     static_assert(is_same<decay_t<decltype(in[0])>, decay_t<decltype(out[0])>>::value,
 32 |                   "Vectors must have the same float type.");
 33 |     // For n <100, a straight vector is faster
 34 |     if(strat == RANDOM_NO_REPLACEMENT)
 35 |         strat = out.size() > 100 ? RANDOM_NO_REPLACEMENT_HASH_SET: RANDOM_NO_REPLACEMENT_VEC;
 36 |     switch(strat) {
 37 |         default:
 38 |         case FIRST_M: {
 39 |             auto sv(subvector(in, 0, out.size()));
 40 |             out = sv;
 41 |         } break;
 42 |         case RANDOM_NO_REPLACEMENT_HASH_SET:
 43 |         {
 44 |             auto indices(random_set_in_range<unsigned>(out.size(), in.size(), seed));
 45 |             unsigned ind(0);
 46 |             for(const auto el: indices) out[ind++] = in[el];
 47 |         }
 48 |         break;
 49 |         case RANDOM_NO_REPLACEMENT_VEC: {
 50 |             aes::AesCtr<uint32_t> gen(seed);
 51 |             std::vector<unsigned> indices;
 52 |             while(indices.size() < out.size()) {
 53 |                 const auto tmp(fastrange(gen(), in.size()));
 54 |                 if(std::find(std::begin(indices), std::end(indices), tmp) == std::end(indices)) {
 55 |                     indices.push_back(tmp);
 56 |                 }
 57 |             }
 58 |             for(unsigned i(0); i < out.size(); ++i) out[i] = in[indices[i]];
 59 |         }
 60 |         break;
 61 |         case RANDOM_W_REPLACEMENT:
 62 |         {
 63 |             aes::AesCtr<uint32_t> gen(seed);
 64 |             for(auto &el: out) {
 65 |                 el = in[fastrange(gen(), out.size())];
 66 |             }
 67 |         }
 68 |     }
 69 | }
 70 | template<typename FullVector, typename OutVector=FullVector>
 71 | OutVector subsample(const FullVector &in, SubsampleStrategy strat, uint64_t seed, size_t outsz) {
 72 |     OutVector out(outsz);
 73 |     static_assert(is_same<decay_t<decltype(in[0])>, decay_t<decltype(out[0])>>::value,
 74 |                   "Vectors must have the same float type.");
 75 |     subsample(in, out, strat, seed);
 76 |     return out;
 77 | }
 78 | 
 79 | template<template <typename> typename SetContainer=std::unordered_set, typename SizeType=unsigned>
 80 | class CachedSubsampler {
 81 | 
 82 |     SizeType in_;
 83 |     std::vector<SizeType> indices;
 84 | 
 85 | public:
 86 |     CachedSubsampler(SizeType in, SizeType out, SizeType seed=0): in_(in) {
 87 |         {
 88 |             auto idxset(random_set_in_range<SetContainer, SizeType>(out, in, seed));
 89 |             indices = std::vector<SizeType>(std::begin(idxset), std::end(idxset));
 90 |         }
 91 |         sort(); // For better memory access pattern.
 92 |     }
 93 |     template<typename Vec1, typename Vec2>
 94 |     void apply(const Vec1 &in, Vec2 &out) {
 95 |         assert(out.size() == indices.size());
 96 |         auto oit(out.begin());
 97 |         for(const auto ind: indices) *oit++ = in[ind];
 98 |     }
 99 |     void resize(SizeType newin, SizeType newout, SizeType seed=0) {
100 |         if(newin == in_) {
101 |             if(newout == indices.size()) return;
102 |             aes::AesCtr<SizeType> gen(seed ? seed: newin * newout + newin*newin*newin/newout);
103 |             if(newout > indices.size()) {
104 |                 SetContainer<SizeType> tmp(indices.begin(), indices.end());
105 |                 while(tmp.size() < newout) tmp.insert(fastrange(gen(), in_));
106 |                 indices = std::vector<SizeType>(std::begin(tmp), std::end(tmp));
107 |                 return;
108 |             } else {
109 |                 SetContainer<SizeType> torm;
110 |                 while(indices.size() - torm.size() > newout)
111 |                     torm.insert(fastrange(gen(), in_));
112 |                 {
113 |                     std::vector<SizeType> swapper;
114 |                     swapper.reserve(newout);
115 |                     for(SizeType i(0); i < indices.size(); ++i)
116 |                         if(torm.find(i) == torm.end())
117 |                             swapper.push_back(indices[i]);
118 |                     std::swap(swapper, indices);
119 |                 }
120 |             }
121 |         } else {
122 |             in_ = newin;
123 |             {
124 |                 auto idxset(random_set_in_range<SetContainer, SizeType>(newout, in_, seed));
125 |                 indices = std::vector<SizeType>(std::begin(idxset), std::end(idxset));
126 |             }
127 |             sort();
128 |         }
129 |     }
130 |     void sort() {std::sort(indices.begin(), indices.end());}
131 | 
132 |     // TODO: Add the cached subsampler.
133 | };
134 | 
135 | } // namespace frp
136 | 
137 | #endif // _GFRP_SAMPLE_H__
138 | 


--------------------------------------------------------------------------------
/include/frp/sdq.h:
--------------------------------------------------------------------------------
 1 | #ifndef SORTED_DQ_H__
 2 | #define SORTED_DQ_H__
 3 | #include <cassert>
 4 | #include <deque>
 5 | #include <list>
 6 | #include <vector>
 7 | #include <algorithm>
 8 | 
 9 | 
10 | namespace sorted {
11 | 
12 | // Sorted deque
13 | template<template<typename...> class Container, typename T, typename All, typename Cmp=std::less<>, typename...Args>
14 | class container {
15 |     Container<T, All, Args...> data_;
16 |     Cmp cmp_;
17 | public:
18 |     template<typename...CArgs>
19 |     container(CArgs &&...args): data_(std::forward<CArgs>(args)...) {
20 |         sort(data_.begin(), data_.end(), cmp_);
21 |     }
22 |     template<typename U>
23 |     auto lower_bound(const U &item) const {
24 |         return std::lower_bound(data_.begin(), data_.end(), item, cmp_);
25 |     }
26 |     auto find(const T &x) const {
27 |         return std::lower_bound(data_.begin(), data_.end(), x, cmp_);
28 |     }
29 |     auto &con() {return data_;}
30 |     auto &con() const {return data_;}
31 |     template<typename...EArgs>
32 |     auto emplace(EArgs &&...args) {
33 |         T x(std::forward<EArgs>(args)...);
34 |         auto it = find(x);
35 |         data_.insert(it, std::move(x));
36 |         assert(std::is_sorted(data_.begin(), data_.end(), cmp_));
37 |     }
38 |     T &operator[](size_t i) {return data_[i];}
39 |     const T &operator[](size_t i) const {return data_[i];}
40 |     auto begin() {return data_.begin();}
41 |     auto end()   {return data_.end();}
42 |     auto begin() const {return data_.begin();}
43 |     auto end()   const {return data_.end();}
44 |     auto cbegin() {return data_.cbegin();}
45 |     auto cend()   {return data_.cend();}
46 |     auto size() const {return data_.size();}
47 |     auto pop() {auto ret = std::move(data_.back()); data_.pop_back(); return ret;}
48 |     using iterator = typename Container<T, All>::iterator;
49 |     using const_iterator = typename Container<T, All>::const_iterator;
50 |     using value_type = typename Container<T, All>::value_type;
51 |     using pointer = typename Container<T, All>::pointer;
52 |     using const_pointer = typename Container<T, All>::const_pointer;
53 |     using reference = typename Container<T, All>::reference;
54 |     using const_reference = typename Container<T, All>::const_reference;
55 | };
56 | 
57 | template<typename T, typename Cmp=std::less<>, typename All=std::allocator<T>>
58 | using vector = container<std::vector, T, All, Cmp>;
59 | template<typename T, typename Cmp=std::less<>, typename All=std::allocator<T>>
60 | using deque = container<std::deque, T, All, Cmp>;
61 | 
62 | } // frp
63 | #endif
64 | 


--------------------------------------------------------------------------------
/include/frp/spectral.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #ifndef NN_MAT_H__
 3 | #define NN_MAT_H__
 4 | #include <cstdint>
 5 | #include "blaze/Math.h"
 6 | #include "frp/linalg.h"
 7 | 
 8 | namespace frp {
 9 | namespace graph {
10 | using std::uint32_t;
11 | using u32 = std::uint32_t;
12 | 
13 | #if 0
14 | template<typename FT, typename IT>
15 | struct inddist: public std::pair<FT, IT> {
16 |     static_assert(std::is_floating_point<FT>::value, "FT must be floating point");
17 |     static_assert(std::is_integral<IT>::value, "IT must be integral");
18 |     template<typename...Args> inddist(Args &&...args) std::pair<FT, IT>(std::forward<Args>(args)...) {}
19 | };
20 | template<typename FT, typename IT>
21 | struct idpq: std::priority_queue<inddist<FT, IT>> {
22 |     auto &getc() {return this->c;}
23 |     const auto &getc() const {return this->c;}
24 | };
25 | #endif
26 | 
27 | template<typename MT, bool SO, typename Functor>
28 | auto dm2laplacian(const blaze::Matrix<MT, SO> &distance_matrix, const Functor &func=Functor()) {
29 |     // Requires that distance_matrix is symmetric. It's okay if it isn't exactly due to floating point errors.
30 |     using FT = typename blaze::Matrix<MT, SO>::ElementType_t;
31 |     const size_t nr = rows(distance_matrix);
32 |     assert(rows(distance_matrix) == columns(distance_matrix));
33 |     blaze::SymmetricMatrix<blaze::DynamicMatrix<FT>> ret(nr);
34 |     for(size_t i = 0, e = rows(distance_matrix); i < e; ++i) {
35 |         auto r = row(distance_matrix, i);
36 |         auto rr = row(ret);
37 |         func(rr, r, i);
38 |         rr[i] = 0.;
39 |         rr[i] = -sum(rr);
40 |     }
41 |     return ret;
42 | }
43 | 
44 | template<typename MT>
45 | auto laplacian_embedding(const MT &laplacian, unsigned k, bool normalize=false) {
46 |     using FT = typename MT::ElementType_t;
47 |     blaze::DynamicVector<FT> eigv;
48 |     blaze::DynamicMatrix<FT> eigenvectors;
49 |     blaze::eigen(laplacian, eigv, eigenvectors);
50 |     blaze::DynamicMatrix<FT> firstk(eigenvectors.rows(), k);
51 |     // LAPACK returns eigenvectors in reverse order because... it does
52 |     for(size_t i = 0; i < k; ++i)
53 |         column(firstk, i) = column(eigenvectors, k - i - 1);
54 |     if(normalize) {
55 |         OMP_PRAGMA("omp parallel for")
56 |         for(size_t i = 0; i < firstk.rows(); ++i)
57 |             row(firstk, i) /= l2Norm(row(firstk, i));
58 |     }
59 |     return firstk;
60 | }
61 | 
62 | // The smallest non-null eigenvectors of the unnormalized Laplacian approximate the RatioCut minimization criterion,and
63 | // The smallest non-null eigenvectors of therandom-walkLaplacianapproximate the NCut criterion.
64 | 
65 | } // graph
66 | } // frp
67 | 
68 | #endif /* NN_MAT_H__ */
69 | 


--------------------------------------------------------------------------------
/include/frp/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef _GFRP_UTIL_H__
  2 | #define _GFRP_UTIL_H__
  3 | #include <climits>
  4 | #include <cmath>
  5 | #include <cstdint>
  6 | #include <cstdlib>
  7 | #include <limits>
  8 | #include <chrono>
  9 | #include <memory>
 10 | #include <tuple>
 11 | #include <type_traits>
 12 | #include <unordered_set>
 13 | #include "kspp/ks.h"
 14 | #include "aesctr/wy.h"
 15 | #include "blaze/Math.h"
 16 | #include <zlib.h>
 17 | #ifndef FHT_HEADER_ONLY
 18 | #  define FHT_HEADER_ONLY
 19 | #endif
 20 | #include "./ifc.h"
 21 | 
 22 | #ifdef _OPENMP
 23 | #  ifndef OMP_PRAGMA
 24 | #    define OMP_PRAGMA(...) _Pragma(__VA_ARGS__)
 25 | #  endif
 26 | #  ifndef OMP_ONLY
 27 | #     define OMP_ONLY(...) __VA_ARGS__
 28 | #  endif
 29 | #else
 30 | #  ifndef OMP_PRAGMA
 31 | #    define OMP_PRAGMA(...)
 32 | #  endif
 33 | #  ifndef OMP_ONLY
 34 | #    define OMP_ONLY(...)
 35 | #  endif
 36 | #endif
 37 | 
 38 | 
 39 | #ifndef CONST_IF
 40 | #if defined(__cpp_if_constexpr) && __cplusplus >= __cpp_if_constexpr
 41 | #define CONST_IF(...) if constexpr(__VA_ARGS__)
 42 | #else
 43 | #define CONST_IF(...) if(__VA_ARGS__)
 44 | #endif
 45 | #endif
 46 | 
 47 | #ifndef FLOAT_TYPE
 48 | #define FLOAT_TYPE double
 49 | #endif
 50 | 
 51 | #ifndef ATTR_CONST
 52 | #  if defined(__GNUC__) || defined(__clang__)
 53 | #    define ATTR_CONST __attribute__((const))
 54 | #  else
 55 | #    define ATTR_CONST
 56 | #  endif
 57 | #endif
 58 | 
 59 | 
 60 | #ifdef __GNUC__
 61 | #  ifndef likely
 62 | #    define likely(x) __builtin_expect((x),1)
 63 | #  endif
 64 | #  ifndef unlikely
 65 | #    define unlikely(x) __builtin_expect((x),0)
 66 | #  endif
 67 | #  ifndef FRP_UNUSED
 68 | #    define FRP_UNUSED(x) __attribute__((unused)) x
 69 | #  endif
 70 | #else
 71 | #  ifndef likely
 72 | #    define likely(x) (x)
 73 | #  endif
 74 | #  ifndef unlikely
 75 | #    define unlikely(x) (x)
 76 | #  endif
 77 | #  ifndef FRP_UNUSED
 78 | #    define FRP_UNUSED(x) (x)
 79 | #  endif
 80 | #endif
 81 | 
 82 | namespace frp {
 83 | using namespace std::literals;
 84 | using std::uint64_t;
 85 | using std::uint32_t;
 86 | using std::uint16_t;
 87 | using std::uint8_t;
 88 | using std::int64_t;
 89 | using std::int32_t;
 90 | using std::int16_t;
 91 | using std::int8_t;
 92 | using blaze::DynamicVector;
 93 | using blaze::DynamicMatrix;
 94 | using blaze::TransposeFlag;
 95 | using std::size_t;
 96 | using std::enable_if_t;
 97 | using std::decay_t;
 98 | using std::memset;
 99 | using std::memcpy;
100 | using std::malloc;
101 | using std::realloc;
102 | using std::unique_ptr;
103 | using std::is_arithmetic;
104 | using std::is_floating_point;
105 | using std::runtime_error;
106 | using std::bad_alloc;
107 | using std::unordered_set;
108 | using std::forward;
109 | using std::is_same;
110 | using std::FILE;
111 | using std::fprintf;
112 | using std::sprintf;
113 | using std::numeric_limits;
114 | using std::strstr;
115 | using std::atoi;
116 | using std::fclose;
117 | using std::exit;
118 | using std::cerr;
119 | using std::cout;
120 | using u32 = uint32_t;
121 | using u64 = uint64_t;
122 | 
123 | inline constexpr uint64_t roundup(uint64_t x) {
124 |     x--;
125 |     x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x |= x >> 32;
126 |     return ++x;
127 | }
128 | template < template <typename...> class Template, typename T >
129 | struct is_instantiation_of : std::false_type {};
130 | 
131 | template < template <typename...> class Template, typename... Args >
132 | struct is_instantiation_of< Template, Template<Args...> > : std::true_type {};
133 | 
134 | class Timer {
135 |     using TpType = std::chrono::system_clock::time_point;
136 |     std::string name_;
137 |     TpType start_, stop_;
138 | public:
139 |     Timer(std::string &&name=""): name_{name}, start_(std::chrono::system_clock::now()) {}
140 |     void stop() {stop_ = std::chrono::system_clock::now();}
141 |     void restart() {start_ = std::chrono::system_clock::now();}
142 |     void report() {std::cerr << "Took " << std::chrono::duration<double>(stop_ - start_).count() << "s for task '" << name_ << "'\n";}
143 |     double time() const {return std::chrono::duration<double>(std::chrono::system_clock::now() - start_).count();}
144 |     ~Timer() {stop(); /* hammertime */ report();}
145 |     void rename(const char *name) {name_ = name;}
146 | };
147 | 
148 | static constexpr INLINE unsigned ilog2(uint64_t x) noexcept {
149 |     return sizeof(uint64_t) * CHAR_BIT - __builtin_clzll(x)  - 1;
150 | }
151 | 
152 | template<typename T>
153 | static constexpr float random_gaussian_from_seed(T hv) {
154 |     constexpr float _wynorm1=1.0f/(1ull<<15);
155 |     return (((hv>>16)&0xffff)+((hv>>32)&0xffff)+(hv>>48))*_wynorm1-3.0f;
156 | }
157 | 
158 | template<typename T> class TD;
159 | 
160 | template<typename T, bool SO, typename F>
161 | void for_each_nz(blaze::DenseVector<T, SO> &_x, const F &func) {
162 |     auto &x = ~_x;
163 |     for(size_t i = 0; i < x.size(); ++i) {
164 |         if(x[i])
165 |             func(i, x[i]);
166 |     }
167 | }
168 | template<typename T, bool SO, typename F>
169 | void for_each_nz(blaze::SparseVector<T, SO> &_x, const F &func) {
170 |     auto &x = ~_x;
171 |     for(auto it = x.begin(), e = x.end(); it != e; ++it)
172 |         func(it->index(), x->value());
173 | }
174 | template<typename T, bool SO, typename F>
175 | void for_each_nz(const blaze::DenseVector<T, SO> &_x, const F &func) {
176 |     auto &x = ~_x;
177 |     for(size_t i = 0; i < x.size(); ++i) {
178 |         if(x[i])
179 |             func(i, x[i]);
180 |     }
181 | }
182 | template<typename T, bool SO, typename F>
183 | void for_each_nz(const blaze::SparseVector<T, SO> &_x, const F &func) {
184 |     auto &x = ~_x;
185 |     for(auto it = x.begin(), e = x.end(); it != e; ++it)
186 |         func(it->index(), x->value());
187 | }
188 | 
189 | template<class Container>
190 | auto mean(const Container &c) {
191 |     using FloatType = decay_t<decltype(c[0])>;
192 |     FloatType sum(0.);
193 |     for_each_nz(c, [&](auto x, auto y){sum += y;});
194 | #if 0
195 |     if constexpr(blaze::IsSparseVector<Container>::value || blaze::IsSparseVector<Container>::value) {
196 |         for(const auto entry: c) sum += entry.value();
197 |     } else {
198 |         for(const auto entry: c) sum += entry;
199 |     }
200 | #endif
201 |     sum /= c.size();
202 |     return sum;
203 | }
204 | 
205 | template<class Container>
206 | auto sum(const Container &c) {
207 |     return std::accumulate(c.begin(), c.end(), static_cast<decay_t<decltype(*c.begin())>>(0));
208 | }
209 | 
210 | template<typename T>
211 | void ksprint(const T &view, ks::string &buf, bool scientific=true) {
212 |     const char fmt[5] = {'%', 'l', (char)('f' - scientific), ',', '\0'};
213 |     for(const auto el: view) buf.sprintf(fmt, static_cast<double>(el));
214 |     buf.pop();
215 | }
216 | 
217 | template<typename T>
218 | void pv(const T &view, FILE *fp=stderr) {
219 |     ks::string str;
220 |     ksprint(view, str);
221 |     str.terminate();
222 |     str.write(fp);
223 | }
224 | 
225 | 
226 | size_t countchars(const char *line, int delim) {
227 |     size_t ret(0);
228 |     while(*line && *line != '\n') ret += (*line++ == delim);
229 |     return ret;
230 | }
231 | 
232 | #if USE_PDQSORT
233 | #include "pdqsort.h"
234 | template<typename... Args>
235 | auto sort(Args &&...args) {
236 |     return pdqsort(std::forward<Args>(args)...);
237 | }
238 | #else
239 | #include <algorithm>
240 | using std::sort;
241 | #endif
242 | 
243 | } // namespace frp
244 | 
245 | #endif
246 | 


--------------------------------------------------------------------------------
/include/thirdparty/fast_mutex.h:
--------------------------------------------------------------------------------
  1 | /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*-
  2 | Copyright (c) 2010-2012 Marcus Geelnard
  3 | 
  4 | This software is provided 'as-is', without any express or implied
  5 | warranty. In no event will the authors be held liable for any damages
  6 | arising from the use of this software.
  7 | 
  8 | Permission is granted to anyone to use this software for any purpose,
  9 | including commercial applications, and to alter it and redistribute it
 10 | freely, subject to the following restrictions:
 11 | 
 12 |     1. The origin of this software must not be misrepresented; you must not
 13 |     claim that you wrote the original software. If you use this software
 14 |     in a product, an acknowledgment in the product documentation would be
 15 |     appreciated but is not required.
 16 | 
 17 |     2. Altered source versions must be plainly marked as such, and must not be
 18 |     misrepresented as being the original software.
 19 | 
 20 |     3. This notice may not be removed or altered from any source
 21 |     distribution.
 22 | */
 23 | 
 24 | #ifndef _FAST_MUTEX_H_
 25 | #define _FAST_MUTEX_H_
 26 | 
 27 | /// @file
 28 | 
 29 | // Which platform are we on?
 30 | #if !defined(_TTHREAD_PLATFORM_DEFINED_)
 31 |   #if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
 32 |     #define _TTHREAD_WIN32_
 33 |   #else
 34 |     #define _TTHREAD_POSIX_
 35 |   #endif
 36 |   #define _TTHREAD_PLATFORM_DEFINED_
 37 | #endif
 38 | 
 39 | // Check if we can support the assembly language level implementation (otherwise
 40 | // revert to the system API)
 41 | #if (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || \
 42 |     (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || \
 43 |     (defined(__GNUC__) && (defined(__ppc__)))
 44 |   #define _FAST_MUTEX_ASM_
 45 | #else
 46 |   #define _FAST_MUTEX_SYS_
 47 | #endif
 48 | 
 49 | #if defined(_TTHREAD_WIN32_)
 50 |   #ifndef WIN32_LEAN_AND_MEAN
 51 |     #define WIN32_LEAN_AND_MEAN
 52 |     #define __UNDEF_LEAN_AND_MEAN
 53 |   #endif
 54 |   #include <windows.h>
 55 |   #ifdef __UNDEF_LEAN_AND_MEAN
 56 |     #undef WIN32_LEAN_AND_MEAN
 57 |     #undef __UNDEF_LEAN_AND_MEAN
 58 |   #endif
 59 | #else
 60 |   #ifdef _FAST_MUTEX_ASM_
 61 |     #include <sched.h>
 62 |   #else
 63 |     #include <pthread.h>
 64 |   #endif
 65 | #endif
 66 | 
 67 | namespace tthread {
 68 | 
 69 | /// Fast mutex class.
 70 | /// This is a mutual exclusion object for synchronizing access to shared
 71 | /// memory areas for several threads. It is similar to the tthread::mutex class,
 72 | /// but instead of using system level functions, it is implemented as an atomic
 73 | /// spin lock with very low CPU overhead.
 74 | ///
 75 | /// The \c fast_mutex class is NOT compatible with the \c condition_variable
 76 | /// class (however, it IS compatible with the \c lock_guard class). It should
 77 | /// also be noted that the \c fast_mutex class typically does not provide
 78 | /// as accurate thread scheduling as a the standard \c mutex class does.
 79 | ///
 80 | /// Because of the limitations of the class, it should only be used in
 81 | /// situations where the mutex needs to be locked/unlocked very frequently.
 82 | ///
 83 | /// @note The "fast" version of this class relies on inline assembler language,
 84 | /// which is currently only supported for 32/64-bit Intel x86/AMD64 and
 85 | /// PowerPC architectures on a limited number of compilers (GNU g++ and MS
 86 | /// Visual C++).
 87 | /// For other architectures/compilers, system functions are used instead.
 88 | class fast_mutex {
 89 |   public:
 90 |     /// Constructor.
 91 | #if defined(_FAST_MUTEX_ASM_)
 92 |     fast_mutex() : mLock(0) {}
 93 | #else
 94 |     fast_mutex()
 95 |     {
 96 |   #if defined(_TTHREAD_WIN32_)
 97 |       InitializeCriticalSection(&mHandle);
 98 |   #elif defined(_TTHREAD_POSIX_)
 99 |       pthread_mutex_init(&mHandle, NULL);
100 |   #endif
101 |     }
102 | #endif
103 | 
104 | #if !defined(_FAST_MUTEX_ASM_)
105 |     /// Destructor.
106 |     ~fast_mutex()
107 |     {
108 |   #if defined(_TTHREAD_WIN32_)
109 |       DeleteCriticalSection(&mHandle);
110 |   #elif defined(_TTHREAD_POSIX_)
111 |       pthread_mutex_destroy(&mHandle);
112 |   #endif
113 |     }
114 | #endif
115 | 
116 |     /// Lock the mutex.
117 |     /// The method will block the calling thread until a lock on the mutex can
118 |     /// be obtained. The mutex remains locked until \c unlock() is called.
119 |     /// @see lock_guard
120 |     inline void lock()
121 |     {
122 | #if !NDEBUG
123 |         //std::cerr << "I am trying to lock.\n";
124 | #endif
125 | #if defined(_FAST_MUTEX_ASM_)
126 |       bool gotLock;
127 |       do {
128 |         gotLock = try_lock();
129 |         if(!gotLock)
130 |         {
131 |   #if defined(_TTHREAD_WIN32_)
132 |           Sleep(0);
133 |   #elif defined(_TTHREAD_POSIX_)
134 |           sched_yield();
135 |   #endif
136 |         }
137 |       } while(!gotLock);
138 | #else
139 |   #if defined(_TTHREAD_WIN32_)
140 |       EnterCriticalSection(&mHandle);
141 |   #elif defined(_TTHREAD_POSIX_)
142 |       pthread_mutex_lock(&mHandle);
143 |   #endif
144 | #endif
145 |     }
146 | 
147 |     /// Try to lock the mutex.
148 |     /// The method will try to lock the mutex. If it fails, the function will
149 |     /// return immediately (non-blocking).
150 |     /// @return \c true if the lock was acquired, or \c false if the lock could
151 |     /// not be acquired.
152 |     inline bool try_lock()
153 |     {
154 | #if defined(_FAST_MUTEX_ASM_)
155 |       int oldLock;
156 |   #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
157 |       asm volatile (
158 |         "movl $1,%%eax\n\t"
159 |         "xchg %%eax,%0\n\t"
160 |         "movl %%eax,%1\n\t"
161 |         : "=m" (mLock), "=m" (oldLock)
162 |         :
163 |         : "%eax", "memory"
164 |       );
165 |   #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
166 |       int *ptrLock = &mLock;
167 |       __asm {
168 |         mov eax,1
169 |         mov ecx,ptrLock
170 |         xchg eax,[ecx]
171 |         mov oldLock,eax
172 |       }
173 |   #elif defined(__GNUC__) && (defined(__ppc__))
174 |       int newLock = 1;
175 |       asm volatile (
176 |         "\n1:\n\t"
177 |         "lwarx  %0,0,%1\n\t"
178 |         "cmpwi  0,%0,0\n\t"
179 |         "bne-   2f\n\t"
180 |         "stwcx. %2,0,%1\n\t"
181 |         "bne-   1b\n\t"
182 |         "isync\n"
183 |         "2:\n\t"
184 |         : "=&r" (oldLock)
185 |         : "r" (&mLock), "r" (newLock)
186 |         : "cr0", "memory"
187 |       );
188 |   #endif
189 |       return (oldLock == 0);
190 | #else
191 |   #if defined(_TTHREAD_WIN32_)
192 |       return TryEnterCriticalSection(&mHandle) ? true : false;
193 |   #elif defined(_TTHREAD_POSIX_)
194 |       return (pthread_mutex_trylock(&mHandle) == 0) ? true : false;
195 |   #endif
196 | #endif
197 |     }
198 | 
199 |     /// Unlock the mutex.
200 |     /// If any threads are waiting for the lock on this mutex, one of them will
201 |     /// be unblocked.
202 |     inline void unlock()
203 |     {
204 | #if defined(_FAST_MUTEX_ASM_)
205 |   #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
206 |       asm volatile (
207 |         "movl $0,%%eax\n\t"
208 |         "xchg %%eax,%0\n\t"
209 |         : "=m" (mLock)
210 |         :
211 |         : "%eax", "memory"
212 |       );
213 |   #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
214 |       int *ptrLock = &mLock;
215 |       __asm {
216 |         mov eax,0
217 |         mov ecx,ptrLock
218 |         xchg eax,[ecx]
219 |       }
220 |   #elif defined(__GNUC__) && (defined(__ppc__))
221 |       asm volatile (
222 |         "sync\n\t"  // Replace with lwsync where possible?
223 |         : : : "memory"
224 |       );
225 |       mLock = 0;
226 |   #endif
227 | #else
228 |   #if defined(_TTHREAD_WIN32_)
229 |       LeaveCriticalSection(&mHandle);
230 |   #elif defined(_TTHREAD_POSIX_)
231 |       pthread_mutex_unlock(&mHandle);
232 |   #endif
233 | #endif
234 |     }
235 | 
236 |   private:
237 | #if defined(_FAST_MUTEX_ASM_)
238 |     int mLock;
239 | #else
240 |   #if defined(_TTHREAD_WIN32_)
241 |     CRITICAL_SECTION mHandle;
242 |   #elif defined(_TTHREAD_POSIX_)
243 |     pthread_mutex_t mHandle;
244 |   #endif
245 | #endif
246 | };
247 | 
248 | }
249 | 
250 | #endif // _FAST_MUTEX_H_
251 | 
252 | 


--------------------------------------------------------------------------------
/lib/fht_kernel0.cu:
--------------------------------------------------------------------------------
1 |     u = array[x + 0], v = array[x + 1];
2 |     array[x + 0] = u + v;
3 |     array[x + 1] = u - v;
4 | 


--------------------------------------------------------------------------------
/lib/fht_kernel1.cu:
--------------------------------------------------------------------------------
1 |     u = array[x + 0], v = array[x + 2];
2 |     array[x + 0] = u + v;
3 |     array[x + 2] = u - v;
4 |     u = array[x + 1], v = array[x + 3];
5 |     array[x + 1] = u + v;
6 |     array[x + 3] = u - v;
7 | 


--------------------------------------------------------------------------------
/lib/fht_kernel2.cu:
--------------------------------------------------------------------------------
 1 |     u = array[x + 0], v = array[x + 4];
 2 |     array[x + 0] = u + v;
 3 |     array[x + 4] = u - v;
 4 |     u = array[x + 1], v = array[x + 5];
 5 |     array[x + 1] = u + v;
 6 |     array[x + 5] = u - v;
 7 |     u = array[x + 2], v = array[x + 6];
 8 |     array[x + 2] = u + v;
 9 |     array[x + 6] = u - v;
10 |     u = array[x + 3], v = array[x + 7];
11 |     array[x + 3] = u + v;
12 |     array[x + 7] = u - v;
13 | 


--------------------------------------------------------------------------------
/lib/fht_kernel3.cu:
--------------------------------------------------------------------------------
 1 |     u = array[x + 0], v = array[x + 8];
 2 |     array[x + 0] = u + v;
 3 |     array[x + 8] = u - v;
 4 |     u = array[x + 1], v = array[x + 9];
 5 |     array[x + 1] = u + v;
 6 |     array[x + 9] = u - v;
 7 |     u = array[x + 2], v = array[x + 10];
 8 |     array[x + 2] = u + v;
 9 |     array[x + 10] = u - v;
10 |     u = array[x + 3], v = array[x + 11];
11 |     array[x + 3] = u + v;
12 |     array[x + 11] = u - v;
13 |     u = array[x + 4], v = array[x + 12];
14 |     array[x + 4] = u + v;
15 |     array[x + 12] = u - v;
16 |     u = array[x + 5], v = array[x + 13];
17 |     array[x + 5] = u + v;
18 |     array[x + 13] = u - v;
19 |     u = array[x + 6], v = array[x + 14];
20 |     array[x + 6] = u + v;
21 |     array[x + 14] = u - v;
22 |     u = array[x + 7], v = array[x + 15];
23 |     array[x + 7] = u + v;
24 |     array[x + 15] = u - v;
25 | 


--------------------------------------------------------------------------------
/lib/fht_kernel4.cu:
--------------------------------------------------------------------------------
 1 |     u = array[x + 0], v = array[x + 16];
 2 |     array[x + 0] = u + v;
 3 |     array[x + 16] = u - v;
 4 |     u = array[x + 1], v = array[x + 17];
 5 |     array[x + 1] = u + v;
 6 |     array[x + 17] = u - v;
 7 |     u = array[x + 2], v = array[x + 18];
 8 |     array[x + 2] = u + v;
 9 |     array[x + 18] = u - v;
10 |     u = array[x + 3], v = array[x + 19];
11 |     array[x + 3] = u + v;
12 |     array[x + 19] = u - v;
13 |     u = array[x + 4], v = array[x + 20];
14 |     array[x + 4] = u + v;
15 |     array[x + 20] = u - v;
16 |     u = array[x + 5], v = array[x + 21];
17 |     array[x + 5] = u + v;
18 |     array[x + 21] = u - v;
19 |     u = array[x + 6], v = array[x + 22];
20 |     array[x + 6] = u + v;
21 |     array[x + 22] = u - v;
22 |     u = array[x + 7], v = array[x + 23];
23 |     array[x + 7] = u + v;
24 |     array[x + 23] = u - v;
25 |     u = array[x + 8], v = array[x + 24];
26 |     array[x + 8] = u + v;
27 |     array[x + 24] = u - v;
28 |     u = array[x + 9], v = array[x + 25];
29 |     array[x + 9] = u + v;
30 |     array[x + 25] = u - v;
31 |     u = array[x + 10], v = array[x + 26];
32 |     array[x + 10] = u + v;
33 |     array[x + 26] = u - v;
34 |     u = array[x + 11], v = array[x + 27];
35 |     array[x + 11] = u + v;
36 |     array[x + 27] = u - v;
37 |     u = array[x + 12], v = array[x + 28];
38 |     array[x + 12] = u + v;
39 |     array[x + 28] = u - v;
40 |     u = array[x + 13], v = array[x + 29];
41 |     array[x + 13] = u + v;
42 |     array[x + 29] = u - v;
43 |     u = array[x + 14], v = array[x + 30];
44 |     array[x + 14] = u + v;
45 |     array[x + 30] = u - v;
46 |     u = array[x + 15], v = array[x + 31];
47 |     array[x + 15] = u + v;
48 |     array[x + 31] = u - v;
49 | 


--------------------------------------------------------------------------------
/lib/fht_kernel5.cu:
--------------------------------------------------------------------------------
 1 |     u = array[x + 0], v = array[x + 32];
 2 |     array[x + 0] = u + v;
 3 |     array[x + 32] = u - v;
 4 |     u = array[x + 1], v = array[x + 33];
 5 |     array[x + 1] = u + v;
 6 |     array[x + 33] = u - v;
 7 |     u = array[x + 2], v = array[x + 34];
 8 |     array[x + 2] = u + v;
 9 |     array[x + 34] = u - v;
10 |     u = array[x + 3], v = array[x + 35];
11 |     array[x + 3] = u + v;
12 |     array[x + 35] = u - v;
13 |     u = array[x + 4], v = array[x + 36];
14 |     array[x + 4] = u + v;
15 |     array[x + 36] = u - v;
16 |     u = array[x + 5], v = array[x + 37];
17 |     array[x + 5] = u + v;
18 |     array[x + 37] = u - v;
19 |     u = array[x + 6], v = array[x + 38];
20 |     array[x + 6] = u + v;
21 |     array[x + 38] = u - v;
22 |     u = array[x + 7], v = array[x + 39];
23 |     array[x + 7] = u + v;
24 |     array[x + 39] = u - v;
25 |     u = array[x + 8], v = array[x + 40];
26 |     array[x + 8] = u + v;
27 |     array[x + 40] = u - v;
28 |     u = array[x + 9], v = array[x + 41];
29 |     array[x + 9] = u + v;
30 |     array[x + 41] = u - v;
31 |     u = array[x + 10], v = array[x + 42];
32 |     array[x + 10] = u + v;
33 |     array[x + 42] = u - v;
34 |     u = array[x + 11], v = array[x + 43];
35 |     array[x + 11] = u + v;
36 |     array[x + 43] = u - v;
37 |     u = array[x + 12], v = array[x + 44];
38 |     array[x + 12] = u + v;
39 |     array[x + 44] = u - v;
40 |     u = array[x + 13], v = array[x + 45];
41 |     array[x + 13] = u + v;
42 |     array[x + 45] = u - v;
43 |     u = array[x + 14], v = array[x + 46];
44 |     array[x + 14] = u + v;
45 |     array[x + 46] = u - v;
46 |     u = array[x + 15], v = array[x + 47];
47 |     array[x + 15] = u + v;
48 |     array[x + 47] = u - v;
49 |     u = array[x + 16], v = array[x + 48];
50 |     array[x + 16] = u + v;
51 |     array[x + 48] = u - v;
52 |     u = array[x + 17], v = array[x + 49];
53 |     array[x + 17] = u + v;
54 |     array[x + 49] = u - v;
55 |     u = array[x + 18], v = array[x + 50];
56 |     array[x + 18] = u + v;
57 |     array[x + 50] = u - v;
58 |     u = array[x + 19], v = array[x + 51];
59 |     array[x + 19] = u + v;
60 |     array[x + 51] = u - v;
61 |     u = array[x + 20], v = array[x + 52];
62 |     array[x + 20] = u + v;
63 |     array[x + 52] = u - v;
64 |     u = array[x + 21], v = array[x + 53];
65 |     array[x + 21] = u + v;
66 |     array[x + 53] = u - v;
67 |     u = array[x + 22], v = array[x + 54];
68 |     array[x + 22] = u + v;
69 |     array[x + 54] = u - v;
70 |     u = array[x + 23], v = array[x + 55];
71 |     array[x + 23] = u + v;
72 |     array[x + 55] = u - v;
73 |     u = array[x + 24], v = array[x + 56];
74 |     array[x + 24] = u + v;
75 |     array[x + 56] = u - v;
76 |     u = array[x + 25], v = array[x + 57];
77 |     array[x + 25] = u + v;
78 |     array[x + 57] = u - v;
79 |     u = array[x + 26], v = array[x + 58];
80 |     array[x + 26] = u + v;
81 |     array[x + 58] = u - v;
82 |     u = array[x + 27], v = array[x + 59];
83 |     array[x + 27] = u + v;
84 |     array[x + 59] = u - v;
85 |     u = array[x + 28], v = array[x + 60];
86 |     array[x + 28] = u + v;
87 |     array[x + 60] = u - v;
88 |     u = array[x + 29], v = array[x + 61];
89 |     array[x + 29] = u + v;
90 |     array[x + 61] = u - v;
91 |     u = array[x + 30], v = array[x + 62];
92 |     array[x + 30] = u + v;
93 |     array[x + 62] = u - v;
94 |     u = array[x + 31], v = array[x + 63];
95 |     array[x + 31] = u + v;
96 |     array[x + 63] = u - v;
97 | 


--------------------------------------------------------------------------------
/lib/fht_kernel6.cu:
--------------------------------------------------------------------------------
  1 |     u = array[x + 0], v = array[x + 64];
  2 |     array[x + 0] = u + v;
  3 |     array[x + 64] = u - v;
  4 |     u = array[x + 1], v = array[x + 65];
  5 |     array[x + 1] = u + v;
  6 |     array[x + 65] = u - v;
  7 |     u = array[x + 2], v = array[x + 66];
  8 |     array[x + 2] = u + v;
  9 |     array[x + 66] = u - v;
 10 |     u = array[x + 3], v = array[x + 67];
 11 |     array[x + 3] = u + v;
 12 |     array[x + 67] = u - v;
 13 |     u = array[x + 4], v = array[x + 68];
 14 |     array[x + 4] = u + v;
 15 |     array[x + 68] = u - v;
 16 |     u = array[x + 5], v = array[x + 69];
 17 |     array[x + 5] = u + v;
 18 |     array[x + 69] = u - v;
 19 |     u = array[x + 6], v = array[x + 70];
 20 |     array[x + 6] = u + v;
 21 |     array[x + 70] = u - v;
 22 |     u = array[x + 7], v = array[x + 71];
 23 |     array[x + 7] = u + v;
 24 |     array[x + 71] = u - v;
 25 |     u = array[x + 8], v = array[x + 72];
 26 |     array[x + 8] = u + v;
 27 |     array[x + 72] = u - v;
 28 |     u = array[x + 9], v = array[x + 73];
 29 |     array[x + 9] = u + v;
 30 |     array[x + 73] = u - v;
 31 |     u = array[x + 10], v = array[x + 74];
 32 |     array[x + 10] = u + v;
 33 |     array[x + 74] = u - v;
 34 |     u = array[x + 11], v = array[x + 75];
 35 |     array[x + 11] = u + v;
 36 |     array[x + 75] = u - v;
 37 |     u = array[x + 12], v = array[x + 76];
 38 |     array[x + 12] = u + v;
 39 |     array[x + 76] = u - v;
 40 |     u = array[x + 13], v = array[x + 77];
 41 |     array[x + 13] = u + v;
 42 |     array[x + 77] = u - v;
 43 |     u = array[x + 14], v = array[x + 78];
 44 |     array[x + 14] = u + v;
 45 |     array[x + 78] = u - v;
 46 |     u = array[x + 15], v = array[x + 79];
 47 |     array[x + 15] = u + v;
 48 |     array[x + 79] = u - v;
 49 |     u = array[x + 16], v = array[x + 80];
 50 |     array[x + 16] = u + v;
 51 |     array[x + 80] = u - v;
 52 |     u = array[x + 17], v = array[x + 81];
 53 |     array[x + 17] = u + v;
 54 |     array[x + 81] = u - v;
 55 |     u = array[x + 18], v = array[x + 82];
 56 |     array[x + 18] = u + v;
 57 |     array[x + 82] = u - v;
 58 |     u = array[x + 19], v = array[x + 83];
 59 |     array[x + 19] = u + v;
 60 |     array[x + 83] = u - v;
 61 |     u = array[x + 20], v = array[x + 84];
 62 |     array[x + 20] = u + v;
 63 |     array[x + 84] = u - v;
 64 |     u = array[x + 21], v = array[x + 85];
 65 |     array[x + 21] = u + v;
 66 |     array[x + 85] = u - v;
 67 |     u = array[x + 22], v = array[x + 86];
 68 |     array[x + 22] = u + v;
 69 |     array[x + 86] = u - v;
 70 |     u = array[x + 23], v = array[x + 87];
 71 |     array[x + 23] = u + v;
 72 |     array[x + 87] = u - v;
 73 |     u = array[x + 24], v = array[x + 88];
 74 |     array[x + 24] = u + v;
 75 |     array[x + 88] = u - v;
 76 |     u = array[x + 25], v = array[x + 89];
 77 |     array[x + 25] = u + v;
 78 |     array[x + 89] = u - v;
 79 |     u = array[x + 26], v = array[x + 90];
 80 |     array[x + 26] = u + v;
 81 |     array[x + 90] = u - v;
 82 |     u = array[x + 27], v = array[x + 91];
 83 |     array[x + 27] = u + v;
 84 |     array[x + 91] = u - v;
 85 |     u = array[x + 28], v = array[x + 92];
 86 |     array[x + 28] = u + v;
 87 |     array[x + 92] = u - v;
 88 |     u = array[x + 29], v = array[x + 93];
 89 |     array[x + 29] = u + v;
 90 |     array[x + 93] = u - v;
 91 |     u = array[x + 30], v = array[x + 94];
 92 |     array[x + 30] = u + v;
 93 |     array[x + 94] = u - v;
 94 |     u = array[x + 31], v = array[x + 95];
 95 |     array[x + 31] = u + v;
 96 |     array[x + 95] = u - v;
 97 |     u = array[x + 32], v = array[x + 96];
 98 |     array[x + 32] = u + v;
 99 |     array[x + 96] = u - v;
100 |     u = array[x + 33], v = array[x + 97];
101 |     array[x + 33] = u + v;
102 |     array[x + 97] = u - v;
103 |     u = array[x + 34], v = array[x + 98];
104 |     array[x + 34] = u + v;
105 |     array[x + 98] = u - v;
106 |     u = array[x + 35], v = array[x + 99];
107 |     array[x + 35] = u + v;
108 |     array[x + 99] = u - v;
109 |     u = array[x + 36], v = array[x + 100];
110 |     array[x + 36] = u + v;
111 |     array[x + 100] = u - v;
112 |     u = array[x + 37], v = array[x + 101];
113 |     array[x + 37] = u + v;
114 |     array[x + 101] = u - v;
115 |     u = array[x + 38], v = array[x + 102];
116 |     array[x + 38] = u + v;
117 |     array[x + 102] = u - v;
118 |     u = array[x + 39], v = array[x + 103];
119 |     array[x + 39] = u + v;
120 |     array[x + 103] = u - v;
121 |     u = array[x + 40], v = array[x + 104];
122 |     array[x + 40] = u + v;
123 |     array[x + 104] = u - v;
124 |     u = array[x + 41], v = array[x + 105];
125 |     array[x + 41] = u + v;
126 |     array[x + 105] = u - v;
127 |     u = array[x + 42], v = array[x + 106];
128 |     array[x + 42] = u + v;
129 |     array[x + 106] = u - v;
130 |     u = array[x + 43], v = array[x + 107];
131 |     array[x + 43] = u + v;
132 |     array[x + 107] = u - v;
133 |     u = array[x + 44], v = array[x + 108];
134 |     array[x + 44] = u + v;
135 |     array[x + 108] = u - v;
136 |     u = array[x + 45], v = array[x + 109];
137 |     array[x + 45] = u + v;
138 |     array[x + 109] = u - v;
139 |     u = array[x + 46], v = array[x + 110];
140 |     array[x + 46] = u + v;
141 |     array[x + 110] = u - v;
142 |     u = array[x + 47], v = array[x + 111];
143 |     array[x + 47] = u + v;
144 |     array[x + 111] = u - v;
145 |     u = array[x + 48], v = array[x + 112];
146 |     array[x + 48] = u + v;
147 |     array[x + 112] = u - v;
148 |     u = array[x + 49], v = array[x + 113];
149 |     array[x + 49] = u + v;
150 |     array[x + 113] = u - v;
151 |     u = array[x + 50], v = array[x + 114];
152 |     array[x + 50] = u + v;
153 |     array[x + 114] = u - v;
154 |     u = array[x + 51], v = array[x + 115];
155 |     array[x + 51] = u + v;
156 |     array[x + 115] = u - v;
157 |     u = array[x + 52], v = array[x + 116];
158 |     array[x + 52] = u + v;
159 |     array[x + 116] = u - v;
160 |     u = array[x + 53], v = array[x + 117];
161 |     array[x + 53] = u + v;
162 |     array[x + 117] = u - v;
163 |     u = array[x + 54], v = array[x + 118];
164 |     array[x + 54] = u + v;
165 |     array[x + 118] = u - v;
166 |     u = array[x + 55], v = array[x + 119];
167 |     array[x + 55] = u + v;
168 |     array[x + 119] = u - v;
169 |     u = array[x + 56], v = array[x + 120];
170 |     array[x + 56] = u + v;
171 |     array[x + 120] = u - v;
172 |     u = array[x + 57], v = array[x + 121];
173 |     array[x + 57] = u + v;
174 |     array[x + 121] = u - v;
175 |     u = array[x + 58], v = array[x + 122];
176 |     array[x + 58] = u + v;
177 |     array[x + 122] = u - v;
178 |     u = array[x + 59], v = array[x + 123];
179 |     array[x + 59] = u + v;
180 |     array[x + 123] = u - v;
181 |     u = array[x + 60], v = array[x + 124];
182 |     array[x + 60] = u + v;
183 |     array[x + 124] = u - v;
184 |     u = array[x + 61], v = array[x + 125];
185 |     array[x + 61] = u + v;
186 |     array[x + 125] = u - v;
187 |     u = array[x + 62], v = array[x + 126];
188 |     array[x + 62] = u + v;
189 |     array[x + 126] = u - v;
190 |     u = array[x + 63], v = array[x + 127];
191 |     array[x + 63] = u + v;
192 |     array[x + 127] = u - v;
193 | 


--------------------------------------------------------------------------------
/lib/fht_kernel7.cu:
--------------------------------------------------------------------------------
  1 |     u = array[x + 0], v = array[x + 128];
  2 |     array[x + 0] = u + v;
  3 |     array[x + 128] = u - v;
  4 |     u = array[x + 1], v = array[x + 129];
  5 |     array[x + 1] = u + v;
  6 |     array[x + 129] = u - v;
  7 |     u = array[x + 2], v = array[x + 130];
  8 |     array[x + 2] = u + v;
  9 |     array[x + 130] = u - v;
 10 |     u = array[x + 3], v = array[x + 131];
 11 |     array[x + 3] = u + v;
 12 |     array[x + 131] = u - v;
 13 |     u = array[x + 4], v = array[x + 132];
 14 |     array[x + 4] = u + v;
 15 |     array[x + 132] = u - v;
 16 |     u = array[x + 5], v = array[x + 133];
 17 |     array[x + 5] = u + v;
 18 |     array[x + 133] = u - v;
 19 |     u = array[x + 6], v = array[x + 134];
 20 |     array[x + 6] = u + v;
 21 |     array[x + 134] = u - v;
 22 |     u = array[x + 7], v = array[x + 135];
 23 |     array[x + 7] = u + v;
 24 |     array[x + 135] = u - v;
 25 |     u = array[x + 8], v = array[x + 136];
 26 |     array[x + 8] = u + v;
 27 |     array[x + 136] = u - v;
 28 |     u = array[x + 9], v = array[x + 137];
 29 |     array[x + 9] = u + v;
 30 |     array[x + 137] = u - v;
 31 |     u = array[x + 10], v = array[x + 138];
 32 |     array[x + 10] = u + v;
 33 |     array[x + 138] = u - v;
 34 |     u = array[x + 11], v = array[x + 139];
 35 |     array[x + 11] = u + v;
 36 |     array[x + 139] = u - v;
 37 |     u = array[x + 12], v = array[x + 140];
 38 |     array[x + 12] = u + v;
 39 |     array[x + 140] = u - v;
 40 |     u = array[x + 13], v = array[x + 141];
 41 |     array[x + 13] = u + v;
 42 |     array[x + 141] = u - v;
 43 |     u = array[x + 14], v = array[x + 142];
 44 |     array[x + 14] = u + v;
 45 |     array[x + 142] = u - v;
 46 |     u = array[x + 15], v = array[x + 143];
 47 |     array[x + 15] = u + v;
 48 |     array[x + 143] = u - v;
 49 |     u = array[x + 16], v = array[x + 144];
 50 |     array[x + 16] = u + v;
 51 |     array[x + 144] = u - v;
 52 |     u = array[x + 17], v = array[x + 145];
 53 |     array[x + 17] = u + v;
 54 |     array[x + 145] = u - v;
 55 |     u = array[x + 18], v = array[x + 146];
 56 |     array[x + 18] = u + v;
 57 |     array[x + 146] = u - v;
 58 |     u = array[x + 19], v = array[x + 147];
 59 |     array[x + 19] = u + v;
 60 |     array[x + 147] = u - v;
 61 |     u = array[x + 20], v = array[x + 148];
 62 |     array[x + 20] = u + v;
 63 |     array[x + 148] = u - v;
 64 |     u = array[x + 21], v = array[x + 149];
 65 |     array[x + 21] = u + v;
 66 |     array[x + 149] = u - v;
 67 |     u = array[x + 22], v = array[x + 150];
 68 |     array[x + 22] = u + v;
 69 |     array[x + 150] = u - v;
 70 |     u = array[x + 23], v = array[x + 151];
 71 |     array[x + 23] = u + v;
 72 |     array[x + 151] = u - v;
 73 |     u = array[x + 24], v = array[x + 152];
 74 |     array[x + 24] = u + v;
 75 |     array[x + 152] = u - v;
 76 |     u = array[x + 25], v = array[x + 153];
 77 |     array[x + 25] = u + v;
 78 |     array[x + 153] = u - v;
 79 |     u = array[x + 26], v = array[x + 154];
 80 |     array[x + 26] = u + v;
 81 |     array[x + 154] = u - v;
 82 |     u = array[x + 27], v = array[x + 155];
 83 |     array[x + 27] = u + v;
 84 |     array[x + 155] = u - v;
 85 |     u = array[x + 28], v = array[x + 156];
 86 |     array[x + 28] = u + v;
 87 |     array[x + 156] = u - v;
 88 |     u = array[x + 29], v = array[x + 157];
 89 |     array[x + 29] = u + v;
 90 |     array[x + 157] = u - v;
 91 |     u = array[x + 30], v = array[x + 158];
 92 |     array[x + 30] = u + v;
 93 |     array[x + 158] = u - v;
 94 |     u = array[x + 31], v = array[x + 159];
 95 |     array[x + 31] = u + v;
 96 |     array[x + 159] = u - v;
 97 |     u = array[x + 32], v = array[x + 160];
 98 |     array[x + 32] = u + v;
 99 |     array[x + 160] = u - v;
100 |     u = array[x + 33], v = array[x + 161];
101 |     array[x + 33] = u + v;
102 |     array[x + 161] = u - v;
103 |     u = array[x + 34], v = array[x + 162];
104 |     array[x + 34] = u + v;
105 |     array[x + 162] = u - v;
106 |     u = array[x + 35], v = array[x + 163];
107 |     array[x + 35] = u + v;
108 |     array[x + 163] = u - v;
109 |     u = array[x + 36], v = array[x + 164];
110 |     array[x + 36] = u + v;
111 |     array[x + 164] = u - v;
112 |     u = array[x + 37], v = array[x + 165];
113 |     array[x + 37] = u + v;
114 |     array[x + 165] = u - v;
115 |     u = array[x + 38], v = array[x + 166];
116 |     array[x + 38] = u + v;
117 |     array[x + 166] = u - v;
118 |     u = array[x + 39], v = array[x + 167];
119 |     array[x + 39] = u + v;
120 |     array[x + 167] = u - v;
121 |     u = array[x + 40], v = array[x + 168];
122 |     array[x + 40] = u + v;
123 |     array[x + 168] = u - v;
124 |     u = array[x + 41], v = array[x + 169];
125 |     array[x + 41] = u + v;
126 |     array[x + 169] = u - v;
127 |     u = array[x + 42], v = array[x + 170];
128 |     array[x + 42] = u + v;
129 |     array[x + 170] = u - v;
130 |     u = array[x + 43], v = array[x + 171];
131 |     array[x + 43] = u + v;
132 |     array[x + 171] = u - v;
133 |     u = array[x + 44], v = array[x + 172];
134 |     array[x + 44] = u + v;
135 |     array[x + 172] = u - v;
136 |     u = array[x + 45], v = array[x + 173];
137 |     array[x + 45] = u + v;
138 |     array[x + 173] = u - v;
139 |     u = array[x + 46], v = array[x + 174];
140 |     array[x + 46] = u + v;
141 |     array[x + 174] = u - v;
142 |     u = array[x + 47], v = array[x + 175];
143 |     array[x + 47] = u + v;
144 |     array[x + 175] = u - v;
145 |     u = array[x + 48], v = array[x + 176];
146 |     array[x + 48] = u + v;
147 |     array[x + 176] = u - v;
148 |     u = array[x + 49], v = array[x + 177];
149 |     array[x + 49] = u + v;
150 |     array[x + 177] = u - v;
151 |     u = array[x + 50], v = array[x + 178];
152 |     array[x + 50] = u + v;
153 |     array[x + 178] = u - v;
154 |     u = array[x + 51], v = array[x + 179];
155 |     array[x + 51] = u + v;
156 |     array[x + 179] = u - v;
157 |     u = array[x + 52], v = array[x + 180];
158 |     array[x + 52] = u + v;
159 |     array[x + 180] = u - v;
160 |     u = array[x + 53], v = array[x + 181];
161 |     array[x + 53] = u + v;
162 |     array[x + 181] = u - v;
163 |     u = array[x + 54], v = array[x + 182];
164 |     array[x + 54] = u + v;
165 |     array[x + 182] = u - v;
166 |     u = array[x + 55], v = array[x + 183];
167 |     array[x + 55] = u + v;
168 |     array[x + 183] = u - v;
169 |     u = array[x + 56], v = array[x + 184];
170 |     array[x + 56] = u + v;
171 |     array[x + 184] = u - v;
172 |     u = array[x + 57], v = array[x + 185];
173 |     array[x + 57] = u + v;
174 |     array[x + 185] = u - v;
175 |     u = array[x + 58], v = array[x + 186];
176 |     array[x + 58] = u + v;
177 |     array[x + 186] = u - v;
178 |     u = array[x + 59], v = array[x + 187];
179 |     array[x + 59] = u + v;
180 |     array[x + 187] = u - v;
181 |     u = array[x + 60], v = array[x + 188];
182 |     array[x + 60] = u + v;
183 |     array[x + 188] = u - v;
184 |     u = array[x + 61], v = array[x + 189];
185 |     array[x + 61] = u + v;
186 |     array[x + 189] = u - v;
187 |     u = array[x + 62], v = array[x + 190];
188 |     array[x + 62] = u + v;
189 |     array[x + 190] = u - v;
190 |     u = array[x + 63], v = array[x + 191];
191 |     array[x + 63] = u + v;
192 |     array[x + 191] = u - v;
193 |     u = array[x + 64], v = array[x + 192];
194 |     array[x + 64] = u + v;
195 |     array[x + 192] = u - v;
196 |     u = array[x + 65], v = array[x + 193];
197 |     array[x + 65] = u + v;
198 |     array[x + 193] = u - v;
199 |     u = array[x + 66], v = array[x + 194];
200 |     array[x + 66] = u + v;
201 |     array[x + 194] = u - v;
202 |     u = array[x + 67], v = array[x + 195];
203 |     array[x + 67] = u + v;
204 |     array[x + 195] = u - v;
205 |     u = array[x + 68], v = array[x + 196];
206 |     array[x + 68] = u + v;
207 |     array[x + 196] = u - v;
208 |     u = array[x + 69], v = array[x + 197];
209 |     array[x + 69] = u + v;
210 |     array[x + 197] = u - v;
211 |     u = array[x + 70], v = array[x + 198];
212 |     array[x + 70] = u + v;
213 |     array[x + 198] = u - v;
214 |     u = array[x + 71], v = array[x + 199];
215 |     array[x + 71] = u + v;
216 |     array[x + 199] = u - v;
217 |     u = array[x + 72], v = array[x + 200];
218 |     array[x + 72] = u + v;
219 |     array[x + 200] = u - v;
220 |     u = array[x + 73], v = array[x + 201];
221 |     array[x + 73] = u + v;
222 |     array[x + 201] = u - v;
223 |     u = array[x + 74], v = array[x + 202];
224 |     array[x + 74] = u + v;
225 |     array[x + 202] = u - v;
226 |     u = array[x + 75], v = array[x + 203];
227 |     array[x + 75] = u + v;
228 |     array[x + 203] = u - v;
229 |     u = array[x + 76], v = array[x + 204];
230 |     array[x + 76] = u + v;
231 |     array[x + 204] = u - v;
232 |     u = array[x + 77], v = array[x + 205];
233 |     array[x + 77] = u + v;
234 |     array[x + 205] = u - v;
235 |     u = array[x + 78], v = array[x + 206];
236 |     array[x + 78] = u + v;
237 |     array[x + 206] = u - v;
238 |     u = array[x + 79], v = array[x + 207];
239 |     array[x + 79] = u + v;
240 |     array[x + 207] = u - v;
241 |     u = array[x + 80], v = array[x + 208];
242 |     array[x + 80] = u + v;
243 |     array[x + 208] = u - v;
244 |     u = array[x + 81], v = array[x + 209];
245 |     array[x + 81] = u + v;
246 |     array[x + 209] = u - v;
247 |     u = array[x + 82], v = array[x + 210];
248 |     array[x + 82] = u + v;
249 |     array[x + 210] = u - v;
250 |     u = array[x + 83], v = array[x + 211];
251 |     array[x + 83] = u + v;
252 |     array[x + 211] = u - v;
253 |     u = array[x + 84], v = array[x + 212];
254 |     array[x + 84] = u + v;
255 |     array[x + 212] = u - v;
256 |     u = array[x + 85], v = array[x + 213];
257 |     array[x + 85] = u + v;
258 |     array[x + 213] = u - v;
259 |     u = array[x + 86], v = array[x + 214];
260 |     array[x + 86] = u + v;
261 |     array[x + 214] = u - v;
262 |     u = array[x + 87], v = array[x + 215];
263 |     array[x + 87] = u + v;
264 |     array[x + 215] = u - v;
265 |     u = array[x + 88], v = array[x + 216];
266 |     array[x + 88] = u + v;
267 |     array[x + 216] = u - v;
268 |     u = array[x + 89], v = array[x + 217];
269 |     array[x + 89] = u + v;
270 |     array[x + 217] = u - v;
271 |     u = array[x + 90], v = array[x + 218];
272 |     array[x + 90] = u + v;
273 |     array[x + 218] = u - v;
274 |     u = array[x + 91], v = array[x + 219];
275 |     array[x + 91] = u + v;
276 |     array[x + 219] = u - v;
277 |     u = array[x + 92], v = array[x + 220];
278 |     array[x + 92] = u + v;
279 |     array[x + 220] = u - v;
280 |     u = array[x + 93], v = array[x + 221];
281 |     array[x + 93] = u + v;
282 |     array[x + 221] = u - v;
283 |     u = array[x + 94], v = array[x + 222];
284 |     array[x + 94] = u + v;
285 |     array[x + 222] = u - v;
286 |     u = array[x + 95], v = array[x + 223];
287 |     array[x + 95] = u + v;
288 |     array[x + 223] = u - v;
289 |     u = array[x + 96], v = array[x + 224];
290 |     array[x + 96] = u + v;
291 |     array[x + 224] = u - v;
292 |     u = array[x + 97], v = array[x + 225];
293 |     array[x + 97] = u + v;
294 |     array[x + 225] = u - v;
295 |     u = array[x + 98], v = array[x + 226];
296 |     array[x + 98] = u + v;
297 |     array[x + 226] = u - v;
298 |     u = array[x + 99], v = array[x + 227];
299 |     array[x + 99] = u + v;
300 |     array[x + 227] = u - v;
301 |     u = array[x + 100], v = array[x + 228];
302 |     array[x + 100] = u + v;
303 |     array[x + 228] = u - v;
304 |     u = array[x + 101], v = array[x + 229];
305 |     array[x + 101] = u + v;
306 |     array[x + 229] = u - v;
307 |     u = array[x + 102], v = array[x + 230];
308 |     array[x + 102] = u + v;
309 |     array[x + 230] = u - v;
310 |     u = array[x + 103], v = array[x + 231];
311 |     array[x + 103] = u + v;
312 |     array[x + 231] = u - v;
313 |     u = array[x + 104], v = array[x + 232];
314 |     array[x + 104] = u + v;
315 |     array[x + 232] = u - v;
316 |     u = array[x + 105], v = array[x + 233];
317 |     array[x + 105] = u + v;
318 |     array[x + 233] = u - v;
319 |     u = array[x + 106], v = array[x + 234];
320 |     array[x + 106] = u + v;
321 |     array[x + 234] = u - v;
322 |     u = array[x + 107], v = array[x + 235];
323 |     array[x + 107] = u + v;
324 |     array[x + 235] = u - v;
325 |     u = array[x + 108], v = array[x + 236];
326 |     array[x + 108] = u + v;
327 |     array[x + 236] = u - v;
328 |     u = array[x + 109], v = array[x + 237];
329 |     array[x + 109] = u + v;
330 |     array[x + 237] = u - v;
331 |     u = array[x + 110], v = array[x + 238];
332 |     array[x + 110] = u + v;
333 |     array[x + 238] = u - v;
334 |     u = array[x + 111], v = array[x + 239];
335 |     array[x + 111] = u + v;
336 |     array[x + 239] = u - v;
337 |     u = array[x + 112], v = array[x + 240];
338 |     array[x + 112] = u + v;
339 |     array[x + 240] = u - v;
340 |     u = array[x + 113], v = array[x + 241];
341 |     array[x + 113] = u + v;
342 |     array[x + 241] = u - v;
343 |     u = array[x + 114], v = array[x + 242];
344 |     array[x + 114] = u + v;
345 |     array[x + 242] = u - v;
346 |     u = array[x + 115], v = array[x + 243];
347 |     array[x + 115] = u + v;
348 |     array[x + 243] = u - v;
349 |     u = array[x + 116], v = array[x + 244];
350 |     array[x + 116] = u + v;
351 |     array[x + 244] = u - v;
352 |     u = array[x + 117], v = array[x + 245];
353 |     array[x + 117] = u + v;
354 |     array[x + 245] = u - v;
355 |     u = array[x + 118], v = array[x + 246];
356 |     array[x + 118] = u + v;
357 |     array[x + 246] = u - v;
358 |     u = array[x + 119], v = array[x + 247];
359 |     array[x + 119] = u + v;
360 |     array[x + 247] = u - v;
361 |     u = array[x + 120], v = array[x + 248];
362 |     array[x + 120] = u + v;
363 |     array[x + 248] = u - v;
364 |     u = array[x + 121], v = array[x + 249];
365 |     array[x + 121] = u + v;
366 |     array[x + 249] = u - v;
367 |     u = array[x + 122], v = array[x + 250];
368 |     array[x + 122] = u + v;
369 |     array[x + 250] = u - v;
370 |     u = array[x + 123], v = array[x + 251];
371 |     array[x + 123] = u + v;
372 |     array[x + 251] = u - v;
373 |     u = array[x + 124], v = array[x + 252];
374 |     array[x + 124] = u + v;
375 |     array[x + 252] = u - v;
376 |     u = array[x + 125], v = array[x + 253];
377 |     array[x + 125] = u + v;
378 |     array[x + 253] = u - v;
379 |     u = array[x + 126], v = array[x + 254];
380 |     array[x + 126] = u + v;
381 |     array[x + 254] = u - v;
382 |     u = array[x + 127], v = array[x + 255];
383 |     array[x + 127] = u + v;
384 |     array[x + 255] = u - v;
385 | 


--------------------------------------------------------------------------------
/py/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY=pyinstall
2 | 
3 | all: pyinstall
4 | 
5 | pyinstall:
6 | 	python3 setup.py install
7 | 


--------------------------------------------------------------------------------
/py/frp.cpp:
--------------------------------------------------------------------------------
  1 | #include "FFHT/fht_header_only.h"
  2 | #include "frp/jl.h"
  3 | #include "pybind11/pybind11.h"
  4 | #include "pybind11/numpy.h"
  5 | using namespace frp;
  6 | 
  7 | namespace py = pybind11;
  8 | 
  9 | PYBIND11_MODULE(frp, m) {
 10 |     m.doc() = "Python bindings for frp"; // optional module docstring
 11 |     py::class_<OJLTransform> (m, "ojlt")
 12 |         .def(py::init<size_t, size_t, uint64_t, size_t>(), "Initialize an orthogonal JL transform from `from` to `do` dimensions, using seed `seed` and `nblocks` blocks",
 13 |              py::arg("from"), py::arg("to"), py::arg("seed")=137, py::arg("nblocks")=3)
 14 |         .def("resize", &OJLTransform::resize, "Resize the JL transform. This always rounds up to the nearest power of two.")
 15 |         .def("apply", [](const OJLTransform &jlt, py::array_t<double> input) -> py::array_t<double> {
 16 |             throw std::runtime_error("ojlt operates on floats, not doubles. Create an ojlt_d object instead.");
 17 |             // TODO: Add 2-d arrays and add kwarg for axis=0/1.
 18 |             return input;
 19 |         }, "Apply JL transform on double array, copying the input to an output array before performing.")
 20 |         .def("apply", [](const OJLTransform &jlt, py::array_t<float> input) -> py::array_t<float> {
 21 |             auto buf = input.request();
 22 |             auto result = py::array_t<float>(roundup(buf.size));
 23 |             auto resbuf(result.request());
 24 |             //std::memset(resbuf.ptr, 0, resbuf.size * resbuf.itemsize);
 25 |             std::memcpy(resbuf.ptr, buf.ptr, buf.size * buf.itemsize);
 26 |             jlt.transform_inplace((float *)resbuf.ptr);
 27 |             return result;
 28 |         }, "Apply JL transform on float array, copying the input to an output array before performing.")
 29 |         .def("apply_inplace", [](const OJLTransform &jlt, py::array_t<double> input) -> py::array_t<double> {
 30 |             throw std::runtime_error("ojlt operates on floats, not doubles. Create an ojlt_d object instead.");
 31 |             return input;
 32 |         }, "Apply JL transform on double array in-place, resizing up to nearest power of two if necessary.")
 33 |         .def("apply_inplace", [](const OJLTransform &jlt, py::array_t<float> input) -> py::array_t<float> {
 34 |             auto buf = input.request();
 35 |             if(buf.size & (buf.size - 1)) throw std::runtime_error("In place transform must be a power of two.");
 36 |             jlt.transform_inplace((float *)buf.ptr);
 37 |             return input;
 38 |         }, "Apply JL transform on double array in-place, resizing up to nearest power of two if necessary.")
 39 |         .def("reseed", &OJLTransform::reseed)
 40 |         .def("from_size", &OJLTransform::from_size)
 41 |         .def("to_size", &OJLTransform::to_size)
 42 |         .def("matrix_apply_oop", [](const OJLTransform &jlt, py::array_t<double> input) -> py::array_t<double> {
 43 |             throw std::runtime_error("ojlt operates on floats, not doubles. Create an ojlt_d object instead.");
 44 |             return input;
 45 |         }, "Apply a JL transform across a full vector, returning a result out of place.")
 46 |         .def("matrix_apply_oop", [](const OJLTransform &jlt, py::array_t<float> input) -> py::array_t<float> {
 47 |             py::buffer_info info = input.request();
 48 |             if(info.ndim != 2) throw std::runtime_error("OJL can only be called on matrices of 2 dimensions.");
 49 |             //std::fprintf(stderr, "Input array is %zu elements long each of dim %zu\n", info.shape[0], info.shape[1]);
 50 |             const ssize_t dest_size(jlt.to_size());
 51 |             py::array_t<float> ret(py::array_t<float>::ShapeContainer({info.shape[0], dest_size}));
 52 |             const size_t scratch_size(roundup(info.shape[1]));
 53 |             blaze::DynamicVector<float> tmp(scratch_size);
 54 |             py::buffer_info retinfo = ret.request();
 55 |             float *data((float *)info.ptr), *tmpdata(&tmp[0]);
 56 |             const size_t rowlen(info.shape[1]);
 57 |             if(rowlen & (rowlen - 1)) {
 58 |                 //std::fprintf(stderr, "rowlen: %zu. Num rows: %zd\n", rowlen, info.shape[0]);
 59 |                 auto tmpsub(subvector(tmp, 0, info.shape[1]));
 60 |                 auto zsub(subvector(tmp, info.shape[1], scratch_size - info.shape[1]));
 61 |                 for(ssize_t i(0); i < info.shape[0]; ++i) {
 62 |                     auto cv = blaze::CustomVector<float, blaze::unaligned, blaze::unpadded>(&data[rowlen * i], rowlen);
 63 |                     tmpsub = cv;
 64 |                     zsub = 0.f;
 65 |                     jlt.transform_inplace(tmp);
 66 |                     cv = tmpsub;
 67 |                 }
 68 |             } else {
 69 |                 for(ssize_t i(0); i < info.shape[0]; ++i) {
 70 |                     jlt.transform_inplace(&data[i * info.shape[1]]);
 71 |                 }
 72 |             }
 73 |             return ret;
 74 |         }, "Apply a JL transform across a full vector, returning a result out of place.")
 75 |         .doc() = "Orthogonal JL transform for float32s";
 76 |     py::class_<DOJ> (m, "ojlt_d")
 77 |         .def(py::init<size_t, size_t, uint64_t, size_t>(), "Initialize an orthogonal JL transform from `from` to `do` dimensions, using seed `seed` and `nblocks` blocks",
 78 |              py::arg("from"), py::arg("to"), py::arg("seed")=137, py::arg("nblocks")=3)
 79 |         .def("resize", &DOJ::resize, "Resize the JL transform. This always rounds up to the nearest power of two.")
 80 |         .def("apply", [](const DOJ &jlt, py::array_t<double> input) -> py::array_t<double> {
 81 |             // TODO: Add 2-d arrays and add kwarg for axis=0/1.
 82 |             auto buf = input.request();
 83 |             auto result = py::array_t<double>(roundup(buf.size));
 84 |             auto resbuf(result.request());
 85 |             //std::memset(resbuf.ptr, 0, resbuf.size * resbuf.itemsize);
 86 |             std::memcpy(resbuf.ptr, buf.ptr, buf.size * buf.itemsize);
 87 |             jlt.transform_inplace((double *)resbuf.ptr);
 88 |             return result;
 89 |         }, "Apply JL transform on double array, copying the input to an output array before performing.")
 90 |         .def("apply", [](const DOJ &jlt, py::array_t<float> input) -> py::array_t<float> {
 91 |             throw std::runtime_error("ojlt_d operates on doubles, not floats. Create an ojlt object instead.");
 92 |             return input;
 93 |         }, "Apply JL transform on float array, copying the input to an output array before performing.")
 94 |         .def("apply_inplace", [](const DOJ &jlt, py::array_t<double> input) -> py::array_t<double> {
 95 |             auto buf = input.request();
 96 |             if(buf.size & (buf.size - 1)) throw std::runtime_error("In place transform must be a power of two.");
 97 |             jlt.transform_inplace((double *)buf.ptr);
 98 |             return input;
 99 |         }, "Apply JL transform on double array in-place, resizing up to nearest power of two if necessary.")
100 |         .def("apply_inplace", [](const DOJ &jlt, py::array_t<float> input) -> py::array_t<float> {
101 |             throw std::runtime_error("ojlt_d operates on doubles, not floats. Create an ojlt object instead.");
102 |             return input;
103 |         }, "Apply JL transform on double array in-place, resizing up to nearest power of two if necessary.")
104 |         .def("reseed", &DOJ::reseed)
105 |         .def("from_size", &DOJ::from_size)
106 |         .def("to_size", &DOJ::to_size)
107 |         .def("matrix_apply_oop", [](const DOJ &jlt, py::array_t<double> input) -> py::array_t<double> {
108 |             py::buffer_info info = input.request();
109 |             if(info.ndim != 2) throw std::runtime_error("OJL can only be called on matrices of 2 dimensions.");
110 |             //std::fprintf(stderr, "Input array is %zu elements long each of dim %zu\n", info.shape[0], info.shape[1]);
111 |             const ssize_t dest_size(jlt.to_size());
112 |             py::array_t<double> ret(py::array_t<double>::ShapeContainer({info.shape[0], dest_size}));
113 |             const size_t scratch_size(roundup(info.shape[1]));
114 |             blaze::DynamicVector<double> tmp(scratch_size);
115 |             py::buffer_info retinfo = ret.request();
116 |             double *data((double *)info.ptr), *tmpdata(&tmp[0]);
117 |             const size_t rowlen(info.shape[1]);
118 |             if(rowlen & (rowlen - 1)) {
119 |                 //std::fprintf(stderr, "rowlen: %zu. Num rows: %zd\n", rowlen, info.shape[0]);
120 |                 auto tmpsub(subvector(tmp, 0, info.shape[1]));
121 |                 auto zsub(subvector(tmp, info.shape[1], scratch_size - info.shape[1]));
122 |                 for(ssize_t i(0); i < info.shape[0]; ++i) {
123 |                     auto cv = blaze::CustomVector<double, blaze::unaligned, blaze::unpadded>(&data[rowlen * i], rowlen);
124 |                     tmpsub = cv;
125 |                     zsub = 0.f;
126 |                     jlt.transform_inplace(tmp);
127 |                     cv = tmpsub;
128 |                 }
129 |             } else {
130 |                 for(ssize_t i(0); i < info.shape[0]; ++i) {
131 |                     jlt.transform_inplace(&data[i * info.shape[1]]);
132 |                 }
133 |             }
134 |             return ret;
135 |         }, "Apply a JL transform across a full vector, returning a result out of place.")
136 |         .def("matrix_apply_oop", [](const DOJ &jlt, py::array_t<float> input) -> py::array_t<float> {
137 |             throw std::runtime_error("ojlt_d operates on doubles, not floats. Create an ojlt object instead.");
138 |             return input;
139 |         }, "Apply a JL transform across a full vector, returning a result out of place.")
140 |         .doc() = "Orthogonal JL transform for doubles";
141 | }
142 | 


--------------------------------------------------------------------------------
/py/jl.py:
--------------------------------------------------------------------------------
 1 | from _jl import *
 2 | 
 3 | if __name__ == "__main__":
 4 |     NUM_SAMPLES = 5
 5 |     import numpy as np
 6 |     t = ojlt(80, 3, 10)
 7 |     arrs = np.array([[i + NUM_SAMPLES * j for i in range(t.from_size())] for j in range(NUM_SAMPLES)], dtype=np.double)
 8 |     # arrs = [np.array([i for i in range(t.from_size())], dtype=np.double) for j in range(10)]
 9 |     assert len(arrs) == NUM_SAMPLES
10 |     outdata = t.matrix_apply_oop(arrs)
11 |     print(outdata)
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/py/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, Extension, find_packages
  2 | from os import environ
  3 | from setuptools.command.build_ext import build_ext
  4 | import subprocess
  5 | import sys
  6 | import setuptools
  7 | from glob import glob
  8 | 
  9 | __version__ = subprocess.check_output(["git", "describe", "--abbrev=4"]).decode().strip().split('-')[0]
 10 | 
 11 | 
 12 | 
 13 | class get_pybind_include(object):
 14 |     """Helper class to determine the pybind11 include path
 15 |     The purpose of this class is to postpone importing pybind11
 16 |     until it is actually installed, so that the ``get_include()``
 17 |     method can be invoked. """
 18 | 
 19 |     def __init__(self, user=False):
 20 |         self.user = user
 21 | 
 22 |     def __str__(self):
 23 |         import pybind11
 24 |         return pybind11.get_include(self.user)
 25 | 
 26 | 
 27 | extra_compile_args = ['-march=native',
 28 |                       '-Wno-char-subscripts', '-Wno-unused-function',
 29 |                       '-Wno-strict-aliasing', '-Wno-ignored-attributes', '-fno-wrapv',
 30 |                       '-fopenmp', '-DNDEBUG']
 31 | 
 32 | if 'BOOST_DIR' in environ:
 33 |     extra_compile_args.append("-I%s" % environ['BOOST_DIR'])
 34 | 
 35 | 
 36 | def ensure_sleef_built():
 37 |     import os
 38 |     if not os.path.isdir("../vec/sleef/build"):
 39 |         import subprocess
 40 |         os.makedirs("../vec/sleef/build")
 41 |         subprocess.check_call("cd ../vec/sleef/build && cmake .. && make -j4", shell=True)
 42 |     assert os.path.isfile("../vec/sleef/build/include/sleef.h"), "Failed to build sleef"
 43 | 
 44 | 
 45 | include_dirs = [
 46 |     # Path to pybind11 headers
 47 |     get_pybind_include(),
 48 |     get_pybind_include(user=True),
 49 |    "../",
 50 |    "../include",
 51 |    "../fastrange",
 52 |    "../incude/frp",
 53 |    "../vec",
 54 |    "../vec/blaze",
 55 |    "../vec/sleef/build/include",
 56 |    "../pybind11/include",
 57 |    "../fftw-3.3.7/api"
 58 | ]
 59 | 
 60 | include_dirs += [x + "/include/" for x in glob("../boost/*")]
 61 | 
 62 | 
 63 | ext_modules = [
 64 |     Extension(
 65 |         'frp',
 66 |         ['frp.cpp'],
 67 |         include_dirs=include_dirs,
 68 |         language='c++',
 69 |         extra_compile_args=extra_compile_args,
 70 |         libraries=['z', 'sleef', 'gomp'],
 71 |         library_dirs=["../vec/sleef/build/lib"]
 72 |     ),
 73 | ]
 74 | 
 75 | 
 76 | 
 77 | # As of Python 3.6, CCompiler has a `has_flag` method.
 78 | # cf http://bugs.python.org/issue26689
 79 | def has_flag(compiler, flagname):
 80 |     """Return a boolean indicating whether a flag name is supported on
 81 |     the specified compiler.
 82 |     """
 83 |     import tempfile
 84 |     with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f:
 85 |         f.write('int main (int argc, char **argv) { return 0; }')
 86 |         try:
 87 |             compiler.compile([f.name], extra_postargs=[flagname])
 88 |         except setuptools.distutils.errors.CompileError:
 89 |             return False
 90 |     return True
 91 | 
 92 | 
 93 | def cpp_flag(compiler):
 94 |     """Return the -std=c++[11/14/17] compiler flag.
 95 |     The newer version is prefered over c++11 (when it is available).
 96 |     """
 97 |     flags = ['-std=c++17', '-std=c++14', '-std=c++11']
 98 | 
 99 |     for flag in flags:
100 |         if has_flag(compiler, flag): return flag
101 | 
102 |     raise RuntimeError('Unsupported compiler -- at least C++11 support '
103 |                        'is needed!')
104 | 
105 | 
106 | extra_link_opts = ["-fopenmp", "-lgomp", "-lz", "-lsleef"]
107 | 
108 | class BuildExt(build_ext):
109 |     """A custom build extension for adding compiler-specific options."""
110 |     c_opts = {
111 |         'msvc': ['/EHsc'],
112 |         'unix': [],
113 |     }
114 |     l_opts = {
115 |         'msvc': [],
116 |         'unix': [],
117 |     }
118 | 
119 |     if sys.platform == 'darwin':
120 |         darwin_opts = ['-mmacosx-version-min=10.7']# , '-libstd=libc++']
121 |         # darwin_opts = []
122 |         c_opts['unix'] += darwin_opts
123 |         l_opts['unix'] += darwin_opts
124 | 
125 |     def build_extensions(self):
126 |         ct = self.compiler.compiler_type
127 |         opts = self.c_opts.get(ct, [])
128 |         link_opts = self.l_opts.get(ct, [])
129 |         if ct == 'unix':
130 |             opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
131 |             opts.append(cpp_flag(self.compiler))
132 |             if has_flag(self.compiler, '-fvisibility=hidden'):
133 |                 opts.append('-fvisibility=hidden')
134 |         elif ct == 'msvc':
135 |             opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version())
136 |         for ext in self.extensions:
137 |             ext.extra_compile_args = opts
138 |             ext.extra_compile_args += extra_compile_args
139 |             ext.extra_link_args = link_opts + extra_link_opts
140 |         build_ext.build_extensions(self)
141 | 
142 | ensure_sleef_built()
143 | 
144 | setup(
145 |     name='frp',
146 |     version=__version__,
147 |     author='Daniel Baker',
148 |     author_email='dnb@cs.jhu.edu',
149 |     url='https://github.com/dnbaker/frp',
150 |     description='A python module for stuff',
151 |     long_description='',
152 |     ext_modules=ext_modules,
153 |     install_requires=['pybind11>=2.4'],
154 |     setup_requires=['pybind11>=2.4'],
155 |     cmdclass={'build_ext': BuildExt},
156 |     zip_safe=False,
157 |     packages=find_packages()
158 | )
159 | 
160 | 


--------------------------------------------------------------------------------
/scripts/autogen.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def generate_code(j, k, s1, array):
 3 |     jk = j + k
 4 |     jks = j + k + s1
 5 |     return "    u = {array}[x + {jk}], v = {array}[x + {jks}];\n".format(**locals()) + \
 6 |         "    {array}[x + {jk}] = u + v;\n".format(**locals()) + \
 7 |         "    {array}[x + {jks}] = u - v;\n".format(**locals())
 8 | 
 9 | 
10 | def make_one_levelset(level, neleml2, array="array"):
11 |     n = 1 << neleml2
12 |     s1 = 1 << level
13 |     s2 = s1 << 1
14 |     assert n >= s2, "n: {n}. s2: {s2}".format(**locals())
15 |     ret = ""
16 |     level_offsets = range(0, n, s2)
17 |     sublevel_offsets = range(s1)
18 |     for lo in level_offsets:
19 |         for so in sublevel_offsets:
20 |             ret += generate_code(lo, so, s1, array)
21 |     return ret
22 | 
23 | def make_unrolled_fht(level: int, neleml2: int, array="array"):
24 |     return "    T u, v;\n" + "".join(make_one_levelset(l, neleml2, array) for l in range(level + 1))
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     import sys
29 |     nk = int(sys.argv[1]) if sys.argv[1:] else 4
30 |     kernels = (make_one_levelset(i, i + 1) for i in range(nk))
31 |     for ind, k in enumerate(kernels):
32 |         with open("fht_kernel%d.cu" % ind, "w") as f:
33 |             f.write(k)
34 | 


--------------------------------------------------------------------------------
/scripts/ratio_err.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | from subprocess import check_call
 4 | 
 5 | def load_data(path):
 6 |     with open(path) as f:
 7 |         return np.array([list(map(float, i[:-1].split(", "))) for i in f if i[0] != "#" and "1e-300" not in i], dtype=np.double)
 8 | 
 9 | 
10 | def print_results(path, sigma, n, ofp):
11 |     print("loading data")
12 |     data = load_data(path)
13 |     print("loaded data")
14 |     # data[:,2] *= sigma * sigma
15 |     mean_fac = np.mean(data[:,1] / data[:,2])
16 |     ofp.write("%f\t%f\t%f\t%f\t%f\t%i\n" % (np.mean(data[:,1]), np.mean(data[:,2]), mean_fac, sigma, np.corrcoef(data[:,1], data[:,2])[0,1], n))
17 |     print("mf: %f. %f, %f\n" % (mean_fac,  np.mean(data[:,1]), np.mean(data[:,2])))
18 |     ratios = data[:,1] / data[:,2]
19 |     print("ratmean %f, ratstd %f" % (np.mean(ratios), np.std(ratios)))
20 |     ratios = data[:,1] / data[:,2]
21 |     print("n: %i. corr: %e. ratio off %e. size %e" % (
22 |         len(data), np.corrcoef(data[:,1], data[:,2])[0,1], mean_fac, len(data[:,1])))
23 |     inv =  -np.reciprocal(data[:,2])
24 |     inv[inv > 1e30] = 1e-300
25 |     print("neginv n: %i. corr: %e. ratio off %e. size %e" % (
26 |         len(data), np.corrcoef(data[:,1], inv)[0,1], mean_fac, len(data[:,1])))
27 |     tmp = np.subtract(data[:,2], 1)
28 |     print("corr of 1 - el and original is %lf" % np.corrcoef(tmp, data[:,1])[0,1])
29 |     tablefp = open("table.txt", "w")
30 |     for el, invel, fullk in zip(data[:,2], inv, data[:,1]):
31 |         print("el %lf\tInv el %lf\tfullk el %lf" % (el, invel, fullk), file=tablefp)
32 | 
33 | 
34 | def submit_work(tup):
35 |     sig, SIZE, fn = tup[:]
36 |     s  = "kernel_test -i256 -s %f -S %i  > %s" % (sig, SIZE, fn)
37 |     check_call(s, shell=True)
38 |     return fn
39 | 
40 | 
41 | def get_data():
42 |     import multiprocessing
43 |     SIGS = [0.01, 0.05, 0.1, 1, 10, 100, 10000]
44 |     SIZE = 1 << 16
45 |     spool = multiprocessing.Pool(8)
46 |     fns = spool.map(submit_work,
47 |                     [[sig, SIZE, "output.%s.txt" % (sig)] for sig in SIGS])
48 |     return ["output.%s.txt" % (sig) for sig in SIGS]
49 | 
50 | 
51 | def print_ratios_and_corrs(path, sig):
52 |     data = load_data(path)
53 |     d = data
54 |     correlations = np.array([np.corrcoef(data[:,1], data[:,i])[0,1] for i in range(1, 6)])
55 |     ratios = np.array([np.mean(d[:,i] / d[:,1]) for i in range(1,6)])
56 |     mratios = np.array([np.mean(d[:,i]) / np.mean(d[:,1]) for i in range(1,6)])
57 |     stds = np.array([np.std(d[:,i] / d[:,1]) for i in range(1,6)])
58 |     rmses = np.array([np.sqrt(sum((c - est)**2 for c, est in zip(d[:,i], d[:,1])) / len(d[:,i])) for i in range(1,6)])
59 |     names = ["exact", "rff", "orff", "sorff", "ff"]
60 |     for ind, (c, r, m, s, rmse) in enumerate(zip(correlations, ratios, mratios, stds, rmses)):
61 |         print("\t".join([names[ind], str(c), str(r), str(s), str(np.mean(d[:,ind+1])), str(m), str(rmse), str(sig)]))
62 | 
63 | 
64 | def main():
65 |     SIGS = [i / 10. for i in range(2, 50)]
66 |     print("Name\tCorrelation\tRatio\tRatio std\tMean\tRatio of means\tRMSE\tSigma")
67 |     # SIGS = [0.01, 0.05, 0.1, 1, 10, 100, 10000]
68 |     if sys.argv[1:]:
69 |         fns = ["output.%s.txt" % (sig) for sig in SIGS]
70 |         for fn, sig in zip(fns, SIGS):
71 |             print_ratios_and_corrs(fn, sig)
72 |         return
73 |     import multiprocessing
74 |     SIZE = 1 << 16
75 |     spool = multiprocessing.Pool(8)
76 |     fns = spool.map(submit_work,
77 |                     [[sig, SIZE, "output.%s.txt" % (sig)] for sig in SIGS])
78 |     for fn, sig in zip(fns, SIGS):
79 |         print_ratios_and_corrs(fn, sig)
80 | 
81 | def old_main():
82 |     import multiprocessing
83 |     SIGS = [i / 10. for i in range(16)]
84 |     SIZE = 1 << 16
85 |     ratsigf = open("ratsig.%s.txt" % (SIZE), "w")
86 |     ratsigf.write("#KernelMean\tApproxMean\tRatio\tSigma\tCorrcoef\tRMSE\tN\n")
87 |     spool = multiprocessing.Pool(8)
88 |     fns = spool.map(submit_work,
89 |                     [[sig, SIZE, "output.%s.txt" % (sig)] for sig in SIGS])
90 |     [print_results(fn, sig, SIZE, ratsigf) for sig, fn in zip(SIGS, fns)]
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/scripts/time_exp.py:
--------------------------------------------------------------------------------
 1 | #/usr/bin/env python
 2 | import sys
 3 | import numpy as np
 4 | import subprocess
 5 | 
 6 | class settings:
 7 |     def __init__(self, sigma, insize, outsize):
 8 |         self.sigma = sigma
 9 |         self.insize = insize
10 |         self.outsize = outsize
11 |     def __str__(self):
12 |         return "%f|%i|%i" % (self.sigma, self.insize, self.outsize)
13 | 
14 | 
15 | def get_output(sigma, insize, outsize):
16 |     s = "kernel_time -Oi%i -s %f -S %i" % (insize, sigma, outsize)
17 |     output_str = [i.strip() for i in subprocess.check_output(s,
18 |                   shell=True).decode().split("\n") if i]
19 |     return output_str, settings(sigma, insize, outsize)
20 | 
21 | 
22 | def submit_work(tup):
23 |     return get_output(*tup)
24 | 
25 | 
26 | def main():
27 |     SIGS = [0.25, 0.5, 1., 2., 5.]
28 |     INSIZES = [128, 256, 1024, 4096]
29 |     START = 4096
30 |     OUTSIZES = [START << 1, START << 2, START << 4, START << 6]
31 |     '''
32 |     SIGS = [0.5]
33 |     INSIZES = [64, 128]
34 |     OUTSIZES = [1024, 2048]
35 |     '''
36 |     subgen = [(sig, ins, outs) for sig in SIGS for ins in INSIZES for outs in OUTSIZES if ins * outs < 4096 * 131072]
37 |     import multiprocessing
38 |     spool = multiprocessing.Pool(8)
39 |     strings = spool.map(submit_work, subgen)
40 |     main_dict = {}
41 |     for string, setting in strings:
42 |         for key, val in zip(string[0].split(), string[1].split()):
43 |             main_dict[
44 |                 str(setting)] = {
45 |                     k: float(v) for
46 |                     k, v in zip(string[0].split(), string[1].split())}
47 |     print("Input Size\tOutput Size\tSigma\tRFF Time\tORF Time"
48 |           "\tSORF Time\tFF Time\tFastest method\t"
49 |           "Ratio of fastest/slowest\tRatio of SORF over slowest")
50 |     keys = ["rf", "orf", "sorf", "ff"]
51 |     for sig, i, o in subgen:
52 |         subdict = main_dict[str(settings(sig, i, o))]
53 |         entries = [subdict[k] for k in keys]
54 |         besti = np.argmin(entries)
55 |         ratio = np.max(entries) / entries[besti]
56 |         linestr = "%i\t%i\t%f" % (i, o, sig)
57 |         entrystr = "\t".join(list(map(str, entries)))
58 |         endstr = "%s\t%f\t%f" % (keys[besti], ratio, np.max(entries) / subdict["sorf"])
59 |         print("\t".join([linestr, entrystr, endstr]))
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/src/aestest.cpp:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <random>
  3 | #include <iostream>
  4 | #include <type_traits>
  5 | #include <vector>
  6 | #include <chrono>
  7 | #include "frp/frp.h"
  8 | 
  9 | class Timer {
 10 |     using TpType = std::chrono::system_clock::time_point;
 11 |     std::string name_;
 12 |     TpType start_, stop_;
 13 | public:
 14 |     Timer(std::string &&name=""): name_{name}, start_(std::chrono::system_clock::now()) {}
 15 |     void stop() {stop_ = std::chrono::system_clock::now();}
 16 |     void restart() {start_ = std::chrono::system_clock::now();}
 17 |     void report() {std::cerr << "Took " << std::chrono::duration<double>(stop_ - start_).count() << "s for task '" << name_ << "'\n";}
 18 |     ~Timer() {stop(); /* hammertime */ report();}
 19 | };
 20 | 
 21 | template<typename Dist, typename RNG>
 22 | auto make_val(Dist &dist, RNG &rng) {
 23 |     return dist(rng);
 24 | }
 25 | 
 26 | template<typename Dist, typename RNG, typename FloatType>
 27 | void time_stuff(const char *name, Dist &dist, RNG &rng, std::vector<FloatType> &vals, size_t niter, size_t size) {
 28 |     Timer t(name);
 29 |     while(niter--) for(size_t i(0); i < size; ++i) vals[i] = dist(rng);
 30 | }
 31 | 
 32 | template<typename RNG>
 33 | void time_aes(RNG &rng, std::vector<uint64_t> &vec, std::string name, size_t nrounds) {
 34 |     {
 35 |         Timer t(name + "sequential");
 36 |         for(size_t i(0); i < nrounds; ++i)
 37 |             for(size_t j(0); j < vec.size(); ++j)
 38 |                 vec[i] = rng();
 39 |     }
 40 |     {
 41 |         Timer t(name + "ram");
 42 |         for(size_t i(0); i < nrounds; ++i)
 43 |             for(size_t j(0); j < vec.size(); ++j)
 44 |                 vec[i] = rng[i];
 45 |     }
 46 | }
 47 | 
 48 | template<typename... Types>
 49 | using unormd = boost::random::detail::unit_normal_distribution<Types...>;
 50 | 
 51 | int main(int argc, char *argv[]) {
 52 |     size_t niter(argc > 2 ? std::strtoull(argv[2], 0, 10): 1000), size(argc > 1 ? std::strtoull(argv[1], 0, 10): 1 << 16);
 53 |     std::fprintf(stderr, "niter %zu size %zu\n", niter, size);
 54 |     int64_t UNROLL_COUNT = 4;
 55 |     const __m128i CTR_ADD = {UNROLL_COUNT, 0};
 56 |     const __m128i CTR_CMP = _mm_set_epi64x(0, UNROLL_COUNT);
 57 |     assert(CTR_ADD[0] == CTR_CMP[0]);
 58 |     assert(CTR_ADD[1] == CTR_CMP[1]);
 59 |     std::vector<uint64_t> vec(size);
 60 |     aes::AesCtr<uint64_t, 8> c8;
 61 |     aes::AesCtr<uint64_t, 4> c4;
 62 |     std::vector<double> rvals(size);
 63 |     std::vector<float> fvals(size);
 64 |     std::uniform_real_distribution<double> urdd(0, M_PI * 2);
 65 |     std::uniform_real_distribution<double> urdf(0, M_PI * 2);
 66 |     boost::random::uniform_real_distribution<double> burdd(0, M_PI * 2);
 67 |     boost::random::uniform_real_distribution<double> burdf(0, M_PI * 2);
 68 |     {
 69 |         Timer t("rdd");
 70 |         for(size_t i(0); i < niter; ++i)
 71 |         for(auto &el: rvals) el = urdd(c8);
 72 |     }
 73 |     {
 74 |         Timer t("rdf");
 75 |         for(size_t i(0); i < niter; ++i)
 76 |         for(auto &el: fvals) el = urdf(c8);
 77 |     }
 78 |     {
 79 |         Timer t("rdfd");
 80 |         for(size_t i(0); i < niter; ++i)
 81 |         for(auto &el: fvals) el = urdd(c8);
 82 |     }
 83 |     {
 84 |         Timer t("rdd");
 85 |         for(size_t i(0); i < niter; ++i)
 86 |         for(auto &el: rvals) el = burdd(c8);
 87 |     }
 88 |     {
 89 |         Timer t("rdf");
 90 |         for(size_t i(0); i < niter; ++i)
 91 |         for(auto &el: fvals) el = burdf(c8);
 92 |     }
 93 |     {
 94 |         Timer t("rdfd");
 95 |         for(size_t i(0); i < niter; ++i)
 96 |         for(auto &el: fvals) el = burdd(c8);
 97 |     }
 98 |     blaze::DynamicVector<float> zomgz(1 << 8);
 99 |     for(size_t i(0); i < zomgz.size(); zomgz[i] = i, ++i);
100 |     frp::LutShuffler<size_t> os(zomgz.size(), 1);
101 |     os.apply(zomgz);
102 |     std::cerr << zomgz;
103 | }
104 | 


--------------------------------------------------------------------------------
/src/dcitest.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <iostream>
  3 | #include <thread>
  4 | #include "omp.h"
  5 | #include "aesctr/wy.h"
  6 | #include "include/frp/dci.h"
  7 | #include <getopt.h>
  8 | 
  9 | using namespace frp;
 10 | using namespace dci;
 11 | 
 12 | // using UDMF = blaze::StrictlyUpperMatrix<blaze::DynamicMatrix<float>>;
 13 | // using UDMU = blaze::StrictlyUpperMatrix<blaze::DynamicMatrix<unsigned>>;
 14 | using UDMF = blaze::DynamicMatrix<FLOAT_TYPE>;
 15 | using UDMU = blaze::DynamicMatrix<unsigned>;
 16 | 
 17 | 
 18 | //StrictlyUpperMatrix
 19 | 
 20 | template<typename DCIType>
 21 | std::pair<UDMF,UDMU> nn_data(const DCIType &dc) {
 22 |     auto a = dc.begin();
 23 |     ssize_t dist = std::distance(a, dc.end());
 24 |     size_t n = dist;
 25 |     UDMF dists(dist, dist);
 26 |     _Pragma("omp parallel for")
 27 |     for(size_t i = 0; i < n; ++i) {
 28 |         const blaze::DynamicVector<FLOAT_TYPE> &vp1 = *dc[i];
 29 |         auto r1 = row(dists, i);
 30 |         for(size_t j = i + 1; j < n; ++j) {
 31 |             const blaze::DynamicVector<FLOAT_TYPE> &vp2 = *dc[j];
 32 |             auto d = blaze::norm(vp1 - vp2);
 33 |             r1[j] = d;
 34 |         }
 35 |     }
 36 |     UDMU labels(dist, dist);
 37 |     _Pragma("omp parallel for")
 38 |     for(size_t i = 0; i < n; ++i) {
 39 |         auto r = row(labels, i);
 40 |         auto mr = row(dists, i);
 41 |         std::iota(r.begin(), r.end(), 0u);
 42 |         sort(r.begin(), r.end(), [&mr](auto x, auto y) {return mr[x] < mr[y];});
 43 | #if 0
 44 |         for(auto x: r) {
 45 |             assert(x < mr.size());
 46 |             std::fprintf(stderr, "n = %zu speaking now with r %d/%f\n", i, x, float(mr[x]));
 47 |         }
 48 | #endif
 49 |     }
 50 |     //std::cerr << "Distances! " << dists << '\n';
 51 |     std::fprintf(stderr, "Return pair stuff\n");
 52 |     return std::make_pair(std::move(dists), std::move(labels));
 53 | }
 54 | 
 55 | 
 56 | template<typename T1, typename I=std::uint32_t>
 57 | auto distmat2nn(const T1 &mat, size_t k) {
 58 |     if(mat.columns() > std::numeric_limits<I>::max())
 59 |         throw std::runtime_error("Overflow: mat size too large");
 60 |     k = std::min(mat.columns(), k);
 61 |     blaze::DynamicMatrix<I> ret(mat.columns(), k);
 62 |     //#pragma omp parallel for
 63 |     for(size_t i = 0; i < mat.rows(); ++i) {
 64 |         //std::fprintf(stderr, "Label stuff2nn %zu\n", i);
 65 |         auto r = row(mat, i);
 66 |         //std::cerr << "Matrix row: " << r;
 67 |         auto func = [&r](size_t j, size_t k){return r[j] > r[k];};
 68 |         size_t heapsz = 0;
 69 |         auto pq(row(ret, i));
 70 |         assert(k == pq.size());
 71 |         size_t j;
 72 |         for(j = 0;j < mat.rows();++j) {
 73 |             auto pqp = &pq[0];
 74 |             if(heapsz < pq.size()) {
 75 |                 pq[heapsz] = j;
 76 |                 if(++heapsz == pq.size())
 77 |                     std::make_heap(pqp, pqp + heapsz, func);
 78 |             } else if(func(j, pq[0])) {
 79 |                 assert(pq.size() >= heapsz);
 80 |                 std::pop_heap(pqp, pqp + heapsz, func);
 81 |                 pq[heapsz - 1] = j;
 82 |                 std::push_heap(pqp, pqp + heapsz, func);
 83 |             }
 84 |         }
 85 |         for(auto it = pq.end();it != pq.begin();std::pop_heap(pq.begin(), it--, func));
 86 |         assert(std::is_sorted(pq.begin(), pq.end(), func));
 87 |         // std::cerr << pq << '\n';
 88 | #if 0
 89 |         for(auto v: pq) {
 90 |             if(r[v] > r[pq[0]]) {
 91 |                 std::fprintf(stderr, "WOOOrv: %e. rpq: %e\n", r[v], r[pq[0]]);
 92 |             }
 93 |             else {
 94 |                 std::fprintf(stderr, "NOOOrv: %e. rpq: %e\n", r[v], r[pq[0]]);
 95 |             }
 96 |         }
 97 | #endif
 98 |     }
 99 |     return ret;
100 | }
101 | 
102 | 
103 | void usage() {
104 |     std::fprintf(stderr, "Usage: dcitest <flags>\n-d: dimension [400]\n"
105 |                          "-n: number of points [100000]\n"
106 |                          "-l: Number of levels [15]\n"
107 |                          "-g: Gamma for unprioritized [1.]\n"
108 |                          "-2: k2 (number of candidates per level) [k]\n"
109 |                          "-i: read tab-delimited data from file at [path]. Default behavior: generate synthetic gaussian noise\n"
110 |                          "-m: Number of sublevels per level [5]\n"
111 |                          "-k: Number of neighbors to retrieve [5]\n"
112 |                          "-h: Usage (this menu)\n");
113 |     std::exit(1);
114 | }
115 | 
116 | int main(int argc, char *argv[]) {
117 |     int c, nd = 400, npoints = 100000, k = 10, l = 15, m = 5, k2 = -1, threads = -1;
118 |     double gamma = 1.;
119 |     const char *inpath = nullptr;
120 |     while((c = getopt(argc, argv, "p:i:2:g:d:n:k:l:m:h")) >= 0) {
121 |          switch(c) {
122 |              case 'd': nd = std::atoi(optarg); break;
123 |                      case 'n': npoints = std::atoi(optarg); break;
124 |              case 'k': k = std::atoi(optarg); break;
125 |              case '2': k2 = std::atoi(optarg); break;
126 |              case 'l': l = std::atoi(optarg); break;
127 |              case 'i': inpath = optarg; break;
128 |              case 'm': m = std::atoi(optarg); break;
129 |              case 'g': gamma = std::atof(optarg); break;
130 |              case 'p': threads = std::atoi(optarg); break;
131 |              case 'h': case '?': usage();
132 |          }
133 |     }
134 |     std::fprintf(stderr, "nd: %d. np: %d. n: %d. k2: %d. Gamma: %f\n", nd, npoints, k, k2, gamma);
135 |     if(k2 < 0) k2 = k;
136 | #if 0
137 |     {
138 |         // make sure it works with < nd
139 |         DCI<blaze::DynamicVector<FLOAT_TYPE>, uint32_t, float, std::deque> dcid(4, 20, nd, 1e-5, false);
140 |         DCI<blaze::DynamicVector<FLOAT_TYPE>> tmp(4, 3, nd, 1e-5, true);
141 |     }
142 | #endif
143 |     std::vector<blaze::DynamicVector<FLOAT_TYPE>> ls;
144 |     ls.reserve(1000);
145 |     omp_set_num_threads(threads <= 0 ? int(std::thread::hardware_concurrency()): threads);
146 |     if(inpath) {
147 |         std::ios_base::sync_with_stdio(false);
148 |         std::ifstream bufreader(inpath);
149 |         std::vector<uint32_t> offsets;
150 |         for(std::string buf; std::getline(bufreader, buf);) {
151 |             if(buf.empty() || std::all_of(buf.begin(), buf.end(), [](auto x) {return std::isspace(x);}))
152 |                 continue;
153 |             ks::split(buf.data(), 0, buf.size(), offsets);
154 |             blaze::DynamicVector<FLOAT_TYPE> tmp(offsets.size());
155 |             OMP_PRAGMA("omp parallel for schedule(static, 16)")
156 |             for(size_t i = 0; i < tmp.size(); ++i)
157 |                 tmp[i] = std::atof(buf.data() + offsets[i]);
158 | #if !NDEBUG
159 |             if(ls.size())
160 |                 assert(ls.back().size() == tmp.size());
161 | #endif
162 |             ls.emplace_back(std::move(tmp));
163 |         }
164 |     } else {
165 |         wy::WyHash<uint64_t, 8> mt(nd * npoints + k * k2 + std::pow(nd, gamma));
166 |         std::normal_distribution<FLOAT_TYPE> gen(2.5, std::sqrt(2.5));
167 |         gen.reset();
168 |         for(ssize_t i = 0; i < npoints; ++i) {
169 |             ls.emplace_back(nd);
170 |             for(auto &x: ls.back())
171 |                 x = gen(mt);
172 |         }
173 |     }
174 |     nd = ls[0].size();
175 |     npoints = ls.size();
176 |     std::fprintf(stderr, "Generated data. nd: %d.np: %d\n", nd, npoints);
177 |     for(const auto &v: ls)
178 |         assert(unsigned(nd) == v.size());
179 |     DCI<FLOAT_TYPE, uint32_t, std::set, ska::flat_hash_set, blaze::rowMajor, FHTLSHasher<FLOAT_TYPE, blaze::rowMajor>, std::uint16_t> fhtdci(
180 |         m, l, nd, 1e-5, true, gamma);
181 |     DCI<FLOAT_TYPE> dci(m, l, nd, 1e-5, true, gamma);
182 |     std::cerr << "made dci\n";
183 |     E2LSHasher<> d2(nd, l * m, 2.);
184 |     d2.project(ls[0]);
185 |     size_t lshash = d2.hash(ls[0]);
186 |     LSHTable<E2LSHasher<>> lshasher(std::move(d2));
187 |     std::fprintf(stderr, "lshash for first item: %zu\n", lshash);
188 |     //OMP_PRAGMA("omp parallel for")
189 |     for(size_t i = 0; i < ls.size(); ++i) {
190 |         dci.add(ls[i]);
191 |         fhtdci.add(ls[i]);
192 |     }
193 | #if 0
194 |     blaze::DynamicMatrix<FLOAT_TYPE> mat_to_insert(nd, 100);
195 |     for(int i = 0; i < nd; ++i)
196 |         for(auto &e: row(mat_to_insert, i))
197 |             e = gen(mt);
198 |     try {
199 |     dci.insert(mat_to_insert);
200 |     } catch(const std::exception &e) {
201 |         std::fprintf(stderr, "failed to insert from matrix, but carry on [msg: %s]\n", e.what());
202 |     }
203 | #endif
204 |     std::fprintf(stderr, "Added\n");
205 |     auto topn = dci.query(ls[0], k);
206 |     std::fprintf(stderr, "topn, where n is %zu: \n\n", topn.size());
207 |     std::reverse(topn.begin(), topn.end());
208 |     auto tnbeg = topn.begin();
209 |     double mv = norm(ls[tnbeg++->id()] - ls[0]);
210 |     std::fprintf(stderr, "first dist: %le\n", mv);
211 |     do {
212 |         auto id = tnbeg->id();
213 |         blaze::DynamicVector<FLOAT_TYPE> &rl(ls[id]);
214 |         blaze::DynamicVector<FLOAT_TYPE> &rr(ls[0]);
215 |         double newv = norm(rl - rr);
216 |         std::fprintf(stderr, "dist: %f, id %u\n", newv, unsigned(id));
217 |     } while(++tnbeg != topn.end());
218 |     std::reverse(topn.begin(), topn.end());
219 |     assert(std::is_sorted(topn.begin(), topn.end()));
220 |     auto dcid2 = dci.cvt();
221 |     auto topnpq = dci.prioritized_query(ls[0], k, k2);
222 |     for(const auto tn: topnpq)
223 |         std::fprintf(stderr, "id: %d, dist %f\n", tn.id(), tn.f());
224 |     std::fprintf(stderr, "Doing exact, feel free to skip\n");
225 |     //auto [x, y] = nn_data(dci);
226 |     //std::fprintf(stderr, "nn\n");
227 |     //auto nnmat = distmat2nn(x, std::max(n, nd - 15));
228 |     std::priority_queue<frp::dci::ProjID<FLOAT_TYPE, int>> pqs;
229 |     std::fprintf(stderr, "Beginning exact calculation\n");
230 |     FLOAT_TYPE maxv = 0;
231 |     //for(const auto v: topn)
232 |     //    maxv = std::max(v.first, maxv);
233 |     auto max_inexact = topn.back().first;
234 |     std::fprintf(stderr, "mi: %f. mv: %f\n", max_inexact, maxv);
235 |     std::fprintf(stderr, "max: %f\n", max_inexact);
236 |     //if(max_inexact < maxv) max_inexact = maxv;
237 |     #pragma omp parallel for schedule(static, 32)
238 |     for(unsigned i = 0; i < ls.size(); ++i) {
239 |         const auto v = norm(ls[0] - ls[i]);
240 |         if(v < max_inexact) {
241 |             const auto tmp = frp::dci::ProjID<FLOAT_TYPE, int>(v, i);
242 |             #pragma omp critical
243 |             {
244 |                 pqs.push(tmp);
245 |             }
246 |         }
247 |     }
248 |     std::vector<frp::dci::ProjID<FLOAT_TYPE, int>> exact_topn;
249 |     while(pqs.size()) {
250 |         auto p = pqs.top();
251 |         exact_topn.push_back(p);
252 |         pqs.pop();
253 |     }
254 |     std::sort(exact_topn.begin(), exact_topn.end());
255 |     std::fprintf(stderr, "size: %zu.\n", exact_topn.size());
256 |     for(const auto v: exact_topn)
257 |         std::fprintf(stderr, "%f:%d\n", v.first, v.second);
258 | }
259 | 


--------------------------------------------------------------------------------
/src/fhtest.cpp:
--------------------------------------------------------------------------------
 1 | #include <random>
 2 | #include "frp/frp.h"
 3 | #include "boost/random/normal_distribution.hpp"
 4 | using namespace frp;
 5 | 
 6 | template<typename T>
 7 | void print_vec(T &vec) {
 8 |     std::cerr << std::scientific;
 9 |     std::cerr << "[";
10 |     for(auto el: vec) std::cerr << el << ",";
11 |     std::cerr << "]\n";
12 | }
13 | 
14 | 
15 | template<typename T>
16 | float norm(T &a) {
17 |     //std::cerr << "val" << std::sqrt(blaze::dot(a, a)) << '\n';
18 |     return std::sqrt(blaze::dot(a, a));
19 | }
20 | 
21 | template<typename... Types>
22 | using unormd = boost::random::detail::unit_normal_distribution<Types...>;
23 | 
24 | int main(int argc, char *argv[]) {
25 |     std::size_t size(argc <= 1 ? 1 << 16: std::strtoull(argv[1], 0, 10)),
26 |                 niter(argc <= 2 ? 1000: std::strtoull(argv[2], 0, 10));
27 |     size = roundup(size);
28 |     blaze::DynamicVector<float> dps(size);
29 |     blaze::DynamicVector<float> dpsout(size);
30 |     aes::AesCtr<> aes(0);
31 |     unormd<float> vals;
32 |     for(auto &el: dps) el = vals(aes);
33 | #if 0
34 |     std::cerr << "Sum: " << frp::sum(dps) << '\n';
35 |     frp::fht(dps);
36 |     std::cerr << "Sum: " << frp::sum(dps) << '\n';
37 |     blaze::DynamicVector<float> sizes(niter);
38 |     for(auto &el: sizes) {
39 |         fht(&dps[0], log2_64(size));
40 |         dps *= 1./std::sqrt(size);
41 |         std::cerr << "Sum: " << frp::sum(dps) << '\n';
42 |         el = frp::sum(dps);
43 |     }
44 |     std::cerr << sizes << '\n';
45 |     std::cerr << "now fft\n";
46 |     fast_copy(&dpsout[0], &dps[0], sizeof(float) * size);
47 | #endif
48 |     auto sumb(norm(dps));
49 |     print_vec(dps);
50 |     std::cerr << "Norm: " << norm(dps) << '\n';
51 |     //DCTBlock<float, FFTW_WISDOM_ONLY> dcblock((int)size);
52 |     DCTBlock<float> dcblock((int)size);
53 |     for(size_t i(0); i < niter; ++i) {
54 |         dcblock.execute(dps);
55 |         dcblock.execute(dps);
56 |     }
57 |     auto suma(norm(dps));
58 |     std::cerr << "Ratio: " << suma / sumb;
59 |     auto sv(subvector(dps, dps.size() >> 1, dps.size() >> 1));
60 |     sv = 0;
61 |     subvector(dps, 0, dps.size() >> 1) = 1.;
62 |     std::cerr << "Before: \n";
63 |     print_vec(dps);
64 |     frp::fht(dps);
65 |     std::cerr << "Before: \n";
66 |     print_vec(dps);
67 |     CirculantMatrix<FLOAT_TYPE> cm(100);
68 | }
69 | 


--------------------------------------------------------------------------------
/src/graphtest.cpp:
--------------------------------------------------------------------------------
1 | #include "frp/graph.h"
2 | 
3 | using namespace frp;
4 | int main() {
5 |     SparseGraph<> sg(10);
6 | }
7 | 


--------------------------------------------------------------------------------
/src/kernel_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <cstring>
  4 | #include <chrono>
  5 | #include <cassert>
  6 | #include <omp.h>
  7 | #include "frp/frp.h"
  8 | 
  9 | using namespace std::chrono;
 10 | 
 11 | using namespace frp;
 12 | using namespace frp::linalg;
 13 | using namespace blaze;
 14 | 
 15 | using KernelBase = kernel::rf::KernelBlock<FLOAT_TYPE>;
 16 | using KernelType = kernel::Kernel<KernelBase, kernel::GaussianFinalizer>;
 17 | using ORFKernelBase = kernel::orf::KernelBlock<FLOAT_TYPE>;
 18 | using ORFKernelType = kernel::Kernel<ORFKernelBase, kernel::GaussianFinalizer>;
 19 | using SORFKernelBase = kernel::sorf::KernelBlock<FLOAT_TYPE>;
 20 | using SORFKernelType = kernel::Kernel<SORFKernelBase, kernel::GaussianFinalizer>;
 21 | using FFKernelBase = kernel::ff::KernelBlock<FLOAT_TYPE>;
 22 | using FFKernelType = kernel::Kernel<FFKernelBase, kernel::GaussianFinalizer>;
 23 | 
 24 | struct GaussianKernel {
 25 |     template<typename V1, typename V2>
 26 |     double operator()(const V1 &v1, const V2 &v2, double sigma) {
 27 |         double dist(dot(v1 - v2, v1 - v2));
 28 |         assert(dist >= 0.);
 29 |         auto xp = -dist / (2. * sigma * sigma);
 30 |         return std::exp(xp);
 31 |     }
 32 | };
 33 | 
 34 | int usage(char *arg) {
 35 |     std::fprintf(stderr, "Usage: %s <opts>\n"
 36 |                          "-i\tInput size [128]\n-s:sigma [1.0]\n-SOutput size [4096]\n-n: nsample points\n", arg);
 37 |     return EXIT_FAILURE;
 38 | }
 39 | 
 40 | template<typename Mat1, typename Mat2, typename KernelType>
 41 | void time_stuff(Mat1 &outm, const Mat2 &in, const char *taskname, const KernelType &kernel) {
 42 |     const size_t nrows(in.rows()), insize(in.columns()), outsize(outm.columns());
 43 |     Timer time(std::string("How long to apply kernel ") + taskname + " times on dimensions " + std::to_string(insize) + ", " + std::to_string(outsize) + ".");
 44 |     for(size_t i(0); i < nrows; ++i) {
 45 |         auto orow(row(outm, i));
 46 |         std::fprintf(stderr, "dims: orow %zu. outm: %zu/%zu. in dims: %zu/%zu\n", orow.size(), outm.rows(), outm.columns(), in.rows(), in.columns());
 47 |         kernel.apply(orow, row(in, i));
 48 |     }
 49 | }
 50 | 
 51 | int main(int argc, char *argv[]) {
 52 |     int c;
 53 |     size_t insize(1 << 6), outsize(1 << 14), nrows(200);
 54 |     double sigma(1.);
 55 |     while((c = getopt(argc, argv, "n:i:S:e:M:s:p:b:l:o:5Brh?")) >= 0) {
 56 |         switch(c) {
 57 |             case 'i': insize = std::strtoull(optarg, 0, 10); break;
 58 |             case 's': sigma = std::atof(optarg); break;
 59 |             case 'S': outsize = std::strtoull(optarg, 0, 10); break;
 60 |             case 'n': nrows = std::strtoull(optarg, 0, 10); break;
 61 |             case 'h': case '?': usage: return usage(*argv);
 62 |         }
 63 |     }
 64 |     if(argc > optind) goto usage;
 65 |     outsize = roundup(outsize);
 66 |     outsize = std::min(insize * 2, outsize - (insize * 3));
 67 |     KernelType kernel(outsize, insize, 1337, sigma);
 68 |     ORFKernelType orfkernel(outsize, insize, 1337 * 2, sigma);
 69 |     SORFKernelType sorfkernel(outsize, insize, 1337 * 3, sigma);
 70 |     FFKernelType ffkernel(outsize, insize, 1337 * 4, sigma);
 71 |     blaze::DynamicMatrix<FLOAT_TYPE> outm(nrows, outsize << 1), outmorf(nrows, outsize << 1), outmsorf(nrows, outsize << 1), outmff(nrows, outsize << 1);
 72 |     blaze::DynamicMatrix<FLOAT_TYPE> in(nrows, insize);
 73 |     size_t seed(0);
 74 |     //omp_set_num_threads(6);
 75 |     //#pragma omp parallel for
 76 |     for(size_t i(0); i < nrows; ++i) {
 77 |         auto inrow(row(in, i));
 78 |         unit_gaussian_fill(inrow, seed + i);
 79 |         inrow *= 1./norm(inrow);
 80 |         //const FLOAT_TYPE val(std::sqrt(meanvar(row(in, i)).second) * i);
 81 |         //auto r(row(in, i));
 82 |         //vec::blockadd(r, val);
 83 |     }
 84 |     blaze::DynamicMatrix<FLOAT_TYPE> indists(nrows, nrows);
 85 |     blaze::DynamicMatrix<FLOAT_TYPE> outdists(nrows, nrows);
 86 |     blaze::DynamicMatrix<FLOAT_TYPE> outdistsorf(nrows, nrows);
 87 |     blaze::DynamicMatrix<FLOAT_TYPE> outdistssorf(nrows, nrows);
 88 |     blaze::DynamicMatrix<FLOAT_TYPE> outdistsff(nrows, nrows);
 89 |     GaussianKernel gk;
 90 |     for(size_t i(0), j; i < nrows; ++i)
 91 |         for(indists(i, i) = 1e-300, j = i + 1; j < nrows; ++j)
 92 |              indists(i, j) = indists(j, i) = gk(row(in, i), row(in, j), sigma);
 93 |     {
 94 |         time_stuff(outm, in, "rf", kernel);
 95 |         time_stuff(outmorf, in, "orf", orfkernel);
 96 |         time_stuff(outmsorf, in, "sorf", sorfkernel);
 97 |         time_stuff(outmff, in, "sorf", ffkernel);
 98 |     }
 99 |     for(size_t i(0), j; i < nrows; ++i)
100 |         for(outdists(i, i) = 1e-300, j = i + 1; j < nrows; ++j)
101 |             outdists(i, j) = outdists(j, i) = dot(row(outm, i), row(outm, j)),
102 |             outdistsorf(i, j) = outdistsorf(j, i) = dot(row(outmorf, i),  row(outmorf, j)),
103 |             outdistssorf(i, j) = outdistssorf(j, i) = dot(row(outmsorf, i),  row(outmsorf, j)),
104 |             outdistsff(i, j) = outdistsff(j, i) = dot(row(outmff, i),  row(outmff, j));
105 |     //std::cerr << "Input full kernel distances: " << indists << '\n';
106 |     //std::cerr << "Input approx kernel distances: " << outdists << '\n';
107 |     blaze::DynamicMatrix<FLOAT_TYPE> ratios(nrows, nrows);
108 |     for(size_t i(0); i < nrows; ++i)
109 |             for(size_t j(0); j < nrows; ++j) ratios(i, j) = outdists(i, j) / indists(i, j);
110 |     //std::cerr << "Output ratios: " << ratios << '\n';
111 |     std::cout << "#Ratio, Gaussian Distances, RF Approx, ORF Approx, SORF Approx\n";
112 |     for(size_t i(0); i < nrows; ++i)
113 |             for(size_t j(0); j < nrows; ++j)
114 |                 std::cout << ratios(i, j) << ", " << indists(i, j) << ", " << outdists(i, j) << ", "
115 |                           << outdistsorf(i, j) << ", " << outdistssorf(i, j) << ", "<< outdistsff(i, j) << '\n';
116 |     // std::cerr << "Successfully completed " << *argv << '\n';
117 | #if 0
118 |     std::cerr << in << '\n';
119 |     std::cerr << outm << '\n';
120 | #endif
121 | }
122 | 


--------------------------------------------------------------------------------
/src/kernel_time.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <cstring>
  4 | #include <chrono>
  5 | #include <cassert>
  6 | #include <omp.h>
  7 | #include "frp/frp.h"
  8 | 
  9 | using namespace std::chrono;
 10 | 
 11 | using namespace frp;
 12 | using namespace frp::linalg;
 13 | using namespace blaze;
 14 | 
 15 | using KernelBase = kernel::rf::KernelBlock<FLOAT_TYPE>;
 16 | using KernelType = kernel::Kernel<KernelBase, kernel::GaussianFinalizer>;
 17 | using ORFKernelBase = kernel::orf::KernelBlock<FLOAT_TYPE>;
 18 | using ORFKernelType = kernel::Kernel<ORFKernelBase, kernel::GaussianFinalizer>;
 19 | using SORFKernelBase = kernel::sorf::KernelBlock<FLOAT_TYPE>;
 20 | using SORFKernelType = kernel::Kernel<SORFKernelBase, kernel::GaussianFinalizer>;
 21 | using FFKernelBase = kernel::ff::KernelBlock<FLOAT_TYPE>;
 22 | using FFKernelType = kernel::Kernel<FFKernelBase, kernel::GaussianFinalizer>;
 23 | 
 24 | struct GaussianKernel {
 25 |     template<typename V1, typename V2>
 26 |     double operator()(const V1 &v1, const V2 &v2, double sigma) const {
 27 |         double dist(dot(v1 - v2, v1 - v2));
 28 |         assert(dist >= 0.);
 29 |         auto xp = -dist / (2. * sigma * sigma);
 30 |         return std::exp(xp);
 31 |     }
 32 | };
 33 | 
 34 | int usage(char *arg) {
 35 |     std::fprintf(stderr, "Usage: %s <opts>\n"
 36 |                          "-i\tInput size [128]\n-s:sigma [1.0]\n-SOutput size [4096]\n-n: nsample points\n", arg);
 37 |     return EXIT_FAILURE;
 38 | }
 39 | 
 40 | template<typename Mat1, typename Mat2, typename KernelType>
 41 | double time_stuff(Mat1 &outm, const Mat2 &in, const char *taskname, const KernelType &kernel, double sigma) {
 42 |     const size_t nrows(in.rows()), insize(in.columns()), outsize(outm.columns());
 43 |     {
 44 |         char buf[1 << 10];
 45 |         std::sprintf(buf, "Input row length: %zu. Output row length: %zu. Kernel {in,out}dim: %zu/%zu\n", in.columns(), outm.columns(), kernel.indim(), kernel.outdim());
 46 |         ::std::cerr << buf;
 47 |     }
 48 |     Timer time(std::string(taskname) + " " + std::to_string(nrows) + " times on dimensions " + std::to_string(insize) + ", " + std::to_string(outsize) + " and sigma = " + std::to_string(sigma) + ".");
 49 |     for(size_t i(0); i < nrows; ++i) {
 50 |         auto orow(row(outm, i));
 51 |         kernel.apply(orow, row(in, i));
 52 |     }
 53 |     return time.time();
 54 | }
 55 | 
 56 | int main(int argc, char *argv[]) {
 57 |     int c;
 58 |     size_t insize(1 << 6), outsize(1 << 14), nrows(250);
 59 |     double sigma(1.);
 60 |     bool force(false);
 61 |     while((c = getopt(argc, argv, "n:i:S:e:M:s:p:b:l:o:5OBrh?")) >= 0) {
 62 |         switch(c) {
 63 |             case 'i': insize = std::strtoull(optarg, 0, 10); break;
 64 |             case 's': sigma = std::atof(optarg); break;
 65 |             case 'S': outsize = std::strtoull(optarg, 0, 10); break;
 66 |             case 'n': nrows = std::strtoull(optarg, 0, 10); break;
 67 |             case 'O': force = true; break;
 68 |             case 'h': case '?': usage: return usage(*argv);
 69 |         }
 70 |     }
 71 |     if(argc > optind) goto usage;
 72 |     outsize = roundup(outsize);
 73 |     insize = roundup(insize);
 74 |     SORFKernelType sorfkernel(outsize, insize, 1337 * 3, sigma);
 75 |     FFKernelType ffkernel(outsize, insize, 1337 * 4, sigma);
 76 |     blaze::DynamicMatrix<FLOAT_TYPE> outm(nrows, outsize << 1);
 77 |     blaze::DynamicMatrix<FLOAT_TYPE> in(nrows, insize);
 78 |     size_t seed(0);
 79 |     //omp_set_num_threads(6);
 80 |     //#pragma omp parallel for
 81 |     for(size_t i(0); i < nrows; ++i) {
 82 |         auto inrow(row(in, i));
 83 |         unit_gaussian_fill(inrow, seed + i);
 84 |         inrow *= 1./norm(inrow);
 85 |         //const FLOAT_TYPE val(std::sqrt(meanvar(row(in, i)).second) * i);
 86 |         //auto r(row(in, i));
 87 |         //vec::blockadd(r, val);
 88 |     }
 89 | #if 0
 90 |     GaussianKernel gk;
 91 |     for(size_t i(0), j; i < nrows; ++i)
 92 |         for(indists(i, i) = 1e-300, j = i + 1; j < nrows; ++j)
 93 |              indists(i, j) = indists(j, i) = gk(row(in, i), row(in, j), sigma);
 94 | #endif
 95 |     double times[4]{0};
 96 |     {
 97 |         if((insize * outsize) < (5000 * 32000) || force) {
 98 |             {
 99 |             KernelType kernel(outsize, insize, 1337, sigma);
100 |             times[0] = time_stuff(outm, in, "rf", kernel, sigma);
101 |             }
102 |             ORFKernelType orfkernel(outsize, insize, 1337 * 2, sigma);
103 |             times[1] = time_stuff(outm, in, "orf", orfkernel, sigma);
104 |         }
105 |         times[2] = time_stuff(outm, in, "sorf", sorfkernel, sigma);
106 |         times[3] = time_stuff(outm, in, "ff", ffkernel, sigma);
107 |     }
108 |     static constexpr const char * names[]{"rf", "orf", "sorf", "ff"};
109 |     for(const auto name: names) std::fprintf(stdout, "%s\t", name);
110 |     std::fputc('\n', stdout);
111 |     for(const auto time: times) std::fprintf(stdout, "%lf\t", time);
112 |     std::fputc('\n', stdout);
113 | }
114 | 


--------------------------------------------------------------------------------
/src/kstest.cpp:
--------------------------------------------------------------------------------
 1 | #include "kspp/ks.h"
 2 | #include <vector>
 3 | 
 4 | float randf() {
 5 |     return (float)rand() / RAND_MAX;
 6 | }
 7 | 
 8 | int main() {
 9 |     using ks::string;
10 |     string z("ZOMG");
11 |     string z2(256);
12 |     while(z2.size() < 16) z2 += 'x';
13 |     z2.terminate();
14 |     std::fprintf(stderr, "str1: %s. str2: %s\n", z.data(), z2.data());
15 |     ks::string other("I am this str");
16 |     std::fprintf(stderr, "Does this str (%s) end in str? %s\n", other.data(), other.endswith("str") ? "true": "false");
17 |     std::fprintf(stderr, "Locate: %s\n", other.locate("str"));
18 |     std::fprintf(stderr, "Locate: %s\n", other.bmlocate("str"));
19 | }
20 | 


--------------------------------------------------------------------------------
/src/lshtest.cpp:
--------------------------------------------------------------------------------
 1 | #include "frp/lsh.h"
 2 | 
 3 | using namespace frp;
 4 | 
 5 | int main() {
 6 |     MatrixLSHasher<> mat(12, 16);
 7 |     std::vector<blaze::DynamicVector<float>> tmp;
 8 |     while(tmp.size() < 500) {
 9 |         tmp.emplace_back(16);
10 |         randomize(tmp.back());
11 |     }
12 |     for(const auto &v: tmp) {
13 |         std::fprintf(stderr, "hash: %zu\n", mat(v));
14 |     }
15 |     
16 | }
17 | 


--------------------------------------------------------------------------------
/src/mtest.cpp:
--------------------------------------------------------------------------------
 1 | #include "boost/math/special_functions/gamma.hpp"
 2 | #include <cstdio>
 3 | 
 4 | int main() {
 5 |     double input(.1);
 6 |     double input2(256);
 7 |     float finput(.1);
 8 |     float finput2(256);
 9 |     std::fprintf(stderr, "gammaq [a = 0.1, q = 256: %lf]\n", boost::math::gamma_q_inv(input2, input));
10 |     std::fprintf(stderr, "gammap [a = 0.1, p = 256: %lf]\n", boost::math::gamma_p_inv(input2, input));
11 |     std::fprintf(stderr, "gammap [a = 0.1, p = 256: %lf]\n", boost::math::gamma_p_inv(finput2, finput));
12 | }
13 | 


--------------------------------------------------------------------------------
/src/multest.cpp:
--------------------------------------------------------------------------------
 1 | #include "frp/frp.h"
 2 | #include <iostream>
 3 | #include <cstdlib>
 4 | #include <cstring>
 5 | #include <chrono>
 6 | 
 7 | using namespace std::chrono;
 8 | using namespace frp;
 9 | using namespace blaze;
10 | 
11 | template<typename T>
12 | void print_vec(T &vec) {
13 |     std::cerr << std::scientific;
14 |     std::cerr << "[";
15 |     for(auto el: vec) std::cerr << el << ",";
16 |     std::cerr << "]\n";
17 | }
18 | 
19 | struct BaseClass {
20 |     std::string msg;
21 |     BaseClass(std::string in): msg(in) {}
22 |     virtual void set_string(std::string append) {
23 |         msg += append;
24 |     }
25 |     void print_string(const std::string &to_app) {
26 |         set_string(to_app);
27 |         std::cerr << "New string " << msg << '\n';
28 |     }
29 | };
30 | 
31 | struct DerivedClass: BaseClass {
32 |     DerivedClass(std::string in): BaseClass(in) {}
33 |     virtual void set_string(std::string append) {
34 |         msg += append + "DERIVED CLASS YO ";
35 |     }
36 | };
37 | 
38 | int main() {
39 | #if 0
40 |     const unsigned len(argc == 1 ? 1 << 16 : std::atoi(argv[1]));
41 |     DynamicVector<FLOAT_TYPE> vec(len);
42 |     DynamicVector<FLOAT_TYPE> ret(len);
43 |     for(auto &el: vec) el = FLOAT_TYPE(std::rand()) / RAND_MAX;
44 |     std::cerr << "Making vec\n";
45 |     PRNVector<aes::AesCtr<uint64_t>,
46 |               unit_normal<FLOAT_TYPE>> pv(len);
47 |     std::cerr << "Made \n";
48 |     auto it(vec.begin());
49 |     unsigned i(0);
50 |     for(auto el: pv) {
51 |         std::cerr << "Accessing index " << i + 1;
52 |          *it = el;
53 |         std::cerr << "pv[" << i << "] is " << pv[i] << '\n';
54 |         ++it;
55 |         ++i;
56 |     }
57 | #else
58 |     BaseClass b("BaseYo");
59 |     DerivedClass d("DerivedYo");
60 |     b.print_string("YAY");
61 |     d.print_string("YAY");
62 | #endif
63 | }
64 | 


--------------------------------------------------------------------------------
/src/ojlt.cpp:
--------------------------------------------------------------------------------
 1 | #include "frp/frp.h"
 2 | #include <fstream>
 3 | #include <getopt.h>
 4 | #include <ctime>
 5 | using namespace frp;
 6 | 
 7 | int main(int argc, char *argv[]) {
 8 |     std::ios_base::sync_with_stdio(false);
 9 |     int co, nd(-1), target_dim(-1), nblocks(3);
10 |     size_t seed(-1), vecbufsz(1 << 18);
11 |     while((co = getopt(argc, argv, "N:s:m:n:th?")) >= 0) {
12 |         switch(co) {
13 |             case 'N': nblocks = atoi(optarg); break;
14 |             case 'n': nd = atoi(optarg); break;
15 |             case 'm': target_dim = atoi(optarg); break;
16 |             case 's': seed = strtoull(optarg, nullptr, 10); break;
17 |             case 't': seed = time(nullptr); break;
18 |             case 'b': vecbufsz = strtoull(optarg, nullptr, 10); break;
19 |             case 'h': case '?': usage: {
20 |                 fprintf(stderr, "%s <args> input.path <output.path [defaults to stdout]\n"
21 |                                 "-n:\tNumber of dimensions of input data.\n"
22 |                                 "-m:\tNumber of dimensions to project to. (ust be <= n)\n"
23 |                                 "-s:\tSeed RNG with [argument as unsigned long long]\n"
24 |                                 "-t:\tSeed RNG with std::time(nullptr)\n"
25 |                                 "-h:\tEmit usage\n",
26 |                              argv[0]);
27 |                 exit(1);
28 |             }
29 |         }
30 |     }
31 |     if((nblocks & 1) != 0) std::fprintf(stderr, "Warning: Using an even numbe of blocks causes provably higher error rates.\n");
32 | 
33 |     if(target_dim < 0) {
34 |         goto usage;
35 |     }
36 |     LineReader ic(argv[optind]);
37 |     if(nd < 0) {
38 |         auto fline(ic.begin());
39 |         nd = countchars(fline.data(), ',') + 1;
40 |         std::fprintf(stderr, "Counted %i fields\n", nd);
41 |     }
42 |     size_t vecsize(nd);
43 |     if(target_dim >= nd) {
44 |         goto usage;
45 |     }
46 |     FILE *ofp(optind + 1 < argc ? fopen(argv[optind + 1], "w"): stdout);
47 |     const int fn(fileno(ofp));
48 |     OrthogonalJLTransform<FLOAT_TYPE> jl(nd, target_dim, seed, nblocks);
49 |     ks::string str(vecbufsz);
50 | #if PARALLEL_PARSE
51 |     std::vector<unsigned> tmp;
52 | #endif
53 |     blaze::DynamicVector<FLOAT_TYPE> vec(roundup(vecsize));
54 | #if USE_OPENMP
55 |     omp_set_num_threads(8);
56 | #endif
57 |     for(auto &line: ic) {
58 | #if PARALLEL_PARSE
59 |         line.set(vec, tmp);
60 | #else
61 |         line.set(vec);
62 | #endif
63 |         jl.transform_inplace(vec);
64 |         ksprint(subvector(vec, 0, target_dim), str);
65 |         str.putc_('\n');
66 |         if(str.size() & (~((str.capacity()>>1) - 1))) {
67 |             // if str.size >= str.capacity/2
68 |             str.write(fn);
69 |             str.clear();
70 |         }
71 |     }
72 |     str.write(fn);
73 |     str.clear();
74 |     if(ofp != stdout) fclose(ofp);
75 |     return EXIT_SUCCESS;
76 | }
77 | 


--------------------------------------------------------------------------------
/src/parser.cpp:
--------------------------------------------------------------------------------
 1 | #include "frp/parser.h"
 2 | using namespace frp;
 3 | 
 4 | int main(int argc, char *argv[]) {
 5 |     LineReader ic(argc > 1 ? argv[1]: "z.txt");
 6 |     unsigned i(0);
 7 |     std::vector<unsigned> tmp;
 8 |     blaze::DynamicVector<FLOAT_TYPE> vec(100000);
 9 | #if 0
10 |     omp_set_num_threads(8);
11 | #endif
12 |     for(auto &line: ic) {
13 |         i += line[0];
14 |         line.set(vec);
15 |         //std::fprintf(stderr, "line #%i is %s", ++i, line.data());
16 |     }
17 |     return EXIT_SUCCESS;
18 | }
19 | 


--------------------------------------------------------------------------------
/src/pcatest.cpp:
--------------------------------------------------------------------------------
 1 | #undef NDEBUG
 2 | #include "aesctr/wy.h"
 3 | #include "frp/linalg.h"
 4 | #include "frp/dist.h"
 5 | #include <iostream>
 6 | 
 7 | using namespace frp;
 8 | using namespace linalg;
 9 | using blaze::rowMajor;
10 | 
11 | // c = np.dot((X.T - np.mean(X, axis=1)).T, (X.T - np.mean(X, axis=1))) * np.true_divide(1, wsum[0] - 1)
12 | // Python code above works.
13 | // The key is take the matrix, sub
14 | int main(int argc, char *argv[]) {
15 |     unsigned nrows = argc == 1 ? 25: std::atoi(argv[1]), ncols = argc < 3 ? 10: std::atoi(argv[2]), ncomp = argc < 4 ? ncols: std::atoi(argv[3]);
16 |     PCAAggregator<float> pcag(ncols);
17 |     blaze::DynamicVector<float> v(ncols, 1);
18 |     blaze::DynamicVector<float, blaze::rowVector> v2(ncols, 3);
19 |     assert(v2.size() == ncols);
20 |     assert(v.size() == ncols);
21 |     blaze::DynamicMatrix<float> random_data(nrows * 20, ncols);
22 |     unit_gaussian_fill(random_data, nrows * ncols + ncomp);
23 |     pcag.add(v);
24 |     pcag.add(v2);
25 |     pcag.add(random_data);
26 |     size_t NPERVEC = sizeof(vec::SIMDTypes<uint64_t>::VType) / sizeof(float);
27 |     pcag.add(blaze::CustomMatrix<float, blaze::aligned, blaze::padded, rowMajor>(
28 |         &random_data(0, 0), random_data.rows(), random_data.columns(), ((random_data.rows() + NPERVEC - 1) / NPERVEC) * NPERVEC));
29 |     std::fprintf(stderr, "Added to pca aggregator\n");
30 |     blaze::DynamicMatrix<float> mat(nrows, ncols);
31 |     std::mt19937_64 mt;
32 |     std::uniform_real_distribution<float> gen;
33 |     for(size_t i = 0; i < mat.rows(); ++i) for(size_t j = 0; j < mat.columns(); ++j)
34 |         mat(i, j) = gen(mt);
35 |     std::fprintf(stderr, "Filled mat\n");
36 |     auto c = naive_cov(mat);
37 |     std::cerr << "naive cov\n";
38 |     auto c2 = naive_cov(mat, false);
39 |     std::cerr << "naive cov, falase\n";
40 |     auto s = blaze::sum<blaze::columnwise>(mat);
41 |     std::cout <<" mat \n" << mat << '\n';
42 |     std::cout << "cov \n" << c << '\n';
43 |     std::cout << "sum \n" << s << '\n';
44 |     std::cout << "sample cov \n" << c2 << '\n';
45 |     auto [x, y] = pca(mat, true, true, ncomp);
46 |     std::fprintf(stderr, "Dims of x: %zu/%zu rows/columns\n", x.rows(), x.columns());
47 |     std::fprintf(stderr, "Dims of mat: %zu/%zu rows/columns\n", mat.rows(), mat.columns());
48 |     auto txdata = mat * x;
49 |     std::cout << txdata << '\n';
50 |     std::fprintf(stderr, "Dims of txdata: %zu/%zu rows/columns\n", txdata.rows(), txdata.columns());
51 |     std::fprintf(stderr, "Subsample to 3\n");
52 | }
53 | 


--------------------------------------------------------------------------------
/src/test_gs.cpp:
--------------------------------------------------------------------------------
 1 | #include "frp/linalg.h"
 2 | #include "frp/dist.h"
 3 | #include <iostream>
 4 | using namespace frp; // enter Froopyland!
 5 | using namespace frp::linalg;
 6 | 
 7 | 
 8 | int main(int argc, char *argv[]) {
 9 |     unsigned size(argc > 1 ? std::atoi(argv[1]): 10), nrows(size), ncols(size), flags(ORTHONORMALIZE);
10 |     blaze::DynamicMatrix<FLOAT_TYPE> input(nrows, ncols), ret1, ret2;
11 |     for(size_t i(0); i < input.rows(); ++i) {
12 |         auto r(row(input, i));
13 |         gaussian_fill(r, 1337 * i);
14 |     }
15 |     gram_schmidt(input, ret1, flags);
16 |     qr_gram_schmidt(input, ret2, flags);
17 |     auto ret3(qr_gram_schmidt(input, flags));
18 |     std::cerr << "input:\n" << input;
19 |     std::cerr << "ret1:\n" << ret1;
20 |     std::cerr << "ret2:\n" << ret2;
21 |     std::cerr << "ret3:\n" << ret3 << '\n';
22 |     for(size_t i(0); i < size; ++i) {
23 |         for(size_t j(0); j < size; ++j) {
24 |             std::fprintf(stderr, "Dot products of row %zu with row %zu are %f, %f\n", i, j, dot(row(ret1, i), row(ret1, j)), dot(row(ret2, i), row(ret2, j)));
25 |         }
26 |         std::fprintf(stderr, "The norm of the rows are %lf, %lf, %lf\n", norm(row(ret1, i)), norm(row(ret2, i)), norm(row(ret3, i)));
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/test/testfht.cpp:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <cstdlib>
  3 | #include <cstdio>
  4 | #include <vector>
  5 | #include <random>
  6 | #include <algorithm>
  7 | 
  8 | 
  9 | template<typename T, typename T1=T>
 10 | void perform_fht(T *input, size_t l2, bool renorm=true, T1 U1=1., T1 U2=1., T1 V1=1., T1 V2=-1.) {
 11 |     auto n = size_t(1) << l2;
 12 |     for(size_t i = 0; i < l2; ++i) {
 13 |         size_t s1 = size_t(1) << i, s2 = s1 << 1;
 14 |         for(size_t j = 0; j < n; j += s2) {
 15 |             for(size_t k = 0; k < s1; ++k) {
 16 |                 auto u = input[j + k], v = input[j + k + s1];
 17 |                 input[j + k] = u * U1 + V1 * v;
 18 |                 input[j + k + s1] = u * U2 + V2 * v;
 19 |             }
 20 |         }
 21 |     }
 22 |     if(renorm)  std::for_each(input, input + n, [mult=1. / std::pow(std::sqrt(2.), l2)](auto &x) {x *= mult;});
 23 | }
 24 | 
 25 | template<typename T>
 26 | void randomize(T &x, uint64_t seed=0) {
 27 |     seed = seed ? seed: std::pow(x.size(), x.size());
 28 |     std::mt19937_64 mt(seed);
 29 |     std::normal_distribution<double> gen;
 30 |     for(auto &e: x)
 31 |         e = gen(mt);
 32 | }
 33 | 
 34 | int main() {
 35 |     int n = 14;
 36 |     std::vector<float> v(1 << n);
 37 |     for(auto &i: v) i = std::sqrt(double(std::rand()) / RAND_MAX);
 38 |     std::mt19937_64 mt;
 39 |     for(auto &i: v) {
 40 |         float tmp;
 41 |         std::srand(*(int *)&tmp);
 42 |         mt.seed(std::rand());
 43 |         i = mt() / double(std::numeric_limits<typename std::mt19937_64::result_type>::max());
 44 |     }
 45 | #define show(v) \
 46 |     for(size_t i = 0; i < n; std::fprintf(stderr, "%f,", v[i++])); std::fputc('\n', stderr)
 47 |     show(v);
 48 |     printf("Rotating 90 degrees each time [period of 2]\n");
 49 |     perform_fht(v.data(), n);
 50 |     perform_fht(v.data(), n);
 51 |     show(v);
 52 |     printf("Rotating 90 degrees each time [period of 4]\n");
 53 |     for(int i = 0; i < 24; ++i) {
 54 |         perform_fht(v.data(), n, true, 1., 1., -1., 1.);
 55 |         show(v);
 56 |     }
 57 |     printf("Rotating 60 degrees each time [period of 6]\n");
 58 |     for(int i = 0; i < 24; ++i) {
 59 |         double p1 = 0.8660254037844386;
 60 |         perform_fht(v.data(), n, false, p1, .5, -.5, p1); // Periodicity of 6
 61 |         show(v);
 62 |     }
 63 |     printf("Rotating 120 degrees each time [period of 3]\n");
 64 |     for(int i = 0; i < 21; ++i) {
 65 |         double p1 = 0.8660254037844386;
 66 |         perform_fht(v.data(), n, false, .5, p1, -p1, .5); // Periodicity of 3
 67 |         show(v);
 68 |     }
 69 |     printf("Rotating ??? [period of 14]\n");
 70 |     auto v1 = M_PI / 3.5, c1 = std::cos(v1), s1 = std::sin(v1); // 14
 71 |     for(int i = 0; i < 28; ++i) {
 72 |         std::fprintf(stderr, "iteration %d\n", i);
 73 |         show(v);
 74 |         perform_fht(v.data(), n, false, s1, c1, -c1, s1); // Periodicity of 6
 75 |     }
 76 |     std::fprintf(stderr, "iteration done\n");
 77 |     printf("Rotating ??? [period of 7]\n");
 78 |     for(int i = 0; i < 28; ++i) {
 79 |         std::fprintf(stderr, "iteration %d\n", i);
 80 |         show(v);
 81 |         perform_fht(v.data(), n, false, c1, s1, -s1, c1); // Periodicity of 7
 82 |     }
 83 |     std::fprintf(stderr, "iteration done\n");
 84 |     show(v);
 85 |     printf("Rotating ??? [period of ???]\n");
 86 |     for(int i = 0; i < 20; ++i) {
 87 |         std::fprintf(stderr, "iteration %d\n", i);
 88 |         show(v);
 89 |         perform_fht(v.data(), n, false, c1, s1, s1, -c1); // Periodicity of 7
 90 |     }
 91 |     std::fprintf(stderr, "iteration done\n");
 92 |     std::fprintf(stderr, "RANDOMIZE\n");
 93 |     randomize(v);
 94 |     show(v);
 95 |     printf("Rotating ??? (the last one but backwards ish?)[period of ???]\n");
 96 |     for(int i = 0; i < 20; ++i) {
 97 |         std::fprintf(stderr, "iteration %d\n", i);
 98 |         show(v);
 99 |         perform_fht(v.data(), n, false, s1, c1, c1, -s1); // Periodicity of 7
100 |     }
101 |     std::fprintf(stderr, "iteration done\n");
102 |     show(v);
103 | #if 0
104 |     printf("Rotating ??? [period of ???] two negatives\n");
105 |     for(int i = 0; i < 20; ++i) {
106 |         std::fprintf(stderr, "iteration %d\n", i);
107 |         show(v);
108 |         perform_fht(v.data(), n, false, -c1, s1, c1, -s1); // Periodicity of 7
109 |     }
110 |     std::fprintf(stderr, "iteration done\n");
111 |     show(v);
112 | #endif
113 | //perform_fht(v.data(), n, true, 1., 1., -1., 1.);
114 | //perform_fht(v.data(), n, true, 1., 1., -1., 1.);
115 | //perform_fht(v.data(), n, true, 1., 1., -1., 1.);
116 | //perform_fht(v.data(), n, true, 1., 1., -1., 1.);
117 | }
118 | 


--------------------------------------------------------------------------------