├── .gitignore
├── COPYING
├── LICENSE.txt
├── README
├── bin
    └── .gitignore
├── makefile
├── samples
    ├── .gitignore
    ├── add-strings
    │   ├── .gitignore
    │   ├── add-strings.c
    │   ├── add-strings.cu
    │   └── makefile
    ├── bin
    │   └── .gitignore
    ├── common.mk
    ├── grid-points
    │   ├── .gitignore
    │   ├── grid-points.cu
    │   └── makefile
    ├── include
    │   ├── .gitignore
    │   └── halloc.h
    ├── random-graph
    │   ├── .gitignore
    │   ├── makefile
    │   └── random-graph.cu
    └── tmp
    │   └── .gitignore
├── src
    ├── .gitignore
    ├── globals.cuh
    ├── grid.cuh
    ├── grid.h
    ├── halloc.cu
    ├── halloc.h
    ├── sbset.cuh
    ├── sbset.h
    ├── size-info.cuh
    ├── size-info.h
    ├── slab.cuh
    ├── slab.h
    ├── statistics.cuh
    ├── utils.cu
    └── utils.h
└── tst
    ├── .gitignore
    ├── common-def.mk
    ├── common.mk
    ├── common
        ├── .gitignore
        ├── common.cu
        ├── common.h
        ├── cuda-malloc-wrapper.h
        ├── halloc-wrapper.h
        ├── makefile
        └── scatter-alloc-wrapper.h
    ├── corr
        ├── .gitignore
        ├── bin
        │   └── .gitignore
        ├── checkptr
        │   ├── .gitignore
        │   ├── checkptr.cu
        │   └── makefile
        ├── freeslabs
        │   ├── .gitignore
        │   ├── freeslabs.cu
        │   └── makefile
        ├── make-all.sh
        ├── makefile
        ├── prob-checkptr
        │   ├── .gitignore
        │   ├── makefile
        │   └── prob-checkptr.cu
        ├── run-all-tests.pl
        ├── run-test.sh
        ├── test
        │   ├── .gitignore
        │   ├── makefile
        │   └── test.cu
        └── tmp
        │   └── .gitignore
    ├── exp
        ├── .gitignore
        ├── common.pl
        ├── frag-int
        │   ├── .gitignore
        │   ├── exp-log.csv
        │   ├── exp-plot.gpl
        │   └── exp-run.pl
        ├── halloc-vs-scatter
        │   ├── .gitignore
        │   ├── exp-log-priv-1.csv
        │   ├── exp-log-priv-2.csv
        │   ├── exp-log-priv-3.csv
        │   ├── exp-log-priv.csv
        │   ├── exp-log-spree.csv
        │   ├── exp-plot.gpl
        │   ├── exp-run.pl
        │   └── graph.py
        ├── run-all-exps.sh
        ├── run-scaling-speed.sh
        ├── scaling
        │   ├── .gitignore
        │   ├── exp-log-lat.csv
        │   ├── exp-log-thru.csv
        │   ├── exp-plot.gpl
        │   ├── exp-run.pl
        │   └── graph.py
        ├── settings
        │   ├── .gitignore
        │   ├── exp-log.csv
        │   ├── exp-plot.gpl
        │   └── exp-run.pl
        └── speed
        │   ├── .gitignore
        │   ├── exp-log-combi.csv
        │   ├── exp-log-single.csv
        │   ├── exp-plot.gpl
        │   ├── exp-run.pl
        │   └── graph.py
    ├── include
        ├── .gitignore
        └── halloc.h
    └── perf
        ├── .gitignore
        ├── bin
            └── .gitignore
        ├── latency
            ├── .gitignore
            ├── latency.cu
            └── makefile
        ├── make-all.sh
        ├── makefile
        ├── phase-alloc-write
            ├── .gitignore
            ├── makefile
            └── phase-alloc-write.cu
        ├── phase-extfrag
            ├── .gitignore
            ├── makefile
            └── phase-extfrag.cu
        ├── phase-latency
            ├── .gitignore
            ├── makefile
            └── phase-latency.cu
        ├── phase-throughput
            ├── .gitignore
            ├── makefile
            └── phase-throughput.cu
        ├── priv-throughput
            ├── .gitignore
            ├── makefile
            └── priv-throughput.cu
        ├── run-test.sh
        ├── throughput
            ├── .gitignore
            ├── makefile
            ├── throughput-all.sh
            └── throughput.cu
        └── tmp
            └── .gitignore


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Forschungszentrum Juelich
 2 | 
 3 | Author(s): Andrew Adinetz
 4 | 
 5 | This software is available to you under a choice of one of two
 6 | licenses.  You may choose to be licensed under the terms of the GNU
 7 | General Public License (GPL) Version 2, available from the file
 8 | COPYING in the main directory of this source tree, or the
 9 | OpenIB.org BSD license below:
10 | 
11 |   Redistribution and use in source and binary forms, with or
12 |   without modification, are permitted provided that the following
13 |   conditions are met:
14 | 
15 |      - Redistributions of source code must retain the above
16 |        copyright notice, this list of conditions and the following
17 |        disclaimer.
18 | 
19 |      - Redistributions in binary form must reproduce the above
20 |        copyright notice, this list of conditions and the following
21 |        disclaimer in the documentation and/or other materials
22 |        provided with the distribution.
23 | 
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 | SOFTWARE.
32 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | Halloc GPU memory allocator, version 0.11
  2 | 
  3 | INTRO
  4 | 
  5 | Halloc is a high-throughput malloc/free-style dynamic memory allocator for
  6 | NVidia Kepler GPUs. It is based on using bit arrays to represent free blocks and
  7 | using a hash function to quickly search for free blocks. This idea, combined
  8 | with clever slab management and performance tuning, enables a really fast
  9 | allocator. Halloc achieves more than 1.5 bln. mallocs/s (more than 1
 10 | bln. malloc/free pairs/s) on K20X and 16-byte allocations, with tens of
 11 | thousands of GPU threads and more than 100MiB allocated. This is much higher
 12 | than other state-of-the-art GPU allocators. In addition, halloc's performance is
 13 | also more stable. This makes halloc suitable for use in GPGPU applications
 14 | requiring fast dynamic memory management. Halloc is mainly designed for small
 15 | allocation sizes, and delegates allocations larger than 3KiB to CUDA allocator.
 16 | 
 17 | 
 18 | REQUIREMENTS
 19 | 
 20 | Software: CUDA 5.0 or higher (tested with 6.5)
 21 | Hardware: Compute Capability 2.0 or higher (tested on CC 3.5 devices K20X and K40).
 22 | 
 23 | Note: libraries and tests are currently not compiled for compute_50/sm_50, i.e. Maxwell.
 24 | 
 25 | 
 26 | COMPILING
 27 | 
 28 | To compile halloc library, type (in project's top directory):
 29 | 
 30 | 	make
 31 | 
 32 | To run correctness tests (CAUTION: takes a lot of time!):
 33 | 
 34 | 	make test
 35 | 
 36 | To build correctness tests without running them:
 37 | 
 38 | 	make build-corr
 39 | 
 40 | To build performance tests without running them:
 41 | 	 
 42 | 	make build-perf
 43 | 
 44 | Performance tests are then located in ./tst/perf/bin directory, and can be
 45 | invoked individually, e.g.
 46 | 
 47 | 	./tst/perf/bin/throughput
 48 | 	./tst/perf/bin/phase-throughput -f0.95 -F0.05 -e0.91 -g5 -t128
 49 | 
 50 | To install, edit PREFIX variable in the makefile to your desired install
 51 | directory (default ~/usr) and type:
 52 | 
 53 | 	make install
 54 | 
 55 | To uninstall:
 56 | 
 57 | 	make uninstall
 58 | 
 59 | 
 60 | USING HALLOC 
 61 | 
 62 | See samples/ directory for samples using Halloc.
 63 | 
 64 | Compiling Your Program
 65 | 
 66 | The GPU application then needs to be compiled with halloc static library using
 67 | separate device compilation and linking. Assuming that the variable $PREFIX
 68 | contains the installation prefix, and myprog.cu is the file being compiled, this
 69 | can be done as follows:
 70 | 
 71 |   nvcc -arch=sm_35 -O3 -I $(PREFIX)/include -dc myprog.cu -o myprog.o
 72 | 	nvcc -arch=sm_35 -O3 -L $(PREFIX)/lib -lhalloc -o myprog myprog.o
 73 | 
 74 | 
 75 | Halloc API 
 76 | 
 77 | The functions defined by halloc are in the halloc.h file, which needs to be
 78 | included into your code to use halloc:
 79 | 
 80 | #include <halloc.h>
 81 | 
 82 | Before using halloc, in device functions it has to be initialized with ha_init()
 83 | function:
 84 | 
 85 | void ha_init(halloc_opts_t opts = halloc_opts_t());
 86 | 
 87 | It can be given a full halloc_opts_t structure to control fine halloc
 88 | parameters, such as slab size or fraction of used chunks at which the slab is
 89 | considered "busy". It can also be called just with specifying amount of memory
 90 | to allocate, or completely without any parameter list to preserve defaults:
 91 | 
 92 | ha_init(512 * 1024 * 1024);  // pass memory to allocate
 93 | ha_init();  // use default amount of memory
 94 | 
 95 | Halloc defines two functions, hamalloc to allocate and hafree to free memory
 96 | (malloc and free are used by CUDA allocator, therefore halloc has to use other
 97 | names). These functions can only be called from device code.
 98 | 
 99 | void *hamalloc(size_t nbytes);
100 | void hafree(void *p);
101 | 
102 | Otherwise, these functions have pretty much the same behavior as standard C
103 | malloc/free, e.g.:
104 | 
105 | // allocate an array
106 | int *p = (int *)hamalloc(8 * sizeof(int));
107 | p[0] = 0;
108 | p[1] = threadIdx.x;
109 | p[2] = 2;
110 | // ...
111 | // free the array
112 | hafree(p);
113 | 
114 | 
115 | // allocate a list
116 | typedef struct list_ {
117 | 	int element;
118 | 	struct list_ *next;
119 | } list;
120 | // ...
121 | list *l = (list *)hamalloc(sizeof(list));
122 | l->element = 1;
123 | l->next = (list *)hamalloc(sizeof(list));
124 | l->next->element = 2;
125 | l->next->next = NULL;
126 | 
127 | The functions can be used in pretty much the same way as in C programs. hamalloc
128 | accepts the number of bytes to allocate, and returns the pointer to allocated
129 | memory, or NULL if memory cannot be allocated. Similarly, hafree accepts either
130 | a pointer returned by hamalloc or NULL, and frees the memory previously
131 | allocated. Naturally, hamalloc and hafree are thread-safe, and can be called
132 | simultaneously by threads of the same or different kernels. hamalloc allocations
133 | persist across kernel invocations, and can be used in other kernel
134 | calls. Pointers allocated by hamalloc can only be freed by hafree;
135 | they cannot be deallocated, e.g., by host/device cudaFree/free.
136 | 
137 | ha_shutdown() is intended to free resources used by halloc, but is currently a
138 | no-op.
139 | 
140 | 
141 | LIMITATIONS
142 | 
143 | There is currently no way to change parameters or allocate more memory after
144 | halloc has been initialized.
145 | 
146 | 
147 | BUGS
148 | 
149 | Though correctness tests pass successfully, this provies nothing, of
150 | course. Some bugs are most likely there ;)
151 | 


--------------------------------------------------------------------------------
/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | PREFIX=~/usr
 2 | NAME=libhalloc.a
 3 | HEADER=src/halloc.h
 4 | SRC_C=src/*.cu
 5 | SRC_H=src/*.h src/*.cuh
 6 | SRC=$(SRC_C) $(SRC_H)
 7 | TGT=bin/$(NAME)
 8 | ARCH= -gencode arch=compute_20,code=sm_20 \
 9 | 	-gencode arch=compute_30,code=sm_30 \
10 | 	-gencode arch=compute_35,code=sm_35
11 | #TEST_TGT=tst/corr/bin/test
12 | 
13 | TMP=*~ \\\#* src/*~ src/\\\#* tst/corr/*~ tst/corr/*.o $(TGT) $(TEST_TGT)
14 | 
15 | # be careful: using cs modifier can lead to errors maxrregcount should be 44-64,
16 | # this allows both enough threads and enough storage (values of 44 and 54 give
17 | # good results in phase test, while 60 and 64 provide somewhat better spree
18 | # throughput); 39-42 (39 tested) are good when operating in L1-preferred mode
19 | build: $(TGT)
20 | $(TGT):	$(SRC) makefile
21 | 	nvcc $(ARCH) -lineinfo -O3 -lib -rdc=true -Xptxas -dlcm=cg -Xptxas -dscm=wb \
22 | 	-Xptxas -maxrregcount=64 -o $(TGT) $(SRC_C)
23 | #	-Xptxas -maxrregcount=42 -o $(TGT) $(SRC_C)
24 | #	nvcc $(ARCH) -O3 -lib -rdc=true -Xptxas -dlcm=cs -Xptxas -dscm=cs -o $(TGT) $(SRC_C)
25 | #	nvcc $(ARCH) -O3 -lib -rdc=true -o $(TGT) $(SRC_C)
26 | 
27 | #test: $(TGT) makefile
28 | #	make -C tst/corr/test run
29 | test:	$(TGT) makefile build-corr
30 | 	make -C tst/corr run-only
31 | 
32 | clean:
33 | 	rm -f $(TMP)
34 | 	make -C tst/common clean
35 | 	make -C tst/corr clean
36 | 	make -C tst/perf clean
37 | 
38 | build-perf:	$(TGT)
39 | 	make -C tst/common build
40 | 	make -C tst/perf build
41 | 
42 | build-corr:	$(TGT)
43 | 	make -C tst/common build
44 | 	make -C tst/corr build
45 | 
46 | build-test:	$(TGT)
47 | 	make -C tst/common build
48 | 	make -C tst/corr build
49 | 	make -C tst/perf build
50 | 
51 | install: $(HEADER) $(TGT)
52 | 	cp $(HEADER) $(PREFIX)/include/halloc.h
53 | 	cp $(TGT) $(PREFIX)/lib/libhalloc.a
54 | 
55 | uninstall:
56 | 	rm -f $(PREFIX)/include/halloc.h $(PREFIX)/lib/libhalloc.a
57 | 


--------------------------------------------------------------------------------
/samples/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | 


--------------------------------------------------------------------------------
/samples/add-strings/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | 


--------------------------------------------------------------------------------
/samples/add-strings/add-strings.c:
--------------------------------------------------------------------------------
  1 | /** @file grid-points.cu a test where grid points are sorted into a grid */
  2 | 
  3 | #include <math.h>
  4 | #include <omp.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include <time.h>
  9 | #include <unistd.h>
 10 | 
 11 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); }
 12 | 
 13 | typedef unsigned long long int uint64;
 14 | 
 15 | /** a random value in [a, b] range */
 16 | int random2(int a, int b) {
 17 | 	//return a + random() % (b - a + 1);
 18 | 	return a;
 19 | }
 20 | 
 21 | /** an array filled with random values in [a, b] range, with contiguous groups
 22 | 		of p values starting at p being the same */
 23 | void random_array(int *arr, size_t n, int p, int a, int b) {
 24 | 	int v = 0;
 25 | 	for(size_t i = 0; i < n; i++) {
 26 | 		if(i % p == 0)
 27 | 			v = random2(a, b);
 28 | 		arr[i] = v;
 29 | 	}
 30 | }
 31 | 
 32 | void alloc_strs(char **strs, const int *lens, int n) {
 33 | #pragma omp parallel for
 34 | 	for(int i = 0; i < n; i++) {
 35 | 		int l = lens[i];
 36 | 		char *str = (char *)malloc((l + 1) * sizeof(char));
 37 | 		for(int j = 0; j < l; j++)
 38 | 			str[j] = ' ';
 39 | 		str[l] = 0;
 40 | 		strs[i] = str;
 41 | 	}
 42 | }  //alloc_strs
 43 | 
 44 | void free_strs(char ** strs, int n) {
 45 | 	#pragma omp parallel for
 46 | 	for(int i = 0; i < n; i++)
 47 | 		free(strs[i]);
 48 | }  // free_strs
 49 | 
 50 | void add_strs(char ** __restrict__ c, char **a, char **b, int n) {
 51 | #pragma omp parallel for
 52 | 	for(int i = 0; i < n; i++) {
 53 | 		const char *sa = a[i], *sb = b[i];
 54 | 		int la = strlen(sa), lb = strlen(sb), lc = la + lb;
 55 | 		char *sc = (char *)malloc((lc + 1) * sizeof(char));
 56 | 		strcpy(sc, sa);
 57 | 		strcpy(sc + la, sb);
 58 | 		c[i] = sc;
 59 | 	}
 60 | }  // add_strs
 61 | 
 62 | #define MIN_LEN 31
 63 | #define MAX_LEN 31
 64 | #define PERIOD 32
 65 | 
 66 | /** a test for string addition on CPU */
 67 | void string_test_cpu(int n, int print) {
 68 | 	int min_len = MIN_LEN;
 69 | 	int max_len = MAX_LEN;
 70 | 	int period = PERIOD;
 71 | 	// string lengths on host and device
 72 | 	int *h_la = 0, *h_lb = 0;
 73 | 	size_t l_sz = n * sizeof(int), s_sz = n * sizeof(char *);
 74 | 	h_la = (int *)malloc(l_sz);
 75 | 	h_lb = (int *)malloc(l_sz);
 76 | 	random_array(h_la, n, period, min_len, max_len);
 77 | 	random_array(h_lb, n, period, min_len, max_len);
 78 | 
 79 | 	// string arrays
 80 | 	char **h_sa, **h_sb, **h_sc;
 81 | 	h_sa = (char **)malloc(s_sz);
 82 | 	h_sb = (char **)malloc(s_sz);
 83 | 	h_sc = (char **)malloc(s_sz);
 84 | 
 85 | 	// allocate strings
 86 | 	double t1, t2;
 87 | 	t1 = omp_get_wtime();
 88 | 	alloc_strs(h_sa, h_la, n);
 89 | 	alloc_strs(h_sb, h_lb, n);
 90 | 	t2 = omp_get_wtime();
 91 | 	//printf("t1 = %lf, t2 = %lf\n", t1, t2);
 92 | 	if(print) {
 93 | 		double t = (t2 - t1) / 2;
 94 | 		printf("CPU allocation time: %4.2lf ms\n", t * 1e3);
 95 | 		printf("CPU allocation performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
 96 | 	}
 97 | 
 98 | 	//concatenate strings
 99 | 	t1 = omp_get_wtime();
100 | 	add_strs(h_sc, h_sa, h_sb, n);
101 | 	t2 = omp_get_wtime();
102 | 	if(print) {
103 | 		double t = t2 - t1;
104 | 		printf("CPU concatenation time: %4.2lf ms\n", t * 1e3);
105 | 		printf("CPU concatenation performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
106 | 	}
107 | 
108 | 	// free strings
109 | 	t1 = omp_get_wtime();
110 | 	free_strs(h_sa, n);
111 | 	free_strs(h_sb, n);
112 | 	free_strs(h_sc, n);
113 | 	t2 = omp_get_wtime();
114 | 	if(print) {
115 | 		double t = (t2 - t1) / 3;
116 | 		//double t = (t2 - t1) / 2;
117 | 		printf("CPU freeing time: %4.2lf ms\n", t * 1e3);
118 | 		printf("CPU freeing performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
119 | 	} 
120 | 	// free the rest
121 | 	free(h_sa);
122 | 	free(h_sb);
123 | 	free(h_sc);
124 | 	free(h_la);
125 | 	free(h_lb);
126 | }  // string_test_cpu
127 | 
128 | 
129 | int main(int argc, char **argv) {
130 | 	//srandom((int)time(0));
131 | 	size_t memory = 512 * 1024 * 1024;
132 | 	printf("==============================\n");
133 | 	// CPU test
134 | 	string_test_cpu(10000, 0);
135 | 	string_test_cpu(500000, 1);
136 | 	//ha_shutdown();
137 | }  // main
138 | 


--------------------------------------------------------------------------------
/samples/add-strings/add-strings.cu:
--------------------------------------------------------------------------------
  1 | /** @file grid-points.cu a test where grid points are sorted into a grid */
  2 | 
  3 | #include <halloc.h>
  4 | 
  5 | #include <math.h>
  6 | #include <omp.h>
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | #include <time.h>
 11 | #include <unistd.h>
 12 | 
 13 | /** a macro for checking CUDA calls */
 14 | #define cucheck(call)																										\
 15 | 	{																																			\
 16 | 	cudaError_t cucheck_err = (call);																			\
 17 | 	if(cucheck_err != cudaSuccess) {																			\
 18 | 		const char* err_str = cudaGetErrorString(cucheck_err);							\
 19 | 		fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call);	\
 20 | 		exit(-1);																														\
 21 | 	}																																			\
 22 | 	}
 23 | 
 24 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); }
 25 | 
 26 | typedef unsigned long long int uint64;
 27 | 
 28 | /** prefetches into L1 cache */
 29 | __device__ inline void prefetch_l1(const void *p) {
 30 | 	asm("prefetch.global.L1 [%0];": :"l"(p));
 31 | }
 32 | 
 33 | /** prefetches into L2 cache */
 34 | __device__ inline void prefetch_l2(const void *p) {
 35 | 	asm("prefetch.global.L2 [%0];": :"l"(p));
 36 | }
 37 | 
 38 | /** a random value in [a, b] range */
 39 | int random(int a, int b) {
 40 | 	return a + random() % (b - a + 1);
 41 | 	//return a;
 42 | }
 43 | 
 44 | /** an array filled with random values in [a, b] range, with contiguous groups
 45 | 		of p values starting at p being the same */
 46 | void random_array(int *arr, size_t n, int p, int a, int b) {
 47 | 	int v = 0;
 48 | 	for(size_t i = 0; i < n; i++) {
 49 | 		if(i % p == 0)
 50 | 			v = random(a, b);
 51 | 		arr[i] = v;
 52 | 	}
 53 | }
 54 | 
 55 | void alloc_strs(char **strs, const int *lens, int n) {
 56 | #pragma omp parallel for
 57 | 	for(int i = 0; i < n; i++) {
 58 | 		int l = lens[i];
 59 | 		char *str = (char *)malloc((l + 1) * sizeof(char));
 60 | 		//strs[i] = (char *)malloc((l + 1) * sizeof(char));
 61 | 		/*
 62 | 		for(int j = 0; j < l; j++)
 63 | 			str[j] = ' ';
 64 | 			str[l] = 0; */
 65 | 		strs[i] = str;
 66 | 	}
 67 | }  //alloc_strs
 68 | 
 69 | /** a kernel that allocates and initializes an array of strings; memory for
 70 | 		strings is allocated using halloc */
 71 | __global__ void alloc_strs_k
 72 | (char ** __restrict__ strs, 
 73 |  const int * __restrict__ lens, int n) {
 74 | 	int i = threadIdx.x + blockIdx.x * blockDim.x;
 75 | 	if(i >= n)
 76 | 		return;
 77 | 	// allocate string (don't forget zero byte!)
 78 | 	int l = lens[i];
 79 | 	//if(i == 0)
 80 | 	//	printf("l = %d\n", l);
 81 | 	//if(i > n - 256)
 82 | 	//	printf("i = %d, l = %d, n = %d\n", i, l, n);
 83 | 	// char *str = (char *)hamalloc((l + 1) * sizeof(char));
 84 | 	// for(int j = 0; j < l; j++)
 85 | 	//  	str[j] = '0' + j;
 86 | 	// str[l] = 0;
 87 | 
 88 | 	uint64 *str = (uint64 *)hamalloc((l + 1) * sizeof(char));
 89 | 	int l_i = (l + 1) / 8;
 90 | 	for(int j = 0; j < l_i - 1; j++) {
 91 | 	  str[j] = 0x2020202020202020ull;
 92 | 	}
 93 | 	str[l_i - 1] = 0x0020202020202020ull;
 94 | 
 95 | 	// save string pointer
 96 | 	strs[i] = (char *)str;
 97 | }  // alloc_strs_k
 98 | 
 99 | /** a kernel that frees memory allocated for strings */
100 | __global__ void free_strs_k
101 | (char ** __restrict__ strs, int n) {
102 | 	int i = threadIdx.x + blockIdx.x * blockDim.x;
103 | 	if(i >= n)
104 | 		return;
105 | 	hafree(strs[i]);
106 | }  // free_strs_k
107 | 
108 | void free_strs(char ** strs, int n) {
109 | 	#pragma omp parallel for
110 | 	for(int i = 0; i < n; i++)
111 | 		free(strs[i]);
112 | }  // free_strs
113 | 
114 | /** finds the zero byte in a long value, returns INT_MAX if not found */
115 | __device__ inline int izero_byte(uint64 v) {
116 | 	int l = INT_MAX;
117 | 	#pragma unroll 8
118 | 	for(int i = 0; i < 8; i++) {
119 | 		if(((v >> i * 8) & 0xffu) == 0)
120 | 			l = min(l, i);
121 | 	}
122 | 	return l;
123 | }  // zero_byte
124 | 
125 | // couple of helper device functions, analogous to C library
126 | /** get the length of a string; it is assumed that s is at least 8-byte aligned */
127 | __device__ inline int dstrlen(const char * __restrict__ s) {
128 | 	//int len = -1;
129 | 	int len = INT_MAX;
130 | 	//while(*s++) len++;
131 | 	//return len;
132 | 	const uint64 *s1 = (const uint64 *)s;
133 | 	int ll = 0;
134 | 	while(len == INT_MAX) {
135 | 	//while(true) {
136 | 		uint64 c1 = *s1++;
137 | 		#pragma unroll 8
138 | 		for(int i = 0; i < 8; i++) {
139 | 			//if(((c1 >> i * 8) & 0xffu) == 0)
140 | 			//	return len;
141 | 			//len++;
142 | 			if(((c1 >> i * 8) & 0xffu) == 0)
143 | 				len = min(len, ll + i);
144 | 		}
145 | 		ll++;
146 | 	}
147 | 	return len;
148 | }  // strlen
149 | 
150 | /** concatenate two strings into the third string; all strings have been
151 | 		allocated, and the result has enough place to hold the arguments; 
152 | 		all pointers are assumed to be 8-byte aligned
153 | */
154 | __device__ inline void dstrcat
155 | (char *  __restrict__ c, const char * __restrict__ b, 
156 |  const char * __restrict__ a) {
157 | 	// while(*c++ = *a++) {}
158 | 	// c--;
159 | 	// while(*c++ = *b++) {}
160 | 	uint64 *c1 = (uint64 *)c;
161 | 	const uint64 *a1 = (const uint64 *)a;
162 | 	const uint64 *b1 = (const uint64 *)b;
163 | 	uint64 cc = 0, aa = 0, bb = 0;
164 | 	int ccpos = 0;
165 | 	//uint cc1 = 0;
166 | 	// TODO: optimize computations for concatenation, similar to dstrlen()
167 | 	// copy first string
168 | 	int izb = INT_MAX;
169 | 	do {
170 | 		aa = *a1++;
171 | 		cc |= aa << ccpos;
172 | 		izb = izero_byte(aa);
173 | 		if(izb == INT_MAX) {
174 | 			*c1++ = cc;
175 | 		} else {
176 | 			ccpos = izb * 8;
177 | 			break;
178 | 		} 
179 | 	} while(izb == INT_MAX);
180 | 	/*
181 | 		for(int i = 0; i < 8; i++) {
182 | 			cc1 = (uint)(aa >> i * 8) & 0xffu;
183 | 			//ccpos += 8;
184 | 			if(cc1) {
185 | 				cc |= (uint64)cc1 << ccpos;
186 | 				ccpos += 8;
187 | 				if(ccpos == 64) {
188 | 					*c1++ = cc;
189 | 					ccpos = 0;
190 | 					//cc = aa >> i * 8;
191 | 					cc = 0;
192 | 				}
193 | 			} else
194 | 				break;
195 | 		}
196 | 	} while(cc1);
197 | 	*/
198 | 	// copy second string
199 | 	do {
200 | 		bb = *b1++;
201 | 		// commit current character group
202 | 		cc |= bb << ccpos;
203 | 		*c1++ = cc;
204 | 		// update for next
205 | 		izb = izero_byte(bb);
206 | 		cc = bb >> ccpos;
207 | 		if(izb != INT_MAX) {
208 | 			*c1++ = cc;
209 | 			break;
210 | 		}
211 | 	} while(izb == INT_MAX);
212 | 
213 | 	/*
214 | 		for(int i = 0; i < 8; i++) {
215 | 			cc1 = (uint)(bb >> i * 8) & 0xffu;
216 | 			cc |= (uint64)cc1 << ccpos;
217 | 			ccpos += 8;
218 | 			if(ccpos == 64) {
219 | 				*c1++ = cc;
220 | 				ccpos = 0;
221 | 				//cc = bb >> i * 8;
222 | 				cc = 0;
223 | 			}
224 | 			if(!cc1)
225 | 				break;
226 | 		}
227 | 	} while(cc1);
228 | 	if(ccpos)
229 | 		*c1 = cc;
230 | 		*/
231 | }  // dstrcat
232 | 
233 | /** adds two arrays of strings elementwise */
234 | __global__ void add_strs_k
235 | (char ** __restrict__ c, const char * const * __restrict__ a, 
236 |  const char * const * __restrict__ b, int n) {
237 | 	int i = threadIdx.x + blockIdx.x * blockDim.x;
238 | 	if(i >= n)
239 | 		return;
240 | 	// measure strings a and b
241 | 	const char *sa = a[i], *sb = b[i];
242 | 	int la = dstrlen(sa), lb = dstrlen(sb), lc = la + lb;
243 | 	//int la = 31, lb = 31, lc = la + lb;
244 | 	// allocate memory and get new string
245 | 	char *sc = (char *)hamalloc((lc + 1) * sizeof(char));
246 | 	dstrcat(sc, sa, sb);
247 | 	c[i] = sc;
248 | }  // add_strs_k
249 | 
250 | void add_strs(char ** __restrict__ c, char **a, char **b, int n) {
251 | #pragma omp parallel for
252 | 	for(int i = 0; i < n; i++) {
253 | 		const char *sa = a[i], *sb = b[i];
254 | 		int la = strlen(sa), lb = strlen(sb), lc = la + lb;
255 | 		//int la = 31, lb = 31, lc = la + lb;
256 | 		char *sc = (char *)malloc((lc + 1) * sizeof(char));
257 | 		strcpy(sc, sa);
258 | 		strcpy(sc + la, sb);
259 | 		c[i] = sc;
260 | 	}
261 | }  // add_strs
262 | 
263 | #define MIN_LEN 31
264 | #define MAX_LEN 31
265 | #define PERIOD 32
266 | 
267 | /** a test for string addition on GPU */
268 | void string_test_gpu(int n, bool print) {
269 | 	int min_len = MIN_LEN;
270 | 	int max_len = MAX_LEN;
271 | 	int period = PERIOD;
272 | 	// string lengths on host and device
273 | 	int *h_la = 0, *d_la = 0, *h_lb = 0, *d_lb = 0;
274 | 	size_t l_sz = n * sizeof(int), s_sz = n * sizeof(char *);
275 | 	cucheck(cudaMallocHost((void **)&h_la, l_sz));
276 | 	cucheck(cudaMallocHost((void **)&h_lb, l_sz));
277 | 	cucheck(cudaMalloc((void **)&d_la, l_sz));
278 | 	cucheck(cudaMalloc((void **)&d_lb, l_sz));
279 | 	random_array(h_la, n, period, min_len, max_len);
280 | 	random_array(h_lb, n, period, min_len, max_len);
281 | 	cucheck(cudaMemcpy(d_la, h_la, l_sz, cudaMemcpyHostToDevice));
282 | 	cucheck(cudaMemcpy(d_lb, h_lb, l_sz, cudaMemcpyHostToDevice));
283 | 
284 | 	// string arrays
285 | 	char **d_sa, **d_sb, **d_sc;
286 | 	cucheck(cudaMalloc((void **)&d_sa, s_sz));
287 | 	cucheck(cudaMalloc((void **)&d_sb, s_sz));
288 | 	cucheck(cudaMalloc((void **)&d_sc, s_sz));
289 | 
290 | 	// allocate strings
291 |  	int bs = 128,	grid = divup(n, bs);
292 | 	double t1, t2;
293 | 	t1 = omp_get_wtime();
294 | 	alloc_strs_k<<<grid, bs>>>(d_sa, d_la, n);
295 | 	cucheck(cudaGetLastError());
296 | 	alloc_strs_k<<<grid, bs>>>(d_sb, d_lb, n);
297 | 	cucheck(cudaGetLastError());
298 | 	cucheck(cudaStreamSynchronize(0));
299 | 	t2 = omp_get_wtime();
300 | 	//printf("t1 = %lf, t2 = %lf\n", t1, t2);
301 | 	if(print) {
302 | 		double t = (t2 - t1) / 2;
303 | 		printf("GPU allocation time: %4.2lf ms\n", t * 1e3);
304 | 		printf("GPU allocation performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
305 | 	}
306 | 
307 | 	//concatenate strings
308 | 	t1 = omp_get_wtime();
309 | 	add_strs_k<<<grid, bs>>>(d_sc, d_sa, d_sb, n);
310 | 	cucheck(cudaGetLastError());
311 | 	cucheck(cudaStreamSynchronize(0));
312 | 	t2 = omp_get_wtime();
313 | 	if(print) {
314 | 		double t = t2 - t1;
315 | 		printf("GPU concatenation time: %4.2lf ms\n", t * 1e3);
316 | 		printf("GPU concatenation performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
317 | 	}
318 | 
319 | 	// free strings
320 | 	t1 = omp_get_wtime();
321 | 	free_strs_k<<<grid, bs>>>(d_sa, n);
322 | 	cucheck(cudaGetLastError());
323 | 	free_strs_k<<<grid, bs>>>(d_sb, n);
324 | 	cucheck(cudaGetLastError());
325 | 	free_strs_k<<<grid, bs>>>(d_sc, n);
326 | 	cucheck(cudaGetLastError());
327 | 	cucheck(cudaStreamSynchronize(0));
328 | 	t2 = omp_get_wtime();
329 | 	if(print) {
330 | 		double t = (t2 - t1) / 3;
331 | 		//double t = (t2 - t1) / 2;
332 | 		printf("GPU freeing time: %4.2lf ms\n", t * 1e3);
333 | 		printf("GPU freeing performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
334 | 	}
335 | 
336 | 	// free the rest
337 | 	cucheck(cudaFree(d_sa));
338 | 	cucheck(cudaFree(d_sb));
339 | 	cucheck(cudaFree(d_sc));
340 | 	cucheck(cudaFree(d_la));
341 | 	cucheck(cudaFree(d_lb));
342 | 	cucheck(cudaFreeHost(h_la));
343 | 	cucheck(cudaFreeHost(h_lb));
344 | }  // string_test_gpu
345 | 
346 | /** a test for string addition on CPU */
347 | void string_test_cpu(int n, bool print) {
348 | 	int min_len = MIN_LEN;
349 | 	int max_len = MAX_LEN;
350 | 	int period = PERIOD;
351 | 	// string lengths on host and device
352 | 	int *h_la = 0, *h_lb = 0;
353 | 	size_t l_sz = n * sizeof(int), s_sz = n * sizeof(char *);
354 | 	h_la = (int *)malloc(l_sz);
355 | 	h_lb = (int *)malloc(l_sz);
356 | 	random_array(h_la, n, period, min_len, max_len);
357 | 	random_array(h_lb, n, period, min_len, max_len);
358 | 
359 | 	// string arrays
360 | 	char **h_sa, **h_sb, **h_sc;
361 | 	h_sa = (char **)malloc(s_sz);
362 | 	h_sb = (char **)malloc(s_sz);
363 | 	h_sc = (char **)malloc(s_sz);
364 | 
365 | 	// allocate strings
366 | 	double t1, t2;
367 | 	t1 = omp_get_wtime();
368 | 	alloc_strs(h_sa, h_la, n);
369 | 	alloc_strs(h_sb, h_lb, n);
370 | 	t2 = omp_get_wtime();
371 | 	//printf("t1 = %lf, t2 = %lf\n", t1, t2);
372 | 	if(print) {
373 | 		double t = (t2 - t1) / 2;
374 | 		printf("CPU allocation time: %4.2lf ms\n", t * 1e3);
375 | 		printf("CPU allocation performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
376 | 	}
377 | 
378 | 	//concatenate strings
379 | 	t1 = omp_get_wtime();
380 | 	add_strs(h_sc, h_sa, h_sb, n);
381 | 	t2 = omp_get_wtime();
382 | 	if(print) {
383 | 		double t = t2 - t1;
384 | 		printf("CPU concatenation time: %4.2lf ms\n", t * 1e3);
385 | 		printf("CPU concatenation performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
386 | 	}
387 | 
388 | 	// free strings
389 | 	t1 = omp_get_wtime();
390 | 	free_strs(h_sa, n);
391 | 	free_strs(h_sb, n);
392 | 	free_strs(h_sc, n);
393 | 	t2 = omp_get_wtime();
394 | 	if(print) {
395 | 		double t = (t2 - t1) / 3;
396 | 		//double t = (t2 - t1) / 2;
397 | 		printf("CPU freeing time: %4.2lf ms\n", t * 1e3);
398 | 		printf("CPU freeing performance: %4.2lf Mstrings/s\n", n / t * 1e-6);
399 | 	} 
400 | 
401 | 	// free the rest
402 | 	free(h_sa);
403 | 	free(h_sb);
404 | 	free(h_sc);
405 | 	free(h_la);
406 | 	free(h_lb);
407 | }  // string_test_cpu
408 | 
409 | 
410 | int main(int argc, char **argv) {
411 | 	srandom((int)time(0));
412 | 	size_t memory = 512 * 1024 * 1024;
413 | 	// GPU test
414 | 	ha_init(halloc_opts_t(memory));
415 | 	//cucheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
416 | 	string_test_gpu(10000, false);
417 | 	string_test_gpu(1000000, true);
418 | 	printf("==============================\n");
419 | 	// CPU test
420 | 	string_test_cpu(10000, false);
421 | 	string_test_cpu(1000000, true);
422 | 	ha_shutdown();
423 | }  // main
424 | 


--------------------------------------------------------------------------------
/samples/add-strings/makefile:
--------------------------------------------------------------------------------
1 | NAME=add-strings
2 | 
3 | include ../common.mk
4 | 


--------------------------------------------------------------------------------
/samples/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/samples/common.mk:
--------------------------------------------------------------------------------
 1 | LIB_DIR=-L../../bin
 2 | #LIB_DIR=-L ~/usr/lib
 3 | LIBHALLOC=-lhalloc
 4 | LIBHALLOC_FILE=../../bin/libhalloc.a
 5 | INCLUDE_DIR=-I../include
 6 | #INCLUDE_DIR=-I ~/usr/include
 7 | SRC_C=*.cu
 8 | SRC_H=../include/halloc.h
 9 | SRC=$(SRC_C) $(SRC_H)
10 | #SRC=$(SRC_C)
11 | TGT=../bin/$(NAME)
12 | 
13 | OBJ=../tmp/$(NAME).o
14 | 
15 | TMP=*~ \\\#* ../tmp/*.o $(TGT)
16 | 
17 | ARCH := -gencode arch=compute_20,code=sm_20 \
18 | 	-gencode arch=compute_30,code=sm_30 \
19 | 	-gencode arch=compute_35,code=sm_35
20 | 
21 | build: $(TGT)
22 | $(TGT): $(LIBHALLOC_FILE) $(OBJ) makefile
23 | #	nvcc -arch=sm_35 -O3 -Xcompiler -fopenmp $(OBJ) $(LIBHALLOC) -o $(TGT)
24 | 	nvcc $(ARCH) -O3 -Xcompiler -fopenmp $(LIB_DIR) $(LIBHALLOC) -o \
25 | 	 $(TGT) $(OBJ)
26 | 
27 | $(OBJ): $(SRC) makefile
28 | 	nvcc $(ARCH) -O3 -Xcompiler -fopenmp -Xptxas -dlcm=cg -Xptxas -dscm=wb \
29 | 		-Xcompiler -pthread $(INCLUDE_DIR) -dc $(SRC_C) -o $(OBJ)
30 | 
31 | run: $(TGT)
32 | 	./$(TGT)
33 | 
34 | clean:
35 | 	rm -f $(TMP)
36 | 
37 | $(LIBHALLOC):
38 | 	make -C ../..
39 | 


--------------------------------------------------------------------------------
/samples/grid-points/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | 


--------------------------------------------------------------------------------
/samples/grid-points/grid-points.cu:
--------------------------------------------------------------------------------
  1 | /** @file grid-points.cu a test where grid points are sorted into a grid */
  2 | 
  3 | #include <halloc.h>
  4 | 
  5 | #include <math.h>
  6 | #include <omp.h>
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | #include <time.h>
 11 | 
 12 | /** a macro for checking CUDA calls */
 13 | #define cucheck(call)																										\
 14 | 	{																																			\
 15 | 	cudaError_t cucheck_err = (call);																			\
 16 | 	if(cucheck_err != cudaSuccess) {																			\
 17 | 		const char* err_str = cudaGetErrorString(cucheck_err);							\
 18 | 		fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call);	\
 19 | 		exit(-1);																														\
 20 | 	}																																			\
 21 | 	}
 22 | 
 23 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); }
 24 | 
 25 | /** a simple 3d vector */
 26 | template<class T>
 27 | struct vec3 {
 28 | 	T x, y, z;
 29 | 	__host__ __device__ vec3(T x, T y, T z) {
 30 | 		this->x = x; this->y = y; this->z = z;
 31 | 	}
 32 | 	__host__ __device__ vec3(T r = 0) {
 33 | 		this->x = this->y = this->z = r;
 34 | 	}
 35 | };
 36 | 
 37 | typedef vec3<int> ivec3;
 38 | typedef vec3<float> fvec3;
 39 | 
 40 | /** a single point list */
 41 | struct point_list_t {
 42 | 	/** point index */
 43 | 	int ip;
 44 | 	/** next element in the list, or 0 if end */
 45 | 	point_list_t *next;
 46 | };
 47 | 
 48 | /** gets a random float value between 0 and 1 */
 49 | float frandom(void) {	
 50 | 	const int rand_max = 65536;
 51 | 	return (double)(random() % rand_max) / rand_max;
 52 | }
 53 | 
 54 | /** gets a random point within [0, 1]^3 cube */
 55 | fvec3 random_point(void) {
 56 | 	return fvec3(frandom(), frandom(), frandom());
 57 | }  // random_point
 58 | 
 59 | typedef unsigned long long int uint64;
 60 | 
 61 | /** atomicCAS wrapper for pointers (arguments same as standard atomicCAS()) */
 62 | __device__ void *atomicCAS(void **address, void *compare, void *val) {
 63 | 	return (void *)atomicCAS((uint64 *)address, (uint64)compare, (uint64)val);
 64 | }  // atomicCAS
 65 | 
 66 | /** atomicExch wrapper for void **/
 67 | __device__ void *atomicExch(void **address, void *val) {
 68 | 	return (void *)atomicExch((uint64 *)address, (uint64)val);
 69 | }
 70 | 
 71 | /** a function to insert a point into a grid on device; this function can be
 72 | 		called concurrently by multiple threads */
 73 | __device__ void insert_point
 74 | (point_list_t **grid, int ncells, const fvec3 * __restrict__ ps, int ip,
 75 | 		point_list_t *plist) {
 76 | 	// compute the cell
 77 | 	fvec3 p = ps[ip];
 78 | 	ivec3 cell;
 79 | 	cell.x = max(min((int)floorf(p.x * ncells), ncells - 1), 0);
 80 | 	cell.y = max(min((int)floorf(p.y * ncells), ncells - 1), 0);
 81 | 	cell.z = max(min((int)floorf(p.z * ncells), ncells - 1), 0);
 82 | 
 83 | 	// get the cell pointer
 84 | 	point_list_t * volatile *pcell = grid + (cell.x + ncells * (cell.y + ncells *
 85 | 																															cell.z));
 86 | 	// try to take over the new start
 87 | 	// TODO: add __threadfence() somewhere
 88 | 	point_list_t *old = (point_list_t *)atomicExch((void **)pcell, plist);
 89 | 	plist->ip = ip;
 90 | 	plist->next = old;
 91 | }  // insert_point
 92 | 
 93 | /** frees the grid cell; one cell can be simultaneously freed by one thread only
 94 | 		*/
 95 | __device__ void free_cell(point_list_t **grid, int ncells, ivec3 cell,
 96 | point_list_t *pre_chains) {
 97 | 	point_list_t **pcell = grid + cell.x + ncells * (cell.y + ncells * cell.z);
 98 | 	// free all cells
 99 | 	point_list_t *plist = *pcell, *pnext;
100 | 	while(plist) {
101 | 		pnext = plist->next;
102 | 		if(!pre_chains) {
103 | 			hafree(plist);
104 | 		}
105 | 		plist = pnext;
106 | 	}
107 | }  // free_cell
108 | 
109 | /** the kernel to insert points into the grid */
110 | __global__ void sort_points_k
111 | (point_list_t **grid, int ncells, const fvec3 * __restrict__ ps,
112 |  point_list_t *pre_chains, int n) {
113 | 	int ip = threadIdx.x + blockIdx.x * blockDim.x;
114 | 	if(ip >= n)
115 | 		return;
116 | 
117 | 	// allocate memory for list element
118 | 	point_list_t *plist;
119 | 	if(pre_chains)
120 | 		plist = pre_chains + ip;
121 | 	else {
122 | 		plist = (point_list_t *)hamalloc(sizeof(point_list_t));
123 | 		//plist = new point_list_t();
124 | 	}
125 | 	if(!plist) {
126 | 		//printf("cannot allocate memory\n");
127 | 		return;
128 | 	}
129 | 
130 | 	insert_point(grid, ncells, ps, ip, plist);
131 | }  // sort_points_k
132 | 
133 | /** the kernel to free the entire grid; this is 1d kernel */
134 | __global__ void free_grid_k
135 | (point_list_t **grid, int ncells, point_list_t *pre_chains) {
136 | 	int ncells3 = ncells * ncells * ncells;
137 | 	int i = threadIdx.x + blockIdx.x * blockDim.x;
138 | 	if(i >= ncells3)
139 | 		return;
140 | 	ivec3 cell;
141 | 	cell.x = i % ncells;
142 | 	cell.y = i % (ncells * ncells) / ncells;
143 | 	cell.z = i / (ncells * ncells);
144 | 	free_cell(grid, ncells, cell, pre_chains);
145 | }  // free_grid_k
146 | 
147 | // a test to fill in the grid and then free it
148 | void grid_test(int n, int ncells, bool alloc, bool print) {
149 | 	// points
150 | 	size_t sz = n * sizeof(fvec3);
151 | 	fvec3 *ps, *d_ps;
152 | 	ps = (fvec3 *)malloc(sz);
153 | 	cucheck(cudaMalloc((void **)&d_ps, sz));
154 | 	for(int ip = 0; ip < n; ip++) {
155 | 		ps[ip] = random_point();
156 | 		//printf("point = (%lf, %lf %lf)\n", (double)ps[ip].x, (double)ps[ip].y, 
157 | 		//			 (double)ps[ip].z);
158 | 	}
159 | 	cucheck(cudaMemcpy(d_ps, ps, sz, cudaMemcpyHostToDevice));
160 | 
161 | 	// grid
162 | 	int ncells3 = ncells * ncells * ncells;
163 | 	size_t grid_sz = ncells3 * sizeof(point_list_t *);
164 | 	point_list_t **d_grid;
165 | 	cucheck(cudaMalloc((void **)&d_grid, grid_sz));
166 | 	cucheck(cudaMemset(d_grid, 0, grid_sz));
167 | 	
168 | 	// pre-allocated per-point chains
169 | 	point_list_t *pre_chains = 0;
170 | 	if(!alloc) {
171 | 		cucheck(cudaMalloc((void **)&pre_chains, n * sizeof(point_list_t)));
172 | 		cucheck(cudaMemset(pre_chains, 0, n * sizeof(point_list_t)));
173 | 	}
174 | 
175 | 	// fill the grid
176 | 	double t1 = omp_get_wtime();
177 | 	int bs = 128;
178 | 	sort_points_k<<<divup(n, bs), bs>>>(d_grid, ncells, d_ps, pre_chains, n);
179 | 	cucheck(cudaGetLastError());
180 | 	cucheck(cudaStreamSynchronize(0));
181 | 	double t2 = omp_get_wtime();
182 | 
183 | 	// free the grid
184 | 	free_grid_k<<<divup(ncells3, bs), bs>>>(d_grid, ncells, pre_chains);
185 | 	cucheck(cudaGetLastError());
186 | 	cucheck(cudaStreamSynchronize(0));
187 | 	double t3 = omp_get_wtime();
188 | 
189 | 	// free everything
190 | 	//free(ps);
191 | 	cucheck(cudaFree(d_grid));
192 | 	cucheck(cudaFree(d_ps));
193 | 	cucheck(cudaFree(pre_chains));
194 | 
195 | 	// print time
196 | 	if(print) {
197 | 		printf("allocation time %.2lf ms\n", (t2 - t1) * 1e3);
198 | 		printf("free time %.2lf ms\n", (t3 - t2) * 1e3);
199 | 		printf("allocation performance %.2lf Mpoints/s\n", n / (t2 - t1) * 1e-6); 
200 | 		printf("free performance %.2lf Mpoints/s\n", n / (t3 - t2) * 1e-6);
201 | 	}  // if(print)
202 | 
203 | }  // grid_test
204 | 
205 | int main(int argc, char **argv) {
206 | 	srandom((int)time(0));
207 | 	size_t memory = 512 * 1024 * 1024;
208 | 	bool alloc = true;
209 | 	//cucheck(cudaSetDevice(0));
210 | 	ha_init(halloc_opts_t(memory));
211 | 	// warm-up run
212 | 	grid_test(10000, 8, alloc, false);
213 | 	// main run
214 | 	grid_test(1000000, 32, alloc, true);
215 | 	ha_shutdown();
216 | }  // main
217 | 


--------------------------------------------------------------------------------
/samples/grid-points/makefile:
--------------------------------------------------------------------------------
1 | NAME=grid-points
2 | 
3 | include ../common.mk
4 | 


--------------------------------------------------------------------------------
/samples/include/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/samples/include/halloc.h:
--------------------------------------------------------------------------------
1 | ../../src/halloc.h


--------------------------------------------------------------------------------
/samples/random-graph/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | 


--------------------------------------------------------------------------------
/samples/random-graph/makefile:
--------------------------------------------------------------------------------
1 | NAME=random-graph
2 | 
3 | include ../common.mk
4 | 


--------------------------------------------------------------------------------
/samples/random-graph/random-graph.cu:
--------------------------------------------------------------------------------
  1 | /** @file grid-points.cu a test where grid points are sorted into a grid */
  2 | 
  3 | #include <halloc.h>
  4 | 
  5 | #include <math.h>
  6 | #include <omp.h>
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | #include <time.h>
 11 | #include <unistd.h>
 12 | 
 13 | /** a macro for checking CUDA calls */
 14 | #define cucheck(call)																										\
 15 | 	{																																			\
 16 | 	cudaError_t cucheck_err = (call);																			\
 17 | 	if(cucheck_err != cudaSuccess) {																			\
 18 | 		const char* err_str = cudaGetErrorString(cucheck_err);							\
 19 | 		fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call);	\
 20 | 		exit(-1);																														\
 21 | 	}																																			\
 22 | 	}
 23 | 
 24 | /** sets CUDA device variable */
 25 | #define cuset(symbol, T, val)																		\
 26 | {																																\
 27 | 	void *cuset_addr;																							\
 28 | 	cucheck(cudaGetSymbolAddress(&cuset_addr, symbol));						\
 29 | 	T cuset_val = (val);																					\
 30 | 	cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \
 31 | 										 cudaMemcpyHostToDevice));									\
 32 | }  // cuset
 33 | 
 34 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); }
 35 | 
 36 | typedef unsigned long long int uint64;
 37 | 
 38 | /** a random value in [a, b] range */
 39 | int random(int a, int b) {
 40 | 	return a + random() % (b - a + 1);
 41 | 	//return a;
 42 | }
 43 | 
 44 | /** an array filled with random values in [a, b] range, with contiguous groups
 45 | 		of p values starting at p being the same */
 46 | void random_array(int *arr, size_t n, int p, int a, int b) {
 47 | 	int v = 0;
 48 | 	for(size_t i = 0; i < n; i++) {
 49 | 		if(i % p == 0)
 50 | 			v = random(a, b);
 51 | 		arr[i] = v;
 52 | 	}
 53 | }
 54 | 
 55 | /** a list of neighboring vertices */
 56 | struct vertex_t;
 57 | struct edge_list_t {
 58 | 	/** the target vertex */
 59 | 	vertex_t *target;
 60 | 	/** the next element in the vertex list */
 61 | 	edge_list_t *next;
 62 | 	/** creates a new list edge */
 63 | 	__host__ __device__ edge_list_t(vertex_t *target, edge_list_t *next = 0) 
 64 | 		: target(target), next(next) {}
 65 | };  // edge_list_t
 66 | 
 67 | /** a single vertex */
 68 | struct vertex_t {
 69 | 	/** the id of the vertex (= illusion of some data) */
 70 | 	int id;
 71 | 	/** the number of edges in the vertex */
 72 | 	int nedges;
 73 | 	/** the list of edges of the vertex */
 74 | 	edge_list_t *edges;
 75 | 	/** create a new vertex */
 76 | 	__host__ __device__ vertex_t(int id) :
 77 | 		id(id), nedges(0), edges(0) {}
 78 | 	/** adds an edge (to the beginning of the list) */
 79 | 	__device__ void add_edge(vertex_t *target) {
 80 | 		edge_list_t *new_edges = (edge_list_t *)hamalloc(sizeof(edge_list_t));
 81 | 		*new_edges = edge_list_t(target, edges);
 82 | 		edges = new_edges;
 83 | 		nedges++;
 84 | 	}  // add_edge
 85 | 	/** same function on the host */
 86 | 	__host__ void add_edge_host(vertex_t *target) {
 87 | 		edge_list_t *new_edges = (edge_list_t *)malloc(sizeof(edge_list_t));
 88 | 		*new_edges = edge_list_t(target, edges);
 89 | 		edges = new_edges;
 90 | 		nedges++;
 91 | 	}  // add_edge
 92 | 
 93 | };  // vertex_t
 94 | 
 95 | /** random number data on device */
 96 | uint * __constant__ random_states_g;
 97 | 
 98 | void drandom_init(void) {
 99 | 	// TODO: somehow standardize this number
100 | 	const uint MAX_NTHREADS = 8 * 1024 * 1024;
101 | 	uint n = MAX_NTHREADS;
102 | 	size_t sz = n * sizeof(uint);
103 | 	uint *d_random_states, *h_random_states;
104 | 
105 | 	// allocate memory
106 | 	cucheck(cudaMalloc((void **)&d_random_states, sz));
107 | 	h_random_states = (uint *)malloc(sz);
108 | 
109 | 	// initialize random values, respect groups
110 | 	//uint gp = 1;
111 | 	uint gp = 1;
112 | 	uint seed;
113 | 	for(uint i = 0; i < n; i++) {
114 | 		if(i % gp == 0)
115 | 			seed = random();
116 | 		h_random_states[i] = seed;
117 | 	}
118 | 	cucheck(cudaMemcpy(d_random_states, h_random_states, sz, 
119 | 										 cudaMemcpyHostToDevice));
120 | 	free(h_random_states);
121 | 	
122 | 	// initialize device variable
123 | 	cuset(random_states_g, uint *, d_random_states);	
124 | }  // drandom_init
125 | 
126 | /** gets the next seed */
127 | static inline __host__ __device__ uint next_seed(uint seed) {
128 | 	/* seed ^= (seed << 13);
129 | 	seed ^= (seed >> 17);
130 | 	seed ^= (seed << 5); */
131 | 	seed = (seed ^ 61) ^ (seed >> 16);
132 | 	seed *= 9;
133 | 	seed = seed ^ (seed >> 4);
134 | 	seed *= 0x27d4eb2d;
135 | 	seed = seed ^ (seed >> 15);
136 | 	return seed;
137 | }  // next_seed
138 | 
139 | /** get the random value on the device */
140 | static inline  __device__ uint drandom(void) {
141 | 	uint tid = threadIdx.x + blockIdx.x * blockDim.x;
142 | 	uint seed = random_states_g[tid];
143 | 	seed = next_seed(seed);
144 | 	random_states_g[tid] = seed;
145 | 	return seed;
146 | }  // drandom
147 | 
148 | /** get the random value within the specified interval (both ends inclusive) on
149 | 		the device */
150 | static inline __device__ uint drandom(uint a, uint b) {
151 | 	return a + (drandom() & 0x00ffffffu) % (uint)(b - a + 1);
152 | }  // drandom
153 | 
154 | static inline __host__ uint hdrandom(uint *seed, uint a, uint b) {
155 | 	*seed = next_seed(*seed);
156 | 	return a + (*seed & 0x00ffffffu) % (uint)(b - a + 1);
157 | }  // hdrandom
158 | 
159 | /** get the floating-point random value between 0 and 1 */
160 | // static inline __device__ float drandomf(void) {
161 | // 	float f = 1.0f / (1024.0f * 1024.0f);
162 | // 	uint m = 1024 * 1024;
163 | // 	return f * drandom(0, m - 1);
164 | // }  // drandomf
165 | 
166 | /** kernel building a random graph */
167 | __global__ void random_graph_build_k
168 | (vertex_t *__restrict__ vs, int nvs, int max_degree) {
169 | 	int tid = threadIdx.x + blockIdx.x * blockDim.x;
170 | 	if(tid >= nvs)
171 | 		return;
172 | 	vertex_t v = vertex_t(tid);
173 | 	// build edges for each vertex
174 | 	int nedges = drandom(1, max_degree);
175 | 	for(int iedge = 0; iedge < nedges; iedge++) {
176 | 		vertex_t *target = &vs[drandom(1, nvs)];
177 | 		v.add_edge(target);
178 | 	}
179 | 	// write the vertex out
180 | 	vs[tid] = v;
181 | }  // random_graph_build_k
182 | 
183 | /** random graph test on GPU */
184 | void random_graph_gpu(int nvs, int max_degree, bool print) {
185 | 	size_t vs_sz = nvs * sizeof(vertex_t);
186 | 	vertex_t *d_vs;
187 | 	cucheck(cudaMalloc((void **)&d_vs, vs_sz));
188 | 	// build the graph
189 | 	int bs = 128;
190 | 	double t1 = omp_get_wtime();
191 | 	random_graph_build_k<<<divup(nvs, bs), bs>>>(d_vs, nvs, max_degree);
192 | 	cucheck(cudaGetLastError());
193 | 	cucheck(cudaStreamSynchronize(0));
194 | 	double t2 = omp_get_wtime();
195 | 	
196 | 	if(print) {
197 | 		double t = t2 - t1;
198 | 		double perf = 0.5 * (max_degree + 1) * nvs / t;
199 | 		printf("GPU time: %.3lf ms\n", t * 1e3);
200 | 		printf("GPU performance: %.3lf Medges/s\n", perf * 1e-6);
201 | 	}
202 | 	cucheck(cudaFree(d_vs));
203 | }  // random_graph_gpu
204 | 
205 | /** random graph test on CPU */
206 | void random_graph_cpu(int nvs, int max_degree, bool print) {
207 | 	size_t vs_sz = nvs * sizeof(vertex_t);
208 | 	vertex_t *vs = (vertex_t *)malloc(vs_sz);
209 | 	// build the graph
210 | 	double t1 = omp_get_wtime();
211 |   #pragma omp parallel 
212 | 	{
213 | 		uint seed = random();
214 | 		#pragma omp for
215 | 		for(int tid = 0; tid < nvs; tid++) {
216 | 			vertex_t v = vertex_t(tid);
217 | 			// build edges for each vertex
218 | 			int nedges = hdrandom(&seed, 1, max_degree);
219 | 			for(int iedge = 0; iedge < nedges; iedge++) {
220 | 				vertex_t *target = &vs[hdrandom(&seed, 1, nvs)];
221 | 				v.add_edge_host(target);
222 | 			}
223 | 			// write the vertex out
224 | 			vs[tid] = v;
225 | 		}
226 |   }
227 | 	double t2 = omp_get_wtime();
228 | 	
229 | 	if(print) {
230 | 		double t = t2 - t1;
231 | 		double perf = 0.5 * (max_degree + 1) * nvs / t;
232 | 		printf("CPU time: %.3lf ms\n", t * 1e3);
233 | 		printf("CPU performance: %.3lf Medges/s\n", perf * 1e-6);
234 | 	}
235 | 	free(vs);
236 | }  // random_graph_cpu
237 | 
238 | int main(int argc, char **argv) {
239 | 	srandom((int)time(0));
240 | 	drandom_init();
241 | 	size_t memory = 512 * 1024 * 1024;
242 | 	// GPU test
243 | 	ha_init(halloc_opts_t(memory));
244 | 	//cucheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
245 | 	random_graph_gpu(10000, 4, false);
246 | 	random_graph_gpu(1000000, 8, true);
247 | 	printf("==============================\n");
248 | 	// CPU test
249 | 	random_graph_cpu(10000, 4, false);
250 | 	random_graph_cpu(1000000, 8, true);
251 | 	ha_shutdown();
252 | }  // main
253 | 


--------------------------------------------------------------------------------
/samples/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | 


--------------------------------------------------------------------------------
/src/globals.cuh:
--------------------------------------------------------------------------------
 1 | /** @file forward-globals.cuh global variables that are used in many .cuh files,
 2 | 		and thus require a forward declaration. This file is included into halloc.cu
 3 | 		before all other .cuh files */
 4 | 
 5 | /** real possible number of superblocks (based on device memory and superblock
 6 | 		size) */
 7 | static __constant__ uint nsbs_g;
 8 | 
 9 | /** superblock size (common for all superblocks, power-of-two) */
10 | static __constant__ uint sb_sz_g;
11 | /** superblock size shift (for fast division operations) */
12 | static __constant__ uint sb_sz_sh_g;
13 | 
14 | /** real number of sizes */
15 | static __constant__ uint nsizes_g;
16 | 
17 | /** slab descriptors */
18 | static __device__ superblock_t sbs_g[MAX_NSBS];
19 | /** slab pointers (stored separately from descriptors, as they do not change) */
20 | __attribute__((aligned(128))) static __device__ void *sb_ptrs_g[MAX_NSBS];
21 | /** slab (non-distributed) counters */
22 | static __device__ uint sb_counters_g[MAX_NSBS];
23 | 


--------------------------------------------------------------------------------
/src/grid.cuh:
--------------------------------------------------------------------------------
 1 | /** @file grid.cuh implementation of superblock grid */
 2 | 
 3 | /** base address of the grid; this is the start address of the grid. It is
 4 | 		always aligned to superblock size boundary */
 5 | static void * __constant__ base_addr_g;
 6 | /** superblock grid; TODO: cache in L1, this helps */
 7 | __attribute__((aligned(128))) static __device__ uint64 sb_grid_g[2 * MAX_NSBS];
 8 | 
 9 | //extern __constant__ uint sb_sz_g;
10 | //extern __constant__ uint sb_sz_sh_g;
11 | 
12 | /** add the superblock to the grid 
13 | 		// TODO: use on device as well, also with size id
14 | */
15 | __host__ void grid_add_sb
16 | (uint64 *cells, void *base_addr, uint sb, void *sb_addr, uint sb_sz) {
17 | 	void *sb_end_addr = (char *)sb_addr + sb_sz - 1;
18 | 	uint icell_start = ((char *)sb_addr - (char *)base_addr) / sb_sz;
19 | 	uint icell_end = ((char *)sb_addr + sb_sz - 1 - (char *)base_addr) / sb_sz;
20 | 	for(uint icell = icell_start; icell <= icell_end; icell++) {
21 | 		uint64 cell = cells[icell];
22 | 		cell |= 1ull << GRID_INIT_POS;
23 | 		void *cell_start_addr = (char *)base_addr + (uint64)icell * sb_sz;
24 | 		void *cell_end_addr = (char *)base_addr + (uint64)(icell + 1) * sb_sz - 1;
25 | 		if(sb_addr <= cell_start_addr) {
26 | 			// set first superblock in cell
27 | 			uint64 first_sb_mask = ((1ull << GRID_SB_LEN) - 1) << GRID_FIRST_SB_POS;
28 | 			cell = ~first_sb_mask & cell | (uint64)sb << GRID_FIRST_SB_POS;
29 | 		}
30 | 		if(sb_end_addr >= cell_end_addr) {
31 | 			// set second superblock in cell
32 | 			uint64 second_sb_mask = ((1ull << GRID_SB_LEN) - 1) << GRID_SECOND_SB_POS;
33 | 			cell = ~second_sb_mask & cell | (uint64)sb << GRID_SECOND_SB_POS;
34 | 		}
35 | 		uint64 mid_addr_mask = ((1ull << GRID_ADDR_LEN) - 1) << GRID_ADDR_POS;
36 | 		// set the break address
37 | 		if(sb_addr > cell_start_addr) {
38 | 			// current superblock is the second superblock, mid address is its start
39 | 			uint64 mid_addr = ((char *)sb_addr - (char *)cell_start_addr) >> 
40 | 				GRID_ADDR_SH;
41 | 			cell = ~mid_addr_mask & cell | mid_addr << GRID_ADDR_POS;
42 | 			//printf("icell = %d, cell_addr = %p, sb_addr = %p, mid_addr = %llx\n",
43 | 			//			 icell, cell_start_addr, sb_addr, mid_addr);
44 | 		} else if(sb_end_addr <= cell_end_addr) {
45 | 			// current superblock is the first superblock, mid address is end of this
46 | 			// superblock + 1
47 | 			uint64 mid_addr = ((char *)sb_end_addr + 1 - (char *)cell_start_addr) >>
48 | 				GRID_ADDR_SH;
49 | 			cell = ~mid_addr_mask & cell | mid_addr << GRID_ADDR_POS;
50 | 			//printf("icell = %d, cell_addr = %p, sb_addr = %p, mid_addr = %llx\n",
51 | 			//			 icell, cell_start_addr, sb_addr, mid_addr);
52 | 		}
53 | 		// save the modified cell
54 | 		cells[icell] = cell;
55 | 	}  // for(each cell in interval)
56 | }  // grid_add_sb
57 | 
58 | /** gets the mid-address of the grid cell */
59 | __device__ inline void *grid_mid_addr(uint icell, uint64 cell) {
60 | 	uint in_sb_addr = ((cell >> GRID_ADDR_POS) & ((1ull << GRID_ADDR_LEN) - 1))
61 | 		<< GRID_ADDR_SH;
62 | 	return (char *)base_addr_g + (uint64)icell * sb_sz_g + in_sb_addr;
63 | }
64 | /** gets the grid cell for the pointer */
65 | __device__ inline uint64 grid_cell(void *p, uint *icell) {
66 | 	// TODO: handle stale cell data
67 | 	//*icell = ((char *)p - (char *)base_addr_g) / sb_sz_g;
68 | 	*icell = ((char *)p - (char *)base_addr_g) >> sb_sz_sh_g;
69 | 	//return sb_grid_g[*icell];
70 | 	return ldca(sb_grid_g + *icell);
71 | }
72 | /** gets the (de)allocation size id for the pointer */
73 | __device__ inline uint grid_size_id(uint icell, uint64 cell, void *p) {
74 | 	void *midp = grid_mid_addr(icell, cell);
75 | 	return p < midp ? grid_first_size_id(cell) : grid_second_size_id(cell);
76 | }
77 | /** gets the (de)allocation superblock id for the pointer */
78 | __device__ inline uint grid_sb_id(uint icell, uint64 cell, void *p) {
79 | 	//void *midp = grid_mid_addr(icell, cell);
80 | 	uint in_sb_addr = ((cell >> GRID_ADDR_POS) & ((1ull << GRID_ADDR_LEN) - 1))
81 | 		<< GRID_ADDR_SH;
82 | 	//uint in_sb_addr = ((cell >> GRID_ADDR_POS) & ((1ull << GRID_ADDR_LEN) - 1));
83 | 	//uint in_p = (char *)p - (char *)base_addr_g - ((uint64)icell << sb_sz_sh_g);
84 | 	uint in_p = (char *)p - (char *)base_addr_g - (uint64)icell * sb_sz_g;
85 | 	//uint in_p = uint(((char *)p - (char *)base_addr_g) >> GRID_ADDR_SH) - (icell <<
86 | 	//	(sb_sz_sh_g - GRID_ADDR_SH));
87 | 	//return p < midp ? grid_first_sb_id(cell) : grid_second_sb_id(cell);
88 | 	return in_p < in_sb_addr ? grid_first_sb_id(cell) : grid_second_sb_id(cell);
89 | }
90 | 


--------------------------------------------------------------------------------
/src/grid.h:
--------------------------------------------------------------------------------
 1 | #ifndef HALLOC_GRID_H_
 2 | #define HALLOC_GRID_H_
 3 | 
 4 | #include "utils.h"
 5 | 
 6 | // constants related to grid cells
 7 | #define GRID_SIZE_LEN 6
 8 | #define GRID_ADDR_LEN 20
 9 | #define GRID_SB_LEN 13
10 | #define GRID_INIT_POS 0
11 | #define GRID_FIRST_SIZE_POS 1
12 | #define GRID_SECOND_SIZE_POS 7
13 | #define GRID_FIRST_SB_POS 13
14 | #define GRID_SECOND_SB_POS 26
15 | #define GRID_ADDR_POS 39
16 | #define GRID_ADDR_SH 4
17 | #define GRID_SB_NONE ((1 << GRID_SB_LEN) - 1)
18 | 
19 | /** initial value for the grid cell */
20 | __host__ __device__ inline uint64 grid_cell_init() {
21 | 	uint64 no_sb_field = (1 << GRID_SB_LEN) - 1;
22 | 	return no_sb_field << GRID_FIRST_SB_POS | no_sb_field << GRID_SECOND_SB_POS;
23 | }
24 | /** checks whether the grid cell is initialized */
25 | __device__ inline bool grid_is_init(uint64 cell) {
26 | 	return (cell >> GRID_INIT_POS) & 1;
27 | } 
28 | /** gets the first size id of the grid cell */
29 | __device__ inline uint grid_first_size_id(uint64 cell) {
30 | 	return (cell >> GRID_FIRST_SIZE_POS) & ((1ull << GRID_SIZE_LEN) - 1);
31 | }
32 | /** gets the  second size id of the grid cell */
33 | __device__ inline uint grid_second_size_id(uint64 cell) {
34 | 	return (cell >> GRID_SECOND_SIZE_POS) & ((1ull << GRID_SIZE_LEN) - 1);
35 | }
36 | /** gets the first superblock id of the grid cell  */
37 | __device__ inline uint grid_first_sb_id(uint64 cell) {
38 | 	return (cell >> GRID_FIRST_SB_POS) & ((1ull << GRID_SB_LEN) - 1);
39 | }
40 | /** gets the second superblock id of the grid cell  */
41 | __device__ inline uint grid_second_sb_id(uint64 cell) {
42 | 	return (cell >> GRID_SECOND_SB_POS) & ((1ull << GRID_SB_LEN) - 1);
43 | }
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/halloc.h:
--------------------------------------------------------------------------------
 1 | #ifndef HALLOC_H_
 2 | #define HALLOC_H_
 3 | 
 4 | /** @file hamalloc.h header for halloc allocator */
 5 | #ifdef HALLOCLIB_COMPILING
 6 | #define HALLOC_EXTERN 
 7 | #else
 8 | #define HALLOC_EXTERN extern
 9 | #endif
10 | 
11 | //#define HALLOC_CPP
12 | 
13 | /** structure (class) for halloc allocator options */
14 | 
15 | struct halloc_opts_t {
16 | 	/** total amount of memory available for allocation, bytes */
17 | 	size_t memory;
18 | 	/** memory fraction available to halloc allocator, the rest goes to CUDA for
19 | 	larger allocations */
20 | 	double halloc_fraction;
21 | 	/** occupancy fraction at which a slab is considered busy */
22 | 	double busy_fraction;
23 | 	/** occupancy fraction at which a slab is considered roomy */
24 | 	double roomy_fraction;
25 | 	/** occupancy fraction at which a slab is considered sparse */
26 | 	double sparse_fraction;
27 | 	/** shift value for slab size (size in bytes) */
28 | 	int sb_sz_sh;
29 | 	/** default constructor which initializes the structure with default values */
30 | 	halloc_opts_t(size_t memory = 512 * 1024 * 1024) : 
31 | 		memory(memory), halloc_fraction(0.75), busy_fraction(0.835),
32 | 		roomy_fraction(0.6), sparse_fraction(0.012), sb_sz_sh(22)
33 | 	{}
34 | };  // halloc_opts_t
35 | 
36 | /** memory allocation */
37 | //HALLOC_EXTERN __device__ __noinline__ void *hamalloc(uint nbytes);
38 | HALLOC_EXTERN __device__ __noinline__ void *hamalloc(size_t nbytes);
39 | 
40 | /** freeing the memory */
41 | HALLOC_EXTERN __device__ __noinline__ void hafree(void *p);
42 | 
43 | /** initializes memory allocator host-side
44 | 		@param memory amount of memory which should be made available for allocation
45 |  */
46 | void ha_init(halloc_opts_t opts = halloc_opts_t());
47 | 
48 | /** shuts down memory allocator host-side */
49 | void ha_shutdown(void);
50 | 
51 | /** gets current external fragmentation 
52 | 		@param ideal whether the ideal case is considered, i.e. all slabs do really
53 | 		get allocated from CUDA allocator memory; currently ignored and assumed false
54 |  */
55 | double ha_extfrag(bool ideal);
56 | 
57 | // overrides for malloc and free if requested; currently unstable
58 | //#ifdef HALLOC_OVERRIDE_STDC
59 | #if 0
60 | __device__ void *malloc(uint nbytes) throw() { 
61 | 		return hamalloc(nbytes);
62 | 	}
63 | inline __device__ void free(void *p) throw() { hafree(p); }
64 | extern "C" __host__ void free(void *p) throw();
65 | #endif
66 | 
67 | // overload new/delete C++ operators on device if requested
68 | // currently doesn't make much sense: the compiler treats operator new very
69 | // specially, and obviously links it against an external library, which kills
70 | // all performance
71 | #if defined(HALLOC_CPP)
72 | #include <new>
73 | //struct halloc_tag_t;
74 | //typedef halloc_tag_t *halloc_t;
75 | //#define halloc ((halloc_t)0)
76 | __device__ void *operator new(size_t nbytes) throw(std::bad_alloc);
77 | //__device__ void *operator new[](size_t nbytes) throw(std::bad_alloc);
78 | __device__ void operator delete(void *p) throw();
79 | //__device__ void operator delete[](void *p) throw();
80 | #endif
81 | 
82 | #endif
83 | 


--------------------------------------------------------------------------------
/src/sbset.cuh:
--------------------------------------------------------------------------------
 1 | /** @file sbset.cuh slab set implementation */
 2 | 
 3 | //extern __constant__ uint nsbs_g;
 4 | 
 5 | __device__ inline uint sbset_get_from(sbset_t sbset) {
 6 | 	// the condition is always true, but the compiler doesn't know that
 7 | 	// without it, performance somewhat drops
 8 | #if SBSET_CTR
 9 | 	if(nsbs_g) {
10 | 		int old = *(volatile int*)&sbset[SB_SET_SZ - 1];
11 | 		//int old = atomicAdd((int *)&sbset[SB_SET_SZ - 1], 0);
12 | 		if(old <= 0)
13 | 			return SB_NONE;
14 | 	}
15 | #endif
16 | 	// then get it
17 | 	for(uint iword = 0; iword < nsbs_g / WORD_SZ; iword++) {
18 | 		// atomicOr() also works good here
19 | 		uint word = *(volatile uint *)&sbset[iword];
20 | 		//uint word = atomicOr(&sbset[iword], 0);
21 | 		while(word) {
22 | 			uint ibit = __ffs(word) - 1;
23 | 			// try locking the bit
24 | 			uint mask = 1 << ibit;
25 | 			if(atomicAnd(&sbset[iword], ~mask) & mask) {
26 | #if SBSET_CTR
27 | 				atomicSub(&sbset[SB_SET_SZ - 1], 1);
28 | #endif
29 | 				return iword * WORD_SZ + ibit;
30 | 			}
31 | 			word &= ~mask;
32 | 		}
33 | 	}
34 | 	return SB_NONE;
35 | }  // sbset_get_from
36 | 


--------------------------------------------------------------------------------
/src/sbset.h:
--------------------------------------------------------------------------------
 1 | #ifndef HALLOC_SBSET_H_
 2 | #define HALLOC_SBSET_H_
 3 | 
 4 | /** @file sbset.h slab set definitions */
 5 | 
 6 | #include "utils.h"
 7 | 
 8 | #define SBSET_CTR 0
 9 | 
10 | /** superblock set type; word 0 is actually an additional counter */
11 | typedef uint sbset_t[SB_SET_SZ];
12 | //typedef uint *sbset_t;
13 | 
14 | //#define WORD_SZ2 64
15 | 
16 | /** gets superblock from set (and removes it) */
17 | __device__ inline uint sbset_get_from(sbset_t sbset);
18 | 
19 | /** adds ("returns") superblock to the set */
20 | __device__ inline void sbset_add_to(sbset_t sbset, uint sb) {
21 | 	uint iword = sb / WORD_SZ, ibit = sb % WORD_SZ;
22 | 	uint mask = 1 << ibit;
23 | 	//atomicAdd((int *)&sbset[SB_SET_SZ - 1], 1);
24 | #if SBSET_CTR
25 | 	if(!(atomicOr(&sbset[iword], mask) & mask))
26 | 	 	atomicAdd((int *)&sbset[SB_SET_SZ - 1], 1);
27 | #else
28 | 	atomicOr(&sbset[iword], mask);
29 | #endif
30 | 	//atomicAdd((int *)&sbset[SB_SET_SZ - 1], 
31 | 	//					1 - ((atomicOr(&sbset[iword], mask) & mask) >> ibit));
32 | }  // sbset_add_to
33 | 
34 | /** removes the specified slab from set */
35 | __device__ inline void sbset_remove_from(sbset_t sbset, uint sb) {
36 | 	uint iword = sb / WORD_SZ, ibit = sb % WORD_SZ;
37 | 	uint mask = 1 << ibit;
38 | #if SBSET_CTR
39 | 	if(atomicAnd(&sbset[iword], ~mask) & mask)
40 | 		atomicSub((int *)&sbset[SB_SET_SZ - 1], 1);
41 | #else
42 | 	atomicAnd(&sbset[iword], ~mask);
43 | #endif
44 | }  // sbset_remove_from
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/src/size-info.cuh:
--------------------------------------------------------------------------------
 1 | /** @file size-infos.cuh implementation of some stuff related to size
 2 | 		information */
 3 | 
 4 | /** information on sizes */
 5 | __attribute__((aligned(128))) static __device__ size_info_t size_infos_g[MAX_NSIZES];
 6 | //static __constant__ size_info_t size_infos_g[MAX_NSIZES];
 7 | 
 8 | /** same data, but in different memory */
 9 | // __device__ size_info_t size_infos_dg[MAX_NSIZES];
10 | 


--------------------------------------------------------------------------------
/src/size-info.h:
--------------------------------------------------------------------------------
 1 | #ifndef HALLOC_SIZE_INFO_H_
 2 | #define HALLOC_SIZE_INFO_H_
 3 | 
 4 | /** @file size-info.h information and definitions related to sizes */
 5 | 
 6 | /** size information type; this is non-changing information, to be stored in
 7 | 		constant memory */
 8 | typedef struct {
 9 | 	/** number of chunks in slab */
10 | 	uint nchunks;
11 | 	/** size of a single chunk */
12 | 	uint chunk_sz;
13 | 	/** id of the chunk to which the size belongs */
14 | 	uint chunk_id;
15 | 	/** number of chunks in a block for this size */
16 | 	uint nchunks_in_block;
17 | 	/** threshold (in chunks) for the slab to be declared "sparse", so that it can
18 | 		be reused by other sizes with the same chunk size */
19 | 	uint sparse_threshold;
20 | 	/** step for the hash function */
21 | 	uint hash_step;
22 | 	/** threshold (in chunks) for the slab to be declared "roomy" */
23 | 	uint roomy_threshold;
24 | 	/** threshold (in chunks) for the slab to be declared "busy" and be detached */
25 | 	uint busy_threshold;
26 | } size_info_t __attribute__((aligned(32)));
27 | 
28 | /** maximum number of sizes supported */
29 | #define MAX_NSIZES 64
30 | /** maximum number of different chunk sizes supported */
31 | #define MAX_NCHUNK_IDS 8
32 | /** a "no-size" constant */
33 | #define SZ_NONE (~0)
34 | /** block step (16 bytes by default), a power of two */
35 | #define BLOCK_STEP 16
36 | /** minimum unit size (allocation blocks are either 2 or 3 units) */
37 | #define MIN_UNIT_SZ 8
38 | /** maximum unit size */
39 | #define MAX_UNIT_SZ 1024
40 | /** unit step */
41 | #define UNIT_STEP 2
42 | /** the number of units */
43 | #define NUNITS 8
44 | /** minimum block size */
45 | #define MIN_BLOCK_SZ 16
46 | /** maximum block size */
47 | #define MAX_BLOCK_SZ 3072
48 | 
49 | // chunk manipulation
50 | uint chunk_val(uint chunk_sz) {
51 | 	//return chunk_sz;
52 | 	uint div3 = chunk_sz % 3 ? 1 : 3;
53 | 	if(chunk_sz % 3 == 0)
54 | 		chunk_sz /= 3;
55 | 	uint sh = 0;
56 | 	for(; (1 << sh) < chunk_sz; sh++);
57 | 	return div3 << 16 | sh;
58 | }
59 | 
60 | __host__ __device__ inline uint chunk_mul(uint v, uint chunk_sz) {
61 | 	//return v * chunk_sz;
62 | 	return (v << (chunk_sz & 0xffffu)) * (chunk_sz >> 16);
63 | }
64 | 
65 | __host__ __device__ inline uint chunk_div(uint v, uint chunk_sz) {
66 | 	// return v / chunk_sz
67 | 	if(chunk_sz >> 16u == 3u)
68 | 		v /= 3u;
69 | 	return v >> (chunk_sz & 0xffffu);
70 | }
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/src/slab.h:
--------------------------------------------------------------------------------
  1 | #ifndef HALLOC_SLAB_H_
  2 | #define HALLOC_SLAB_H_
  3 | 
  4 | /** @file slab.h slab (superblock) header file */
  5 | 
  6 | #include "utils.h"
  7 | 
  8 | /** possible slab flags */
  9 | enum {
 10 | 	/** slab allocated from CUDA device-side memory, and must be freed into it */
 11 | 	SB_CUDA = 0x1
 12 | };
 13 | 
 14 | /** superblock descriptor type; information is mostly changing; note that during
 15 | 		allocation, a superblock is mostly identified by superblock id */
 16 | typedef struct {
 17 | 	/** slab size id 
 18 | 			TODO: check if we really need it
 19 | 	 */
 20 | 	unsigned int size_id;
 21 | 	/** whether this is a head slab */
 22 | 	unsigned int is_head;
 23 | 	/** slab chunk id */
 24 | 	unsigned int chunk_id;
 25 | 	/** slab chunk size */
 26 | 	uint chunk_sz;
 27 | 	/** pointer to memory owned by superblock */
 28 | 	void *ptr;
 29 | } superblock_t;
 30 | 
 31 | /** a step to check whether the slab can be moved to another free category */
 32 | #define SB_FREE_STEP 2048
 33 | /** maximum number of tries inside a slab after which the allocation
 34 | 		attempt is abandoned */
 35 | //#define MAX_NTRIES 32
 36 | #define MAX_NTRIES 32
 37 | /** the number of steps after which count check needs be peformed, to ensure
 38 | 		that the allocator is not searching in a block that is already full */
 39 | #define CHECK_NTRIES 2
 40 | /** a "no-sb" constant */
 41 | #define SB_NONE (~0)
 42 | /** number of heads between which to distribute allocations */
 43 | #define NHEADS 1
 44 | /** whether to cache head slabs */
 45 | #define CACHE_HEAD_SBS 1
 46 | /** step frequency, i.e. what's the step for step update */
 47 | //#define STEP_FREQ 64
 48 | #define STEP_FREQ 64
 49 | 
 50 | /** positions and sizes related to slab counters */
 51 | // modified values enable better reading of counters in hex
 52 | #define SB_SIZE_POS 0
 53 | //#define SB_SIZE_SZ 6
 54 | #define SB_SIZE_SZ 5
 55 | //#define SB_CHUNK_POS 6
 56 | #define SB_CHUNK_POS 5
 57 | #define SB_CHUNK_SZ 3
 58 | //#define SB_HEAD_POS 9
 59 | #define SB_HEAD_POS 8
 60 | #define SB_HEAD_SZ 1
 61 | //#define SB_COUNT_POS 10
 62 | #define SB_COUNT_POS 12
 63 | #define SB_COUNT_SZ 20
 64 | 
 65 | // functions for manipulation with counter values
 66 | /** gets slab allocation count */
 67 | __device__ inline uint sb_count(uint counter) {
 68 | 	return counter >> SB_COUNT_POS;
 69 | }
 70 | /** gets size id  */
 71 | // __device__ inline uint sb_size_id(uint counter) {
 72 | // 	return (counter >> SB_SIZE_POS) & ((1 << SB_SIZE_SZ) - 1);
 73 | // }
 74 | /** gets chunk id */
 75 | __device__ inline uint sb_chunk_id(uint counter) {
 76 | 	return (counter >> SB_CHUNK_POS) & ((1 << SB_CHUNK_SZ) - 1);
 77 | }
 78 | /** gets whether the slab is head (i.e., head bit is set) */
 79 | __device__ inline bool sb_is_head(uint counter) {
 80 | 	return (counter >> SB_HEAD_POS) & 1;
 81 | }
 82 | /** sets the head  for the counter, returns the old counter value */
 83 | __device__ inline uint sb_set_head(uint *counter) {
 84 | 	//return atomicOr(counter, 1 << SB_HEAD_POS);
 85 | 	return atomicAdd(counter, 1 << SB_HEAD_POS);
 86 | }
 87 | /** resets the head for the slab counter, returns the old counter value */
 88 | __device__ inline uint sb_reset_head(uint *counter) {
 89 | 	//return atomicAnd(counter, ~(1 << SB_HEAD_POS));
 90 | 	return atomicSub(counter, 1 << SB_HEAD_POS);
 91 | }
 92 | /** sets the chunk size for the slab counter, returns the old counter value; the
 93 | 		chunk must be NONE for this to work correctly */
 94 | __device__ inline uint sb_set_chunk
 95 | (uint *counter, uint chunk_id) {
 96 | 	return atomicSub
 97 | 		(counter, ((SZ_NONE - chunk_id) & ((1 << SB_CHUNK_SZ) - 1)) << 
 98 | 		 SB_CHUNK_POS);
 99 | }  // sb_set_chunk
100 | 
101 | /** resets the chunk from the specified size to the new size; */
102 | __device__ inline uint sb_reset_chunk
103 | (uint *counter, uint old_chunk_id) {
104 | 	return atomicAdd
105 | 		(counter, ((SZ_NONE - old_chunk_id) & ((1 << SB_CHUNK_SZ) - 1)) << 
106 | 		 SB_CHUNK_POS);
107 | }  // sb_reset_chunk
108 | 
109 | /** updates the size id only, returns the new counter */
110 | // __device__ inline uint sb_update_size_id
111 | // (uint *counter, uint old_size_id, uint new_size_id) {
112 | // 	old_size_id = old_size_id & ((1 << SB_SIZE_SZ) - 1);
113 | // 	new_size_id = new_size_id & ((1 << SB_SIZE_SZ) - 1);
114 | // 	if(old_size_id >= new_size_id)
115 | // 		return atomicSub(counter, old_size_id - new_size_id);
116 | // 	else
117 | // 		return atomicAdd(counter, new_size_id - old_size_id);
118 | // }  // sb_update_size_id
119 | /** gets the counter value for the specified count, size id and chunk id */
120 | __host__ __device__ inline uint sb_counter_val
121 | (uint count, bool is_head, uint chunk_id, uint size_id) {
122 | 	return count << SB_COUNT_POS | (is_head ? 1 : 0) << SB_HEAD_POS |
123 | 		(chunk_id & ((1 << SB_CHUNK_SZ) - 1)) << SB_CHUNK_POS | 
124 | 		(size_id & ((1 << SB_SIZE_SZ) - 1)) << SB_SIZE_POS;
125 | }
126 | /** atomically increments/decrements slab counter, returns old slab counter value */
127 | __device__ inline uint sb_counter_inc(uint *counter, uint change) {
128 | 	return atomicAdd(counter, change << SB_COUNT_POS);
129 | }
130 | __device__ inline uint sb_counter_dec(uint *counter, uint change) {
131 | 	return atomicSub(counter, change << SB_COUNT_POS);
132 | }
133 | 
134 | /** a single-thread-in-warp slab lock; it loops until the slab is locked */
135 | // __device__ inline void sb_lock(superblock_t *sb) {
136 | // 	lock(&sb->mutex);
137 | // }
138 | /** a single-thread-in-warp slab unlock; it loops until the slab is unlocked */
139 | // __device__ inline void sb_unlock(superblock_t *sb) {
140 | // 	unlock(&sb->mutex);
141 | // }
142 | 
143 | #endif
144 | 


--------------------------------------------------------------------------------
/src/statistics.cuh:
--------------------------------------------------------------------------------
 1 | /** @file statistics.cuh functions for collecting memory statistics */
 2 | 
 3 | /** total free memory on device, B */
 4 | __device__ uint64 free_mem_g;
 5 | /** maximum memory that can be allocated, B */
 6 | __device__ uint64 max_alloc_mem_g;
 7 | /** total Halloc memory (incl. CUDA memory), B */
 8 | __constant__ uint64 total_mem_g;
 9 | /** memory assigned to CUDA allocator, B */
10 | __constant__ uint64 cuda_mem_g;
11 | 
12 | /** one-thread kernel determining maximum allocatable memory; it does so by
13 | 		doing binary search on what CUDA malloc can do */
14 | __global__ void find_max_alloc_k() {
15 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
16 | 	if(i > 0)
17 | 		return;
18 | 	uint64 hi = cuda_mem_g, lo = 0, mid;
19 | 	uint64 min_diff = 1024 * 1024;
20 | 	while(hi - lo > min_diff) {
21 | 		mid = (hi + lo) / 2;
22 | 		void *p = malloc(mid);
23 | 		if(p) {
24 | 			lo = mid;
25 | 			free(p);
26 | 		} else 
27 | 			hi = mid;
28 | 	}  // while
29 | 	max_alloc_mem_g = mid;
30 | }  // find_max_alloc_k
31 | 
32 | /** multi-thread kernel that counts free memory available on device by launching
33 | 		one thread per slab */
34 | __global__ void find_free_mem_k(bool ideal) {
35 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
36 | 	if(i >= nsbs_g)
37 | 		return;
38 | 	uint sb_sz = sb_sz_g;
39 | 	uint chunk_sz = sbs_g[i].chunk_sz;
40 | 	uint nused_chunks = sb_count(sb_counters_g[i]);
41 | 	uint used_mem = chunk_sz != 0 ? chunk_mul(nused_chunks, chunk_sz) : 0;
42 | 	uint free_sz = sb_sz - used_mem;
43 | 	atomicAdd(&free_mem_g, free_sz);
44 | 	if(ideal && chunk_sz == 0)
45 | 		atomicAdd(&max_alloc_mem_g, sb_sz);
46 | 	if(i == 0)
47 | 		atomicAdd(&free_mem_g, cuda_mem_g);
48 | }  // find_free_mem_k
49 | 
50 | double ha_extfrag(bool ideal) {
51 | 	uint bs = 128;
52 | 	cuset(max_alloc_mem_g, uint64, 0);
53 | 	cuset(free_mem_g, uint64, 0);
54 | 	find_max_alloc_k<<<1, bs>>>();
55 | 	cucheck(cudaGetLastError());
56 | 	cucheck(cudaStreamSynchronize(0));
57 | 	find_free_mem_k<<<divup(MAX_NSBS, bs), bs>>>(ideal);
58 | 	cucheck(cudaGetLastError());
59 | 	cucheck(cudaStreamSynchronize(0));
60 | 
61 | 	uint64 free_mem, max_alloc;
62 | 	//uint64 cuda_mem;
63 | 	cuget(&free_mem, free_mem_g);
64 | 	cuget(&max_alloc, max_alloc_mem_g);
65 | 	//cuget(&cuda_mem, cuda_mem_g);
66 | 	//	printf("free_mem = %lld, max_alloc = %lld, cuda_mem = %lld\n", 
67 | 	//			 free_mem, max_alloc, cuda_mem);
68 | 	return 1.0 - (double)max_alloc / free_mem;
69 | }  // ha_extfrag
70 | 


--------------------------------------------------------------------------------
/src/utils.cu:
--------------------------------------------------------------------------------
 1 | /** @file utils.cu utility function implementation */
 2 | 
 3 | #include <math.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | uint max_prime_below(uint n, uint nb) {
 9 | 	for(uint p = n - 1; p >= 3; p--) {
10 | 		uint max_d = (uint)floor(sqrt(p));
11 | 		bool is_prime = true;
12 | 		for(uint d = 2; d <= max_d; d++)
13 | 			if(p % d == 0) {
14 | 				is_prime = false;
15 | 				break;
16 | 			}
17 | 		if(is_prime && n % p && nb % p)
18 | 			return p;
19 | 	}
20 | 	// if we are here, we can't find prime; exit with failure
21 | 	fprintf(stderr, "cannot find prime below %d not dividing %d\n", n, n);
22 | 	exit(-1);
23 | 	return ~0;
24 | }  // max_prime_below
25 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef HALLOC_UTILS_H_
  2 | #define HALLOC_UTILS_H_
  3 | 
  4 | /** @file utils.h some utility macros, functions and definitions */
  5 | 
  6 | /** a macro for checking CUDA calls */
  7 | #define cucheck(call)																										\
  8 | 	{																																			\
  9 | 	cudaError_t cucheck_err = (call);																			\
 10 | 	if(cucheck_err != cudaSuccess) {																			\
 11 | 		const char* err_str = cudaGetErrorString(cucheck_err);							\
 12 | 		fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call);	\
 13 | 		exit(-1);																														\
 14 | 	}																																			\
 15 | 	}
 16 | 
 17 | /** sets CUDA device variable */
 18 | #define cuset(symbol, T, val)																		\
 19 | {																																\
 20 | 	void *cuset_addr;																							\
 21 | 	cucheck(cudaGetSymbolAddress(&cuset_addr, symbol));						\
 22 | 	T cuset_val = (val);																					\
 23 | 	cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \
 24 | 										 cudaMemcpyHostToDevice));									\
 25 | }  // cuset
 26 | 
 27 | /** gets the value of the CUDA device variable */
 28 | #define cuget(pval, symbol)																\
 29 | {																													\
 30 | 	void *cuget_addr;																				\
 31 | 	cucheck(cudaGetSymbolAddress(&cuget_addr, symbol));			\
 32 | 	cucheck(cudaMemcpy((pval), cuget_addr, sizeof(*(pval)), \
 33 | 										 cudaMemcpyDeviceToHost));						\
 34 | }
 35 | 
 36 | #define cuset_arr(symbol, val)												\
 37 | {																											\
 38 | 	void *cuset_addr;																		\
 39 | 	cucheck(cudaGetSymbolAddress(&cuset_addr, symbol));	\
 40 | 	cucheck(cudaMemcpy(cuset_addr, *val, sizeof(*val),	\
 41 | 										 cudaMemcpyHostToDevice));				\
 42 | } // cuset_arr
 43 | 
 44 | /** acts as cudaMemset(), but accepts device variable */
 45 | #define cuvar_memset(symbol, val, sz)									\
 46 | {																											\
 47 | 	void *cuvar_addr;																		\
 48 | 	cucheck(cudaGetSymbolAddress(&cuvar_addr, symbol));	\
 49 | 	cucheck(cudaMemset(cuvar_addr, val, sz));						\
 50 | }  // cuvar_memset
 51 | 
 52 | /** 64-bit integer type */
 53 | typedef unsigned long long uint64;
 54 | 
 55 | // constants
 56 | /** word size (the word is uint, which is assumed to be 32-bit) */
 57 | #define WORD_SZ 32
 58 | /** the warp size (32 on current NVidia architectures) */
 59 | #define WARP_SZ 32
 60 | /** maximum number of superblocks */
 61 | //#define MAX_NSBS 4096
 62 | #define MAX_NSBS 8192
 63 | /** the size of SB set, in words; the number of used SBs can be smaller */
 64 | #define SB_SET_SZ (MAX_NSBS / WORD_SZ)
 65 | /** the maximum number of warps in a thread block */
 66 | #define MAX_NWARPS 32
 67 | 
 68 | /** division with rounding upwards, useful for kernel calls */
 69 | inline __host__ __device__ int divup
 70 | (int a, int b) { return a / b + (a % b ? 1 : 0); }
 71 | 
 72 | /** checks whether the step is in mask */
 73 | __device__ inline bool step_is_in_mask(uint mask, uint val) {
 74 | 	return (mask >> val) & 1;
 75 | }
 76 | 
 77 | /** gets the distance to the next higher mask value	*/
 78 | __device__ inline uint step_next_dist(uint mask, uint val) {
 79 | 	uint res =  __ffs(mask >> (val + 1));
 80 | 	return res ? res : WORD_SZ - val;
 81 | }
 82 | 
 83 | /** tries single-thread-per-warp lock 
 84 | 		@returns true if locking is successful and false otherwise
 85 |  */
 86 | __device__ inline bool try_lock(uint *mutex) {
 87 | 	return atomicExch(mutex, 1) == 0;
 88 | }
 89 | /** single-thread-per-warp lock; loops until the lock is acquired */
 90 | __device__ inline void lock(uint *mutex) {
 91 | 	while(!try_lock(mutex));
 92 | }
 93 | /** single-thread-per-warp unlock, without threadfence */
 94 | __device__ inline void unlock(uint *mutex) {
 95 | 	__threadfence();
 96 | 	atomicExch(mutex, 0);
 97 | }
 98 | /** waits until the mutex is unlocked, but does not attempt locking */
 99 | __device__ inline void wait_unlock(uint *mutex) {
100 | 	while(*(volatile uint *)mutex);
101 | 	// {
102 | 	// 	uint64 t1 = clock64();
103 | 	// 	while(clock64() - t1 < 1);
104 | 	// }
105 | }
106 | /** gets the warp leader based on the mask */
107 | __device__ inline uint warp_leader(uint mask) {
108 | 	return __ffs(mask) - 1;
109 | }
110 | 
111 | /** gets the lane id inside the warp */
112 | __device__ inline uint lane_id(void) {
113 | 	uint lid;
114 | 	asm("mov.u32 %0, %%laneid;" : "=r" (lid));
115 | 	return lid;
116 | 	// TODO: maybe use more reliable lane id computation
117 | 	//return threadIdx.x % WARP_SZ;
118 | }
119 | 
120 | /** gets the id of the warp */
121 | __device__ inline uint warp_id(void) {
122 | 	// TODO: use something more stable
123 | 	return threadIdx.x / WARP_SZ;
124 | }
125 | 
126 | /** broadcasts a value to all participating threads in a warp */
127 | __device__ inline uint warp_bcast(uint v, uint root_lid) {
128 | #if __CUDA_ARCH__ >= 300
129 | 	// use warp intrinsics
130 | 	return (uint) __shfl((int)v, root_lid);
131 | #else
132 | 	// use shared memory
133 | 	volatile __shared__ uint vs[MAX_NWARPS];
134 | 	if(lane_id() == root_lid)
135 | 		vs[warp_id()] = v;
136 | 	return vs[warp_id()];
137 | #endif
138 | }  // warp_bcast
139 | 
140 | /** loads the data with caching */
141 | __device__ inline uint ldca(const uint *p) {
142 | 	uint res;
143 | 	asm("ld.global.ca.u32 %0, [%1];": "=r"(res) : "l"(p));
144 | 	return res;
145 | }  
146 | 
147 | __device__ inline uint64 ldca(const uint64 *p) {
148 | 	uint64 res;
149 | 	asm("ld.global.ca.u64 %0, [%1];": "=l"(res) : "l"(p));
150 | 	return res;
151 | }  
152 | 
153 | __device__ inline void *ldca(void * const *p) {
154 | 	void *res;
155 | 	asm("ld.global.ca.u64 %0, [%1];": "=l"(res) : "l"(p));
156 | 	return res;
157 | }  
158 | 
159 | /** prefetches into L1 cache */
160 | __device__ inline void prefetch_l1(const void *p) {
161 | 	asm("prefetch.global.L1 [%0];": :"l"(p));
162 | }
163 | 
164 | /** prefetches into L2 cache */
165 | __device__ inline void prefetch_l2(const void *p) {
166 | 	asm("prefetch.global.L2 [%0];": :"l"(p));
167 | }
168 | 
169 | __device__ inline uint lanemask_lt() {
170 | 	uint mask;
171 | 	asm("mov.u32 %0, %%lanemask_lt;" : "=r" (mask));
172 | 	return mask;
173 | }
174 | 
175 | /** find the largest prime number below this one, and not dividing this one */
176 | uint max_prime_below(uint n, uint nb);
177 | 
178 | #endif
179 | 


--------------------------------------------------------------------------------
/tst/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | 


--------------------------------------------------------------------------------
/tst/common-def.mk:
--------------------------------------------------------------------------------
1 | # currently, =1 is not supported
2 | WITH_SCATTER=0
3 | 


--------------------------------------------------------------------------------
/tst/common.mk:
--------------------------------------------------------------------------------
 1 | include ../../common-def.mk
 2 | 
 3 | LIBHALLOC=../../../bin/libhalloc.a
 4 | LIBCOMMON=../../common/libcommontest.a
 5 | LIBSCATTER=../../include/libscatteralloc.a
 6 | 
 7 | LIBS :=$(LIBHALLOC) $(LIBCOMMON)
 8 | 
 9 | ARCH := -gencode arch=compute_20,code=sm_20 \
10 | 	-gencode arch=compute_30,code=sm_30 \
11 | 	-gencode arch=compute_35,code=sm_35
12 | 
13 | FLAGS := $(ARCH) -O3 -Xcompiler -fopenmp
14 | CUFLAGS := $(FLAGS) -I../../include -I../../common 
15 | 
16 | ifeq ($(WITH_SCATTER), 1)
17 | LIBS += $(LIBSCATTER)
18 | CUFLAGS += -DWITH_SCATTER
19 | endif
20 | 
21 | CUFLAGS += -dc
22 | 
23 | SRC_C=*.cu
24 | SRC_H=../../include/halloc.h ../../common/*.h
25 | SRC=$(SRC_C) $(SRC_H)
26 | TGT=../bin/$(NAME)
27 | 
28 | OBJ=../tmp/$(NAME).o
29 | 
30 | TMP=*~ \\\#* ../tmp/*.o $(TGT)
31 | 
32 | build: $(TGT)
33 | $(TGT): $(LIBS) $(OBJ) makefile
34 | 	nvcc $(FLAGS) $(OBJ) $(LIBS) -o $(TGT)
35 | 
36 | $(OBJ): $(SRC) makefile
37 | 	nvcc $(CUFLAGS) -dc $(SRC_C) -o $(OBJ)
38 | 
39 | run: $(TGT)
40 | 	./$(TGT)
41 | 
42 | clean:
43 | 	rm -f $(TMP)
44 | 


--------------------------------------------------------------------------------
/tst/common/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | *.a
6 | 


--------------------------------------------------------------------------------
/tst/common/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef HALLOC_COMMON_H_
  2 | #define HALLOC_COMMON_H_
  3 | 
  4 | #include <assert.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | 
  9 | /** number of bytes in one GiB */
 10 | #define NBYTES_IN_GIB (1024.0 * 1024.0 * 1024.0)
 11 | 
 12 | /** a macro for checking CUDA calls */
 13 | #define cucheck(call)																										\
 14 | 	{																																			\
 15 | 	cudaError_t cucheck_err = (call);																			\
 16 | 	if(cucheck_err != cudaSuccess) {																			\
 17 | 		const char* err_str = cudaGetErrorString(cucheck_err);							\
 18 | 		fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call);	\
 19 | 		exit(-1);																														\
 20 | 	}																																			\
 21 | 	}
 22 | 
 23 | /** sets CUDA device variable */
 24 | #define cuset(symbol, T, val)																		\
 25 | {																																\
 26 | 	void *cuset_addr;																							\
 27 | 	cucheck(cudaGetSymbolAddress(&cuset_addr, symbol));						\
 28 | 	T cuset_val = (val);																					\
 29 | 	cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \
 30 | 										 cudaMemcpyHostToDevice));									\
 31 | }  // cuset
 32 | 
 33 | /** gets the value of the CUDA device variable */
 34 | #define cuget(pval, symbol)																\
 35 | {																													\
 36 | 	void *cuget_addr;																				\
 37 | 	cucheck(cudaGetSymbolAddress(&cuget_addr, symbol));			\
 38 | 	cucheck(cudaMemcpy((pval), cuget_addr, sizeof(*(pval)), \
 39 | 										 cudaMemcpyDeviceToHost));						\
 40 | }
 41 | 
 42 | /** division with rounding upwards, useful for kernel calls */
 43 | inline int divup(int a, int b) { return a / b + (a % b ? 1 : 0); }
 44 | 
 45 | /** short-name typedef for a long long unsigned type */
 46 | typedef unsigned long long uint64;
 47 | 
 48 | /** @file common.h common functions and definitions for testing infrastructure
 49 | 		of halloc and similar GPU memory allocations. Note that this is provided for
 50 | 		testing, performance measurement and comparison only, and is not intended
 51 | 		for use in end-user applications. For end-user applications, halloc or
 52 | 		another allocator is better to be used directly. */
 53 | 
 54 | /** supported allocators */
 55 | typedef enum {
 56 | 	AllocatorNone = 0, AllocatorCuda, AllocatorHalloc, AllocatorScatterAlloc, 
 57 | 	AllocatorXMalloc, AllocatorTopNone
 58 | } AllocatorType;
 59 | 
 60 | /** supported allocation size distributions */
 61 | typedef enum {
 62 | 	DistrNone = 0, DistrUniform, DistrExpUniform, DistrExpEqual, DistrTopNone
 63 | } DistrType;
 64 | 
 65 | /** allocation action */
 66 | typedef enum {
 67 | 	ActionNone = 0, ActionAlloc, ActionFree
 68 | } ActionType;
 69 | 
 70 | #ifdef COMMONTEST_COMPILING
 71 | #define COMMONTEST_EXTERN
 72 | #else
 73 | #define COMMONTEST_EXTERN extern
 74 | #endif
 75 | 
 76 | /** external variable holding random values, one per thread */
 77 | COMMONTEST_EXTERN uint * __constant__ random_states_g;
 78 | 
 79 | /** get the random value on the device */
 80 | static inline  __device__ uint drandom(void) {
 81 | 	uint tid = threadIdx.x + blockIdx.x * blockDim.x;
 82 | 	uint seed = random_states_g[tid];
 83 | 	// TODO: check if other advancements algorithms are faster
 84 | 	/* seed ^= (seed << 13);
 85 | 	seed ^= (seed >> 17);
 86 | 	seed ^= (seed << 5); */
 87 | 	seed = (seed ^ 61) ^ (seed >> 16);
 88 | 	seed *= 9;
 89 | 	seed = seed ^ (seed >> 4);
 90 | 	seed *= 0x27d4eb2d;
 91 | 	seed = seed ^ (seed >> 15);
 92 | 	random_states_g[tid] = seed;
 93 | 	return seed;
 94 | }  // drandom
 95 | 
 96 | /** get the random value within the specified interval (both ends inclusive) on
 97 | 		the device */
 98 | static inline __device__ uint drandom(uint a, uint b) {
 99 | 	return a + (drandom() & 0x00ffffffu) % (uint)(b - a + 1);
100 | }  // drandom
101 | 
102 | /** get the floating-point random value between 0 and 1 */
103 | static inline __device__ float drandomf(void) {
104 | 	float f = 1.0f / (1024.0f * 1024.0f);
105 | 	uint m = 1024 * 1024;
106 | 	return f * drandom(0, m - 1);
107 | }  // drandomf
108 | 
109 | /** get the random boolean value with the specified probability
110 | 		@param probab the probability to return true
111 |  */
112 | static inline __device__ bool drandomb(float probab) {
113 | 	if(0.0f < probab && probab < 1.0f)
114 | 		return drandomf() <= probab;
115 | 	else 
116 | 		return probab >= 1.0f;
117 | }  // drandomb
118 | 
119 | /** common options for tests and allocator intiialization; note that some tests
120 | 		are free to provide their own default settings */
121 | struct CommonOpts {
122 | 	/** default initialization for common options */
123 | 	CommonOpts(bool dummy) 
124 | 		: allocator(AllocatorHalloc), memory(512 * 1024 * 1024), 
125 | 			halloc_fraction(0.75), busy_fraction(0.835), roomy_fraction(0.6),
126 | 			sparse_fraction(0.0125), sb_sz_sh(22), device(0), nthreads(1024 * 1024),
127 | 			ntries(8), alloc_sz(16), max_alloc_sz(16), nallocs(4), niters(1),
128 | 			bs(128), period_mask(0), group_sh(0),	distr_type(DistrUniform), 
129 | 			alloc_fraction(1), free_fraction(0), exec_fraction(1) {
130 | 		recompute_fields();
131 | 	}
132 | 
133 | 	__host__ __device__ CommonOpts() {}
134 | 	/** parses the options from command line, with the defaults specified; memory
135 | 		is also capped to fraction of device-available at this step 
136 | 		@param [in, out] this the default options on the input, and the options
137 | 		provided by the command line on the output
138 | 	*/
139 | 	void parse_cmdline(int argc, char **argv);
140 | 	/** the allocator type, as parsed from the command line, -a */
141 | 	AllocatorType allocator;
142 | 	// allocator arguments
143 | 	/** maximum allocatable memory; silently capped by a fraction (0.75) of
144 | 			available device memory, -m */
145 | 	size_t memory;
146 | 	/** fraction of memory allocated for halloc allocator, halloc only, -C */
147 | 	double halloc_fraction;
148 | 	/** slab occupancy above which it is declared busy, -B */
149 | 	double busy_fraction;
150 | 	/** slab occupancy below which it is declared roomy, -R */
151 | 	double roomy_fraction;
152 | 	/** slab occupancy below which it is declared sparse; currently, no option, as
153 | 		we don't see where it's useful */
154 | 	double sparse_fraction;
155 | 	/** shift of slab size, -b */
156 | 	int sb_sz_sh;
157 | 	
158 | 	// test parameters
159 | 	/** the device on which everything runs, -D */
160 | 	int device;
161 | 	/** number of threads in the test, -n */
162 | 	int nthreads;
163 | 	/** thread block size, -T	*/
164 | 	int bs;
165 | 	/** number of tries in the test, -t */
166 | 	int ntries;
167 | 	/** allocation size in bytes when fixed, -s */
168 | 	uint alloc_sz;
169 | 	/** maximum alloc size in bytes, -S */
170 | 	uint max_alloc_sz;
171 | 	/** ceil(log2(max_alloc_sz/alloc_sz) */
172 | 	uint max_alloc_sh;
173 | 	/** number of allocations per thread, -l */
174 | 	int nallocs;
175 | 	/** number of inside-kernel iterations, applicable only to priv-* samples,
176 | 	forced to one in other cases, -i */
177 | 	int niters;
178 | 	/** period mask, indicates one of how many threads actually does allocation;
179 | 	-q specifies period shift
180 | 	*/
181 | 	int period_mask;
182 | 	/** group size for period; the "period" parameter is applied to groups, not
183 | 	individual threads; -g */
184 | 	int group_sh;
185 | 	/** gets the allocation size distribution type; -d */
186 | 	DistrType distr_type;
187 | 	/** probabilities; first dimension is the phase (alloc = 0, free = 1), second
188 | 	dimension is the action to be taken (alloc = 0, free = 1); these cannot be specified
189 | 	from command line directly, and computed instead from steady state*/
190 | 	float probabs[2][2];
191 | 	/** the steady state fraction threads having something allocated after the
192 | 	allocation phase (f' in equation terms); -f
193 | 	*/
194 | 	float alloc_fraction;
195 | 	/** the steady state fraction of threads having something allocated after the
196 | 	free phase (f'' in equation terms); -F */
197 | 	float free_fraction;
198 | 	/** the fraction of threads which need to do (execute) something between
199 | 	steady states; -e */
200 | 	float exec_fraction;
201 | 	/** gets the total number of allocations, as usually defined for tests; for
202 | 	randomized tests, expectation is returned; individual tests may use their own
203 | 	definition */
204 | 	double total_nallocs(void);
205 | 	/** gets the total size of all the allocations; for randomized tests,
206 | 	expectation is returned
207 | 	*/
208 | 	double total_sz(void);
209 | 	/** gets the single allocation expectation size */
210 | 	double expected_sz(void);
211 | 
212 | 	/** gets the next action */
213 | 	__device__ ActionType next_action
214 | 	(bool allocated, uint itry, uint iter) const {
215 | 		uint phase = (itry * niters + iter) % 2;
216 | 		uint state = allocated ? 1 : 0;
217 | 		if(drandomb(probabs[phase][state]))
218 | 			return allocated ? ActionFree : ActionAlloc;
219 | 		else
220 | 			return ActionNone;
221 | 	}  // next_action
222 | 
223 | 	/** gets the next allocation size, which can be random */
224 | 	__device__ uint next_alloc_sz(void) const {
225 | 		// single-size case
226 | 		if(!is_random())
227 | 			return alloc_sz;
228 | 		switch(distr_type) {
229 | 		case DistrUniform:
230 | 			{
231 | 				uint sz = drandom(alloc_sz, max_alloc_sz);
232 | 				//sz = min(sz, max_alloc_sz);
233 | 				//printf("sz = %d, alloc_sz = %d, max_alloc_sz = %d\n", sz, alloc_sz, 
234 | 				//			 max_alloc_sz);
235 | 				return sz;
236 | 			}
237 | 		case DistrExpUniform:
238 | 			{
239 | 				// get random shift
240 | 				uint sh = drandom(0, max_alloc_sh);
241 | 				// get a value within the exponential group
242 | 				uint sz = drandom(alloc_sz << sh, (alloc_sz << (sh + 1)) - 1);
243 | 				sz = min(sz, max_alloc_sz);
244 | 				return sz;
245 | 			}
246 | 		case DistrExpEqual:
247 | 			{
248 | 				// get shift, distributed in geometric progression (shift *2 =>
249 | 				// probability / 2)
250 | 				uint sh = __ffs(drandom(1, 1 << (max_alloc_sh + 1))) - 1;
251 | 				// get a value within the exponential group
252 | 				uint sz = drandom(alloc_sz << sh, (alloc_sz << (sh + 1)) - 1);
253 | 				sz = min(sz, max_alloc_sz);
254 | 				return sz;
255 | 			}
256 | 		default:
257 | 			// this should definitely not happen
258 | 			assert(0);
259 | 			return 0;
260 | 		}
261 | 	}  // next_alloc_sz
262 | 	/** checks whether the thread is inactive */
263 | 	__host__ __device__ bool is_thread_inactive(uint tid) const {
264 | 		return tid >= nthreads || (tid >> group_sh) & period_mask;
265 | 	}
266 | 	/** gets the period */
267 | 	__host__ __device__ uint period(void) const { return period_mask + 1; }
268 | 	/** gets the group size */
269 | 	__host__ __device__ uint group(void) const { return 1 << group_sh; }
270 | 	/** gets the (contiguous) number of pointers for the given number of threads */
271 | 	__host__ __device__ uint nptrs_cont(uint nts) const {
272 | 		return nts / (group() * period()) * group() + 
273 | 			min(nts % (group() * period()), group());
274 | 	}
275 | 	/** checks whether randomization is employed */
276 | 	__host__ __device__ uint is_random(void) const {
277 | 		return alloc_sz != max_alloc_sz;
278 | 	}
279 | 	/** recompute the fields which need be recomputed */
280 | 	void recompute_fields(void);
281 | };
282 | 
283 | #ifndef COMMONTEST_COMPILING
284 | __constant__ CommonOpts opts_g;
285 | #endif
286 | 
287 | /** initialize device generation of random numbers */
288 | void drandom_init(const CommonOpts &opts);
289 | 
290 | /** shutdown device generation of random numbers */
291 | void drandom_shutdown(const CommonOpts &opts);
292 | 
293 | /** checks that all the pointers are non-zero 
294 | 		@param d_ptrs device pointers
295 | 		@param nptrs the number of pointers
296 |  */
297 | bool check_nz(void **d_ptrs, uint *d_ctrs, uint nptrs, const CommonOpts &opts);
298 | 
299 | /** checks that all allocations are made properly, i.e. that no pointer is zero,
300 | 		and there's at least alloc_sz memory after each pointer (alloc_sz is the
301 | 		same for all allocations). Parameters are mostly the same as with check_nz()
302 |   */
303 | bool check_alloc(void **d_ptrs, uint *d_ctrs, uint nptrs,
304 | 								 const CommonOpts &opts);
305 | 
306 | #include "halloc-wrapper.h"
307 | #include "cuda-malloc-wrapper.h"
308 | #include "scatter-alloc-wrapper.h"
309 | 
310 | /** does a test with specific allocator and test functor; it is called after
311 | 		command line parsing */
312 | template <class T, template<class Ta> class Test>
313 | void run_test(CommonOpts &opts, bool with_warmup) {
314 | 	T::init(opts);
315 | 	//warm_up<T>();
316 | 	
317 | 	Test<T> test;
318 | 	// warmup, if necessary
319 | 	if(with_warmup)
320 | 		test(opts, true);
321 | 	// real run
322 | 	test(opts, false);
323 | 
324 | 	T::shutdown();
325 | }  // run_test
326 | 
327 | /** does a test with specific test functor; basically
328 | 		this is a main function for all the tests */
329 | template <template<class Ta> class Test >
330 | void run_test(int argc, char ** argv, CommonOpts &opts, bool with_warmup = true) {
331 | 	// parse command line
332 | 	opts.parse_cmdline(argc, argv);
333 | 	cucheck(cudaSetDevice(opts.device));
334 | 
335 | 	// initialize random numbers
336 | 	drandom_init(opts);
337 | 
338 | 	// instantiate based on allocator type
339 | 	switch(opts.allocator) {
340 | 	case AllocatorCuda:
341 | 		run_test <class CudaMalloc, Test> (opts, with_warmup);
342 | 		break;
343 | 	case AllocatorHalloc:
344 | 		//printf("testing halloc allocator\n");
345 | 		run_test <class Halloc, Test> (opts, with_warmup);
346 | 		break;
347 | #ifdef WITH_SCATTER
348 | 	case AllocatorScatterAlloc:
349 | 		run_test <class ScatterAlloc, Test> (opts, with_warmup);
350 | 		break;
351 | #endif
352 | 	default:
353 | 		fprintf(stderr, "allocator invalid or not supported\n");
354 | 		exit(-1);
355 | 	}
356 | }  // run_test
357 | 
358 | #ifndef COMMONTEST_COMPILING
359 | 
360 | /** helper malloc kernel used by many tests throughout */
361 | template<class T>
362 | __global__ void malloc_k
363 | (CommonOpts opts, void **ptrs) {
364 | 	int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x;
365 | 	if(opts.is_thread_inactive(i))
366 | 		return;
367 | 	for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) {
368 | 		uint sz = opts.next_alloc_sz();
369 | 		void *ptr = T::malloc(sz);
370 | 		ptrs[i + n * ialloc] = ptr;
371 | 	}
372 | }  // malloc_k
373 | 
374 | /** helper non-randomized malloc kernel */
375 | template<class T>
376 | __global__ void malloc_corr_k
377 | (CommonOpts opts, void **ptrs) {
378 | 	int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x;
379 | 	if(opts.is_thread_inactive(i))
380 | 		return;
381 | 	for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) {
382 | 		uint sz = opts.next_alloc_sz();
383 | 		void *ptr = T::malloc(sz);
384 | 		ptrs[i + n * ialloc] = ptr;
385 | 		if(ptr)
386 | 			*(uint *)ptr = sz;
387 | 	}
388 | }  // malloc_corr_k
389 | 
390 | /** helper free kernel used by many tests throughout */
391 | template<class T>
392 | __global__ void free_k
393 | (CommonOpts opts, void **ptrs) {
394 | 	int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x;
395 | 	if(opts.is_thread_inactive(i))
396 | 		return;
397 | 	for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) 
398 | 		T::free(ptrs[i + n * ialloc]);
399 | }  // free_k
400 | 
401 | /** free the rest after the throughput test; this also counts against the total
402 | 		time */
403 | template <class T> __global__ void free_rest_k(void **ptrs, uint *ctrs) {
404 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
405 | 	if(opts_g.is_thread_inactive(i))
406 | 		return;
407 | 	uint ctr = ctrs[i], n = opts_g.nthreads;
408 | 	for(uint ialloc = 0; ialloc < ctr; ialloc++) {
409 | 		T::free(ptrs[n * ialloc + i]);
410 | 	}
411 | 	ctrs[i] = 0;
412 | }  // free_rest_k
413 | 
414 | #endif
415 | 
416 | #endif
417 | 


--------------------------------------------------------------------------------
/tst/common/cuda-malloc-wrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUDA_MALLOC_WRAPPER_H_
 2 | #define CUDA_MALLOC_WRAPPER_H_
 3 | 
 4 | /** @file cuda-malloc-wrapper.h wrapper class for CUDA malloc allocator */
 5 | 
 6 | #include "common.h"
 7 | 
 8 | class CudaMalloc {
 9 | public:
10 | 	static void init(const CommonOpts &opts) {
11 | 		cucheck(cudaDeviceSetLimit(cudaLimitMallocHeapSize, opts.memory));
12 | 	}
13 | 
14 | 	static inline __device__ void *malloc(uint nbytes) {
15 | 		return ::malloc(nbytes);
16 | 	}
17 | 
18 | 	static inline __device__ void free(void *p) {
19 | 		::free(p);
20 | 	}
21 | 
22 | 	static double extfrag(bool ideal) {
23 | 		return 0;
24 | 	}
25 | 
26 | 	static void shutdown(void) {}
27 | 
28 | }; 
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/tst/common/halloc-wrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef HALLOC_WRAPPER_H_
 2 | #define HALLOC_WRAPPER_H_
 3 | 
 4 | /** @file halloc-wrapper.h wrapper class for halloc allocator */
 5 | 
 6 | #include "common.h"
 7 | #include <halloc.h>
 8 | 
 9 | class Halloc {
10 | public:
11 | 	static void init(const CommonOpts &opts) {
12 | 		halloc_opts_t halloc_opts(opts.memory);
13 | 		halloc_opts.halloc_fraction = opts.halloc_fraction;
14 | 		halloc_opts.busy_fraction = opts.busy_fraction;
15 | 		halloc_opts.roomy_fraction = opts.roomy_fraction;
16 | 		halloc_opts.sparse_fraction = opts.sparse_fraction;
17 | 		halloc_opts.sb_sz_sh = opts.sb_sz_sh;
18 | 		ha_init(halloc_opts);
19 | 	}
20 | 
21 | 	static inline __device__ void *malloc(uint nbytes) {
22 | 		return hamalloc(nbytes);
23 | 	}
24 | 
25 | 	static inline __device__ void free(void *p) {
26 | 		hafree(p);
27 | 	}
28 | 
29 | 	static double extfrag(bool ideal) {
30 | 		return ha_extfrag(ideal);
31 | 	}
32 | 
33 | 	static void shutdown(void) {
34 | 		ha_shutdown();
35 | 	}
36 | 
37 | }; 
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/tst/common/makefile:
--------------------------------------------------------------------------------
 1 | include ../common-def.mk
 2 | 
 3 | TGT=libcommontest.a
 4 | SRC_C=*.cu
 5 | SRC_H=*.h
 6 | SRC=$(SRC_C) $(SRC_H)
 7 | TMP=$(TGT) *~ \#*
 8 | 
 9 | ARCH := -gencode arch=compute_20,code=sm_20 \
10 | 	-gencode arch=compute_30,code=sm_30 \
11 | 	-gencode arch=compute_35,code=sm_35
12 | 
13 | FLAGS= $(ARCH) -O3 -rdc=true -lib -I../include
14 | ifeq ($(WITH_SCATTER), 1)
15 | FLAGS += -DWITH_SCATTER
16 | endif
17 | 
18 | build: $(TGT) 
19 | $(TGT):	$(SRC) makefile
20 | 	nvcc $(FLAGS) -o $(TGT) $(SRC_C)
21 | 
22 | clean:
23 | 	rm -f $(TMP)
24 | 


--------------------------------------------------------------------------------
/tst/common/scatter-alloc-wrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef SCATTER_ALLOC_WRAPPER_H_
 2 | #define SCATTER_ALLOC_WRAPPER_H_
 3 | 
 4 | /** @file scatter-alloc-wrapper.h wrapper class for ScatterAlloc allocator */
 5 | #ifdef WITH_SCATTER
 6 | 
 7 | #include "common.h"
 8 | #include <scatter-alloc.h>
 9 | 
10 | class ScatterAlloc {
11 | public:
12 | 	static void init(const CommonOpts &opts) {
13 | 		sc_init_heap(opts.memory);
14 | 	}
15 | 
16 | 	static inline __device__ void *malloc(uint nbytes) {
17 | 		return scmalloc(nbytes);
18 | 	}
19 | 
20 | 	static inline __device__ void free(void *p) {
21 | 		scfree(p);
22 | 	}
23 | 
24 | 	static double extfrag(bool ideal) {
25 | 		return 0;
26 | 	}
27 | 
28 | 	static void shutdown(void) {
29 | 	}
30 | 
31 | }; 
32 | 
33 | #endif
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/tst/corr/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | *.log
6 | 


--------------------------------------------------------------------------------
/tst/corr/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/tst/corr/checkptr/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/corr/checkptr/checkptr.cu:
--------------------------------------------------------------------------------
 1 | /** @file latency.cu latency test for various memory allocators */
 2 | 
 3 | #include <common.h>
 4 | 
 5 | #include <limits.h>
 6 | #include <math.h>
 7 | #include <omp.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | 
12 | template<class T> class CheckPtrTest {
13 | 	
14 | public:
15 | 	void operator()(CommonOpts opts, bool warmup) {
16 | 		opts.niters = 1;
17 | 		// allocate memory
18 | 		if(warmup) {
19 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
20 | 			opts.ntries = 1;
21 | 		}
22 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
23 | 		int nptrs = n * opts.nallocs;
24 | 		size_t ptrs_sz = nptrs * sizeof(void *);
25 | 		void **d_ptrs;
26 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
27 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
28 | 
29 | 		// do testing
30 | 		for(int itry = 0; itry < opts.ntries; itry++) {
31 | 			// allocate
32 | 			malloc_corr_k<T> <<<grid, bs>>>(opts, d_ptrs);
33 | 			cucheck(cudaGetLastError());
34 | 			cucheck(cudaStreamSynchronize(0));
35 | 			// check that pointers are correct
36 | 			if(!check_alloc(d_ptrs, 0, nptrs, opts)) {
37 | 				exit(-1);
38 | 			}
39 | 			// free
40 | 			free_k<T> <<<grid, bs>>>(opts, d_ptrs);
41 | 			cucheck(cudaGetLastError());
42 | 			cucheck(cudaStreamSynchronize(0));
43 | 		}  // for(itry)
44 | 
45 | 		// free memory
46 | 		cucheck(cudaFree(d_ptrs));		
47 | 	}  // operator()
48 |  
49 | };  // CheckPtrTest
50 | 
51 | int main(int argc, char **argv) {
52 | 	CommonOpts opts(true);
53 | 	run_test<CheckPtrTest> (argc, argv, opts, false);
54 | 	return 0;
55 | }  // main
56 | 


--------------------------------------------------------------------------------
/tst/corr/checkptr/makefile:
--------------------------------------------------------------------------------
1 | NAME=checkptr
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/corr/freeslabs/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/corr/freeslabs/freeslabs.cu:
--------------------------------------------------------------------------------
 1 | /** @file freeslabs.cu tests whether all slabs are returned as free */
 2 | 
 3 | #include <common.h>
 4 | 
 5 | #include <limits.h>
 6 | #include <math.h>
 7 | #include <omp.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | 
12 | template<class T> class FreeSlabsTest {
13 | 	
14 | public:
15 | 	void operator()(CommonOpts opts, bool warmup) {
16 | 		opts.niters = 1;
17 | 		// allocate memory
18 | 		if(warmup) {
19 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
20 | 			opts.ntries = 1;
21 | 		}
22 | 		// override number of allocations, period and group options
23 | 		opts.nallocs = 1;
24 | 		opts.period_mask = 0;
25 | 		opts.group_sh = 0;
26 | 		int max_n = opts.nthreads, nptrs = max_n * opts.nallocs;
27 | 		// note that here, nthreads is treated as the maximum thread number
28 | 		size_t ptrs_sz = nptrs * sizeof(void *);
29 | 		void **d_ptrs;
30 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
31 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
32 | 
33 | 		// allocation fraction; increase to larger values when it's possible 
34 | 		// to free cached or head slabs
35 | 		double fraction = 0.4;
36 | 		// do testing
37 | 		for(int itry = 0; itry < opts.ntries; itry++) {
38 | 			// step over sizes:
39 | 			// 16..64: step 8
40 | 			// 64..256: step 16
41 | 			// 256..1k: step 128
42 | 			uint step = 8;
43 | 			for(uint alloc_sz = 16; alloc_sz <= 1024; alloc_sz += step) {
44 | 				printf("allocation size %d\n", alloc_sz);
45 | 				int nthreads = (int)floor(fraction * opts.memory / alloc_sz);
46 | 				nthreads = min(max_n, nthreads);
47 | 				opts.nthreads = nthreads;
48 | 				opts.alloc_sz = opts.max_alloc_sz = alloc_sz;
49 | 				opts.recompute_fields();
50 | 				int bs = opts.bs, grid = divup(opts.nthreads, bs);
51 | 				// allocate
52 | 				malloc_k<T> <<<grid, bs>>>(opts, d_ptrs);
53 | 				cucheck(cudaGetLastError());
54 | 				cucheck(cudaStreamSynchronize(0));
55 | 				// check that pointers are correct
56 | 				if(!check_alloc(d_ptrs, 0, opts.nthreads, opts)) {
57 | 					exit(-1);
58 | 				}
59 | 				// free
60 | 				free_k<T> <<<grid, bs>>>(opts, d_ptrs);
61 | 				cucheck(cudaGetLastError());
62 | 				cucheck(cudaStreamSynchronize(0));
63 | 				// set up step
64 | 				if(alloc_sz >= 256)
65 | 					step = 128;
66 | 				else if(alloc_sz >= 64)
67 | 					step = 16;
68 | 				else
69 | 					step = 8;
70 | 			}  // for(alloc_sz)
71 | 		}  // for(itry)
72 | 
73 | 		// free memory
74 | 		cucheck(cudaFree(d_ptrs));		
75 | 	}  // operator()
76 |  
77 | };  // FreeSlabsTest
78 | 
79 | int main(int argc, char **argv) {
80 | 	CommonOpts opts(true);
81 | 	opts.ntries = 4;
82 | 	run_test<FreeSlabsTest> (argc, argv, opts, false);
83 | 	return 0;
84 | }  // main
85 | 


--------------------------------------------------------------------------------
/tst/corr/freeslabs/makefile:
--------------------------------------------------------------------------------
1 | NAME=freeslabs
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/corr/make-all.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | # runs specific make target for each performance test
4 | ls -1 | grep -vE 'bin|tmp|make|run|\.log' | xargs -IXA_TEST -P0 \
5 | 		make -C XA_TEST $1
6 | 


--------------------------------------------------------------------------------
/tst/corr/makefile:
--------------------------------------------------------------------------------
 1 | TMP=*~
 2 | 
 3 | build:
 4 | 	./make-all.sh build
 5 | 
 6 | clean:
 7 | 	rm -f $(TMP)
 8 | 	./make-all.sh clean
 9 | 
10 | run: build
11 | 	./run-all-tests.pl
12 | 
13 | run-only:
14 | 	./run-all-tests.pl
15 | 


--------------------------------------------------------------------------------
/tst/corr/prob-checkptr/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/corr/prob-checkptr/makefile:
--------------------------------------------------------------------------------
1 | NAME=prob-checkptr
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/corr/prob-checkptr/prob-checkptr.cu:
--------------------------------------------------------------------------------
 1 | /** @file prob-throughput.cu probabalitized throughput test */
 2 | 
 3 | #include <common.h>
 4 | 
 5 | #include <limits.h>
 6 | #include <math.h>
 7 | #include <omp.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | 
12 | /** the kernel of the probability throughput test */
13 | template <class T>
14 | __global__ void prob_corr_k
15 | (void **ptrs, uint *ctrs, uint itry) {
16 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
17 | 	uint n = opts_g.nthreads, nallocs = opts_g.nallocs;
18 | 	if(opts_g.is_thread_inactive(i))
19 | 		return;
20 | 	uint ctr = ctrs[i];
21 | 
22 | 	// iterate 
23 | 	for(uint iter = 0; iter < opts_g.niters; iter++) {
24 | 		// perform the action
25 | 		switch(opts_g.next_action(ctr > 0, itry, iter)) {
26 | 		case ActionAlloc:
27 | 			for(uint ialloc = 0; ialloc < nallocs; ialloc++) {
28 | 				uint sz = opts_g.next_alloc_sz();
29 | 				void *ptr = T::malloc(sz);
30 | 				ptrs[ialloc * n + i] = ptr;
31 | 				if(ptr)
32 | 					*(uint *)ptr = sz;
33 | 				//printf("tid = %d, sz = %d\n", i, sz);
34 | 			}
35 | 			ctr = nallocs;
36 | 			break;
37 | 		case ActionFree:
38 | 			for(uint ialloc = 0; ialloc < nallocs; ialloc++)
39 | 				T::free(ptrs[ialloc * n + i]);
40 | 			ctr = 0;
41 | 			break;
42 | 		}
43 | 	}  // for(each iteration)
44 | 	ctrs[i] = ctr;
45 | }  // prob_throughput_k
46 | 
47 | /** measures malloc throughput */
48 | template<class T> class ProbCorrTest {
49 | 	
50 | public:
51 | 	void operator()(CommonOpts opts, bool warmup) {
52 | 		// allocate memory
53 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
54 | 		int nptrs = n * opts.nallocs;
55 | 		size_t ptrs_sz = nptrs * sizeof(void *);
56 | 		uint ctrs_sz = n * sizeof(uint);
57 | 		void **d_ptrs;
58 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
59 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
60 | 		uint *d_ctrs;
61 | 		cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz));
62 | 		cucheck(cudaMemset(d_ctrs, 0, ctrs_sz));
63 | 
64 | 		cuset(opts_g, CommonOpts, opts);
65 | 
66 | 		// do testing
67 | 		for(int itry = 0; itry < opts.ntries; itry++) {
68 | 			printf("iteration %d\n", itry);
69 | 			// run the kernel
70 | 			//printf("kernel configuration: %d, %d\n", grid, bs);
71 | 			prob_corr_k<T> <<<grid, bs>>>(d_ptrs, d_ctrs, itry);
72 | 			cucheck(cudaGetLastError());
73 | 			cucheck(cudaStreamSynchronize(0));
74 | 			// check that pointers are correct
75 | 			if(!check_alloc(d_ptrs, d_ctrs, nptrs, opts)) {
76 | 			 	fprintf(stderr, "cannot allocate enough memory\n");
77 | 			 	exit(-1);
78 | 			}
79 | 		}  // for(itry)
80 | 
81 | 		// free the rest
82 | 		printf("freeing the rest\n");
83 | 		free_rest_k<T> <<<grid, bs>>> (d_ptrs, d_ctrs);
84 | 		cucheck(cudaGetLastError());
85 | 		cucheck(cudaStreamSynchronize(0));
86 | 
87 | 		// free memory
88 | 		cucheck(cudaFree(d_ptrs));
89 | 		cucheck(cudaFree(d_ctrs));
90 | 	}  // operator()
91 |  
92 | };  // ProbThroughputTest
93 | 
94 | int main(int argc, char **argv) {
95 | 	CommonOpts opts(true);
96 | 	run_test<ProbCorrTest>(argc, argv, opts, false);
97 | 	return 0;
98 | }  // main
99 | 


--------------------------------------------------------------------------------
/tst/corr/run-all-tests.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env perl
 2 | 
 3 | # a script to run all test for halloc
 4 | 
 5 | use POSIX;
 6 | 
 7 | $ntests = 0;
 8 | $nsuccesses = 0;
 9 | $device = 0;
10 | $memory = 512 * 1024 * 1024;
11 | 
12 | sub runtest {
13 | 		system("./run-test.sh", @_, "-D$device", "-m$memory");
14 | 		if($? >> 8 == 0) {
15 | 				$nsuccesses++;
16 | 		}				
17 | 		$ntests++;
18 | }  # runtest
19 | 
20 | # correctness memory allocation test; over all sizes, allocate/free 25% memory
21 | # for each small size, and 12.5% memory for each large size
22 | $step = 8;
23 | for($alloc_sz = 16; $alloc_sz <= 32 * 1024; $alloc_sz += $step) {
24 | 		$fraction = $alloc_sz <= 2 * 1024 ? 0.25 : 0.125;
25 | 		$nthreads = floor($fraction * $memory / $alloc_sz);
26 | 		if($nthreads == 0) {
27 | 				next;
28 | 		}
29 | 		runtest("checkptr", "-l1", "-t4", "-s$alloc_sz", "-n$nthreads");
30 | 		# modify step
31 | 		if($alloc_sz >= 1024 * 1024) {
32 | 				$step = 1024 * 1024;
33 | 		} elsif($alloc_sz >= 128 * 1024) {
34 | 				$step = 128 * 1024;
35 | 		} elsif($alloc_sz >= 16 * 1024) {
36 | 				$step = 16 * 1024;
37 | 		} elsif($alloc_sz >= 2 * 1024) {
38 | 				$step = 2 * 1024;
39 | 		} elsif($alloc_sz >= 256) {
40 | 				$step = 256;
41 | 		} elsif($alloc_sz >= 64) {
42 | 				$step = 16;
43 | 		} else {
44 | 				$step = 8;
45 | 		}
46 | }  # for($step)
47 | 
48 | # free slabs test - to ensure that slabs are freed correctly
49 | runtest("freeslabs", "-m$memory");
50 | 
51 | # probabilitized tests
52 | $falloc = 0.5;
53 | $ffree = 0.5;
54 | $fexec = 0.75;
55 | #foreach $group (10) {
56 | foreach $group (0, 5, 10) {
57 | 		foreach $niters (1, 5) {
58 | #		foreach $niters (1) {
59 | 				$ntries = $group == 1 ? 1024 : 16384;
60 | 				$ntries = ceil($ntries / $niters);
61 | 				@fixed_args = ("prob-checkptr", "-i$niters", "-t$ntries", "-f$falloc",
62 | 											 "-F$ffree", "-e$fexec", "-g$group");
63 | 				# small sizes (<= 64 bytes)
64 | 				$nthreads = 1024 * 1024;
65 | 				runtest(@fixed_args, "-l4", "-n$nthreads", "-s8", "-S64", "-duniform");
66 | 				# medium sizes (<= 256 bytes)
67 | 				runtest(@fixed_args, "-l1", "-n$nthreads", "-s8", "-S256", "-duniform");
68 | 				runtest(@fixed_args, "-l4", "-n$nthreads", "-s8", "-S256",	"-dexpequal");
69 | 				# large-size test (<= 3072 bytes)				
70 | 				$nthreads = 64 * 1024;
71 | 				runtest(@fixed_args, "-l1", "-n$nthreads", "-s8", "-S3072", "-duniform");
72 | 				$nthreads = 128 * 1024;
73 | 				runtest(@fixed_args, "-l4", "-n$nthreads", "-s8", "-S3072",	"-dexpequal");
74 | 		}
75 | }
76 | 
77 | # print the total count
78 | $nfails = $ntests - $nsuccesses;
79 | print "tests: $ntests TOTAL, $nsuccesses SUCCEEDED, $nfails FAILED\n";
80 | 


--------------------------------------------------------------------------------
/tst/corr/run-test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # usage: 
 4 | # ./run-test.sh <test-name> <test-args>
 5 | 
 6 | TEST_NAME=$1
 7 | shift 1
 8 | TEST_EXE=./bin/$TEST_NAME
 9 | 
10 | # run the test
11 | echo $TEST_EXE $@
12 | $TEST_EXE $@
13 | 
14 | # analyze exit code
15 | # TODO: add output coloring
16 | TEST_EXIT=$?
17 | if [ $TEST_EXIT == 0 ]; then
18 | 		echo "$TEST_NAME test PASSED"
19 | 		exit 0
20 | else
21 | 		echo "$TEST_NAME test FAILED with exit code $TEST_EXIT"
22 | 		exit -1
23 | fi
24 | 


--------------------------------------------------------------------------------
/tst/corr/test/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/corr/test/makefile:
--------------------------------------------------------------------------------
1 | NAME=test
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/corr/test/test.cu:
--------------------------------------------------------------------------------
  1 | /** @file test.cu testing a simple idea of an allocator */
  2 | 
  3 | #include <halloc.h>
  4 | #include <omp.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | 
  9 | #include <thrust/device_ptr.h>
 10 | #include <thrust/functional.h>
 11 | #include <thrust/logical.h>
 12 | #include <thrust/sort.h>
 13 | 
 14 | /** a macro for checking CUDA calls */
 15 | #define cucheck(call)																										\
 16 | 	{																																			\
 17 | 	cudaError_t cucheck_err = (call);																			\
 18 | 	if(cucheck_err != cudaSuccess) {																			\
 19 | 		const char* err_str = cudaGetErrorString(cucheck_err);							\
 20 | 		fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call);	\
 21 | 		exit(-1);																														\
 22 | 	}																																			\
 23 | 	}
 24 | 
 25 | //#include "halloc.h"
 26 | 
 27 | /** testing parameters */
 28 | #define NTHREADS (2 * 1024 * 1024)
 29 | #define NMALLOCS 8
 30 | #define NTHREADS2 (NTHREADS / NMALLOCS)
 31 | //#define NTHREADS2 NTHREADS
 32 | #define BS 256
 33 | #define NTRIES 8
 34 | #define MEMORY (4 * 16 * NTHREADS)
 35 | //#define NTRIES 1
 36 | 
 37 | // alloc/free kernel
 38 | __global__ void malloc_free_k(int ntimes) {
 39 | 	for(int i = 0; i < ntimes; i++) {
 40 | 		void *p = hamalloc(16);
 41 | 		if(!p)
 42 | 			printf("cannot allocate memory\n");
 43 | 		hafree(p);
 44 | 	}
 45 | }  // malloc_free_k
 46 | 
 47 | // alloc-and-save-pointer kernel
 48 | __global__ void malloc_k(void **ptrs, int ntimes) {
 49 | 	int tid = threadIdx.x + blockIdx.x * blockDim.x;
 50 | 	int nthreads = blockDim.x * gridDim.x;
 51 | 	for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads) {
 52 | 		ptrs[iptr] = hamalloc(16);
 53 | 		if(!ptrs[iptr])
 54 | 			printf("cannot allocate memory\n");
 55 | 	}
 56 | }  // malloc_k
 57 | // read-and-free pointer kernel
 58 | __global__ void free_k(void **ptrs, int ntimes) {
 59 | 	int tid = threadIdx.x + blockIdx.x * blockDim.x;
 60 | 	int nthreads = blockDim.x * gridDim.x;
 61 | 	for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads)
 62 | 		hafree(ptrs[iptr]);
 63 | }  // free_k
 64 | 
 65 | // alloc-and-save-pointer kernel
 66 | __global__ void cuda_malloc_k(void **ptrs, int ntimes) {
 67 | 	int tid = threadIdx.x + blockIdx.x * blockDim.x;
 68 | 	int nthreads = blockDim.x * gridDim.x;
 69 | 	for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads) {
 70 | 		ptrs[iptr] = malloc(16);
 71 | 		if(!ptrs[iptr])
 72 | 			printf("cannot allocate memory using CUDA malloc()\n");
 73 | 	}
 74 | }  // malloc_k
 75 | // read-and-free pointer kernel
 76 | __global__ void cuda_free_k(void **ptrs, int ntimes) {
 77 | 	int tid = threadIdx.x + blockIdx.x * blockDim.x;
 78 | 	int nthreads = blockDim.x * gridDim.x;
 79 | 	for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads)
 80 | 		free(ptrs[iptr]);
 81 | }  // free_k
 82 | 
 83 | // a kernel to check whether pointers are good
 84 | __global__ void check_ptrs_k(bool *good, uint sz, size_t *ptrs, uint n) {
 85 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
 86 | 	size_t *ptr = (size_t *)ptrs[i];
 87 | 	// check 1: try to write two values at the pointer
 88 | 	ptr[0] = ptrs[i];
 89 | 	ptr[1] = ptrs[i];
 90 | 	// check 2: check that the pointer addresses are really valid
 91 | 	if(i < n - 1) {
 92 | 		good[i] = ptrs[i + 1] - ptrs[i] >= sz;
 93 | 	} else
 94 | 		good[i] = true;
 95 | }  // check_ptrs_k
 96 | 
 97 | // correctness test - checks if all allocations are correct
 98 | void run_test0(void) {
 99 | 	void **d_ptrs;
100 | 	size_t ptrs_sz = NTHREADS2 * NMALLOCS * sizeof(void *);
101 | 	uint nmallocs = NMALLOCS * NTHREADS2;
102 | 	cucheck(cudaMalloc(&d_ptrs, ptrs_sz));
103 | 	size_t *d_addresses = (size_t *)d_ptrs;
104 | 	cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
105 | 	// allocate data
106 | 	malloc_k<<<NTHREADS2 / BS, BS>>>(d_ptrs, NMALLOCS);
107 | 	cucheck(cudaGetLastError());
108 | 	cucheck(cudaStreamSynchronize(0));
109 | 	// sort pointers
110 | 	thrust::device_ptr<size_t> dt_addresses(d_addresses);
111 | 	thrust::sort(dt_addresses, dt_addresses + nmallocs);
112 | 	// check sorted pointers
113 | 	bool *d_good;
114 | 	size_t good_sz = nmallocs * sizeof(bool);
115 | 	cucheck(cudaMalloc((void **)&d_good, good_sz));
116 | 	check_ptrs_k<<<nmallocs/BS, BS>>>(d_good, 16, d_addresses, nmallocs);
117 | 	cucheck(cudaGetLastError());
118 | 	cucheck(cudaStreamSynchronize(0));
119 | 	thrust::device_ptr<bool> dt_good(d_good);
120 | 	bool passed = thrust::all_of(dt_good, dt_good + nmallocs, 
121 | 															 thrust::identity<bool>());
122 | 	printf("test 0 (correctness of allocation):\n");
123 | 	printf("test %s\n", passed ? "PASSED" : "FAILED");
124 | 	printf("\n");
125 | 	// FINISHED HERE
126 | 	// TODO: check pointers (each should point to enough memory)
127 | 	// free memory
128 | 	free_k<<<NTHREADS2 / BS, BS>>>(d_ptrs, NMALLOCS);
129 | 	cucheck(cudaGetLastError());
130 | 	cucheck(cudaStreamSynchronize(0));
131 | 	cucheck(cudaFree(d_ptrs));
132 | }  // run_test0
133 | 
134 | void run_test1(void) {
135 | 	double t1 = omp_get_wtime();
136 | 	for(int itry = 0; itry < NTRIES; itry++) {
137 | 		malloc_free_k<<<NTHREADS / BS, BS>>>(1);
138 | 		cucheck(cudaGetLastError());
139 | 		cucheck(cudaStreamSynchronize(0));
140 | 	}
141 | 	double t2 = omp_get_wtime();
142 | 	double nmallocs = (double)NTHREADS * NTRIES;
143 | 	printf("test 1 (malloc/free inside each thread):\n");
144 | 	printf("test duration %.2lf ms\n", (t2 - t1) * 1e3);
145 | 	printf("%.0lf malloc/free pairs in the test\n", nmallocs);
146 | 	printf("allocation speed: %.2lf Mpairs/s\n", nmallocs / (t2 - t1) * 1e-6);
147 | 	printf("\n");
148 | }  // run_test1
149 | 
150 | void run_test2(void) {
151 | 	void **d_ptrs;
152 | 	size_t ptrs_sz = NTHREADS2 * NMALLOCS * sizeof(void *);
153 | 	cucheck(cudaMalloc(&d_ptrs, ptrs_sz));
154 | 	cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
155 | 	double t1 = omp_get_wtime();
156 | 	for(int itry = 0; itry < NTRIES; itry++) {
157 | 		malloc_k<<<NTHREADS2 / BS, BS>>>(d_ptrs, NMALLOCS);
158 | 		cucheck(cudaGetLastError());
159 | 		//cucheck(cudaStreamSynchronize(0));
160 | 		free_k<<<NTHREADS2 / BS, BS>>>(d_ptrs, NMALLOCS);
161 | 		cucheck(cudaGetLastError());
162 | 		cucheck(cudaStreamSynchronize(0));
163 | 	}
164 | 	double t2 = omp_get_wtime();
165 | 	cucheck(cudaFree(d_ptrs));
166 | 	double nmallocs = (double)NMALLOCS * NTHREADS2 * NTRIES;
167 | 	printf("test 2 (first all mallocs, then all frees):\n");
168 | 	printf("test duration %.2lf ms\n", (t2 - t1) * 1e3);
169 | 	printf("%.0lf malloc/free pairs in the test\n", nmallocs);
170 | 	printf("allocation speed: %.2lf Mpairs/s\n", nmallocs / (t2 - t1) * 1e-6);
171 | 	printf("\n");
172 | }  // run_test2
173 | 
174 | /** latency test */
175 | void run_test3(void) {
176 | 	double t1 = omp_get_wtime();
177 | 	int lat_ntries = 4, lat_nmallocs = 16 * 1024;
178 | 	//int lat_ntries = 1, lat_nmallocs = 1;
179 | 	for(int itry = 0; itry < lat_ntries; itry++) {
180 | 		malloc_free_k<<<1, 1>>>(lat_nmallocs);
181 | 		cucheck(cudaGetLastError());
182 | 		cucheck(cudaStreamSynchronize(0));
183 | 	}
184 | 	double t2 = omp_get_wtime();
185 | 	double nmallocs = (double)lat_nmallocs * lat_ntries;
186 | 	printf("test 3 (latency):\n");
187 | 	printf("test duration %.2lf ms\n", (t2 - t1) * 1e3);
188 | 	printf("%.0lf malloc/free pairs in the test\n", nmallocs);
189 | 	printf("latency: %.0lf ns\n", (t2 - t1) * 1e9 / nmallocs);
190 | 	printf("\n");
191 | }  // run_test3
192 | 
193 | /** throughput test for CUDA allocator */
194 | void run_test4(void) {
195 | 	void **d_ptrs;
196 | 	int cuda_nthreads = 128 * 1024, cuda_nmallocs = 2, cuda_ntries = 4;
197 | 	//cucheck(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 32 * 1024 * 1024));
198 | 	size_t ptrs_sz = cuda_nthreads * cuda_nmallocs * sizeof(void *);
199 | 	cucheck(cudaMalloc(&d_ptrs, ptrs_sz));
200 | 	cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
201 | 	double t1 = omp_get_wtime();
202 | 	for(int itry = 0; itry < cuda_ntries; itry++) {
203 | 		cuda_malloc_k<<<cuda_nthreads / BS, BS>>>(d_ptrs, cuda_nmallocs);
204 | 		cucheck(cudaGetLastError());
205 | 		//cucheck(cudaStreamSynchronize(0));
206 | 		cuda_free_k<<<cuda_nthreads / BS, BS>>>(d_ptrs, cuda_nmallocs);
207 | 		cucheck(cudaGetLastError());
208 | 		cucheck(cudaStreamSynchronize(0));
209 | 	}
210 | 	double t2 = omp_get_wtime();
211 | 	cucheck(cudaFree(d_ptrs));
212 | 	double nmallocs = (double)cuda_nmallocs * cuda_nthreads * cuda_ntries;
213 | 	printf("test 4 (CUDA, first all mallocs, then all frees):\n");
214 | 	printf("test duration %.2lf ms\n", (t2 - t1) * 1e3);
215 | 	printf("%.0lf malloc/free pairs in the test\n", nmallocs);
216 | 	printf("allocation speed: %.2lf Mpairs/s\n", nmallocs / (t2 - t1) * 1e-6);
217 | 	printf("\n");
218 | }  // run_test4
219 | 
220 | // separate time, first for allocation, then for free
221 | void run_test5(void) {
222 | 	void **d_ptrs;
223 | 	size_t ptrs_sz = NTHREADS2 * NMALLOCS * sizeof(void *);
224 | 	cucheck(cudaMalloc(&d_ptrs, ptrs_sz));
225 | 	cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
226 | 	uint ntries = 1;
227 | 	double t1 = omp_get_wtime();
228 | 	for(int itry = 0; itry < ntries; itry++) {
229 | 		malloc_k<<<NTHREADS2 / BS, BS>>>(d_ptrs, NMALLOCS);
230 | 		cucheck(cudaGetLastError());
231 | 		cucheck(cudaStreamSynchronize(0));
232 | 	}
233 | 	double t2 = omp_get_wtime();
234 | 	for(int itry = 0; itry < ntries; itry++) {
235 | 		free_k<<<NTHREADS2 / BS, BS>>>(d_ptrs, NMALLOCS);
236 | 		cucheck(cudaGetLastError());
237 | 		cucheck(cudaStreamSynchronize(0));
238 | 	}
239 | 	double t3 = omp_get_wtime();
240 | 	cucheck(cudaFree(d_ptrs));
241 | 	double nmallocs = (double)NMALLOCS * NTHREADS2 * ntries;
242 | 	printf("test 5 (first mallocs, then frees, separate timing):\n");
243 | 	printf("test duration: malloc %.2lf ms, free %.2lf ms\n", 
244 | 				 (t2 - t1) * 1e3, (t3 - t2) * 1e3);
245 | 	printf("%.0lf malloc/free pairs in the test\n", nmallocs);
246 | 	printf("speed: %.2lf Mmallocs/s, %.2lf Mfrees/s\n", 
247 | 				 nmallocs / (t2 - t1) * 1e-6, nmallocs / (t3 - t2) * 1e-6);
248 | 	printf("\n");
249 | }  // run_test5
250 | 
251 | int main(int argc, char **argv) {
252 | 	ha_init(halloc_opts_t(MEMORY));
253 | 	//ha_init(halloc_opts_t(1024 * 1024 * 1024));
254 | 	run_test0();
255 | 	run_test1();
256 | 	run_test2();
257 | 	run_test3();
258 | 	run_test4();
259 | 	run_test5();
260 | 	ha_shutdown();
261 | }  // main
262 | 


--------------------------------------------------------------------------------
/tst/corr/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/tst/exp/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.png
5 | *.pdf
6 | 


--------------------------------------------------------------------------------
/tst/exp/common.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env perl
 2 | 
 3 | 
 4 | $device = 1;
 5 | 
 6 | # runs, sets exernal variables to data extracted from test run; 
 7 | # negative values mean that no data has been extracted
 8 | sub runtest {
 9 | 		my $test = $_[0];
10 | 		shift @_;
11 | 		$test = "../../perf/bin/phase-$test";
12 | 		$args = join ' ', @_;
13 | 		#print $args;
14 | 		my @res = `$test -D$device $args`;
15 | 		shift @res;
16 | 		#print @res;
17 | 		# set standard variables to undefined
18 | 		#throughput
19 | 		$thru_malloc = -1;
20 | 		$thru_free = -1;
21 | 		$thru_pair = -1;
22 | 		#speed; note that there's no "free speed"
23 | 		$speed_malloc = -1;
24 | 		$speed_pair = -1;
25 | 		#latency: note that there's no pair latency
26 | 		$lat_malloc_min = -1;
27 | 		$lat_malloc_max = -1;
28 | 		$lat_malloc_avg = -1;
29 | 		$lat_free_min = -1;
30 | 		$lat_free_max = -1;
31 | 		$lat_free_avg = -1;
32 | 		# analyze result lines
33 | 		
34 | 		foreach $line (@res) {
35 | 				my @fields = split ' ', $line;
36 | 				#print (join ',', @fields);
37 | 				my $is_malloc = grep /malloc/, @fields;
38 | 				my $is_free = grep /free/, @fields;
39 | 				my $is_pair = grep /pair/, @fields;
40 | 				my $is_thru = grep /throughput/, @fields;
41 | 				my $is_speed = grep /speed/, @fields;
42 | 				my $is_lat = grep /latency/, @fields;
43 | 				my $is_avg = grep /avg/, @fields;
44 | 				my $is_min = grep /min/, @fields;
45 | 				my $is_max = grep /max/, @fields;
46 | 				#print $is_pair, $is_thru, $is_malloc, "\n";
47 | 				if($is_thru) {
48 | 						if($is_malloc) {
49 | 								$thru_malloc = $fields[2];
50 | 						} elsif($is_free) {
51 | 								$thru_free = $fields[2];
52 | 						} elsif($is_pair) {
53 | 								$thru_pair = $fields[2];
54 | 						}
55 | 				} elsif($is_speed) {
56 | 						if($is_malloc) {
57 | 								$speed_malloc = $fields[2];
58 | 						} elsif($is_pair) {
59 | 								$speed_pair = $fields[2];
60 | 						}
61 | 				} elsif($is_lat) {
62 | 						if($is_malloc) {
63 | 								if($is_min) {
64 | 										$lat_malloc_min = $fields[3];
65 | 								} elsif($is_max) {
66 | 										$lat_malloc_max = $fields[3];
67 | 								} elsif($is_avg) {
68 | 										$lat_malloc_avg = $fields[3];
69 | 								}
70 | 						} elsif($is_free) {
71 | 								if($is_min) {
72 | 										$lat_free_min = $fields[3];
73 | 								} elsif($is_max) {
74 | 										$lat_free_max = $fields[3];
75 | 								} elsif($is_avg) {
76 | 										$lat_free_avg = $fields[3];
77 | 								}
78 | 						}
79 | 				}
80 | 		}  # foreach $line
81 | }  # sub runtest
82 | 


--------------------------------------------------------------------------------
/tst/exp/frag-int/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.png
5 | *.pdf
6 | 


--------------------------------------------------------------------------------
/tst/exp/frag-int/exp-plot.gpl:
--------------------------------------------------------------------------------
 1 | # plots results of internal fragmentation experiments
 2 | 
 3 | set terminal pdf enhanced color
 4 | set style data lines
 5 | 
 6 | set output "frag-int.pdf"
 7 | set xlabel "Allocation size, B"
 8 | set ylabel "Average internal fragmentation"
 9 | #plot [0:3072][] 
10 | plot [0:384][] \
11 | 		 "exp-log.csv" u 1:4 t "Average"
12 | #		 "exp-log.csv" u 1:2 t "Block",\
13 | #		 "exp-log.csv" u 1:3 t "Cumulative",\
14 | 


--------------------------------------------------------------------------------
/tst/exp/frag-int/exp-run.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env perl
 2 | 
 3 | # data for internal fragmentation plot
 4 | 
 5 | # 2, 3
 6 | @alloc_szs = (16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536,
 7 | 							2048, 3072);
 8 | # 2, 3, 5
 9 | #@alloc_szs = (16, 24, 32, 40, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512,
10 | #							640, 768, 1024, 1280, 1536,	2048, 2560, 3072);
11 | # 2, 3, 5, 7
12 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256,
13 | #							320, 384, 448, 512,	640, 768, 896, 1024, 1280, 1536, 1792, 2048,
14 | #							2560, 3072);
15 | # 2, 3, 5, 7, 9
16 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 72, 80, 96, 112, 128, 144, 160, 192, 224, 256,
17 | #							288, 320, 384, 448, 512, 576, 640, 768, 896, 1024, 1152, 1280,
18 | #							1536, 1792, 2048, 2304,	2560, 3072);
19 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224,
20 | #							256, 320, 384, 448,	512, 640, 768, 896,
21 | #							1024, 1280, 1536, 1792,	2048, 2560, 3072);
22 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 80,	96, 112, 128, 144, 160, 176, 
23 | #							192, 224, 256, 288, 320, 352, 384, 448, 512, 576, 640, 704,
24 | #							768, 896, 1024, 1280, 1408, 1536, 1792, 2048, 2304, 2560, 3072);
25 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 72, 80, 88,	96, 104, 112, 120, 
26 | #							128, 144, 160, 176, 192, 208, 224, 240, 
27 | #							256, 288, 320, 352, 384, 416, 448, 480, 
28 | #							512, 576, 640, 704,	768, 832, 896, 960, 
29 | #							1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920,
30 | #							2048, 2304, 2560, 2816, 3072);
31 | sub find_alloc_sz {
32 | 		my $sz = $_[0];
33 | 		foreach $alloc_sz (@alloc_szs) {
34 | 				if($alloc_sz >= $sz) {
35 | 						return $alloc_sz;
36 | 				}
37 | 		}
38 | }  # find_alloc_sz
39 | 
40 | $min_sz = 16;
41 | $max_sz = 3072;
42 | $step_sz = 8;
43 | 
44 | $OCSV = 100;
45 | $ofile = "./exp-log.csv";
46 | open(OCSV, ">", $ofile) || die "cannot open $ofile for writing";
47 | $oline = "nbytes block_frag cum_frag cum_frag2\n";
48 | print OCSV $oline;
49 | #print $oline;
50 | print "$#alloc_szs sizes\n";
51 | 
52 | $sum_frag = 0;
53 | $sum_alloc_sz = 0;
54 | $sum_overhead = 0;
55 | $n = 1;
56 | 
57 | for($sz = $min_sz; $sz <= $max_sz; $sz += $step_sz) {
58 | 		$alloc_sz = find_alloc_sz($sz);
59 | 		$overhead = $alloc_sz - $sz;
60 | 		$sum_overhead += $overhead;
61 | 		$sum_alloc_sz += $alloc_sz;
62 | 		$block_frag = $overhead / $alloc_sz;
63 | 		$sum_frag += $block_frag;
64 | 		$cum_frag = $sum_frag / $n;
65 | 		$cum_frag2 = $sum_overhead / $sum_alloc_sz;
66 | 		$n++;
67 | 		$oline = "$sz $block_frag $cum_frag $cum_frag2\n";
68 | 		print OCSV $oline;
69 | #		print $oline;
70 | }  # for($sz)
71 | 
72 | close OCSV;
73 | system('gnuplot exp-plot.gpl');
74 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.png
5 | *.pdf
6 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/exp-log-priv-1.csv:
--------------------------------------------------------------------------------
 1 | allocator alloc_sz nallocs ffree thru
 2 | halloc 16 4 0.2 982.04
 3 | scatter 16 4 0.2 271.01
 4 | halloc 16 4 0.21 983.60
 5 | scatter 16 4 0.21 269.33
 6 | halloc 16 4 0.22 982.17
 7 | scatter 16 4 0.22 268.03
 8 | halloc 16 4 0.23 978.75
 9 | scatter 16 4 0.23 266.21
10 | halloc 16 4 0.24 980.30
11 | scatter 16 4 0.24 263.26
12 | halloc 16 4 0.25 977.35
13 | scatter 16 4 0.25 270.15
14 | halloc 16 4 0.26 980.39
15 | scatter 16 4 0.26 271.01
16 | halloc 16 4 0.27 977.82
17 | scatter 16 4 0.27 266.86
18 | halloc 16 4 0.28 973.04
19 | scatter 16 4 0.28 256.02
20 | halloc 16 4 0.29 972.50
21 | scatter 16 4 0.29 11.33
22 | halloc 16 4 0.3 979.25
23 | scatter 16 4 0.3 7.64
24 | halloc 16 4 0.31 970.01
25 | scatter 16 4 0.31 7.29
26 | halloc 16 4 0.32 971.70
27 | scatter 16 4 0.32 6.63
28 | halloc 16 4 0.33 970.19
29 | scatter 16 4 0.33 6.53
30 | halloc 16 4 0.34 972.48
31 | scatter 16 4 0.34 5.96
32 | halloc 64 1 0.2 806.59
33 | scatter 64 1 0.2 190.25
34 | halloc 64 1 0.21 807.06
35 | scatter 64 1 0.21 189.42
36 | halloc 64 1 0.22 808.17
37 | scatter 64 1 0.22 187.81
38 | halloc 64 1 0.23 797.65
39 | scatter 64 1 0.23 63.09
40 | halloc 64 1 0.24 805.14
41 | scatter 64 1 0.24 19.88
42 | halloc 64 1 0.25 805.23
43 | scatter 64 1 0.25 9.88
44 | halloc 64 1 0.26 804.99
45 | scatter 64 1 0.26 9.58
46 | halloc 64 1 0.27 801.62
47 | scatter 64 1 0.27 8.52
48 | halloc 64 1 0.28 800.72
49 | scatter 64 1 0.28 7.69
50 | halloc 64 1 0.29 799.41
51 | scatter 64 1 0.29 5.91
52 | halloc 64 1 0.3 802.26
53 | scatter 64 1 0.3 5.88
54 | halloc 64 1 0.31 799.17
55 | scatter 64 1 0.31 4.85
56 | halloc 64 1 0.32 799.17
57 | scatter 64 1 0.32 4.65
58 | halloc 64 1 0.33 802.99
59 | scatter 64 1 0.33 3.95
60 | halloc 64 1 0.34 802.42
61 | scatter 64 1 0.34 3.67
62 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/exp-log-priv-2.csv:
--------------------------------------------------------------------------------
 1 | allocator alloc_sz nallocs ffree thru
 2 | halloc 16 4 0.2 965.75
 3 | scatter 16 4 0.2 269.91
 4 | halloc 16 4 0.21 982.14
 5 | scatter 16 4 0.21 268.22
 6 | halloc 16 4 0.22 981.53
 7 | scatter 16 4 0.22 267.53
 8 | halloc 16 4 0.23 981.10
 9 | scatter 16 4 0.23 266.25
10 | halloc 16 4 0.24 983.11
11 | scatter 16 4 0.24 267.94
12 | halloc 16 4 0.25 976.61
13 | scatter 16 4 0.25 270.16
14 | halloc 16 4 0.26 975.87
15 | scatter 16 4 0.26 270.08
16 | halloc 16 4 0.27 979.27
17 | scatter 16 4 0.27 268.75
18 | halloc 16 4 0.28 975.02
19 | scatter 16 4 0.28 12.77
20 | halloc 16 4 0.29 977.26
21 | scatter 16 4 0.29 8.99
22 | halloc 16 4 0.3 976.81
23 | scatter 16 4 0.3 8.29
24 | halloc 16 4 0.31 972.75
25 | scatter 16 4 0.31 7.46
26 | halloc 16 4 0.32 975.86
27 | scatter 16 4 0.32 6.54
28 | halloc 16 4 0.33 968.50
29 | scatter 16 4 0.33 6.10
30 | halloc 16 4 0.34 975.61
31 | scatter 16 4 0.34 5.73
32 | halloc 64 1 0.2 800.17
33 | scatter 64 1 0.2 190.13
34 | halloc 64 1 0.21 805.85
35 | scatter 64 1 0.21 188.04
36 | halloc 64 1 0.22 808.36
37 | scatter 64 1 0.22 118.72
38 | halloc 64 1 0.23 804.07
39 | scatter 64 1 0.23 41.38
40 | halloc 64 1 0.24 806.42
41 | scatter 64 1 0.24 30.69
42 | halloc 64 1 0.25 803.27
43 | scatter 64 1 0.25 20.10
44 | halloc 64 1 0.26 801.75
45 | scatter 64 1 0.26 9.43
46 | halloc 64 1 0.27 804.59
47 | scatter 64 1 0.27 8.74
48 | halloc 64 1 0.28 804.18
49 | scatter 64 1 0.28 8.61
50 | halloc 64 1 0.29 798.36
51 | scatter 64 1 0.29 6.93
52 | halloc 64 1 0.3 807.24
53 | scatter 64 1 0.3 6.17
54 | halloc 64 1 0.31 801.66
55 | scatter 64 1 0.31 5.29
56 | halloc 64 1 0.32 801.73
57 | scatter 64 1 0.32 4.42
58 | halloc 64 1 0.33 802.09
59 | scatter 64 1 0.33 3.90
60 | halloc 64 1 0.34 804.09
61 | scatter 64 1 0.34 3.81
62 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/exp-log-priv-3.csv:
--------------------------------------------------------------------------------
 1 | allocator alloc_sz nallocs ntries thru
 2 | halloc 16 4 0.25 929.48
 3 | scatter 16 4 0.25 252.56
 4 | cuda 16 4 0.25 1.13
 5 | halloc 16 4 0.26 932.18
 6 | scatter 16 4 0.26 251.80
 7 | cuda 16 4 0.26 1.11
 8 | halloc 16 4 0.27 934.27
 9 | scatter 16 4 0.27 253.12
10 | cuda 16 4 0.27 1.10
11 | halloc 16 4 0.28 927.89
12 | scatter 16 4 0.28 255.17
13 | cuda 16 4 0.28 1.10
14 | halloc 16 4 0.29 923.13
15 | scatter 16 4 0.29 251.95
16 | cuda 16 4 0.29 1.10
17 | halloc 16 4 0.3 931.29
18 | scatter 16 4 0.3 8.72
19 | cuda 16 4 0.3 1.08
20 | halloc 16 4 0.31 924.89
21 | scatter 16 4 0.31 9.54
22 | cuda 16 4 0.31 1.08
23 | halloc 16 4 0.32 929.67
24 | scatter 16 4 0.32 7.07
25 | cuda 16 4 0.32 1.08
26 | halloc 16 4 0.33 928.99
27 | scatter 16 4 0.33 6.33
28 | cuda 16 4 0.33 1.04
29 | halloc 16 4 0.34 930.59
30 | scatter 16 4 0.34 6.00
31 | cuda 16 4 0.34 1.02
32 | halloc 64 1 0.25 755.22
33 | scatter 64 1 0.25 169.06
34 | cuda 64 1 0.25 1.39
35 | halloc 64 1 0.26 761.69
36 | scatter 64 1 0.26 171.18
37 | cuda 64 1 0.26 1.40
38 | halloc 64 1 0.27 760.99
39 | scatter 64 1 0.27 168.87
40 | cuda 64 1 0.27 1.40
41 | halloc 64 1 0.28 758.68
42 | scatter 64 1 0.28 169.78
43 | cuda 64 1 0.28 1.41
44 | halloc 64 1 0.29 756.84
45 | scatter 64 1 0.29 167.94
46 | cuda 64 1 0.29 1.32
47 | halloc 64 1 0.3 756.97
48 | scatter 64 1 0.3 155.65
49 | cuda 64 1 0.3 1.36
50 | halloc 64 1 0.31 759.72
51 | scatter 64 1 0.31 165.55
52 | cuda 64 1 0.31 1.41
53 | halloc 64 1 0.32 761.54
54 | scatter 64 1 0.32 69.02
55 | cuda 64 1 0.32 1.40
56 | halloc 64 1 0.33 755.98
57 | scatter 64 1 0.33 32.53
58 | cuda 64 1 0.33 1.33
59 | halloc 64 1 0.34 759.43
60 | scatter 64 1 0.34 25.85
61 | cuda 64 1 0.34 1.32
62 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/exp-log-priv.csv:
--------------------------------------------------------------------------------
 1 | allocator alloc_sz nallocs ntries thru
 2 | halloc 16 4 65536 1038.58
 3 | scatter 16 4 65536 312.89
 4 | cuda 16 4 65536 1.22
 5 | halloc 16 4 131072 777.01
 6 | scatter 16 4 131072 288.14
 7 | cuda 16 4 131072 1.21
 8 | halloc 16 4 196608 950.07
 9 | scatter 16 4 196608 266.85
10 | cuda 16 4 196608 1.15
11 | halloc 16 4 262144 970.49
12 | scatter 16 4 262144 262.20
13 | cuda 16 4 262144 1.13
14 | halloc 16 4 327680 980.67
15 | scatter 16 4 327680 260.94
16 | cuda 16 4 327680 1.10
17 | halloc 16 4 393216 982.32
18 | scatter 16 4 393216 258.07
19 | cuda 16 4 393216 1.11
20 | halloc 16 4 458752 985.25
21 | scatter 16 4 458752 8.09
22 | cuda 16 4 458752 1.09
23 | halloc 16 4 524288 990.09
24 | scatter 16 4 524288 5.23
25 | cuda 16 4 524288 1.09
26 | halloc 16 4 589824 981.62
27 | scatter 16 4 589824 3.81
28 | cuda 16 4 589824 1.06
29 | halloc 16 4 655360 990.06
30 | scatter 16 4 655360 3.15
31 | cuda 16 4 655360 1.09
32 | halloc 64 1 65536 843.04
33 | scatter 64 1 65536 200.29
34 | cuda 64 1 65536 1.35
35 | halloc 64 1 131072 631.26
36 | scatter 64 1 131072 185.52
37 | cuda 64 1 131072 1.42
38 | halloc 64 1 196608 792.66
39 | scatter 64 1 196608 183.51
40 | cuda 64 1 196608 1.42
41 | halloc 64 1 262144 813.74
42 | scatter 64 1 262144 180.21
43 | cuda 64 1 262144 1.40
44 | halloc 64 1 327680 822.22
45 | scatter 64 1 327680 179.32
46 | cuda 64 1 327680 1.37
47 | halloc 64 1 393216 825.92
48 | scatter 64 1 393216 178.73
49 | cuda 64 1 393216 1.41
50 | halloc 64 1 458752 834.72
51 | scatter 64 1 458752 97.93
52 | cuda 64 1 458752 1.40
53 | halloc 64 1 524288 833.25
54 | scatter 64 1 524288 5.73
55 | cuda 64 1 524288 1.39
56 | halloc 64 1 589824 821.45
57 | scatter 64 1 589824 3.27
58 | cuda 64 1 589824 1.38
59 | halloc 64 1 655360 841.57
60 | scatter 64 1 655360 2.60
61 | cuda 64 1 655360 1.41
62 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/exp-log-spree.csv:
--------------------------------------------------------------------------------
 1 | allocator alloc_sz nallocs nthreads thru thru_malloc thru_free
 2 | halloc 16 4 65536 558.85 1058.06 1166.94
 3 | scatter 16 4 65536 180.58 244.43 673.03
 4 | cuda 16 4 65536 0.31 0.33 7.29
 5 | halloc 16 4 131072 647.81 1181.17 1412.47
 6 | scatter 16 4 131072 138.49 166.11 798.98
 7 | cuda 16 4 131072 0.31 0.33 7.14
 8 | halloc 16 4 196608 658.35 1165.55 1488.63
 9 | scatter 16 4 196608 128.39 149.57 863.38
10 | cuda 16 4 196608 0.30 0.31 7.23
11 | halloc 16 4 262144 683.10 1225.46 1519.11
12 | scatter 16 4 262144 102.87 116.73 817.47
13 | cuda 16 4 262144 0.28 0.29 7.27
14 | halloc 16 4 327680 701.69 1268.07 1546.45
15 | scatter 16 4 327680 7.80 7.82 772.93
16 | cuda 16 4 327680 0.27 0.28 7.27
17 | halloc 16 4 393216 716.42 1302.52 1567.44
18 | scatter 16 4 393216 5.62 5.62 777.60
19 | cuda 16 4 393216 0.25 0.26 7.36
20 | halloc 16 4 458752 714.80 1289.26 1579.07
21 | scatter 16 4 458752 2.43 2.42 774.32
22 | cuda 16 4 458752 0.23 0.24 7.19
23 | halloc 16 4 524288 723.18 1306.52 1594.38
24 | scatter 16 4 524288 1.13 1.13 711.65
25 | cuda 16 4 524288 0.22 0.22 7.24
26 | halloc 16 4 589824 740.49 1359.65 1601.15
27 | scatter 16 4 589824 0.98 0.97 714.50
28 | cuda 16 4 589824 0.20 0.20 7.15
29 | halloc 16 4 655360 736.18 1333.57 1617.76
30 | scatter 16 4 655360 0.76 0.75 680.68
31 | cuda 16 4 655360 0.19 0.19 7.18
32 | halloc 64 1 65536 303.24 572.84 634.75
33 | scatter 64 1 65536 105.39 152.90 331.62
34 | cuda 64 1 65536 0.29 0.30 5.90
35 | halloc 64 1 131072 365.00 644.84 827.53
36 | scatter 64 1 131072 61.70 73.22 374.97
37 | cuda 64 1 131072 0.31 0.33 6.66
38 | halloc 64 1 196608 408.28 744.14 890.58
39 | scatter 64 1 196608 47.34 53.84 370.44
40 | cuda 64 1 196608 0.32 0.33 7.02
41 | halloc 64 1 262144 428.33 760.95 964.29
42 | scatter 64 1 262144 48.93 55.29 400.87
43 | cuda 64 1 262144 0.31 0.33 7.08
44 | halloc 64 1 327680 450.79 804.79 1008.55
45 | scatter 64 1 327680 47.26 53.25 395.21
46 | cuda 64 1 327680 0.30 0.31 7.13
47 | halloc 64 1 393216 456.52 804.09 1039.07
48 | scatter 64 1 393216 44.96 50.00 416.80
49 | cuda 64 1 393216 0.29 0.30 7.29
50 | halloc 64 1 458752 467.69 824.36 1063.50
51 | scatter 64 1 458752 3.45 3.46 421.65
52 | cuda 64 1 458752 0.29 0.30 7.32
53 | halloc 64 1 524288 475.88 838.48 1082.67
54 | scatter 64 1 524288 1.34 1.34 378.90
55 | cuda 64 1 524288 0.29 0.30 7.27
56 | halloc 64 1 589824 481.29 847.43 1095.94
57 | scatter 64 1 589824 0.93 0.92 371.99
58 | cuda 64 1 589824 0.28 0.29 7.36
59 | halloc 64 1 655360 496.02 884.87 1110.82
60 | scatter 64 1 655360 0.88 0.87 377.13
61 | cuda 64 1 655360 0.29 0.30 7.35
62 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/exp-plot.gpl:
--------------------------------------------------------------------------------
 1 | # fixed slab size, varying occupancy
 2 | set terminal pdf enhanced color font ",10"
 3 | set pointsize 0.65
 4 | set style data linespoints
 5 | 
 6 | set output "vs-priv.pdf"
 7 | set logscale y 2
 8 | set xlabel "Allocation fraction"
 9 | set ylabel "Throughput, Mops/s"
10 | plot [0.19:0.35][2:4096] \
11 | 	"<awk '$1==\"halloc\"&&$2==16' exp-log-priv.csv" \
12 | 				u 4:5 title "4x16B, Halloc",\
13 | 	"<awk '$1==\"halloc\"&&$2==64' exp-log-priv.csv" \
14 | 				u 4:5 title "1x64B, Halloc",\
15 | 	"<awk '$1==\"scatter\"&&$2==16' exp-log-priv.csv" \
16 | 				u 4:5 title "4x16B, ScatterAlloc",\
17 | 	"<awk '$1==\"scatter\"&&$2==64' exp-log-priv.csv" \
18 | 				u 4:5 title "1x64B, ScatterAlloc"
19 | 
20 | 
21 | set output "vs-spree-malloc.pdf"
22 | set logscale y 2
23 | set xlabel "#threads"
24 | set ylabel "Throughput, Mops/s"
25 | plot [][0.8:16384] \
26 | 	"<awk '$1==\"halloc\"&&$2==16' exp-log-spree.csv" \
27 | 				u 4:6 title "4x16B, Halloc",\
28 | 	"<awk '$1==\"halloc\"&&$2==64' exp-log-spree.csv" \
29 | 				u 4:6 title "1x64B, Halloc",\
30 | 	"<awk '$1==\"scatter\"&&$2==16' exp-log-spree.csv" \
31 | 				u 4:6 title "4x16B, ScatterAlloc",\
32 | 	"<awk '$1==\"scatter\"&&$2==64' exp-log-spree.csv" \
33 | 				u 4:6 title "1x64B, ScatterAlloc"
34 | 
35 | 
36 | set output "vs-spree-free.pdf"
37 | set logscale y 2
38 | set xlabel "#threads"
39 | set ylabel "Throughput, Mops/s"
40 | plot [][0.8:16384] \
41 | 	"<awk '$1==\"halloc\"&&$2==16' exp-log-spree.csv" \
42 | 				u 4:7 title "4x16B, Halloc",\
43 | 	"<awk '$1==\"halloc\"&&$2==64' exp-log-spree.csv" \
44 | 				u 4:7 title "1x64B, Halloc",\
45 | 	"<awk '$1==\"scatter\"&&$2==16' exp-log-spree.csv" \
46 | 				u 4:7 title "4x16B, ScatterAlloc",\
47 | 	"<awk '$1==\"scatter\"&&$2==64' exp-log-spree.csv" \
48 | 				u 4:7 title "1x64B, ScatterAlloc"
49 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/exp-run.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env perl
 2 | 
 3 | # a script to perform experiment with settings
 4 | use POSIX;
 5 | 
 6 | #include common functions
 7 | do '../common.pl';
 8 | 
 9 | $memory = 512 * 1024 * 1024;
10 | $group = 5;
11 | $max_nthreads = 640 * 1024;
12 | $mem_fraction = 0.4;
13 | #$common = "-f$falloc -F$ffree -e$fexec -m$memory -g$group";
14 | 
15 | # private test
16 | sub priv_test {
17 | 		$fexec = 0.75;
18 | #		$ffree = 0.2;
19 | 		$ffree = 0.35;
20 | 		$total_niters = 128;
21 | 		$ocsv_name = "./exp-log-priv.csv";
22 | 		$common = "-e$fexec -m$memory -g$group";
23 | 		$OCSV = 100;
24 | 		open(OCSV, ">", $ocsv_name) 
25 | 				|| die "cannot open file $ocsv_name for writing";
26 | 		$oline = "allocator alloc_sz nallocs ntries thru\n";
27 | 		print $oline;
28 | 		print OCSV $oline;
29 | 		foreach $alloc_sz (16, 64) {
30 | #		foreach $alloc_sz (16) {
31 | #				for($nallocs = 1; $nallocs < 16; $nallocs++) {
32 | #				for($ffree = 0.25; $ffree <= 0.35; $ffree += 0.01) {
33 | 				for($nthreads = 64 * 1024; $nthreads <= $max_nthreads; 
34 | 						$nthreads += 64 * 1024) {
35 | #				for($ntries = 2; $ntries <= 32; $ntries += 2) {
36 | #						$falloc = $ffree + $fexec - 0.01;
37 | 						$falloc = 0.9;
38 | 						$nallocs = $alloc_sz == 16 ? 4 : 1;
39 | #						$nthreads = $max_nthreads;
40 | #						foreach $allocator ("halloc", "scatter", "cuda") {
41 | 						foreach $allocator ("halloc", "scatter", "cuda") {
42 | 								$args = "-a$allocator -n$nthreads -l$nallocs -s$alloc_sz " . 
43 | 										"-f$falloc -F$ffree -e$fexec";
44 | 								# private speed
45 | 								$niters = 16;
46 | 								$ntries = $total_niters / $niters;
47 | 								if($allocator eq "cuda") {
48 | 										$ntries = 1;
49 | 								}
50 | 								runtest("throughput", $common, $args, "-i$niters -t$ntries");
51 | 								$oline = "$allocator $alloc_sz $nallocs $nthreads $thru_pair\n";
52 | #								$oline = "$allocator $alloc_sz $nallocs $ffree $thru_pair\n";
53 | 								print OCSV $oline;
54 | 								print $oline;
55 | 						}
56 | 				}
57 | 		}  # foreach $alloc_sz		
58 | 		close OCSV;
59 | } # sub priv_test
60 | 
61 | # spree test: fractions fixed, nthreads varies
62 | sub spree_test {
63 | 		$ocsv_name = "./exp-log-spree.csv";
64 | 		$OCSV = 100;
65 | 		$falloc = 0.9; $ffree = 0.2; $fexec = 0.71;
66 | 		$total_niters = 16;
67 | 		$common = "-f$falloc -F$ffree -e$fexec -m$memory -g$group";
68 | 		open(OCSV, ">", $ocsv_name) 
69 | 				|| die "cannot open file $ocsv_name for writing";
70 | 		$oline = "allocator alloc_sz nallocs nthreads thru thru_malloc thru_free\n";
71 | 		print $oline;
72 | 		print OCSV $oline;
73 | 		foreach $alloc_sz (16, 64) {
74 | 				for($nthreads = 64 * 1024; $nthreads <= $max_nthreads;
75 | 						$nthreads += 64 *	1024) {
76 | 						$nallocs = $alloc_sz == 16 ? 4 : 1;
77 | 						foreach $allocator ("halloc", "scatter", "cuda") {
78 | 								$args = "-a$allocator -n$nthreads -l$nallocs -s$alloc_sz";
79 | 								# private speed
80 | 								$niters = 1;
81 | 								$ntries = $total_niters / $niters;
82 | 								runtest("throughput", $common, $args, "-i$niters -t$ntries");
83 | 								$oline = "$allocator $alloc_sz $nallocs $nthreads $thru_pair " 
84 | 										. "$thru_malloc $thru_free\n";
85 | 								print OCSV $oline;
86 | 								print $oline;
87 | 						}
88 | 				}
89 | 		}  # foreach $alloc_sz		
90 | 		close OCSV;
91 | } # sub spree_test
92 | 
93 | # main
94 | priv_test();
95 | spree_test();
96 | # run gnuplot
97 | system('gnuplot', './exp-plot.gpl');
98 | 


--------------------------------------------------------------------------------
/tst/exp/halloc-vs-scatter/graph.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import numpy as np #numerical stuff
  3 | import sys
  4 | import os
  5 | 
  6 | import prettyplotlib as ppl # makes nicer colors and generally better to look at graphs
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib as mpl
  9 | from prettyplotlib import brewer2mpl
 10 | 
 11 | def funlink(path):
 12 |   try:
 13 |     os.unlink(path)
 14 |   except:
 15 |     pass
 16 | 
 17 | # filtering the numpy array for specific sz and l values
 18 | def np_filter(data, sz):
 19 |   nps = data.shape[0]
 20 |   return np.array([data[i,:] for i in range(nps) if
 21 |                    data[i,0]==sz])
 22 | 
 23 | inputFileName = "exp-log-priv.csv"
 24 | data = np.loadtxt(inputFileName, skiprows=1, usecols=[1,2,3,4])
 25 | 
 26 | allocators = ['Halloc', 'ScatterAlloc', 'CUDA']
 27 | #allocators = ['Halloc']
 28 | 
 29 | nps0 = data.shape[0]
 30 | fig = plt.figure(figsize=(12,7))
 31 | ax = fig.add_subplot(111)
 32 | ax.set_yscale('log')
 33 | #ymin = 1
 34 | ymin = np.amin(data[:,3]) / 1.5
 35 | ymax = np.amax(data[:,3]) * 1.5
 36 | #ymin = 0.1
 37 | #ymax = 5 * 10**3
 38 | for ialloc in range(len(allocators)):
 39 |   for sz in [16, 64]:
 40 |     l = 1
 41 |     if(sz == 16):
 42 |       l = 4
 43 |     alloc = allocators[ialloc];
 44 |     curData = data[np.array(range(nps0/3))*3 + ialloc, :]
 45 |     curData = np_filter(curData, sz)
 46 |     xs = range(curData.shape[0])
 47 |     # allocation throughput for different sizes
 48 |     ppl.plot(ax, xs, curData[:,3], '-o',
 49 |              label=('%dx%d B %s' % (l,sz,alloc)), linewidth=2)
 50 |     ax.set_xlabel('#threads, x 1024')
 51 |     ax.set_ylabel('Throughput, Mops/s')
 52 |     if(ialloc == len(allocators) - 1 and sz == 64):
 53 |       ax.set_xticks(xs)
 54 |       ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024.0])
 55 |       ax.axis(xmin=-1, xmax=len(xs), ymin=ymin, ymax=ymax)
 56 |       ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
 57 | 
 58 | ax.set_title('Private Test Pair Throughput')
 59 | ppl.legend(ax, loc=0)
 60 | plt.tick_params(axis='both', which='major', direction='in', bottom=True)
 61 | outputfilename = 'vs-priv-pair.pdf'
 62 | funlink(outputfilename)
 63 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
 64 |   
 65 | #plt.show()
 66 | 
 67 | 
 68 | inputFileName = "exp-log-spree.csv"
 69 | data = np.loadtxt(inputFileName, skiprows=1, usecols=[1,2,3,4,5,6])
 70 | 
 71 | allocators = ['Halloc', 'ScatterAlloc', 'CUDA']
 72 | nps0 = data.shape[0]
 73 | fig = plt.figure(figsize=(12,7))
 74 | ax = fig.add_subplot(111)
 75 | ax.set_yscale('log')
 76 | ymin = np.amin(data[:,4]) / 1.5
 77 | ymax = np.amax(data[:,4]) * 1.5
 78 | for ialloc in range(len(allocators)):
 79 |   for sz in [16, 64]:
 80 |     l = 1
 81 |     if(sz == 16):
 82 |       l = 4
 83 |     alloc = allocators[ialloc];
 84 |     curData = data[np.array(range(nps0/3))*3 + ialloc, :]
 85 |     curData = np_filter(curData, sz)
 86 |     xs = range(curData.shape[0])
 87 |     # allocation throughput for different sizes
 88 |     ppl.plot(ax, xs, curData[:,4], '-o',
 89 |              label=('%dx%d B %s' % (l,sz,alloc)), linewidth=2)
 90 |     ax.set_xlabel('#threads, x 1024')
 91 |     ax.set_ylabel('Throughput, Mops/s')
 92 |     if(ialloc == len(allocators) - 1 and sz == 64):
 93 |       ax.set_xticks(xs)
 94 |       ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024.0])
 95 |       ax.axis(xmin=-1, xmax=len(xs), ymin=ymin, ymax=ymax)
 96 |       ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
 97 | 
 98 | ax.set_title('Spree Test malloc() Throughput')
 99 | ppl.legend(ax, loc=0)
100 | plt.tick_params(axis='both', which='major', direction='in', bottom=True)
101 | outputfilename = 'vs-spree-malloc.pdf'
102 | funlink(outputfilename)
103 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
104 |   
105 | #plt.show()
106 | 


--------------------------------------------------------------------------------
/tst/exp/run-all-exps.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | cd scaling
 4 | ./exp-run.pl
 5 | cd ..
 6 | 
 7 | cd settings
 8 | ./exp-run.pl
 9 | cd ..
10 | 
11 | cd speed
12 | ./exp-run.pl
13 | cd ..
14 | 
15 | cd halloc-vs-scatter
16 | ./exp-run.pl
17 | cd ..
18 | 


--------------------------------------------------------------------------------
/tst/exp/run-scaling-speed.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | 
 4 | cd scaling
 5 | ./exp-run.pl
 6 | #gnuplot ./exp-plot.gpl
 7 | cd ..
 8 | 
 9 | cd settings
10 | ./exp-run.pl
11 | #gnuplot ./exp-plot.gpl
12 | cd ..
13 | 
14 | cd speed
15 | ./exp-run.pl
16 | #gnuplot ./exp-plot.gpl
17 | cd ..
18 | 


--------------------------------------------------------------------------------
/tst/exp/scaling/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.png
5 | *.pdf
6 | 


--------------------------------------------------------------------------------
/tst/exp/scaling/exp-log-lat.csv:
--------------------------------------------------------------------------------
 1 | alloc_sz nallocs nthreads malloc_min malloc_avg malloc_max free_min free_avg free_max
 2 | 16 1 1 2669.00 3046.49 5233.00 1966.00 2053.32 2606.00
 3 | 256 1 1 2664.00 3020.17 5285.00 1970.00 2051.83 2578.00
 4 | 16 1 2 2665.00 3034.24 5190.00 1970.00 2075.22 2574.00
 5 | 256 1 2 2672.00 3050.47 5385.00 1970.00 2072.81 2542.00
 6 | 16 1 4 2705.00 3068.02 5213.00 2042.00 2117.30 2578.00
 7 | 256 1 4 2689.00 3048.25 5297.00 2006.00 2085.87 2582.00
 8 | 16 1 8 2757.00 3106.10 5337.00 2090.00 2175.38 2602.00
 9 | 256 1 8 2685.00 3063.93 5233.00 2010.00 2091.18 2490.00
10 | 16 1 16 2753.00 3170.95 5445.00 2094.00 2218.83 2730.00
11 | 256 1 16 2729.00 3117.48 5329.00 2018.00 2112.97 2594.00
12 | 16 1 32 2897.00 3181.90 4801.00 2182.00 2295.02 2758.00
13 | 256 1 32 2720.00 3056.85 4717.00 2050.00 2155.36 2622.00
14 | 16 1 64 2905.00 3174.13 4585.00 2182.00 2282.11 2682.00
15 | 256 1 64 2713.00 3011.81 4672.00 2046.00 2135.31 2506.00
16 | 16 1 128 2885.00 3151.48 4584.00 2166.00 2269.20 2646.00
17 | 256 1 128 2697.00 2964.06 4572.00 2034.00 2119.09 2586.00
18 | 16 1 256 2885.00 3141.89 4556.00 2166.00 2262.80 2790.00
19 | 256 1 256 2693.00 2948.81 4548.00 2030.00 2123.28 2562.00
20 | 16 1 512 2825.00 3157.58 4636.00 2162.00 2272.25 2958.00
21 | 256 1 512 2689.00 2969.11 4600.00 2030.00 2129.53 2790.00
22 | 16 1 1024 2809.00 3108.37 4576.00 2146.00 2256.25 2946.00
23 | 256 1 1024 2677.00 2950.17 4536.00 2010.00 2111.54 2866.00
24 | 16 1 2048 2796.00 3090.98 4589.00 2126.00 2248.39 2942.00
25 | 256 1 2048 2661.00 2950.79 4501.00 1994.00 2104.90 2910.00
26 | 16 1 4096 2792.00 3133.48 4643.00 2129.00 2278.27 3025.00
27 | 256 1 4096 2650.00 2994.24 4517.00 1994.00 2129.01 2942.00
28 | 16 1 8192 2806.00 3269.68 5114.00 2134.00 2349.96 3266.00
29 | 256 1 8192 2666.00 3104.65 5758.00 1994.00 2192.73 3172.00
30 | 16 1 16384 2805.00 3993.63 9289.00 2130.00 2522.31 5405.00
31 | 256 1 16384 2665.00 3771.82 25824.00 1994.00 2434.00 5683.00
32 | 16 1 32768 2794.00 3699.55 9389.00 2128.00 2488.10 5508.00
33 | 256 1 32768 2652.00 3631.28 26175.00 1994.00 2362.26 5788.00
34 | 16 1 65536 2796.00 3543.83 9365.00 2126.00 2468.92 5542.00
35 | 256 1 65536 2649.00 12856.61 66737.00 1986.00 2472.33 5755.00
36 | 16 1 131072 2805.00 3488.33 9691.00 2134.00 2463.11 5517.00
37 | 256 1 131072 2676.00 14862.38 124004.00 1990.00 2500.34 6001.00
38 | 16 1 262144 2801.00 3472.84 10822.00 2129.00 2458.63 5435.00
39 | 256 1 262144 2667.00 17748.94 169232.00 1993.00 2556.95 5812.00
40 | 16 1 524288 2797.00 3510.06 10846.00 2130.00 2456.23 5655.00
41 | 256 1 524288 2681.00 30596.38 197553.00 2012.00 3071.61 7347.00
42 | 16 1 1048576 2805.00 3629.24 14415.00 2130.00 2455.04 5298.00
43 | 256 1 1048576 2683.00 42801.76 250025.00 2002.00 3734.92 9445.00
44 | 


--------------------------------------------------------------------------------
/tst/exp/scaling/exp-log-thru.csv:
--------------------------------------------------------------------------------
 1 | alloc_sz nallocs nthreads priv_pair spree_pair spree_malloc spree_free
 2 | 16 1 32 3.11 0.61 1.17 1.25
 3 | 16 4 32 4.06 1.74 3.27 3.66
 4 | 256 1 32 3.21 0.60 1.16 1.24
 5 | 16 1 64 5.97 1.19 2.30 2.43
 6 | 16 4 64 7.97 3.33 6.28 7.01
 7 | 256 1 64 6.32 1.20 2.31 2.47
 8 | 16 1 128 11.79 2.40 4.66 4.90
 9 | 16 4 128 15.42 6.76 12.75 14.22
10 | 256 1 128 12.63 2.68 5.15 5.51
11 | 16 1 256 23.68 5.37 10.36 11.00
12 | 16 4 256 30.63 14.45 27.15 30.53
13 | 256 1 256 24.64 5.37 10.19 11.20
14 | 16 1 512 44.59 7.66 15.00 15.49
15 | 16 4 512 59.58 22.65 43.36 46.87
16 | 256 1 512 46.30 7.58 14.56 15.62
17 | 16 1 1024 89.53 15.43 30.34 31.04
18 | 16 4 1024 118.87 45.02 86.98 92.29
19 | 256 1 1024 92.89 15.51 30.41 31.31
20 | 16 1 2048 177.10 30.63 60.00 61.87
21 | 16 4 2048 236.00 89.26 172.32 183.08
22 | 256 1 2048 184.24 30.54 59.76 61.75
23 | 16 1 4096 347.55 59.95 119.07 119.41
24 | 16 4 4096 458.70 172.43 335.65 350.60
25 | 256 1 4096 361.17 61.05 120.40 122.48
26 | 16 1 8192 667.23 118.77 234.67 237.82
27 | 16 4 8192 879.54 336.12 650.60 687.53
28 | 256 1 8192 692.55 120.26 237.31 241.12
29 | 16 1 16384 758.85 205.24 392.85 424.84
30 | 16 4 16384 932.55 501.49 928.16 1077.97
31 | 256 1 16384 788.59 190.48 351.94 410.25
32 | 16 1 32768 1012.49 347.03 654.60 729.97
33 | 16 4 32768 1218.44 753.21 1373.81 1647.22
34 | 256 1 32768 621.78 312.41 557.77 701.38
35 | 16 1 65536 1214.44 546.46 1026.08 1155.39
36 | 16 4 65536 1431.13 887.71 1712.50 1822.22
37 | 256 1 65536 454.22 419.25 675.25 1089.95
38 | 16 1 131072 1334.22 677.16 1276.19 1425.88
39 | 16 4 131072 1552.65 1064.33 2039.86 2200.11
40 | 256 1 131072 506.78 530.69 862.80 1359.21
41 | 16 1 262144 1366.82 792.88 1562.20 1592.19
42 | 16 4 262144 1544.41 1151.00 2176.99 2413.93
43 | 256 1 262144 421.84 610.48 998.63 1548.64
44 | 16 1 524288 1380.35 895.23 1767.78 1793.64
45 | 16 4 524288 1457.55 1193.52 2213.27 2559.74
46 | 256 1 524288 268.46 626.00 974.51 1723.85
47 | 16 1 1048576 1366.76 940.35 1818.48 1925.30
48 | 16 4 1048576 1396.75 1220.92 2269.06 2611.85
49 | 256 1 1048576 190.58 557.76 807.69 1770.94
50 | 


--------------------------------------------------------------------------------
/tst/exp/scaling/exp-plot.gpl:
--------------------------------------------------------------------------------
 1 | # fixed slab size, varying occupancy
 2 | set terminal pdf enhanced color font ",10"
 3 | set pointsize 0.65
 4 | set style data linespoints
 5 | 
 6 | set output "1x16b-thru.pdf"
 7 | set xlabel "#threads"
 8 | set logscale x 2
 9 | set ylabel "Throughput, Mops/s"
10 | plot [1024:2*1024*1024][0:2000] \
11 | 	"<awk '$1==16 && $2==1' exp-log-thru.csv" u 3:4 title "Private",\
12 | 	"<awk '$1==16 && $2==1' exp-log-thru.csv" u 3:5 title "Spree",\
13 | 	"<awk '$1==16 && $2==1' exp-log-thru.csv" u 3:6 title "Spree malloc",\
14 | 	"<awk '$1==16 && $2==1' exp-log-thru.csv" u 3:7 title "Spree free"
15 | 
16 | set output "4x16b-thru.pdf"
17 | set xlabel "#threads"
18 | set logscale x 2
19 | set ylabel "Throughput, Mops/s"
20 | plot [1024:2*1024*1024][0:2500] \
21 | 	"<awk '$1==16 && $2==4' exp-log-thru.csv" u 3:4 title "Private",\
22 | 	"<awk '$1==16 && $2==4' exp-log-thru.csv" u 3:5 title "Spree",\
23 | 	"<awk '$1==16 && $2==4' exp-log-thru.csv" u 3:6 title "Spree malloc",\
24 | 	"<awk '$1==16 && $2==4' exp-log-thru.csv" u 3:7 title "Spree free"
25 | 
26 | set output "1x256b-thru.pdf"
27 | set xlabel "#threads"
28 | set logscale x 2
29 | set ylabel "Throughput, Mops/s"
30 | plot [1024:2*1024*1024][0:1500] \
31 | 	"<awk '$1==256 && $2==1' exp-log-thru.csv" u 3:4 title "Private",\
32 | 	"<awk '$1==256 && $2==1' exp-log-thru.csv" u 3:5 title "Spree",\
33 | 	"<awk '$1==256 && $2==1' exp-log-thru.csv" u 3:6 title "Spree malloc",\
34 | 	"<awk '$1==256 && $2==1' exp-log-thru.csv" u 3:7 title "Spree free"
35 | 
36 | #malloc latency
37 | set output "1x16b-lat-malloc.pdf"
38 | set xlabel "#threads"
39 | set logscale xy 2
40 | set ylabel "Latency, cycles"
41 | plot [0.5:2*1024*1024][256:48*1024] \
42 | 	"<awk '$1==16 && $2==1' exp-log-lat.csv" u 3:4 title "Min",\
43 | 	"<awk '$1==16 && $2==1' exp-log-lat.csv" u 3:5 title "Avg",\
44 | 	"<awk '$1==16 && $2==1' exp-log-lat.csv" u 3:6 title "Max"
45 | 
46 | set output "1x256b-lat-malloc.pdf"
47 | set xlabel "#threads"
48 | set logscale xy 2
49 | set ylabel "Latency, cycles"
50 | plot [0.5:2*1024*1024][256:768*1024] \
51 | 	"<awk '$1==256 && $2==1' exp-log-lat.csv" u 3:4 title "Min",\
52 | 	"<awk '$1==256 && $2==1' exp-log-lat.csv" u 3:5 title "Avg",\
53 | 	"<awk '$1==256 && $2==1' exp-log-lat.csv" u 3:6 title "Max"
54 | 
55 | #free latency
56 | set output "1x16b-lat-free.pdf"
57 | set xlabel "#threads"
58 | set logscale xy 2
59 | set ylabel "Latency, cycles"
60 | plot [0.5:2*1024*1024][256:16*1024] \
61 | 	"<awk '$1==16 && $2==1' exp-log-lat.csv" u 3:7 title "Min",\
62 | 	"<awk '$1==16 && $2==1' exp-log-lat.csv" u 3:8 title "Avg",\
63 | 	"<awk '$1==16 && $2==1' exp-log-lat.csv" u 3:9 title "Max"
64 | 
65 | set output "1x256b-lat-free.pdf"
66 | set xlabel "#threads"
67 | set logscale xy 2
68 | set ylabel "Latency, cycles"
69 | plot [0.5:2*1024*1024][256:16*1024] \
70 | 	"<awk '$1==256 && $2==1' exp-log-lat.csv" u 3:7 title "Min",\
71 | 	"<awk '$1==256 && $2==1' exp-log-lat.csv" u 3:8 title "Avg",\
72 | 	"<awk '$1==256 && $2==1' exp-log-lat.csv" u 3:9 title "Max"
73 | 


--------------------------------------------------------------------------------
/tst/exp/scaling/exp-run.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env perl
 2 | 
 3 | # a script to perform experiment with settings
 4 | use POSIX;
 5 | 
 6 | #include common functions
 7 | do '../common.pl';
 8 | 
 9 | $falloc = 0.95;
10 | $ffree = 0.05;
11 | $fexec = 0.91;
12 | $memory = 512 * 1024 * 1024;
13 | $group = 5;
14 | $max_nthreads = 1024 * 1024;
15 | $mem_fraction = 0.4;
16 | $common = "-f$falloc -F$ffree -e$fexec -m$memory -g$group";
17 | 
18 | # throughput test
19 | sub thru_test {
20 | 		$total_niters = 512;
21 | 		$ocsv_name = "./exp-log-thru.csv";
22 | 		$OCSV = 100;
23 | 		open(OCSV, ">", $ocsv_name) 
24 | 				|| die "cannot open file $ocsv_name for writing";
25 | 		$oline = "alloc_sz nallocs nthreads priv_pair spree_pair spree_malloc spree_free\n";
26 | 		print $oline;
27 | 		print OCSV $oline;
28 | 		for($nthreads = 32; $nthreads <= 1024 * 1024; $nthreads *= 2) {
29 | 				foreach $alloc_sz (16, 256) {
30 | 						foreach $nallocs (1, 4) {
31 | 								if($nallocs == 4 && $alloc_sz > 64) {
32 | 										next;
33 | 								}
34 | 								# spree test
35 | 								$ntries = $total_niters;
36 | 								$args = "-n$nthreads -l$nallocs -s$alloc_sz";
37 | 								runtest("throughput", $common, $args, "-i1 -t$ntries");
38 | 								$spree_pair = $thru_pair;
39 | 								$spree_malloc = $thru_malloc;
40 | 								$spree_free = $thru_free;
41 | 								# private test
42 | 								$niters = 32;
43 | 								$ntries = $total_niters / $niters;
44 | 								runtest("throughput", $common, $args, "-i$niters -t$ntries");
45 | 								$priv_pair = $thru_pair;
46 | 								$oline = "$alloc_sz $nallocs $nthreads $priv_pair " . 
47 | 										"$spree_pair $spree_malloc $spree_free\n";
48 | 								print OCSV $oline;
49 | 								print $oline;
50 | 						}  # foreach $nallocs
51 | 				}  # foreach $alloc_sz
52 | 		}  # for($nthreads)
53 | 		close OCSV;
54 | } # sub thru_test
55 | 
56 | # latency test
57 | sub lat_test {
58 | 		$total_niters = 128;
59 | 		$ocsv_name = "./exp-log-lat.csv";
60 | 		$OCSV = 100;
61 | 		open(OCSV, ">", $ocsv_name) 
62 | 				|| die "cannot open file $ocsv_name for writing";
63 | 		$oline = "alloc_sz nallocs nthreads malloc_min malloc_avg malloc_max " . 
64 | 				"free_min free_avg free_max\n";
65 | 		print $oline;
66 | 		print OCSV $oline;
67 | 		for($nthreads = 1; $nthreads <= 1024 * 1024; $nthreads *= 2) {
68 | 				foreach $alloc_sz (16, 256) {
69 | 						$nallocs = 1;
70 | 						#foreach $nallocs (1, 4) {
71 | 						#		if($nallocs == 4 && $alloc_sz > 64) {
72 | 						#				next;
73 | 						#		}
74 | 								# private test
75 | 								$niters = 16;
76 | 								$ntries = $total_niters / $niters;
77 | 								$args = "-n$nthreads -l$nallocs -s$alloc_sz";
78 | 								runtest("latency", $common, $args, "-i$niters -t$ntries");
79 | 								$priv_pair = $thru_pair;
80 | 								$oline = "$alloc_sz $nallocs $nthreads " . 
81 | 										"$lat_malloc_min $lat_malloc_avg $lat_malloc_max " . 
82 | 										"$lat_free_min $lat_free_avg $lat_free_max\n";
83 | 								print OCSV $oline;
84 | 								print $oline;
85 | 						#}  # foreach $nallocs
86 | 				}  # foreach $alloc_sz
87 | 		}  # for($nthreads)
88 | 		close OCSV;
89 | } # sub lat_test
90 | 
91 | # main 
92 | thru_test();
93 | lat_test();
94 | # run gnuplot
95 | system('gnuplot', './exp-plot.gpl');
96 | 


--------------------------------------------------------------------------------
/tst/exp/scaling/graph.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import numpy as np #numerical stuff
 3 | import sys
 4 | import os
 5 | 
 6 | import prettyplotlib as ppl # makes nicer colors and generally better to look at graphs
 7 | import matplotlib.pyplot as plt
 8 | import matplotlib as mpl
 9 | from prettyplotlib import brewer2mpl
10 | 
11 | def funlink(path):
12 |   try:
13 |     os.unlink(path)
14 |   except:
15 |     pass
16 | 
17 | inputFileName = "exp-log-thru.csv"
18 | data = np.loadtxt(inputFileName, skiprows=1)
19 | 
20 | # filtering the numpy array for specific sz and l values
21 | def np_filter(data, l, sz):
22 |   nps = data.shape[0]
23 |   return np.array([data[i,:] for i in range(nps) if
24 |               data[i,0]==sz and data[i,1]==l and data[i,2]>=1024])
25 | 
26 | for l in [1, 4]:
27 |   for sz in [16, 256]:
28 |     if(sz == 256 and l == 4):
29 |       continue
30 |     curData = np_filter(data, l, sz)
31 |     xs = range(curData.shape[0])
32 |     
33 |     # allocation throughput for different sizes
34 |     fig = plt.figure(figsize=(12,7))
35 |     ax = fig.add_subplot(111)
36 |     ppl.plot(ax, xs, curData[:,3], '-o', label="Private", linewidth=2)
37 |     ppl.plot(ax, xs, curData[:,4], '-o', label="Spree", linewidth=2)
38 |     ppl.plot(ax, xs, curData[:,5], '-o', label="Spree malloc", linewidth=2)
39 |     ppl.plot(ax, xs, curData[:,6], '-o', label="Spree free", linewidth=2)
40 |     ax.set_title("%dx%d B Throughput" % (l,sz));
41 |     ax.set_xlabel("#threads, x 1024")
42 |     ax.set_ylabel("Throughput, Mops/s")
43 |     ax.set_xticks(xs)
44 |     ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024])
45 |     ax.axis(xmin=-1, xmax=len(xs), ymin=0)
46 |     ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
47 |     ppl.legend(ax, loc=0)
48 |     plt.tick_params(axis='both', which='major', direction='in', bottom=True)
49 |     outputfilename = '%dx%db-thru.pdf' % (l,sz)
50 |     funlink(outputfilename)
51 |     fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
52 | 
53 | 
54 | inputFileName = "exp-log-lat.csv"
55 | data = np.loadtxt(inputFileName, skiprows=1)
56 | l = 1
57 | for sz in [16, 256]:
58 |   for iaction in [0, 1]:
59 |     actions = ['Malloc', 'Free']
60 |     action = actions[iaction]
61 |     curData = np_filter(data, l, sz)
62 |     xs = range(curData.shape[0])  
63 |     # allocation throughput for different sizes
64 |     fig = plt.figure(figsize=(12,7))
65 |     ax = fig.add_subplot(111)
66 |     divd=0.732*1000
67 |     ppl.plot(ax, xs, curData[:,3+3*iaction]/divd, '-o', label="Min", linewidth=2)
68 |     ppl.plot(ax, xs, curData[:,4+3*iaction]/divd, '-o', label="Avg", linewidth=2)
69 |     ppl.plot(ax, xs, curData[:,5+3*iaction]/divd, '-o', label="Max", linewidth=2)
70 |     ax.set_title("%s Latency" % (action));
71 |     ax.set_xlabel("#threads, x 1024")
72 |     ax.set_ylabel("Latency, us")
73 |     ax.set_xticks(xs)
74 |     ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024])
75 |     ax.set_yscale('log')
76 |     ax.axis(xmin=-1, xmax=len(xs), ymin=1)
77 |     ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
78 |     ppl.legend(ax, loc=0)
79 |     plt.tick_params(axis='both', which='major', direction='in', bottom=True)
80 |     outputfilename = '1x%dB-lat-%s.pdf' % (sz,action.lower())
81 |     funlink(outputfilename)
82 |     fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
83 | 


--------------------------------------------------------------------------------
/tst/exp/settings/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.png
5 | *.pdf
6 | 


--------------------------------------------------------------------------------
/tst/exp/settings/exp-log.csv:
--------------------------------------------------------------------------------
  1 | slab_size busy nallocs alloc_sz throughput speed
  2 | 1 0.745 4 16 879.20 13.10
  3 | 1 0.745 1 256 78.70 18.76
  4 | 1 0.755 4 16 840.40 12.52
  5 | 1 0.755 1 256 77.48 18.47
  6 | 1 0.765 4 16 812.89 12.11
  7 | 1 0.765 1 256 76.54 18.25
  8 | 1 0.775 4 16 776.14 11.57
  9 | 1 0.775 1 256 75.02 17.89
 10 | 1 0.785 4 16 744.02 11.09
 11 | 1 0.785 1 256 73.82 17.60
 12 | 1 0.795 4 16 732.39 10.91
 13 | 1 0.795 1 256 72.82 17.36
 14 | 1 0.805 4 16 714.30 10.64
 15 | 1 0.805 1 256 71.24 16.98
 16 | 1 0.815 4 16 665.75 9.92
 17 | 1 0.815 1 256 70.58 16.83
 18 | 1 0.825 4 16 613.18 9.14
 19 | 1 0.825 1 256 69.28 16.52
 20 | 1 0.835 4 16 612.29 9.12
 21 | 1 0.835 1 256 68.53 16.34
 22 | 1 0.845 4 16 598.90 8.92
 23 | 1 0.845 1 256 67.58 16.11
 24 | 1 0.855 4 16 575.33 8.57
 25 | 1 0.855 1 256 66.76 15.92
 26 | 1 0.865 4 16 542.84 8.09
 27 | 1 0.865 1 256 65.74 15.67
 28 | 1 0.875 4 16 565.86 8.43
 29 | 1 0.875 1 256 64.81 15.45
 30 | 1 0.885 4 16 541.23 8.06
 31 | 1 0.885 1 256 63.81 15.21
 32 | 1 0.895 4 16 521.58 7.77
 33 | 1 0.895 1 256 62.99 15.02
 34 | 1 0.905 4 16 544.74 8.12
 35 | 1 0.905 1 256 62.01 14.79
 36 | 1 0.915 4 16 510.70 7.61
 37 | 1 0.915 1 256 60.87 14.51
 38 | 1 0.925 4 16 438.11 6.53
 39 | 1 0.925 1 256 59.94 14.29
 40 | 1 0.935 4 16 471.63 7.03
 41 | 1 0.935 1 256 59.21 14.12
 42 | 1 0.945 4 16 409.59 6.10
 43 | 1 0.945 1 256 57.93 13.81
 44 | 2 0.745 4 16 1443.03 21.50
 45 | 2 0.745 1 256 156.03 37.20
 46 | 2 0.755 4 16 1445.88 21.55
 47 | 2 0.755 1 256 153.99 36.71
 48 | 2 0.765 4 16 1442.89 21.50
 49 | 2 0.765 1 256 153.37 36.57
 50 | 2 0.775 4 16 1432.73 21.35
 51 | 2 0.775 1 256 151.29 36.07
 52 | 2 0.785 4 16 1425.57 21.24
 53 | 2 0.785 1 256 149.80 35.72
 54 | 2 0.795 4 16 1421.52 21.18
 55 | 2 0.795 1 256 147.74 35.22
 56 | 2 0.805 4 16 1421.90 21.19
 57 | 2 0.805 1 256 145.90 34.79
 58 | 2 0.815 4 16 1417.70 21.13
 59 | 2 0.815 1 256 143.77 34.28
 60 | 2 0.825 4 16 1367.36 20.38
 61 | 2 0.825 1 256 142.19 33.90
 62 | 2 0.835 4 16 1366.65 20.36
 63 | 2 0.835 1 256 140.04 33.39
 64 | 2 0.845 4 16 1304.42 19.44
 65 | 2 0.845 1 256 138.25 32.96
 66 | 2 0.855 4 16 1313.96 19.58
 67 | 2 0.855 1 256 135.85 32.39
 68 | 2 0.865 4 16 1279.31 19.06
 69 | 2 0.865 1 256 133.39 31.80
 70 | 2 0.875 4 16 1210.28 18.03
 71 | 2 0.875 1 256 131.32 31.31
 72 | 2 0.885 4 16 1142.57 17.03
 73 | 2 0.885 1 256 129.17 30.80
 74 | 2 0.895 4 16 1110.86 16.55
 75 | 2 0.895 1 256 125.69 29.97
 76 | 2 0.905 4 16 1037.18 15.46
 77 | 2 0.905 1 256 123.12 29.35
 78 | 2 0.915 4 16 964.39 14.37
 79 | 2 0.915 1 256 119.22 28.42
 80 | 2 0.925 4 16 908.65 13.54
 81 | 2 0.925 1 256 116.32 27.73
 82 | 2 0.935 4 16 843.35 12.57
 83 | 2 0.935 1 256 114.11 27.21
 84 | 2 0.945 4 16 798.92 11.90
 85 | 2 0.945 1 256 112.85 26.91
 86 | 4 0.745 4 16 1441.52 21.48
 87 | 4 0.745 1 256 348.23 83.02
 88 | 4 0.755 4 16 1420.42 21.17
 89 | 4 0.755 1 256 339.54 80.95
 90 | 4 0.765 4 16 1427.30 21.27
 91 | 4 0.765 1 256 343.84 81.98
 92 | 4 0.775 4 16 1463.45 21.81
 93 | 4 0.775 1 256 338.31 80.66
 94 | 4 0.785 4 16 1465.73 21.84
 95 | 4 0.785 1 256 339.53 80.95
 96 | 4 0.795 4 16 1442.23 21.49
 97 | 4 0.795 1 256 323.45 77.12
 98 | 4 0.805 4 16 1402.25 20.90
 99 | 4 0.805 1 256 333.15 79.43
100 | 4 0.815 4 16 1383.01 20.61
101 | 4 0.815 1 256 329.53 78.57
102 | 4 0.825 4 16 1344.60 20.04
103 | 4 0.825 1 256 329.00 78.44
104 | 4 0.835 4 16 1369.94 20.41
105 | 4 0.835 1 256 319.74 76.23
106 | 4 0.845 4 16 1407.73 20.98
107 | 4 0.845 1 256 332.08 79.17
108 | 4 0.855 4 16 1428.62 21.29
109 | 4 0.855 1 256 323.09 77.03
110 | 4 0.865 4 16 1361.36 20.29
111 | 4 0.865 1 256 319.26 76.12
112 | 4 0.875 4 16 1335.62 19.90
113 | 4 0.875 1 256 317.13 75.61
114 | 4 0.885 4 16 1274.91 19.00
115 | 4 0.885 1 256 309.58 73.81
116 | 4 0.895 4 16 1253.91 18.68
117 | 4 0.895 1 256 310.67 74.07
118 | 4 0.905 4 16 1309.75 19.52
119 | 4 0.905 1 256 300.99 71.76
120 | 4 0.915 4 16 1287.82 19.19
121 | 4 0.915 1 256 294.47 70.21
122 | 4 0.925 4 16 1238.84 18.46
123 | 4 0.925 1 256 301.45 71.87
124 | 4 0.935 4 16 1086.71 16.19
125 | 4 0.935 1 256 280.21 66.81
126 | 4 0.945 4 16 936.88 13.96
127 | 4 0.945 1 256 291.00 69.38
128 | 8 0.745 4 16 1443.02 21.50
129 | 8 0.745 1 256 909.74 216.90
130 | 8 0.755 4 16 1441.99 21.49
131 | 8 0.755 1 256 889.23 212.01
132 | 8 0.765 4 16 1445.51 21.54
133 | 8 0.765 1 256 874.22 208.43
134 | 8 0.775 4 16 1448.12 21.58
135 | 8 0.775 1 256 788.44 187.98
136 | 8 0.785 4 16 1451.03 21.62
137 | 8 0.785 1 256 830.44 197.99
138 | 8 0.795 4 16 1457.40 21.72
139 | 8 0.795 1 256 838.23 199.85
140 | 8 0.805 4 16 1459.32 21.75
141 | 8 0.805 1 256 833.81 198.80
142 | 8 0.815 4 16 1457.17 21.71
143 | 8 0.815 1 256 817.50 194.91
144 | 8 0.825 4 16 1453.37 21.66
145 | 8 0.825 1 256 812.52 193.72
146 | 8 0.835 4 16 1443.71 21.51
147 | 8 0.835 1 256 814.79 194.26
148 | 8 0.845 4 16 1315.38 19.60
149 | 8 0.845 1 256 809.40 192.98
150 | 8 0.855 4 16 1235.75 18.41
151 | 8 0.855 1 256 800.73 190.91
152 | 8 0.865 4 16 1227.53 18.29
153 | 8 0.865 1 256 782.43 186.55
154 | 8 0.875 4 16 1193.70 17.79
155 | 8 0.875 1 256 776.16 185.05
156 | 8 0.885 4 16 1237.99 18.45
157 | 8 0.885 1 256 757.69 180.65
158 | 8 0.895 4 16 1266.08 18.87
159 | 8 0.895 1 256 743.97 177.38
160 | 8 0.905 4 16 1277.63 19.04
161 | 8 0.905 1 256 719.34 171.50
162 | 8 0.915 4 16 1290.66 19.23
163 | 8 0.915 1 256 729.92 174.03
164 | 8 0.925 4 16 1273.92 18.98
165 | 8 0.925 1 256 672.34 160.30
166 | 8 0.935 4 16 1235.11 18.40
167 | 8 0.935 1 256 672.28 160.28
168 | 8 0.945 4 16 1176.06 17.52
169 | 8 0.945 1 256 661.98 157.83
170 | 


--------------------------------------------------------------------------------
/tst/exp/settings/exp-plot.gpl:
--------------------------------------------------------------------------------
 1 | # fixed slab size, varying occupancy
 2 | set terminal pdf enhanced color font ",10"
 3 | set pointsize 0.65
 4 | set style data linespoints
 5 | 
 6 | set output "var-occup.pdf"
 7 | set xlabel "Busy threshold"
 8 | set ylabel "Speed, GiB/s"
 9 | plot [][0:140] \
10 | 	"<awk '$1==2 && $4==16' exp-log.csv" u 2:6 title "4x16B, 2MiB slab",\
11 |   "<awk '$1==2 && $4==256' exp-log.csv" u 2:6 title "1x256B, 2MiB slab",\
12 |   "<awk '$1==4 && $4==16' exp-log.csv" u 2:6 title "4x16B, 4MiB slab",\
13 |   "<awk '$1==4 && $4==256' exp-log.csv" u 2:6 title "1x256B, 4MiB slab"
14 | 
15 | set output "var-threshold.pdf"
16 | set xlabel "Slab size, MiB"
17 | set ylabel "Speed, GiB/s"
18 | set logscale xy 2
19 | plot [0.9:9][4:200] \
20 | 	"<awk '$2==0.835 && $4==16' exp-log.csv" u 1:6 title "4x16B, busy at 0.835",\
21 | 	"<awk '$2==0.835 && $4==256' exp-log.csv" u 1:6 title "1x256B, busy at 0.835",\
22 | 	"<awk '$2==0.945 && $4==16' exp-log.csv" u 1:6 title "4x16B, busy at 0.945",\
23 | 	"<awk '$2==0.945 && $4==256' exp-log.csv" u 1:6 title "1x256B, busy at 0.945"
24 | 


--------------------------------------------------------------------------------
/tst/exp/settings/exp-run.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env perl
 2 | 
 3 | # a script to perform experiment with settings
 4 | use POSIX;
 5 | 
 6 | #include common functions
 7 | do '../common.pl';
 8 | 
 9 | $falloc = 0.95;
10 | $ffree = 0.05;
11 | $fexec = 0.91;
12 | $nthreads = 512 * 1024;
13 | $memory = 512 * 1024 * 1024;
14 | $group = 5;
15 | $niters = 31;
16 | $ntries = 16;
17 | 
18 | $common = "-f$falloc -F$ffree -e$fexec -n$nthreads -m$memory " .
19 | 	"-g$group -i$niters -t$niters";
20 | $ocsv_name = "./exp-log.csv";
21 | $OCSV = 100;
22 | open(OCSV, ">", $ocsv_name) 
23 | 		|| die "cannot open file $ocsv_name for writing";
24 | $oline = "slab_size busy nallocs alloc_sz throughput speed\n";
25 | print OCSV $oline;
26 | print $oline;
27 | foreach $slab_size (20, 21, 22, 23) {
28 | #foreach $slab_size (22) {
29 | #		foreach $busy (0.75, 0.835, 0.9, 0.95) {
30 | #		foreach $busy (0.835) {
31 | 		for($busy = 0.745; $busy <= 0.955; $busy += 0.01) {
32 | 				foreach $alloc_sz (16, 256) {
33 | #				foreach $alloc_sz (16) {
34 | 						my $nallocs = $alloc_sz == 16 ? 4 : 1;
35 | 						runtest("throughput", $common, "-b$slab_size", "-B$busy",
36 | 										"-s$alloc_sz", "-l$nallocs");
37 | 						my $slab_sz = 2 ** ($slab_size - 20);
38 | 						$oline = 
39 | 								"$slab_sz $busy $nallocs $alloc_sz $thru_pair $speed_pair\n";
40 | 						print OCSV $oline;
41 | 						print $oline;
42 | 				}  # foreach alloc_sz
43 | 		}  # foreach busy
44 | }  # foreach slab_size
45 | 
46 | close OCSV;
47 | # run gnuplot
48 | system('gnuplot', './exp-plot.gpl');
49 | 


--------------------------------------------------------------------------------
/tst/exp/speed/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.png
5 | *.pdf
6 | 


--------------------------------------------------------------------------------
/tst/exp/speed/exp-log-combi.csv:
--------------------------------------------------------------------------------
1 | block priv_speed priv_thru spree_speed spree_speed_malloc speed_thru speed_thru_malloc
2 | 8..32 25.17 1351.07 21.56 38.48 1157.69 2065.96
3 | 8..64 32.00 954.48 37.15 65.65 1108.01 1958.13
4 | 8..256 113.98 927.18 97.75 177.29 795.18 1442.11
5 | 8..3072 93.01 64.85 126.69 137.21 88.33 95.66
6 | 


--------------------------------------------------------------------------------
/tst/exp/speed/exp-log-single.csv:
--------------------------------------------------------------------------------
 1 | block priv_speed priv_thru spree_speed spree_speed_malloc speed_thru speed_thru_malloc
 2 | 16 20.89 1401.70 18.18 33.79 1220.15 2267.83
 3 | 24 25.11 1123.53 25.83 46.64 1155.82 2086.62
 4 | 32 41.83 1403.64 35.65 65.45 1196.23 2196.17
 5 | 48 22.70 507.83 47.93 84.00 1072.26 1879.02
 6 | 64 23.26 390.20 59.86 102.81 1004.20 1724.82
 7 | 96 36.88 412.49 67.94 106.74 759.91 1193.91
 8 | 128 35.26 295.80 82.85 131.64 695.02 1104.25
 9 | 192 53.44 298.88 53.32 66.25 298.19 370.48
10 | 256 46.47 194.92 60.05 70.94 251.85 297.56
11 | 384 47.71 133.39 79.04 92.03 221.00 257.33
12 | 512 60.11 126.06 89.41 102.16 187.50 214.25
13 | 768 66.74 93.31 78.01 85.12 109.07 119.01
14 | 1024 68.13 71.44 92.00 99.52 96.47 104.36
15 | 1536 65.66 45.90 82.36 86.91 57.57 60.76
16 | 2048 60.71 31.83 100.74 106.33 52.82 55.75
17 | 3072 56.30 19.68 85.90 89.04 30.02 31.12
18 | 


--------------------------------------------------------------------------------
/tst/exp/speed/exp-plot.gpl:
--------------------------------------------------------------------------------
 1 | # fixed slab size, varying occupancy
 2 | set terminal pdf enhanced color font ",10"
 3 | set pointsize 0.65
 4 | set style data linespoints
 5 | 
 6 | set output "single-speed.pdf"
 7 | set xlabel "Allocation size, B"
 8 | set logscale x 2
 9 | set ylabel "Speed, GiB/s"
10 | plot [12:4096][0:120] \
11 | 	"exp-log-single.csv" u 1:2 title "Private",\
12 |   "exp-log-single.csv" u 1:4 title "Spree",\
13 |   "exp-log-single.csv" u 1:5 title "Spree malloc"
14 | 
15 | set output "single-thru.pdf"
16 | set xlabel "Allocation size, B"
17 | set logscale xy 2
18 | set ylabel "Throughput, Mops/s"
19 | plot [12:4096][10:2000] \
20 | 	"exp-log-single.csv" u 1:3 title "Private",\
21 |   "exp-log-single.csv" u 1:6 title "Spree",\
22 |   "exp-log-single.csv" u 1:7 title "Spree malloc"
23 | 
24 | set style data boxes
25 | set boxwidth 0.225
26 | set style fill solid 0.6
27 | 
28 | set output "combi-speed.pdf"
29 | set xlabel "Allocation size range, B"
30 | unset logscale xy
31 | set ylabel "Speed, GiB/s"
32 | plot [][0:160] \
33 | 	"exp-log-combi.csv" u 0:2 title "Private",\
34 |   "exp-log-combi.csv" u ($0+0.25):4:xticlabels(1) title "Spree",\
35 |   "exp-log-combi.csv" u ($0+0.5):5 title "Spree malloc"
36 | 
37 | set output "combi-thru.pdf"
38 | set xlabel "Allocation size range, B"
39 | set logscale y 2
40 | set ylabel "Throughput, Mops/s"
41 | plot [][16:2048] \
42 | 	"exp-log-combi.csv" u 0:3 title "Private",\
43 |   "exp-log-combi.csv" u ($0+0.25):6:xticlabels(1) title "Spree",\
44 |   "exp-log-combi.csv" u ($0+0.5):7 title "Spree malloc"
45 | 


--------------------------------------------------------------------------------
/tst/exp/speed/exp-run.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env perl
  2 | 
  3 | # a script to perform experiment with settings
  4 | use POSIX;
  5 | 
  6 | #include common functions
  7 | do '../common.pl';
  8 | 
  9 | $falloc = 0.95;
 10 | $ffree = 0.05;
 11 | $fexec = 0.91;
 12 | $memory = 512 * 1024 * 1024;
 13 | $group = 5;
 14 | $max_nthreads = 1024 * 1024;
 15 | $mem_fraction = 0.4;
 16 | $total_niters = 512;
 17 | $common = "-f$falloc -F$ffree -e$fexec -m$memory -g$group";
 18 | 
 19 | # running a speed test
 20 | sub run_speedtest {
 21 | 		# spree speed
 22 | 		$ntries = $total_niters;
 23 | 		runtest("throughput", $common, $_[0], "-i1 -t$ntries");
 24 | 		$spree_speed = $speed_pair;
 25 | 		$spree_speed_malloc = $speed_malloc;
 26 | 		$spree_thru = $thru_pair;
 27 | 		$spree_thru_malloc = $thru_malloc;
 28 | 		# private speed
 29 | 		$niters = 32;
 30 | 		$ntries = $total_niters / $niters;
 31 | 		runtest("throughput", $common, $_[0], "-i$niters -t$ntries");
 32 | 		$priv_speed = $speed_pair;
 33 | 		$priv_thru = $thru_pair;
 34 | } # run_speedtest
 35 | 
 36 | # single-size test
 37 | sub single_size {
 38 | 		@alloc_szs = (16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536,
 39 | 									2048, 3072);
 40 | 		$ocsv_name = "./exp-log-single.csv";
 41 | 		$OCSV = 100;
 42 | 		open(OCSV, ">", $ocsv_name) 
 43 | 				|| die "cannot open file $ocsv_name for writing";
 44 | 		$oline = "block priv_speed priv_thru spree_speed spree_speed_malloc " . 
 45 | 				"speed_thru speed_thru_malloc\n";
 46 | 		print $oline;
 47 | 		print OCSV $oline;
 48 | 		foreach $alloc_sz (@alloc_szs) {
 49 | 				$nallocs = 1;
 50 | 				if($alloc_sz <= 64) {
 51 | 						$nallocs = 4;
 52 | 				} elsif($alloc_sz <= 128) {
 53 | 						$nallocs = 2;
 54 | 				}
 55 | 				$nthreads = floor($mem_fraction * $memory / ($alloc_sz * $nallocs));
 56 | 				$nthreads = $nthreads > $max_nthreads ? $max_nthreads : $nthreads;
 57 | 				run_speedtest("-n$nthreads -l$nallocs -s$alloc_sz");
 58 | 				$oline = "$alloc_sz $priv_speed $priv_thru " . 
 59 | 						"$spree_speed $spree_speed_malloc $spree_thru $spree_thru_malloc\n";
 60 | 				print OCSV $oline;
 61 | 				print $oline;		
 62 | 		}  # foreach $alloc_sz
 63 | 		
 64 | 		close OCSV;
 65 | } # sub single_size
 66 | 
 67 | # combined-size tests
 68 | sub combi_size {
 69 | 		@min_alloc_szs = (8, 8, 8, 8);
 70 | 		@max_alloc_szs = (32, 64, 256, 3072);
 71 | 		$ocsv_name = "./exp-log-combi.csv";
 72 | 		$OCSV = 100;
 73 | 		open(OCSV, ">", $ocsv_name) 
 74 | 				|| die "cannot open file $ocsv_name for writing";
 75 | 		$oline = "block priv_speed priv_thru spree_speed spree_speed_malloc " . 
 76 | 				"speed_thru speed_thru_malloc\n";
 77 | 		print $oline;
 78 | 		print OCSV $oline;
 79 | 		for($isz = 0; $isz < @min_alloc_szs; $isz++) {
 80 | 				$min_sz = $min_alloc_szs[$isz];
 81 | 				$max_sz = $max_alloc_szs[$isz];
 82 | 				$nallocs = 1;
 83 | 				if($max_sz <= 64) {
 84 | 						$nallocs = 4;
 85 | 				}
 86 | 				$avg_sz = ($min_sz + $max_sz) / 2;
 87 | 				$nthreads = floor($mem_fraction * $memory / ($avg_sz * $nallocs));
 88 | 				$nthreads = $nthreads > $max_nthreads ? $max_nthreads : $nthreads;
 89 | 				run_speedtest("-n$nthreads -l$nallocs -s$min_sz -S$max_sz");
 90 | 				$oline = "$min_sz..$max_sz $priv_speed $priv_thru " . 
 91 | 						"$spree_speed $spree_speed_malloc $spree_thru $spree_thru_malloc\n";
 92 | 				print OCSV $oline;
 93 | 				print $oline;		
 94 | 		}  # foreach $alloc_sz
 95 | 
 96 | 		close OCSV;
 97 | } # sub combi_size
 98 | 
 99 | 
100 | # main
101 | single_size();
102 | combi_size();
103 | # run gnuplot
104 | system('gnuplot', './exp-plot.gpl');
105 | 


--------------------------------------------------------------------------------
/tst/exp/speed/graph.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import numpy as np #numerical stuff
  3 | import sys
  4 | import os
  5 | 
  6 | import prettyplotlib as ppl # makes nicer colors and generally better to look at graphs
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib as mpl
  9 | from prettyplotlib import brewer2mpl
 10 | 
 11 | def funlink(path):
 12 |   try:
 13 |     os.unlink(path)
 14 |   except:
 15 |     pass
 16 | 
 17 | # change font to Open Sans (has some kerning issues, though)
 18 | #mpl.rcParams.update({'font.family':'Open Sans'})
 19 | 
 20 | inputFileName = "exp-log-single.csv"
 21 | 
 22 | data = np.loadtxt(inputFileName, skiprows=1)
 23 | xs = range(data.shape[0])
 24 | 
 25 | # allocation speed for different sizes
 26 | fig = plt.figure(figsize=(12,7))
 27 | ax = fig.add_subplot(111)
 28 | ppl.plot(ax, xs, data[:,1], '-o', label="Private", linewidth=2)
 29 | ppl.plot(ax, xs, data[:,3], '-o', label="Spree", linewidth=2)
 30 | ppl.plot(ax, xs, data[:,4], '-o', label="Spree malloc", linewidth=2)
 31 | ax.set_title("Allocation Speed for Different Sizes");
 32 | ax.set_xlabel("Allocation size, B")
 33 | ax.set_ylabel("Speed, GiB/s")
 34 | ax.set_xticks(xs)
 35 | ax.set_xticklabels(['%.0lf' % d for d in data[:, 0]])
 36 | ax.axis(xmin=-1, ymin=0)
 37 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
 38 | ppl.legend(ax, loc=0)
 39 | plt.tick_params(axis='both', which='major', direction='in', bottom=True)
 40 | outputfilename = 'single-speed.pdf'
 41 | funlink(outputfilename)
 42 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
 43 | 
 44 | # allocation throughput for different sizes
 45 | fig = plt.figure(figsize=(12,7))
 46 | ax = fig.add_subplot(111)
 47 | ppl.plot(ax, xs, data[:,2], '-o', label="Private", linewidth=2)
 48 | ppl.plot(ax, xs, data[:,5], '-o', label="Spree", linewidth=2)
 49 | ppl.plot(ax, xs, data[:,6], '-o', label="Spree malloc", linewidth=2)
 50 | ax.set_title("Allocation Throughput for Different Sizes");
 51 | ax.set_xlabel("Allocation size, B")
 52 | ax.set_ylabel("Throughput, Mop/s")
 53 | ax.set_yscale('log')
 54 | ax.set_xticks(xs)
 55 | ax.set_xticklabels(['%.0lf' % d for d in data[:, 0]])
 56 | ax.axis(xmin=-1, ymin=1)
 57 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
 58 | ppl.legend(ax, loc=0)
 59 | plt.tick_params(axis='both', which='major', direction='in', bottom=True)
 60 | outputfilename = 'single-thru.pdf'
 61 | funlink(outputfilename)
 62 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
 63 | 
 64 | # size combinations
 65 | data = np.loadtxt('exp-log-combi.csv', skiprows=1, usecols=[1, 2, 3, 4, 5, 6])
 66 | labels = ['8..32', '8..64', '8..256', '8..3072']
 67 | xs = np.array(range(data.shape[0])) * 2
 68 | step=0.3
 69 | width=0.25
 70 | 
 71 | # allocation speed for different size combinations
 72 | fig = plt.figure(figsize=(12,7))
 73 | ax = fig.add_subplot(111)
 74 | ppl.bar(ax, xs, data[:,0], color='b', width=width, label="Private")
 75 | ppl.bar(ax, xs + step, data[:,2], color='g', width=width, label="Spree")
 76 | ppl.bar(ax, xs + 2*step, data[:,3], color='r', width=width, label="Spree malloc")
 77 | ax.set_title("Allocation Speed for Combinations of Sizes");
 78 | ax.set_xlabel("Allocation size, B")
 79 | ax.set_ylabel("Speed, GiB/s")
 80 | ax.set_xticks(xs + 0.45)
 81 | ax.set_xticklabels(labels)
 82 | ax.axis(xmin=-0.5, ymin=1)
 83 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
 84 | ppl.legend(ax, loc=0)
 85 | outputfilename = 'combi-speed.pdf'
 86 | funlink(outputfilename)
 87 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
 88 | 
 89 | # allocation throughput for different size combinations
 90 | fig = plt.figure(figsize=(12,7))
 91 | ax = fig.add_subplot(111)
 92 | ppl.bar(ax, xs, data[:,1], color='b', width=width, label="Private")
 93 | ppl.bar(ax, xs + step, data[:,4], color='g', width=width, label="Spree")
 94 | ppl.bar(ax, xs + 2*step, data[:,5], color='r', width=width, label="Spree malloc")
 95 | ax.set_title("Allocation Throughput for Combinations of Sizes");
 96 | ax.set_xlabel("Allocation size, B")
 97 | ax.set_ylabel("Throughput, Mop/s")
 98 | ax.set_xticks(xs + 0.45)
 99 | ax.set_xticklabels(labels)
100 | ax.axis(xmin=-0.5, ymin=1)
101 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True)
102 | ppl.legend(ax, loc=0)
103 | outputfilename = 'combi-thru.pdf'
104 | funlink(outputfilename)
105 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight')
106 | 
107 | #plt.show()
108 | 


--------------------------------------------------------------------------------
/tst/include/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | libscatteralloc.a
6 | scatter-alloc.h
7 | 


--------------------------------------------------------------------------------
/tst/include/halloc.h:
--------------------------------------------------------------------------------
1 | ../../src/halloc.h


--------------------------------------------------------------------------------
/tst/perf/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/bin/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------
/tst/perf/latency/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/latency/latency.cu:
--------------------------------------------------------------------------------
  1 | /** @file latency.cu latency test for various memory allocators */
  2 | 
  3 | #include <common.h>
  4 | 
  5 | #include <float.h>
  6 | #include <limits.h>
  7 | #include <math.h>
  8 | #include <omp.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <string.h>
 12 | 
 13 | /** measures malloc latencies; note that latencies are averaged per-thread,
 14 | 		per-allocation latencies are not preserved; latencies here are measured in cycles */
 15 | template<class T>
 16 | __global__ void latency_malloc_k
 17 | (CommonOpts opts, void **ptrs, double *latencies) {
 18 | 	int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x;
 19 | 	if(opts.is_thread_inactive(i))
 20 | 		return;
 21 | 	for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) {
 22 | 		uint sz = opts.next_alloc_sz();
 23 | 		uint64 t1 = clock64();
 24 | 		ptrs[i + n * ialloc] = T::malloc(sz);
 25 | 		uint64 t2 = clock64(), latency = t2 - t1;
 26 | 		latencies[i + ialloc * n] = (double)latency;
 27 | 	}
 28 | }  // latency_malloc_k
 29 | 
 30 | // TODO: verify that all pointers are non-zero
 31 | 
 32 | template<class T>
 33 | __global__ void latency_free_k
 34 | (CommonOpts opts, void **ptrs, double *latencies) {
 35 | 	int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x;
 36 | 	if(opts.is_thread_inactive(i))
 37 | 		return;
 38 | 	for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) {
 39 | 		uint64 t1 = clock64();
 40 | 		T::free(ptrs[i + n * ialloc]);
 41 | 		uint64 t2 = clock64(), latency = t2 - t1;
 42 | 		latencies[i + ialloc * n] = (double)latency;
 43 | 	}
 44 | }  // latency_free_k
 45 | 
 46 | template<class T> class LatencyTest {
 47 | 	
 48 | public:
 49 | 	void operator()(CommonOpts opts, bool warmup) {
 50 | 		opts.niters = 1;
 51 | 		// allocate memory
 52 | 		if(warmup) {
 53 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
 54 | 			opts.ntries = 1;
 55 | 		}
 56 | 		if(!warmup)
 57 | 			printf("latency test\n");
 58 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
 59 | 		int nptrs = n * opts.nallocs;
 60 | 		size_t ptrs_sz = nptrs * sizeof(void *);
 61 | 		size_t lat_sz = nptrs * sizeof(double);
 62 | 		void **d_ptrs;
 63 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
 64 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
 65 | 		double *h_malloc_latencies = (double *)malloc(lat_sz);
 66 | 		double *h_free_latencies = (double *)malloc(lat_sz);
 67 | 		double *d_malloc_latencies, *d_free_latencies;
 68 | 		cucheck(cudaMalloc((void **)&d_malloc_latencies, lat_sz));
 69 | 		cucheck(cudaMalloc((void **)&d_free_latencies, lat_sz));
 70 | 		cucheck(cudaMemset(d_malloc_latencies, 0, lat_sz));
 71 | 		cucheck(cudaMemset(d_free_latencies, 0, lat_sz));
 72 | 
 73 | 		// latency variables
 74 | 		double avg_malloc_latency = 0, avg_free_latency = 0;
 75 | 		double min_malloc_latency = FLT_MAX, min_free_latency = FLT_MAX;
 76 | 		double max_malloc_latency = FLT_MIN, max_free_latency = FLT_MIN;
 77 | 
 78 | 		// do testing
 79 | 		for(int itry = 0; itry < opts.ntries; itry++) {
 80 | 			// allocate
 81 | 			latency_malloc_k<T> <<<grid, bs>>>(opts, d_ptrs, d_malloc_latencies);
 82 | 			cucheck(cudaGetLastError());
 83 | 			cucheck(cudaStreamSynchronize(0));
 84 | 			// check that pointers are correct
 85 | 			if(!check_nz(d_ptrs, 0, nptrs, opts)) {
 86 | 				fprintf(stderr, "cannot allocate enough memory\n");
 87 | 				exit(-1);
 88 | 			}
 89 | 			// free
 90 | 			latency_free_k<T> <<<grid, bs>>>(opts, d_ptrs, d_free_latencies);
 91 | 			cucheck(cudaGetLastError());
 92 | 			cucheck(cudaStreamSynchronize(0));
 93 | 			// collect latency infos
 94 | 			if(!warmup) {
 95 | 				cucheck(cudaMemcpy(h_malloc_latencies, d_malloc_latencies, lat_sz,
 96 | 													 cudaMemcpyDeviceToHost));
 97 | 				cucheck(cudaMemcpy(h_free_latencies, d_free_latencies, lat_sz,
 98 | 													 cudaMemcpyDeviceToHost));
 99 | 				for(int i = 0; i < n; i += opts.period_mask + 1) {
100 | 					for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) {					
101 | 						double malloc_latency = h_malloc_latencies[ialloc * n + i];
102 | 						double free_latency = h_free_latencies[ialloc * n + i];
103 | 						avg_malloc_latency += malloc_latency;
104 | 						avg_free_latency += free_latency;
105 | 						min_malloc_latency = min(min_malloc_latency, malloc_latency);
106 | 						min_free_latency = min(min_free_latency, free_latency);
107 | 						max_malloc_latency = max(max_malloc_latency, malloc_latency);
108 | 						max_free_latency = max(max_free_latency, free_latency);
109 | 					}
110 | 				}
111 | 			}  // if(not warmup)
112 | 		}  // for(itry)
113 | 
114 | 		// output latency infos
115 | 		if(!warmup) {
116 | 			avg_malloc_latency /= opts.total_nallocs();
117 | 			avg_free_latency /= opts.total_nallocs();
118 | 			printf("min malloc latency %.2lf cycles\n", min_malloc_latency);
119 | 			printf("avg malloc latency %.2lf cycles\n", avg_malloc_latency);
120 | 			printf("max malloc latency %.2lf cycles\n", max_malloc_latency);
121 | 			printf("min free latency %.2lf cycles\n", min_free_latency);
122 | 			printf("avg free latency %.2lf cycles\n", avg_free_latency);
123 | 			printf("max free latency %.2lf cycles\n", max_free_latency);
124 | 			printf("avg pair latency %.2lf cycles\n", 
125 | 						 avg_malloc_latency + avg_free_latency);
126 | 		}  // output latency infos
127 | 
128 | 		// free memory
129 | 		free(h_malloc_latencies);
130 | 		free(h_free_latencies);
131 | 		cucheck(cudaFree(d_malloc_latencies));
132 | 		cucheck(cudaFree(d_free_latencies));
133 | 		cucheck(cudaFree(d_ptrs));		
134 | 	}  // operator()
135 |  
136 | };  // LatencyTest
137 | 
138 | int main(int argc, char **argv) {
139 | 	CommonOpts opts(true);
140 | 	run_test<LatencyTest> (argc, argv, opts);
141 | 	return 0;
142 | }  // main
143 | 


--------------------------------------------------------------------------------
/tst/perf/latency/makefile:
--------------------------------------------------------------------------------
1 | NAME=latency
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/perf/make-all.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | # runs specific make target for each performance test
4 | ls -1 | grep -vE 'bin|tmp|make|run' | xargs -IXA_TEST -P0 make -C XA_TEST $1
5 | 


--------------------------------------------------------------------------------
/tst/perf/makefile:
--------------------------------------------------------------------------------
1 | TMP=*~
2 | 
3 | build:
4 | 	./make-all.sh build
5 | 
6 | clean:
7 | 	rm -f $(TMP)
8 | 	./make-all.sh clean
9 | 


--------------------------------------------------------------------------------
/tst/perf/phase-alloc-write/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/phase-alloc-write/makefile:
--------------------------------------------------------------------------------
1 | NAME=phase-alloc-write
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/perf/phase-alloc-write/phase-alloc-write.cu:
--------------------------------------------------------------------------------
  1 | /** @file prob-throughput.cu probabalitized throughput test */
  2 | 
  3 | #include <common.h>
  4 | 
  5 | #include <limits.h>
  6 | #include <math.h>
  7 | #include <omp.h>
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | 
 12 | /** global counters for number of allocations, frees and total size allocated
 13 | 		*/
 14 | __device__ uint64 nmallocs_g = 0;
 15 | 
 16 | /** the kernel of the probability throughput test */
 17 | template <class T>
 18 | __global__ void prob_throughput_k
 19 | (void **ptrs, uint *ctrs, uint itry) {
 20 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
 21 | 	uint n = opts_g.nthreads;
 22 | 	//uint nallocs = opts_g.nallocs;
 23 | 	if(opts_g.is_thread_inactive(i))
 24 | 		return;
 25 | 	uint ctr = ctrs[i];
 26 | 	//uint nmallocs = 0;
 27 | 
 28 | 	// iterate 
 29 | 	for(uint iter = 0; iter < opts_g.niters; iter++) {
 30 | 		// perform the action
 31 | 		switch(opts_g.next_action(ctr > 0, itry, iter)) {
 32 | 			//switch(ctr > 0 ? ActionFree : ActionAlloc) {
 33 | 		case ActionAlloc:
 34 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) {
 35 | 				// allocate
 36 | 				uint alloc_sz = opts_g.next_alloc_sz();
 37 | 				uint64 *p =	(uint64 *)T::malloc(alloc_sz);
 38 | 				for(int iword = 0; iword < alloc_sz / (uint)sizeof(uint64); iword++) 
 39 | 					p[iword] = 123ull;
 40 | 				ptrs[ialloc * n + i] = p;
 41 | 			}
 42 | 			ctr = opts_g.nallocs;
 43 | 			//nmallocs += nallocs;
 44 | 			break;
 45 | 		case ActionFree:
 46 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++)
 47 | 				T::free(ptrs[ialloc * n + i]);
 48 | 			ctr = 0;
 49 | 			break;
 50 | 		case ActionNone:
 51 | 			//printf("no action taken\n");
 52 | 			break;
 53 | 		}
 54 | 	}  // for(each iteration)
 55 | 	ctrs[i] = ctr;
 56 | }  // prob_throughput_k
 57 | 
 58 | /** measures malloc throughput */
 59 | template<class T> class PhaseThroughputTest {
 60 | 	
 61 | public:
 62 | 	void operator()(CommonOpts opts, bool warmup) {
 63 | 		// allocate memory
 64 | 		if(warmup) {
 65 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
 66 | 			opts.ntries = 1;
 67 | 			opts.niters = 1;
 68 | 		}
 69 | 		if(!warmup)
 70 | 			printf("two-phase throuhgput test\n");
 71 | 		cuset(opts_g, CommonOpts, opts);
 72 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
 73 | 		int nptrs = n * opts.nallocs;
 74 | 		size_t ptrs_sz = nptrs * sizeof(void *);
 75 | 		uint ctrs_sz = n * sizeof(uint);
 76 | 		void **d_ptrs;
 77 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
 78 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
 79 | 		uint *d_ctrs;
 80 | 		cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz));
 81 | 		cucheck(cudaMemset(d_ctrs, 0, ctrs_sz));
 82 | 		
 83 | 		double t_malloc = 0, t_free = 0, t_pair = 0;
 84 | 		cuset(nmallocs_g, uint64, 0);
 85 | 
 86 | 		// do testing
 87 | 		for(int itry = 0; itry < opts.ntries; itry++) {
 88 | 			// run the kernel
 89 | 			double t_start = omp_get_wtime();
 90 | 			prob_throughput_k<T> <<<grid, bs>>>(/* opts, */ d_ptrs, d_ctrs, itry);
 91 | 			cucheck(cudaGetLastError());
 92 | 			cucheck(cudaStreamSynchronize(0));
 93 | 			double t_end = omp_get_wtime(), dt = t_end - t_start;
 94 | 			t_pair += dt;
 95 | 			if(itry % 2)
 96 | 				t_free += dt;
 97 | 			else
 98 | 				t_malloc += dt;
 99 | 			// check that pointers are correct
100 | 			if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) {
101 | 				fprintf(stderr, "cannot allocate enough memory\n");
102 | 				exit(-1);
103 | 			}		
104 | 		}  // for(itry)
105 | 
106 | 		// free the rest
107 | 		{
108 | 			double t_start = omp_get_wtime();
109 | 			free_rest_k<T> <<<grid, bs>>> (/* opts, */ d_ptrs, d_ctrs);
110 | 			cucheck(cudaGetLastError());
111 | 			cucheck(cudaStreamSynchronize(0));
112 | 			double t_end = omp_get_wtime(), dt = t_end - t_start;
113 | 			t_pair += dt;
114 | 			t_free += dt;
115 | 		}
116 | 
117 | 		// output throughput infos
118 | 		if(!warmup) {
119 | 			//uint64 nallocs;
120 | 			//cuget(&nallocs, nmallocs_g);
121 | 			
122 | 			//double malloc_throughput = opts.total_nallocs() / t_malloc * 1e-6;
123 | 			//double free_throughput = opts.total_nallocs() / t_free * 1e-6;
124 | 			double npairs = 0.5 * opts.total_nallocs() * opts.exec_fraction;
125 | 			double nmallocs = 0.25 * opts.total_nallocs() * 
126 | 				(opts.exec_fraction + opts.alloc_fraction - opts.free_fraction);
127 | 			double nfrees = 0.25 * opts.total_nallocs() * 
128 | 				(opts.exec_fraction + opts.alloc_fraction - opts.free_fraction);
129 | 			double pair_throughput = npairs / t_pair * 1e-6;
130 | 			double malloc_throughput = nmallocs / t_malloc * 1e-6;
131 | 			double free_throughput = nfrees / t_free * 1e-6;
132 | 			double malloc_speed = nmallocs * opts.expected_sz() / t_malloc /
133 | 				NBYTES_IN_GIB;
134 | 			double pair_speed = npairs * opts.expected_sz() / t_pair /
135 | 				NBYTES_IN_GIB;
136 | 			if(opts.niters == 1) {
137 | 				printf("malloc throughput %.2lf Mmallocs/s\n", malloc_throughput);
138 | 				printf("free throughput %.2lf Mfrees/s\n", free_throughput);
139 | 			}
140 | 			//printf("total test time %.2lf ms\n", t_pair * 1e3);
141 | 			printf("pair throughput %.2lf Mpairs/s\n", pair_throughput);
142 | 			if(opts.niters == 1) {
143 | 				printf("malloc speed %.2lf GiB/s\n", malloc_speed);
144 | 			}
145 | 			printf("pair speed %.2lf GiB/s\n", pair_speed);
146 | 		}  // output latency infos
147 | 
148 | 		// free memory
149 | 		cucheck(cudaFree(d_ptrs));
150 | 		cucheck(cudaFree(d_ctrs));
151 | 	}  // operator()
152 |  
153 | };  // PhaseThroughputTest
154 | 
155 | int main(int argc, char **argv) {
156 | 	CommonOpts opts(true);
157 | 	run_test<PhaseThroughputTest>(argc, argv, opts);
158 | 	return 0;
159 | }  // main
160 | 


--------------------------------------------------------------------------------
/tst/perf/phase-extfrag/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/phase-extfrag/makefile:
--------------------------------------------------------------------------------
1 | NAME=phase-extfrag
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/perf/phase-extfrag/phase-extfrag.cu:
--------------------------------------------------------------------------------
  1 | /** @file phase-extfrag.cu probabalitized external fragmentation test */
  2 | 
  3 | #include <common.h>
  4 | 
  5 | #include <limits.h>
  6 | #include <math.h>
  7 | #include <omp.h>
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | 
 12 | /** global counters for number of allocations, frees and total size allocated
 13 | 		*/
 14 | __device__ uint64 nmallocs_g = 0;
 15 | 
 16 | /** the kernel of the probability throughput test */
 17 | template <class T>
 18 | __global__ void prob_throughput_k
 19 | (void **ptrs, uint *ctrs, uint itry) {
 20 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
 21 | 	uint n = opts_g.nthreads;
 22 | 	//uint nallocs = opts_g.nallocs;
 23 | 	if(opts_g.is_thread_inactive(i))
 24 | 		return;
 25 | 	uint ctr = ctrs[i];
 26 | 	//uint nmallocs = 0;
 27 | 
 28 | 	// iterate 
 29 | 	for(uint iter = 0; iter < opts_g.niters; iter++) {
 30 | 		// perform the action
 31 | 		switch(opts_g.next_action(ctr > 0, itry, iter)) {
 32 | 			//switch(ctr > 0 ? ActionFree : ActionAlloc) {
 33 | 		case ActionAlloc:
 34 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++)
 35 | 				ptrs[ialloc * n + i] = T::malloc(opts_g.next_alloc_sz());
 36 | 			ctr = opts_g.nallocs;
 37 | 			//nmallocs += nallocs;
 38 | 			break;
 39 | 		case ActionFree:
 40 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++)
 41 | 				T::free(ptrs[ialloc * n + i]);
 42 | 			ctr = 0;
 43 | 			break;
 44 | 		case ActionNone:
 45 | 			//printf("no action taken\n");
 46 | 			break;
 47 | 		}
 48 | 	}  // for(each iteration)
 49 | 	ctrs[i] = ctr;
 50 | }  // prob_throughput_k
 51 | 
 52 | /** measures malloc throughput */
 53 | template<class T> class PhaseExtFragTest {
 54 | 	
 55 | public:
 56 | 	void operator()(CommonOpts opts, bool warmup) {
 57 | 		// allocate memory
 58 | 		if(warmup) {
 59 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
 60 | 			opts.ntries = 1;
 61 | 			opts.niters = 1;
 62 | 		}
 63 | 		if(!warmup)
 64 | 			printf("two-phase throuhgput test\n");
 65 | 		cuset(opts_g, CommonOpts, opts);
 66 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
 67 | 		int nptrs = n * opts.nallocs;
 68 | 		size_t ptrs_sz = nptrs * sizeof(void *);
 69 | 		uint ctrs_sz = n * sizeof(uint);
 70 | 		void **d_ptrs;
 71 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
 72 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
 73 | 		uint *d_ctrs;
 74 | 		cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz));
 75 | 		cucheck(cudaMemset(d_ctrs, 0, ctrs_sz));
 76 | 		
 77 | 		// do testing
 78 | 		for(int itry = 0; itry < opts.ntries; itry++) {
 79 | 			// run the kernel
 80 | 			prob_throughput_k<T> <<<grid, bs>>>(d_ptrs, d_ctrs, itry);
 81 | 			cucheck(cudaGetLastError());
 82 | 			cucheck(cudaStreamSynchronize(0));			
 83 | 			// check that pointers are correct
 84 | 			if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) {
 85 | 				fprintf(stderr, "cannot allocate enough memory\n");
 86 | 				exit(-1);
 87 | 			}
 88 | 			if(!warmup)
 89 | 				printf("external fragmentation %d %.2lf %.2lf\n", itry, T::extfrag(false),
 90 | 							 T::extfrag(true));
 91 | 		}  // for(itry)
 92 | 
 93 | 		// free the rest
 94 | 		{
 95 | 			free_rest_k<T> <<<grid, bs>>> (/* opts, */ d_ptrs, d_ctrs);
 96 | 			cucheck(cudaGetLastError());
 97 | 			cucheck(cudaStreamSynchronize(0));
 98 | 		}
 99 | 
100 | 		// free memory
101 | 		cucheck(cudaFree(d_ptrs));
102 | 		cucheck(cudaFree(d_ctrs));
103 | 	}  // operator()
104 |  
105 | };  // PhaseExtFragTest
106 | 
107 | int main(int argc, char **argv) {
108 | 	CommonOpts opts(true);
109 | 	run_test<PhaseExtFragTest>(argc, argv, opts);
110 | 	return 0;
111 | }  // main
112 | 


--------------------------------------------------------------------------------
/tst/perf/phase-latency/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/phase-latency/makefile:
--------------------------------------------------------------------------------
1 | NAME=phase-latency
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/perf/phase-latency/phase-latency.cu:
--------------------------------------------------------------------------------
  1 | /** @file prob-throughput.cu probabalitized throughput test */
  2 | 
  3 | #include <common.h>
  4 | 
  5 | #include <float.h>
  6 | #include <limits.h>
  7 | #include <math.h>
  8 | #include <omp.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <string.h>
 12 | 
 13 | /** global counters for number of allocations, frees and total size allocated
 14 | 		*/
 15 | __device__ uint64 nmallocs_g = 0;
 16 | 
 17 | /** the kernel of the probability throughput test */
 18 | template <class T>
 19 | __global__ void phase_latency_k
 20 | (void **ptrs, uint *ctrs, uint itry, ActionType *actions, uint *latencies) {
 21 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
 22 | 	uint n = opts_g.nthreads;
 23 | 	uint nallocs = opts_g.nallocs;
 24 | 	if(opts_g.is_thread_inactive(i))
 25 | 		return;
 26 | 	uint ctr = ctrs[i];
 27 | 	//uint nmallocs = 0;
 28 | 
 29 | 	// iterate 
 30 | 	for(uint iter = 0; iter < opts_g.niters; iter++) {
 31 | 		// perform the action
 32 | 		ActionType action = opts_g.next_action(ctr > 0, itry, iter);
 33 | 		actions[iter * n + i] = action;
 34 | 		switch(action) {
 35 | 			//switch(ctr > 0 ? ActionFree : ActionAlloc) {
 36 | 		case ActionAlloc:
 37 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) {
 38 | 				uint t1 = clock();
 39 | 				ptrs[ialloc * n + i] = T::malloc(opts_g.next_alloc_sz());
 40 | 				uint t2 = clock();
 41 | 				latencies[(iter * nallocs + ialloc) * n + i] = t2 - t1;
 42 | 			}
 43 | 			ctr = opts_g.nallocs;
 44 | 			//nmallocs += nallocs;
 45 | 			break;
 46 | 		case ActionFree:
 47 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) {
 48 | 				uint t1 = clock();
 49 | 				T::free(ptrs[ialloc * n + i]);
 50 | 				uint t2 = clock();
 51 | 				latencies[(iter * nallocs + ialloc) * n + i] = t2 - t1;
 52 | 			}
 53 | 			ctr = 0;
 54 | 			break;
 55 | 		case ActionNone:
 56 | 			//printf("no action taken\n");
 57 | 			break;
 58 | 		}
 59 | 	}  // for(each iteration)
 60 | 	ctrs[i] = ctr;
 61 | }  // phase_latency_k
 62 | 
 63 | /** measures malloc/free latency */
 64 | template<class T> class PhaseLatencyTest {
 65 | 	
 66 | public:
 67 | 	void operator()(CommonOpts opts, bool warmup) {
 68 | 		// allocate memory
 69 | 		if(warmup) {
 70 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
 71 | 			opts.ntries = 1;
 72 | 			opts.niters = 1;
 73 | 		}
 74 | 		if(!warmup)
 75 | 			printf("two-phase latency test\n");
 76 | 		cuset(opts_g, CommonOpts, opts);
 77 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
 78 | 		int nptrs = n * opts.nallocs;
 79 | 		size_t ptrs_sz = nptrs * sizeof(void *);
 80 | 		uint ctrs_sz = n * sizeof(uint);
 81 | 		size_t lat_sz = n * opts.niters * opts.nallocs * sizeof(uint);
 82 | 		size_t act_sz = n * opts.niters * sizeof(ActionType);
 83 | 		void **d_ptrs;
 84 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
 85 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
 86 | 		uint *d_ctrs;
 87 | 		cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz));
 88 | 		cucheck(cudaMemset(d_ctrs, 0, ctrs_sz));
 89 | 		uint *d_latencies;
 90 | 		cucheck(cudaMalloc((void **)&d_latencies, lat_sz));
 91 | 		cucheck(cudaMemset(d_latencies, 0, lat_sz));
 92 | 		ActionType *d_actions;
 93 | 		cucheck(cudaMalloc((void **)&d_actions, act_sz));
 94 | 		cucheck(cudaMemset(d_actions, 0, act_sz));
 95 | 		uint *h_latencies;
 96 | 		cucheck(cudaMallocHost((void **)&h_latencies, lat_sz));
 97 | 		ActionType *h_actions;
 98 | 		cucheck(cudaMallocHost((void **)&h_actions, act_sz));
 99 | 		
100 | 		//cuset(nmallocs_g, uint64, 0);
101 | 
102 | 		// latency variables
103 | 		double avg_malloc_latency = 0, avg_free_latency = 0;
104 | 		double min_malloc_latency = FLT_MAX, min_free_latency = FLT_MAX;
105 | 		double max_malloc_latency = FLT_MIN, max_free_latency = FLT_MIN;
106 | 		double nmallocs = 0, nfrees = 0;
107 | 
108 | 		// do testing
109 | 		for(int itry = 0; itry < opts.ntries; itry++) {
110 | 			// run the kernel
111 | 			phase_latency_k<T> <<<grid, bs>>>
112 | 				(d_ptrs, d_ctrs, itry, d_actions, d_latencies);
113 | 			cucheck(cudaGetLastError());
114 | 			cucheck(cudaStreamSynchronize(0));
115 | 			// check that pointers are correct
116 | 			if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) {
117 | 				fprintf(stderr, "cannot allocate enough memory\n");
118 | 				exit(-1);
119 | 			}
120 | 			// compute the latencies
121 | 			if(!warmup) {
122 | 				cucheck(cudaMemcpy(h_latencies, d_latencies, lat_sz,
123 | 													 cudaMemcpyDeviceToHost));
124 | 				cucheck(cudaMemcpy(h_actions, d_actions, act_sz,
125 | 													 cudaMemcpyDeviceToHost));
126 | 				for(int iter = 0; iter < opts.niters; iter++) {
127 | 					for(int i = 0; i < n; i += opts.period_mask + 1) {
128 | 						ActionType action = h_actions[iter * n + i];
129 | 						for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) {
130 | 							uint latency = 
131 | 								h_latencies[(iter * opts.nallocs + ialloc) * n  + i];
132 | 							//double malloc_latency = h_malloc_latencies[ialloc * n + i];
133 | 							//double free_latency = h_free_latencies[ialloc * n + i];
134 | 							switch(action) {
135 | 							case ActionAlloc:
136 | 								nmallocs++;
137 | 								avg_malloc_latency += (double)latency;
138 | 								min_malloc_latency = min(min_malloc_latency, (double)latency);
139 | 								max_malloc_latency = max(max_malloc_latency, (double)latency);
140 | 								break;
141 | 							case ActionFree:
142 | 								nfrees++;
143 | 								avg_free_latency += (double)latency;
144 | 								min_free_latency = min(min_free_latency, (double)latency);
145 | 								max_free_latency = max(max_free_latency, (double)latency);
146 | 								break;
147 | 								// otherwise, do nothing
148 | 							}
149 | 						}  // for(ialloc)
150 | 					}  // for(i)
151 | 				}  // for(iter)
152 | 			}  // if(not warmup)
153 | 
154 | 		}  // for(itry)
155 | 
156 | 		// free the rest - this is not timed for latency
157 | 		{
158 | 			free_rest_k<T> <<<grid, bs>>> (/* opts, */ d_ptrs, d_ctrs);
159 | 			cucheck(cudaGetLastError());
160 | 			cucheck(cudaStreamSynchronize(0));
161 | 		}
162 | 
163 | 		// output throughput infos
164 | 		if(!warmup) {
165 | 			avg_malloc_latency /= nmallocs;
166 | 			avg_free_latency /= nfrees;
167 | 			printf("min malloc latency %.2lf cycles\n", min_malloc_latency);
168 | 			printf("avg malloc latency %.2lf cycles\n", avg_malloc_latency);
169 | 			printf("max malloc latency %.2lf cycles\n", max_malloc_latency);
170 | 			printf("min free latency %.2lf cycles\n", min_free_latency);
171 | 			printf("avg free latency %.2lf cycles\n", avg_free_latency);
172 | 			printf("max free latency %.2lf cycles\n", max_free_latency);
173 | 			printf("avg pair latency %.2lf cycles\n", 
174 | 						 avg_malloc_latency + avg_free_latency);
175 | 		}  // output latency infos
176 | 
177 | 		// free memory
178 | 		cucheck(cudaFree(d_ptrs));
179 | 		cucheck(cudaFree(d_ctrs));
180 | 	}  // operator() 
181 | };  // PhaseLatencyTest
182 | 
183 | int main(int argc, char **argv) {
184 | 	CommonOpts opts(true);
185 | 	run_test<PhaseLatencyTest>(argc, argv, opts);
186 | 	return 0;
187 | }  // main
188 | 


--------------------------------------------------------------------------------
/tst/perf/phase-throughput/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/phase-throughput/makefile:
--------------------------------------------------------------------------------
1 | NAME=phase-throughput
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/perf/phase-throughput/phase-throughput.cu:
--------------------------------------------------------------------------------
  1 | /** @file prob-throughput.cu probabalitized throughput test */
  2 | 
  3 | #include <common.h>
  4 | 
  5 | #include <limits.h>
  6 | #include <math.h>
  7 | #include <omp.h>
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | 
 12 | /** global counters for number of allocations, frees and total size allocated
 13 | 		*/
 14 | __device__ uint64 nmallocs_g = 0;
 15 | 
 16 | /** the kernel of the probability throughput test */
 17 | template <class T>
 18 | __global__ void prob_throughput_k
 19 | (void **ptrs, uint *ctrs, uint itry) {
 20 | 	uint i = threadIdx.x + blockIdx.x * blockDim.x;
 21 | 	uint n = opts_g.nthreads;
 22 | 	//uint nallocs = opts_g.nallocs;
 23 | 	if(opts_g.is_thread_inactive(i))
 24 | 		return;
 25 | 	uint ctr = ctrs[i];
 26 | 	//uint nmallocs = 0;
 27 | 
 28 | 	// iterate 
 29 | 	for(uint iter = 0; iter < opts_g.niters; iter++) {
 30 | 		// perform the action
 31 | 		switch(opts_g.next_action(ctr > 0, itry, iter)) {
 32 | 			//switch(ctr > 0 ? ActionFree : ActionAlloc) {
 33 | 		case ActionAlloc:
 34 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++)
 35 | 				ptrs[ialloc * n + i] = T::malloc(opts_g.next_alloc_sz());
 36 | 			ctr = opts_g.nallocs;
 37 | 			//nmallocs += nallocs;
 38 | 			break;
 39 | 		case ActionFree:
 40 | 			for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++)
 41 | 				T::free(ptrs[ialloc * n + i]);
 42 | 			ctr = 0;
 43 | 			break;
 44 | 		case ActionNone:
 45 | 			//printf("no action taken\n");
 46 | 			break;
 47 | 		}
 48 | 	}  // for(each iteration)
 49 | 	ctrs[i] = ctr;
 50 | }  // prob_throughput_k
 51 | 
 52 | /** measures malloc throughput */
 53 | template<class T> class PhaseThroughputTest {
 54 | 	
 55 | public:
 56 | 	void operator()(CommonOpts opts, bool warmup) {
 57 | 		// allocate memory
 58 | 		if(warmup) {
 59 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
 60 | 			opts.ntries = 1;
 61 | 			opts.niters = 1;
 62 | 		}
 63 | 		if(!warmup)
 64 | 			printf("two-phase throuhgput test\n");
 65 | 		cuset(opts_g, CommonOpts, opts);
 66 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
 67 | 		int nptrs = n * opts.nallocs;
 68 | 		size_t ptrs_sz = nptrs * sizeof(void *);
 69 | 		uint ctrs_sz = n * sizeof(uint);
 70 | 		void **d_ptrs;
 71 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
 72 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
 73 | 		uint *d_ctrs;
 74 | 		cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz));
 75 | 		cucheck(cudaMemset(d_ctrs, 0, ctrs_sz));
 76 | 		
 77 | 		double t_malloc = 0, t_free = 0, t_pair = 0;
 78 | 		cuset(nmallocs_g, uint64, 0);
 79 | 
 80 | 		// do testing
 81 | 		for(int itry = 0; itry < opts.ntries; itry++) {
 82 | 			// run the kernel
 83 | 			double t_start = omp_get_wtime();
 84 | 			prob_throughput_k<T> <<<grid, bs>>>(/* opts, */ d_ptrs, d_ctrs, itry);
 85 | 			cucheck(cudaGetLastError());
 86 | 			cucheck(cudaStreamSynchronize(0));
 87 | 			double t_end = omp_get_wtime(), dt = t_end - t_start;
 88 | 			t_pair += dt;
 89 | 			if(itry % 2)
 90 | 				t_free += dt;
 91 | 			else
 92 | 				t_malloc += dt;
 93 | 			// check that pointers are correct
 94 | 			if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) {
 95 | 				fprintf(stderr, "cannot allocate enough memory\n");
 96 | 				exit(-1);
 97 | 			}		
 98 | 		}  // for(itry)
 99 | 
100 | 		// free the rest
101 | 		{
102 | 			double t_start = omp_get_wtime();
103 | 			free_rest_k<T> <<<grid, bs>>> (/* opts, */ d_ptrs, d_ctrs);
104 | 			cucheck(cudaGetLastError());
105 | 			cucheck(cudaStreamSynchronize(0));
106 | 			double t_end = omp_get_wtime(), dt = t_end - t_start;
107 | 			t_pair += dt;
108 | 			t_free += dt;
109 | 		}
110 | 
111 | 		// output throughput infos
112 | 		if(!warmup) {
113 | 			//uint64 nallocs;
114 | 			//cuget(&nallocs, nmallocs_g);
115 | 			
116 | 			//double malloc_throughput = opts.total_nallocs() / t_malloc * 1e-6;
117 | 			//double free_throughput = opts.total_nallocs() / t_free * 1e-6;
118 | 			double npairs = 0.5 * opts.total_nallocs() * opts.exec_fraction;
119 | 			double nmallocs = 0.25 * opts.total_nallocs() * 
120 | 				(opts.exec_fraction + opts.alloc_fraction - opts.free_fraction);
121 | 			double nfrees = 0.25 * opts.total_nallocs() * 
122 | 				(opts.exec_fraction + opts.alloc_fraction - opts.free_fraction);
123 | 			double pair_throughput = npairs / t_pair * 1e-6;
124 | 			double malloc_throughput = nmallocs / t_malloc * 1e-6;
125 | 			double free_throughput = nfrees / t_free * 1e-6;
126 | 			double malloc_speed = nmallocs * opts.expected_sz() / t_malloc /
127 | 				NBYTES_IN_GIB;
128 | 			double pair_speed = npairs * opts.expected_sz() / t_pair /
129 | 				NBYTES_IN_GIB;
130 | 			if(opts.niters == 1) {
131 | 				printf("malloc throughput %.2lf Mmallocs/s\n", malloc_throughput);
132 | 				printf("free throughput %.2lf Mfrees/s\n", free_throughput);
133 | 			}
134 | 			//printf("total test time %.2lf ms\n", t_pair * 1e3);
135 | 			printf("pair throughput %.2lf Mpairs/s\n", pair_throughput);
136 | 			if(opts.niters == 1) {
137 | 				printf("malloc speed %.2lf GiB/s\n", malloc_speed);
138 | 			}
139 | 			printf("pair speed %.2lf GiB/s\n", pair_speed);
140 | 		}  // output latency infos
141 | 
142 | 		// free memory
143 | 		cucheck(cudaFree(d_ptrs));
144 | 		cucheck(cudaFree(d_ctrs));
145 | 	}  // operator()
146 |  
147 | };  // PhaseThroughputTest
148 | 
149 | int main(int argc, char **argv) {
150 | 	CommonOpts opts(true);
151 | 	run_test<PhaseThroughputTest>(argc, argv, opts);
152 | 	return 0;
153 | }  // main
154 | 


--------------------------------------------------------------------------------
/tst/perf/priv-throughput/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/priv-throughput/makefile:
--------------------------------------------------------------------------------
1 | NAME=priv-throughput
2 | 
3 | include ../../common.mk
4 | 


--------------------------------------------------------------------------------
/tst/perf/priv-throughput/priv-throughput.cu:
--------------------------------------------------------------------------------
 1 | /** @file throughput.cu throughput test for various memory allocators */
 2 | 
 3 | #include <common.h>
 4 | 
 5 | #include <limits.h>
 6 | #include <math.h>
 7 | #include <omp.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | 
12 | /** measures malloc throughput */
13 | template<class T>
14 | __global__ void throughput_malloc_free_k
15 | (CommonOpts opts, void **ptrs) {
16 | 	int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x;
17 | 	if(opts.is_thread_inactive(i))
18 | 		return;
19 | 	for(int iter = 0; iter < opts.niters; iter++) {
20 | 		// first allocate
21 | 		for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) 
22 | 			ptrs[i + n * ialloc] = T::malloc(opts.next_alloc_sz());
23 | 		// then free
24 | 		for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) 
25 | 			T::free(ptrs[i + n * ialloc]);
26 | 	}
27 | }  // throughput_malloc_k
28 | 
29 | template<class T> class PrivThroughputTest {
30 | 	
31 | public:
32 | 	void operator()(CommonOpts opts, bool warmup) {
33 | 		// allocate memory
34 | 		if(warmup) {
35 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
36 | 			opts.ntries = 1;
37 | 		}
38 | 		if(!warmup)
39 | 			printf("private throughput test\n");
40 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
41 | 		int nptrs = n * opts.nallocs;
42 | 		size_t ptrs_sz = nptrs * sizeof(void *);
43 | 		void **d_ptrs;
44 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
45 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
46 | 
47 | 		double t_pair = 0;
48 | 
49 | 		// do testing
50 | 		for(int itry = 0; itry < opts.ntries; itry++) {
51 | 			// allocate
52 | 			double t_pair_start = omp_get_wtime();
53 | 			throughput_malloc_free_k<T> <<<grid, bs>>>(opts, d_ptrs);
54 | 			cucheck(cudaGetLastError());
55 | 			cucheck(cudaStreamSynchronize(0));
56 | 			double t_pair_end = omp_get_wtime();
57 | 			t_pair += t_pair_end - t_pair_start;
58 | 			// as pointers have not been zeroed out, check them nevertheless
59 | 			if(!check_nz(d_ptrs, 0, nptrs, opts)) {
60 | 				fprintf(stderr, "cannot allocate enough memory\n");
61 | 				exit(-1);
62 | 			}
63 | 		}  // for(itry)
64 | 
65 | 		// output throughput infos; no individual malloc/free throughput can be
66 | 		// estimated
67 | 		if(!warmup) {
68 | 			double pair_throughput = opts.total_nallocs() / t_pair * 1e-6;
69 | 			double pair_speed = opts.total_sz() / t_pair / NBYTES_IN_GIB;
70 | 			printf("pair throughput %.2lf Mpairs/s\n", pair_throughput);
71 | 			printf("pair speed %.2lf GiB/s\n", pair_speed);
72 | 		}  // output latency infos
73 | 
74 | 		// free memory
75 | 		cucheck(cudaFree(d_ptrs));
76 | 	}  // operator() 
77 | };  // PrivThroughputTest
78 | 
79 | int main(int argc, char **argv) {
80 | 	CommonOpts opts(true);
81 | 	run_test<PrivThroughputTest>(argc, argv, opts);
82 | 	return 0;
83 | }  // main
84 | 


--------------------------------------------------------------------------------
/tst/perf/run-test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # usage: 
 4 | # ./run-test.sh <test-name> <test-args>
 5 | 
 6 | TEST_NAME=$1
 7 | shift 1
 8 | TEST_EXE=./bin/$TEST_NAME
 9 | 
10 | # run the test
11 | echo $TEST_EXE $@
12 | $TEST_EXE $@
13 | 


--------------------------------------------------------------------------------
/tst/perf/throughput/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | \#*
3 | ._*
4 | *.o
5 | 


--------------------------------------------------------------------------------
/tst/perf/throughput/makefile:
--------------------------------------------------------------------------------
1 | NAME=throughput
2 | 
3 | include ../../common.mk
4 | 
5 | throughput-all:	$(TGT)
6 | 	cd .. && ./throughput/throughput-all.sh
7 | 


--------------------------------------------------------------------------------
/tst/perf/throughput/throughput-all.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # runs throughput test for all sizes
 4 | ./run-test.sh throughput -s16 -l4
 5 | ./run-test.sh throughput -s24 -l4
 6 | ./run-test.sh throughput -s32 -l4
 7 | ./run-test.sh throughput -s48 -l4
 8 | ./run-test.sh throughput -s64 -l4
 9 | ./run-test.sh throughput -s96 -l2
10 | ./run-test.sh throughput -s128 -l2
11 | ./run-test.sh throughput -s192 -l1
12 | ./run-test.sh throughput -s256 -l1
13 | ./run-test.sh throughput -s384 -l1 -n$((512*1024))
14 | ./run-test.sh throughput -s512 -l1 -n$((512*1024))
15 | ./run-test.sh throughput -s768 -l1 -n$((256*1024))
16 | ./run-test.sh throughput -s1024 -l1 -n$((256*1024))
17 | ./run-test.sh throughput -s1536 -l1 -n$((128*1024))
18 | ./run-test.sh throughput -s2048 -l1 -n$((128*1024))
19 | ./run-test.sh throughput -s3072 -l1 -n$((64*1024))
20 | 
21 | 


--------------------------------------------------------------------------------
/tst/perf/throughput/throughput.cu:
--------------------------------------------------------------------------------
 1 | /** @file throughput.cu throughput test for various memory allocators */
 2 | 
 3 | #include <common.h>
 4 | 
 5 | #include <limits.h>
 6 | #include <math.h>
 7 | #include <omp.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | 
12 | /** measures malloc throughput */
13 | 
14 | template<class T> class ThroughputTest {
15 | 	
16 | public:
17 | 	void operator()(CommonOpts opts, bool warmup) {
18 | 		opts.niters = 1;
19 | 		// allocate memory
20 | 		if(warmup) {
21 | 			opts.nthreads = min(4 * opts.bs, opts.nthreads);
22 | 			opts.ntries = 1;
23 | 		}
24 | 		if(!warmup)
25 | 			printf("throughput test\n");
26 | 		int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs);
27 | 		int nptrs = n * opts.nallocs;
28 | 		size_t ptrs_sz = nptrs * sizeof(void *);
29 | 		void **d_ptrs;
30 | 		cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz));
31 | 		cucheck(cudaMemset(d_ptrs, 0, ptrs_sz));
32 | 
33 | 		double t_malloc = 0, t_free = 0;
34 | 
35 | 		// do testing
36 | 		for(int itry = 0; itry < opts.ntries; itry++) {
37 | 			// allocate
38 | 			double t_malloc_start = omp_get_wtime();
39 | 			malloc_k<T> <<<grid, bs>>>(opts, d_ptrs);
40 | 			cucheck(cudaGetLastError());
41 | 			cucheck(cudaStreamSynchronize(0));
42 | 			double t_malloc_end = omp_get_wtime();
43 | 			t_malloc += t_malloc_end - t_malloc_start;
44 | 			// check that pointers are correct
45 | 			if(!check_nz(d_ptrs, 0, nptrs, opts)) {
46 | 				fprintf(stderr, "cannot allocate enough memory\n");
47 | 				exit(-1);
48 | 			}
49 | 			// free
50 | 			double t_free_start = omp_get_wtime();
51 | 			free_k<T> <<<grid, bs>>>(opts, d_ptrs);
52 | 			cucheck(cudaGetLastError());
53 | 			cucheck(cudaStreamSynchronize(0));
54 | 			double t_free_end = omp_get_wtime();
55 | 			t_free += t_free_end - t_free_start;
56 | 		}  // for(itry)
57 | 
58 | 		// output latency infos
59 | 		if(!warmup) {
60 | 			double malloc_throughput = opts.total_nallocs() / t_malloc * 1e-6;
61 | 			double free_throughput = opts.total_nallocs() / t_free * 1e-6;
62 | 			double pair_throughput = opts.total_nallocs() / (t_malloc + t_free) 
63 | 				* 1e-6;
64 | 			double malloc_speed = opts.total_sz() / t_malloc / NBYTES_IN_GIB;
65 | 			double pair_speed = opts.total_sz() / (t_malloc + t_free) / NBYTES_IN_GIB;
66 | 			printf("malloc throughput %.2lf Mmallocs/s\n", malloc_throughput);
67 | 			printf("free throughput %.2lf Mfrees/s\n", free_throughput);
68 | 			printf("pair throughput %.2lf Mpairs/s\n", pair_throughput);
69 | 			printf("malloc speed %.2lf GiB/s\n", malloc_speed);
70 | 			printf("pair speed %.2lf GiB/s\n", pair_speed);
71 | 		}  // output latency infos
72 | 
73 | 		// free memory
74 | 		cucheck(cudaFree(d_ptrs));		
75 | 	}  // operator()
76 |  
77 | };  // LatencyTest
78 | 
79 | int main(int argc, char **argv) {
80 | 	CommonOpts opts(true);
81 | 	run_test<ThroughputTest>(argc, argv, opts);
82 | 	return 0;
83 | }  // main
84 | 


--------------------------------------------------------------------------------
/tst/perf/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | 


--------------------------------------------------------------------------------