├── .gitignore ├── COPYING ├── LICENSE.txt ├── README ├── bin └── .gitignore ├── makefile ├── samples ├── .gitignore ├── add-strings │ ├── .gitignore │ ├── add-strings.c │ ├── add-strings.cu │ └── makefile ├── bin │ └── .gitignore ├── common.mk ├── grid-points │ ├── .gitignore │ ├── grid-points.cu │ └── makefile ├── include │ ├── .gitignore │ └── halloc.h ├── random-graph │ ├── .gitignore │ ├── makefile │ └── random-graph.cu └── tmp │ └── .gitignore ├── src ├── .gitignore ├── globals.cuh ├── grid.cuh ├── grid.h ├── halloc.cu ├── halloc.h ├── sbset.cuh ├── sbset.h ├── size-info.cuh ├── size-info.h ├── slab.cuh ├── slab.h ├── statistics.cuh ├── utils.cu └── utils.h └── tst ├── .gitignore ├── common-def.mk ├── common.mk ├── common ├── .gitignore ├── common.cu ├── common.h ├── cuda-malloc-wrapper.h ├── halloc-wrapper.h ├── makefile └── scatter-alloc-wrapper.h ├── corr ├── .gitignore ├── bin │ └── .gitignore ├── checkptr │ ├── .gitignore │ ├── checkptr.cu │ └── makefile ├── freeslabs │ ├── .gitignore │ ├── freeslabs.cu │ └── makefile ├── make-all.sh ├── makefile ├── prob-checkptr │ ├── .gitignore │ ├── makefile │ └── prob-checkptr.cu ├── run-all-tests.pl ├── run-test.sh ├── test │ ├── .gitignore │ ├── makefile │ └── test.cu └── tmp │ └── .gitignore ├── exp ├── .gitignore ├── common.pl ├── frag-int │ ├── .gitignore │ ├── exp-log.csv │ ├── exp-plot.gpl │ └── exp-run.pl ├── halloc-vs-scatter │ ├── .gitignore │ ├── exp-log-priv-1.csv │ ├── exp-log-priv-2.csv │ ├── exp-log-priv-3.csv │ ├── exp-log-priv.csv │ ├── exp-log-spree.csv │ ├── exp-plot.gpl │ ├── exp-run.pl │ └── graph.py ├── run-all-exps.sh ├── run-scaling-speed.sh ├── scaling │ ├── .gitignore │ ├── exp-log-lat.csv │ ├── exp-log-thru.csv │ ├── exp-plot.gpl │ ├── exp-run.pl │ └── graph.py ├── settings │ ├── .gitignore │ ├── exp-log.csv │ ├── exp-plot.gpl │ └── exp-run.pl └── speed │ ├── .gitignore │ ├── exp-log-combi.csv │ ├── exp-log-single.csv │ ├── exp-plot.gpl │ ├── exp-run.pl │ └── graph.py ├── include ├── .gitignore └── halloc.h └── perf ├── .gitignore ├── bin └── .gitignore ├── latency ├── .gitignore ├── latency.cu └── makefile ├── make-all.sh ├── makefile ├── phase-alloc-write ├── .gitignore ├── makefile └── phase-alloc-write.cu ├── phase-extfrag ├── .gitignore ├── makefile └── phase-extfrag.cu ├── phase-latency ├── .gitignore ├── makefile └── phase-latency.cu ├── phase-throughput ├── .gitignore ├── makefile └── phase-throughput.cu ├── priv-throughput ├── .gitignore ├── makefile └── priv-throughput.cu ├── run-test.sh ├── throughput ├── .gitignore ├── makefile ├── throughput-all.sh └── throughput.cu └── tmp └── .gitignore /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Forschungszentrum Juelich 2 | 3 | Author(s): Andrew Adinetz 4 | 5 | This software is available to you under a choice of one of two 6 | licenses. You may choose to be licensed under the terms of the GNU 7 | General Public License (GPL) Version 2, available from the file 8 | COPYING in the main directory of this source tree, or the 9 | OpenIB.org BSD license below: 10 | 11 | Redistribution and use in source and binary forms, with or 12 | without modification, are permitted provided that the following 13 | conditions are met: 14 | 15 | - Redistributions of source code must retain the above 16 | copyright notice, this list of conditions and the following 17 | disclaimer. 18 | 19 | - Redistributions in binary form must reproduce the above 20 | copyright notice, this list of conditions and the following 21 | disclaimer in the documentation and/or other materials 22 | provided with the distribution. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 | SOFTWARE. 32 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Halloc GPU memory allocator, version 0.11 2 | 3 | INTRO 4 | 5 | Halloc is a high-throughput malloc/free-style dynamic memory allocator for 6 | NVidia Kepler GPUs. It is based on using bit arrays to represent free blocks and 7 | using a hash function to quickly search for free blocks. This idea, combined 8 | with clever slab management and performance tuning, enables a really fast 9 | allocator. Halloc achieves more than 1.5 bln. mallocs/s (more than 1 10 | bln. malloc/free pairs/s) on K20X and 16-byte allocations, with tens of 11 | thousands of GPU threads and more than 100MiB allocated. This is much higher 12 | than other state-of-the-art GPU allocators. In addition, halloc's performance is 13 | also more stable. This makes halloc suitable for use in GPGPU applications 14 | requiring fast dynamic memory management. Halloc is mainly designed for small 15 | allocation sizes, and delegates allocations larger than 3KiB to CUDA allocator. 16 | 17 | 18 | REQUIREMENTS 19 | 20 | Software: CUDA 5.0 or higher (tested with 6.5) 21 | Hardware: Compute Capability 2.0 or higher (tested on CC 3.5 devices K20X and K40). 22 | 23 | Note: libraries and tests are currently not compiled for compute_50/sm_50, i.e. Maxwell. 24 | 25 | 26 | COMPILING 27 | 28 | To compile halloc library, type (in project's top directory): 29 | 30 | make 31 | 32 | To run correctness tests (CAUTION: takes a lot of time!): 33 | 34 | make test 35 | 36 | To build correctness tests without running them: 37 | 38 | make build-corr 39 | 40 | To build performance tests without running them: 41 | 42 | make build-perf 43 | 44 | Performance tests are then located in ./tst/perf/bin directory, and can be 45 | invoked individually, e.g. 46 | 47 | ./tst/perf/bin/throughput 48 | ./tst/perf/bin/phase-throughput -f0.95 -F0.05 -e0.91 -g5 -t128 49 | 50 | To install, edit PREFIX variable in the makefile to your desired install 51 | directory (default ~/usr) and type: 52 | 53 | make install 54 | 55 | To uninstall: 56 | 57 | make uninstall 58 | 59 | 60 | USING HALLOC 61 | 62 | See samples/ directory for samples using Halloc. 63 | 64 | Compiling Your Program 65 | 66 | The GPU application then needs to be compiled with halloc static library using 67 | separate device compilation and linking. Assuming that the variable $PREFIX 68 | contains the installation prefix, and myprog.cu is the file being compiled, this 69 | can be done as follows: 70 | 71 | nvcc -arch=sm_35 -O3 -I $(PREFIX)/include -dc myprog.cu -o myprog.o 72 | nvcc -arch=sm_35 -O3 -L $(PREFIX)/lib -lhalloc -o myprog myprog.o 73 | 74 | 75 | Halloc API 76 | 77 | The functions defined by halloc are in the halloc.h file, which needs to be 78 | included into your code to use halloc: 79 | 80 | #include 81 | 82 | Before using halloc, in device functions it has to be initialized with ha_init() 83 | function: 84 | 85 | void ha_init(halloc_opts_t opts = halloc_opts_t()); 86 | 87 | It can be given a full halloc_opts_t structure to control fine halloc 88 | parameters, such as slab size or fraction of used chunks at which the slab is 89 | considered "busy". It can also be called just with specifying amount of memory 90 | to allocate, or completely without any parameter list to preserve defaults: 91 | 92 | ha_init(512 * 1024 * 1024); // pass memory to allocate 93 | ha_init(); // use default amount of memory 94 | 95 | Halloc defines two functions, hamalloc to allocate and hafree to free memory 96 | (malloc and free are used by CUDA allocator, therefore halloc has to use other 97 | names). These functions can only be called from device code. 98 | 99 | void *hamalloc(size_t nbytes); 100 | void hafree(void *p); 101 | 102 | Otherwise, these functions have pretty much the same behavior as standard C 103 | malloc/free, e.g.: 104 | 105 | // allocate an array 106 | int *p = (int *)hamalloc(8 * sizeof(int)); 107 | p[0] = 0; 108 | p[1] = threadIdx.x; 109 | p[2] = 2; 110 | // ... 111 | // free the array 112 | hafree(p); 113 | 114 | 115 | // allocate a list 116 | typedef struct list_ { 117 | int element; 118 | struct list_ *next; 119 | } list; 120 | // ... 121 | list *l = (list *)hamalloc(sizeof(list)); 122 | l->element = 1; 123 | l->next = (list *)hamalloc(sizeof(list)); 124 | l->next->element = 2; 125 | l->next->next = NULL; 126 | 127 | The functions can be used in pretty much the same way as in C programs. hamalloc 128 | accepts the number of bytes to allocate, and returns the pointer to allocated 129 | memory, or NULL if memory cannot be allocated. Similarly, hafree accepts either 130 | a pointer returned by hamalloc or NULL, and frees the memory previously 131 | allocated. Naturally, hamalloc and hafree are thread-safe, and can be called 132 | simultaneously by threads of the same or different kernels. hamalloc allocations 133 | persist across kernel invocations, and can be used in other kernel 134 | calls. Pointers allocated by hamalloc can only be freed by hafree; 135 | they cannot be deallocated, e.g., by host/device cudaFree/free. 136 | 137 | ha_shutdown() is intended to free resources used by halloc, but is currently a 138 | no-op. 139 | 140 | 141 | LIMITATIONS 142 | 143 | There is currently no way to change parameters or allocate more memory after 144 | halloc has been initialized. 145 | 146 | 147 | BUGS 148 | 149 | Though correctness tests pass successfully, this provies nothing, of 150 | course. Some bugs are most likely there ;) 151 | -------------------------------------------------------------------------------- /bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | PREFIX=~/usr 2 | NAME=libhalloc.a 3 | HEADER=src/halloc.h 4 | SRC_C=src/*.cu 5 | SRC_H=src/*.h src/*.cuh 6 | SRC=$(SRC_C) $(SRC_H) 7 | TGT=bin/$(NAME) 8 | ARCH= -gencode arch=compute_20,code=sm_20 \ 9 | -gencode arch=compute_30,code=sm_30 \ 10 | -gencode arch=compute_35,code=sm_35 11 | #TEST_TGT=tst/corr/bin/test 12 | 13 | TMP=*~ \\\#* src/*~ src/\\\#* tst/corr/*~ tst/corr/*.o $(TGT) $(TEST_TGT) 14 | 15 | # be careful: using cs modifier can lead to errors maxrregcount should be 44-64, 16 | # this allows both enough threads and enough storage (values of 44 and 54 give 17 | # good results in phase test, while 60 and 64 provide somewhat better spree 18 | # throughput); 39-42 (39 tested) are good when operating in L1-preferred mode 19 | build: $(TGT) 20 | $(TGT): $(SRC) makefile 21 | nvcc $(ARCH) -lineinfo -O3 -lib -rdc=true -Xptxas -dlcm=cg -Xptxas -dscm=wb \ 22 | -Xptxas -maxrregcount=64 -o $(TGT) $(SRC_C) 23 | # -Xptxas -maxrregcount=42 -o $(TGT) $(SRC_C) 24 | # nvcc $(ARCH) -O3 -lib -rdc=true -Xptxas -dlcm=cs -Xptxas -dscm=cs -o $(TGT) $(SRC_C) 25 | # nvcc $(ARCH) -O3 -lib -rdc=true -o $(TGT) $(SRC_C) 26 | 27 | #test: $(TGT) makefile 28 | # make -C tst/corr/test run 29 | test: $(TGT) makefile build-corr 30 | make -C tst/corr run-only 31 | 32 | clean: 33 | rm -f $(TMP) 34 | make -C tst/common clean 35 | make -C tst/corr clean 36 | make -C tst/perf clean 37 | 38 | build-perf: $(TGT) 39 | make -C tst/common build 40 | make -C tst/perf build 41 | 42 | build-corr: $(TGT) 43 | make -C tst/common build 44 | make -C tst/corr build 45 | 46 | build-test: $(TGT) 47 | make -C tst/common build 48 | make -C tst/corr build 49 | make -C tst/perf build 50 | 51 | install: $(HEADER) $(TGT) 52 | cp $(HEADER) $(PREFIX)/include/halloc.h 53 | cp $(TGT) $(PREFIX)/lib/libhalloc.a 54 | 55 | uninstall: 56 | rm -f $(PREFIX)/include/halloc.h $(PREFIX)/lib/libhalloc.a 57 | -------------------------------------------------------------------------------- /samples/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | -------------------------------------------------------------------------------- /samples/add-strings/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | -------------------------------------------------------------------------------- /samples/add-strings/add-strings.c: -------------------------------------------------------------------------------- 1 | /** @file grid-points.cu a test where grid points are sorted into a grid */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); } 12 | 13 | typedef unsigned long long int uint64; 14 | 15 | /** a random value in [a, b] range */ 16 | int random2(int a, int b) { 17 | //return a + random() % (b - a + 1); 18 | return a; 19 | } 20 | 21 | /** an array filled with random values in [a, b] range, with contiguous groups 22 | of p values starting at p being the same */ 23 | void random_array(int *arr, size_t n, int p, int a, int b) { 24 | int v = 0; 25 | for(size_t i = 0; i < n; i++) { 26 | if(i % p == 0) 27 | v = random2(a, b); 28 | arr[i] = v; 29 | } 30 | } 31 | 32 | void alloc_strs(char **strs, const int *lens, int n) { 33 | #pragma omp parallel for 34 | for(int i = 0; i < n; i++) { 35 | int l = lens[i]; 36 | char *str = (char *)malloc((l + 1) * sizeof(char)); 37 | for(int j = 0; j < l; j++) 38 | str[j] = ' '; 39 | str[l] = 0; 40 | strs[i] = str; 41 | } 42 | } //alloc_strs 43 | 44 | void free_strs(char ** strs, int n) { 45 | #pragma omp parallel for 46 | for(int i = 0; i < n; i++) 47 | free(strs[i]); 48 | } // free_strs 49 | 50 | void add_strs(char ** __restrict__ c, char **a, char **b, int n) { 51 | #pragma omp parallel for 52 | for(int i = 0; i < n; i++) { 53 | const char *sa = a[i], *sb = b[i]; 54 | int la = strlen(sa), lb = strlen(sb), lc = la + lb; 55 | char *sc = (char *)malloc((lc + 1) * sizeof(char)); 56 | strcpy(sc, sa); 57 | strcpy(sc + la, sb); 58 | c[i] = sc; 59 | } 60 | } // add_strs 61 | 62 | #define MIN_LEN 31 63 | #define MAX_LEN 31 64 | #define PERIOD 32 65 | 66 | /** a test for string addition on CPU */ 67 | void string_test_cpu(int n, int print) { 68 | int min_len = MIN_LEN; 69 | int max_len = MAX_LEN; 70 | int period = PERIOD; 71 | // string lengths on host and device 72 | int *h_la = 0, *h_lb = 0; 73 | size_t l_sz = n * sizeof(int), s_sz = n * sizeof(char *); 74 | h_la = (int *)malloc(l_sz); 75 | h_lb = (int *)malloc(l_sz); 76 | random_array(h_la, n, period, min_len, max_len); 77 | random_array(h_lb, n, period, min_len, max_len); 78 | 79 | // string arrays 80 | char **h_sa, **h_sb, **h_sc; 81 | h_sa = (char **)malloc(s_sz); 82 | h_sb = (char **)malloc(s_sz); 83 | h_sc = (char **)malloc(s_sz); 84 | 85 | // allocate strings 86 | double t1, t2; 87 | t1 = omp_get_wtime(); 88 | alloc_strs(h_sa, h_la, n); 89 | alloc_strs(h_sb, h_lb, n); 90 | t2 = omp_get_wtime(); 91 | //printf("t1 = %lf, t2 = %lf\n", t1, t2); 92 | if(print) { 93 | double t = (t2 - t1) / 2; 94 | printf("CPU allocation time: %4.2lf ms\n", t * 1e3); 95 | printf("CPU allocation performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 96 | } 97 | 98 | //concatenate strings 99 | t1 = omp_get_wtime(); 100 | add_strs(h_sc, h_sa, h_sb, n); 101 | t2 = omp_get_wtime(); 102 | if(print) { 103 | double t = t2 - t1; 104 | printf("CPU concatenation time: %4.2lf ms\n", t * 1e3); 105 | printf("CPU concatenation performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 106 | } 107 | 108 | // free strings 109 | t1 = omp_get_wtime(); 110 | free_strs(h_sa, n); 111 | free_strs(h_sb, n); 112 | free_strs(h_sc, n); 113 | t2 = omp_get_wtime(); 114 | if(print) { 115 | double t = (t2 - t1) / 3; 116 | //double t = (t2 - t1) / 2; 117 | printf("CPU freeing time: %4.2lf ms\n", t * 1e3); 118 | printf("CPU freeing performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 119 | } 120 | // free the rest 121 | free(h_sa); 122 | free(h_sb); 123 | free(h_sc); 124 | free(h_la); 125 | free(h_lb); 126 | } // string_test_cpu 127 | 128 | 129 | int main(int argc, char **argv) { 130 | //srandom((int)time(0)); 131 | size_t memory = 512 * 1024 * 1024; 132 | printf("==============================\n"); 133 | // CPU test 134 | string_test_cpu(10000, 0); 135 | string_test_cpu(500000, 1); 136 | //ha_shutdown(); 137 | } // main 138 | -------------------------------------------------------------------------------- /samples/add-strings/add-strings.cu: -------------------------------------------------------------------------------- 1 | /** @file grid-points.cu a test where grid points are sorted into a grid */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /** a macro for checking CUDA calls */ 14 | #define cucheck(call) \ 15 | { \ 16 | cudaError_t cucheck_err = (call); \ 17 | if(cucheck_err != cudaSuccess) { \ 18 | const char* err_str = cudaGetErrorString(cucheck_err); \ 19 | fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call); \ 20 | exit(-1); \ 21 | } \ 22 | } 23 | 24 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); } 25 | 26 | typedef unsigned long long int uint64; 27 | 28 | /** prefetches into L1 cache */ 29 | __device__ inline void prefetch_l1(const void *p) { 30 | asm("prefetch.global.L1 [%0];": :"l"(p)); 31 | } 32 | 33 | /** prefetches into L2 cache */ 34 | __device__ inline void prefetch_l2(const void *p) { 35 | asm("prefetch.global.L2 [%0];": :"l"(p)); 36 | } 37 | 38 | /** a random value in [a, b] range */ 39 | int random(int a, int b) { 40 | return a + random() % (b - a + 1); 41 | //return a; 42 | } 43 | 44 | /** an array filled with random values in [a, b] range, with contiguous groups 45 | of p values starting at p being the same */ 46 | void random_array(int *arr, size_t n, int p, int a, int b) { 47 | int v = 0; 48 | for(size_t i = 0; i < n; i++) { 49 | if(i % p == 0) 50 | v = random(a, b); 51 | arr[i] = v; 52 | } 53 | } 54 | 55 | void alloc_strs(char **strs, const int *lens, int n) { 56 | #pragma omp parallel for 57 | for(int i = 0; i < n; i++) { 58 | int l = lens[i]; 59 | char *str = (char *)malloc((l + 1) * sizeof(char)); 60 | //strs[i] = (char *)malloc((l + 1) * sizeof(char)); 61 | /* 62 | for(int j = 0; j < l; j++) 63 | str[j] = ' '; 64 | str[l] = 0; */ 65 | strs[i] = str; 66 | } 67 | } //alloc_strs 68 | 69 | /** a kernel that allocates and initializes an array of strings; memory for 70 | strings is allocated using halloc */ 71 | __global__ void alloc_strs_k 72 | (char ** __restrict__ strs, 73 | const int * __restrict__ lens, int n) { 74 | int i = threadIdx.x + blockIdx.x * blockDim.x; 75 | if(i >= n) 76 | return; 77 | // allocate string (don't forget zero byte!) 78 | int l = lens[i]; 79 | //if(i == 0) 80 | // printf("l = %d\n", l); 81 | //if(i > n - 256) 82 | // printf("i = %d, l = %d, n = %d\n", i, l, n); 83 | // char *str = (char *)hamalloc((l + 1) * sizeof(char)); 84 | // for(int j = 0; j < l; j++) 85 | // str[j] = '0' + j; 86 | // str[l] = 0; 87 | 88 | uint64 *str = (uint64 *)hamalloc((l + 1) * sizeof(char)); 89 | int l_i = (l + 1) / 8; 90 | for(int j = 0; j < l_i - 1; j++) { 91 | str[j] = 0x2020202020202020ull; 92 | } 93 | str[l_i - 1] = 0x0020202020202020ull; 94 | 95 | // save string pointer 96 | strs[i] = (char *)str; 97 | } // alloc_strs_k 98 | 99 | /** a kernel that frees memory allocated for strings */ 100 | __global__ void free_strs_k 101 | (char ** __restrict__ strs, int n) { 102 | int i = threadIdx.x + blockIdx.x * blockDim.x; 103 | if(i >= n) 104 | return; 105 | hafree(strs[i]); 106 | } // free_strs_k 107 | 108 | void free_strs(char ** strs, int n) { 109 | #pragma omp parallel for 110 | for(int i = 0; i < n; i++) 111 | free(strs[i]); 112 | } // free_strs 113 | 114 | /** finds the zero byte in a long value, returns INT_MAX if not found */ 115 | __device__ inline int izero_byte(uint64 v) { 116 | int l = INT_MAX; 117 | #pragma unroll 8 118 | for(int i = 0; i < 8; i++) { 119 | if(((v >> i * 8) & 0xffu) == 0) 120 | l = min(l, i); 121 | } 122 | return l; 123 | } // zero_byte 124 | 125 | // couple of helper device functions, analogous to C library 126 | /** get the length of a string; it is assumed that s is at least 8-byte aligned */ 127 | __device__ inline int dstrlen(const char * __restrict__ s) { 128 | //int len = -1; 129 | int len = INT_MAX; 130 | //while(*s++) len++; 131 | //return len; 132 | const uint64 *s1 = (const uint64 *)s; 133 | int ll = 0; 134 | while(len == INT_MAX) { 135 | //while(true) { 136 | uint64 c1 = *s1++; 137 | #pragma unroll 8 138 | for(int i = 0; i < 8; i++) { 139 | //if(((c1 >> i * 8) & 0xffu) == 0) 140 | // return len; 141 | //len++; 142 | if(((c1 >> i * 8) & 0xffu) == 0) 143 | len = min(len, ll + i); 144 | } 145 | ll++; 146 | } 147 | return len; 148 | } // strlen 149 | 150 | /** concatenate two strings into the third string; all strings have been 151 | allocated, and the result has enough place to hold the arguments; 152 | all pointers are assumed to be 8-byte aligned 153 | */ 154 | __device__ inline void dstrcat 155 | (char * __restrict__ c, const char * __restrict__ b, 156 | const char * __restrict__ a) { 157 | // while(*c++ = *a++) {} 158 | // c--; 159 | // while(*c++ = *b++) {} 160 | uint64 *c1 = (uint64 *)c; 161 | const uint64 *a1 = (const uint64 *)a; 162 | const uint64 *b1 = (const uint64 *)b; 163 | uint64 cc = 0, aa = 0, bb = 0; 164 | int ccpos = 0; 165 | //uint cc1 = 0; 166 | // TODO: optimize computations for concatenation, similar to dstrlen() 167 | // copy first string 168 | int izb = INT_MAX; 169 | do { 170 | aa = *a1++; 171 | cc |= aa << ccpos; 172 | izb = izero_byte(aa); 173 | if(izb == INT_MAX) { 174 | *c1++ = cc; 175 | } else { 176 | ccpos = izb * 8; 177 | break; 178 | } 179 | } while(izb == INT_MAX); 180 | /* 181 | for(int i = 0; i < 8; i++) { 182 | cc1 = (uint)(aa >> i * 8) & 0xffu; 183 | //ccpos += 8; 184 | if(cc1) { 185 | cc |= (uint64)cc1 << ccpos; 186 | ccpos += 8; 187 | if(ccpos == 64) { 188 | *c1++ = cc; 189 | ccpos = 0; 190 | //cc = aa >> i * 8; 191 | cc = 0; 192 | } 193 | } else 194 | break; 195 | } 196 | } while(cc1); 197 | */ 198 | // copy second string 199 | do { 200 | bb = *b1++; 201 | // commit current character group 202 | cc |= bb << ccpos; 203 | *c1++ = cc; 204 | // update for next 205 | izb = izero_byte(bb); 206 | cc = bb >> ccpos; 207 | if(izb != INT_MAX) { 208 | *c1++ = cc; 209 | break; 210 | } 211 | } while(izb == INT_MAX); 212 | 213 | /* 214 | for(int i = 0; i < 8; i++) { 215 | cc1 = (uint)(bb >> i * 8) & 0xffu; 216 | cc |= (uint64)cc1 << ccpos; 217 | ccpos += 8; 218 | if(ccpos == 64) { 219 | *c1++ = cc; 220 | ccpos = 0; 221 | //cc = bb >> i * 8; 222 | cc = 0; 223 | } 224 | if(!cc1) 225 | break; 226 | } 227 | } while(cc1); 228 | if(ccpos) 229 | *c1 = cc; 230 | */ 231 | } // dstrcat 232 | 233 | /** adds two arrays of strings elementwise */ 234 | __global__ void add_strs_k 235 | (char ** __restrict__ c, const char * const * __restrict__ a, 236 | const char * const * __restrict__ b, int n) { 237 | int i = threadIdx.x + blockIdx.x * blockDim.x; 238 | if(i >= n) 239 | return; 240 | // measure strings a and b 241 | const char *sa = a[i], *sb = b[i]; 242 | int la = dstrlen(sa), lb = dstrlen(sb), lc = la + lb; 243 | //int la = 31, lb = 31, lc = la + lb; 244 | // allocate memory and get new string 245 | char *sc = (char *)hamalloc((lc + 1) * sizeof(char)); 246 | dstrcat(sc, sa, sb); 247 | c[i] = sc; 248 | } // add_strs_k 249 | 250 | void add_strs(char ** __restrict__ c, char **a, char **b, int n) { 251 | #pragma omp parallel for 252 | for(int i = 0; i < n; i++) { 253 | const char *sa = a[i], *sb = b[i]; 254 | int la = strlen(sa), lb = strlen(sb), lc = la + lb; 255 | //int la = 31, lb = 31, lc = la + lb; 256 | char *sc = (char *)malloc((lc + 1) * sizeof(char)); 257 | strcpy(sc, sa); 258 | strcpy(sc + la, sb); 259 | c[i] = sc; 260 | } 261 | } // add_strs 262 | 263 | #define MIN_LEN 31 264 | #define MAX_LEN 31 265 | #define PERIOD 32 266 | 267 | /** a test for string addition on GPU */ 268 | void string_test_gpu(int n, bool print) { 269 | int min_len = MIN_LEN; 270 | int max_len = MAX_LEN; 271 | int period = PERIOD; 272 | // string lengths on host and device 273 | int *h_la = 0, *d_la = 0, *h_lb = 0, *d_lb = 0; 274 | size_t l_sz = n * sizeof(int), s_sz = n * sizeof(char *); 275 | cucheck(cudaMallocHost((void **)&h_la, l_sz)); 276 | cucheck(cudaMallocHost((void **)&h_lb, l_sz)); 277 | cucheck(cudaMalloc((void **)&d_la, l_sz)); 278 | cucheck(cudaMalloc((void **)&d_lb, l_sz)); 279 | random_array(h_la, n, period, min_len, max_len); 280 | random_array(h_lb, n, period, min_len, max_len); 281 | cucheck(cudaMemcpy(d_la, h_la, l_sz, cudaMemcpyHostToDevice)); 282 | cucheck(cudaMemcpy(d_lb, h_lb, l_sz, cudaMemcpyHostToDevice)); 283 | 284 | // string arrays 285 | char **d_sa, **d_sb, **d_sc; 286 | cucheck(cudaMalloc((void **)&d_sa, s_sz)); 287 | cucheck(cudaMalloc((void **)&d_sb, s_sz)); 288 | cucheck(cudaMalloc((void **)&d_sc, s_sz)); 289 | 290 | // allocate strings 291 | int bs = 128, grid = divup(n, bs); 292 | double t1, t2; 293 | t1 = omp_get_wtime(); 294 | alloc_strs_k<<>>(d_sa, d_la, n); 295 | cucheck(cudaGetLastError()); 296 | alloc_strs_k<<>>(d_sb, d_lb, n); 297 | cucheck(cudaGetLastError()); 298 | cucheck(cudaStreamSynchronize(0)); 299 | t2 = omp_get_wtime(); 300 | //printf("t1 = %lf, t2 = %lf\n", t1, t2); 301 | if(print) { 302 | double t = (t2 - t1) / 2; 303 | printf("GPU allocation time: %4.2lf ms\n", t * 1e3); 304 | printf("GPU allocation performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 305 | } 306 | 307 | //concatenate strings 308 | t1 = omp_get_wtime(); 309 | add_strs_k<<>>(d_sc, d_sa, d_sb, n); 310 | cucheck(cudaGetLastError()); 311 | cucheck(cudaStreamSynchronize(0)); 312 | t2 = omp_get_wtime(); 313 | if(print) { 314 | double t = t2 - t1; 315 | printf("GPU concatenation time: %4.2lf ms\n", t * 1e3); 316 | printf("GPU concatenation performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 317 | } 318 | 319 | // free strings 320 | t1 = omp_get_wtime(); 321 | free_strs_k<<>>(d_sa, n); 322 | cucheck(cudaGetLastError()); 323 | free_strs_k<<>>(d_sb, n); 324 | cucheck(cudaGetLastError()); 325 | free_strs_k<<>>(d_sc, n); 326 | cucheck(cudaGetLastError()); 327 | cucheck(cudaStreamSynchronize(0)); 328 | t2 = omp_get_wtime(); 329 | if(print) { 330 | double t = (t2 - t1) / 3; 331 | //double t = (t2 - t1) / 2; 332 | printf("GPU freeing time: %4.2lf ms\n", t * 1e3); 333 | printf("GPU freeing performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 334 | } 335 | 336 | // free the rest 337 | cucheck(cudaFree(d_sa)); 338 | cucheck(cudaFree(d_sb)); 339 | cucheck(cudaFree(d_sc)); 340 | cucheck(cudaFree(d_la)); 341 | cucheck(cudaFree(d_lb)); 342 | cucheck(cudaFreeHost(h_la)); 343 | cucheck(cudaFreeHost(h_lb)); 344 | } // string_test_gpu 345 | 346 | /** a test for string addition on CPU */ 347 | void string_test_cpu(int n, bool print) { 348 | int min_len = MIN_LEN; 349 | int max_len = MAX_LEN; 350 | int period = PERIOD; 351 | // string lengths on host and device 352 | int *h_la = 0, *h_lb = 0; 353 | size_t l_sz = n * sizeof(int), s_sz = n * sizeof(char *); 354 | h_la = (int *)malloc(l_sz); 355 | h_lb = (int *)malloc(l_sz); 356 | random_array(h_la, n, period, min_len, max_len); 357 | random_array(h_lb, n, period, min_len, max_len); 358 | 359 | // string arrays 360 | char **h_sa, **h_sb, **h_sc; 361 | h_sa = (char **)malloc(s_sz); 362 | h_sb = (char **)malloc(s_sz); 363 | h_sc = (char **)malloc(s_sz); 364 | 365 | // allocate strings 366 | double t1, t2; 367 | t1 = omp_get_wtime(); 368 | alloc_strs(h_sa, h_la, n); 369 | alloc_strs(h_sb, h_lb, n); 370 | t2 = omp_get_wtime(); 371 | //printf("t1 = %lf, t2 = %lf\n", t1, t2); 372 | if(print) { 373 | double t = (t2 - t1) / 2; 374 | printf("CPU allocation time: %4.2lf ms\n", t * 1e3); 375 | printf("CPU allocation performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 376 | } 377 | 378 | //concatenate strings 379 | t1 = omp_get_wtime(); 380 | add_strs(h_sc, h_sa, h_sb, n); 381 | t2 = omp_get_wtime(); 382 | if(print) { 383 | double t = t2 - t1; 384 | printf("CPU concatenation time: %4.2lf ms\n", t * 1e3); 385 | printf("CPU concatenation performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 386 | } 387 | 388 | // free strings 389 | t1 = omp_get_wtime(); 390 | free_strs(h_sa, n); 391 | free_strs(h_sb, n); 392 | free_strs(h_sc, n); 393 | t2 = omp_get_wtime(); 394 | if(print) { 395 | double t = (t2 - t1) / 3; 396 | //double t = (t2 - t1) / 2; 397 | printf("CPU freeing time: %4.2lf ms\n", t * 1e3); 398 | printf("CPU freeing performance: %4.2lf Mstrings/s\n", n / t * 1e-6); 399 | } 400 | 401 | // free the rest 402 | free(h_sa); 403 | free(h_sb); 404 | free(h_sc); 405 | free(h_la); 406 | free(h_lb); 407 | } // string_test_cpu 408 | 409 | 410 | int main(int argc, char **argv) { 411 | srandom((int)time(0)); 412 | size_t memory = 512 * 1024 * 1024; 413 | // GPU test 414 | ha_init(halloc_opts_t(memory)); 415 | //cucheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); 416 | string_test_gpu(10000, false); 417 | string_test_gpu(1000000, true); 418 | printf("==============================\n"); 419 | // CPU test 420 | string_test_cpu(10000, false); 421 | string_test_cpu(1000000, true); 422 | ha_shutdown(); 423 | } // main 424 | -------------------------------------------------------------------------------- /samples/add-strings/makefile: -------------------------------------------------------------------------------- 1 | NAME=add-strings 2 | 3 | include ../common.mk 4 | -------------------------------------------------------------------------------- /samples/bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /samples/common.mk: -------------------------------------------------------------------------------- 1 | LIB_DIR=-L../../bin 2 | #LIB_DIR=-L ~/usr/lib 3 | LIBHALLOC=-lhalloc 4 | LIBHALLOC_FILE=../../bin/libhalloc.a 5 | INCLUDE_DIR=-I../include 6 | #INCLUDE_DIR=-I ~/usr/include 7 | SRC_C=*.cu 8 | SRC_H=../include/halloc.h 9 | SRC=$(SRC_C) $(SRC_H) 10 | #SRC=$(SRC_C) 11 | TGT=../bin/$(NAME) 12 | 13 | OBJ=../tmp/$(NAME).o 14 | 15 | TMP=*~ \\\#* ../tmp/*.o $(TGT) 16 | 17 | ARCH := -gencode arch=compute_20,code=sm_20 \ 18 | -gencode arch=compute_30,code=sm_30 \ 19 | -gencode arch=compute_35,code=sm_35 20 | 21 | build: $(TGT) 22 | $(TGT): $(LIBHALLOC_FILE) $(OBJ) makefile 23 | # nvcc -arch=sm_35 -O3 -Xcompiler -fopenmp $(OBJ) $(LIBHALLOC) -o $(TGT) 24 | nvcc $(ARCH) -O3 -Xcompiler -fopenmp $(LIB_DIR) $(LIBHALLOC) -o \ 25 | $(TGT) $(OBJ) 26 | 27 | $(OBJ): $(SRC) makefile 28 | nvcc $(ARCH) -O3 -Xcompiler -fopenmp -Xptxas -dlcm=cg -Xptxas -dscm=wb \ 29 | -Xcompiler -pthread $(INCLUDE_DIR) -dc $(SRC_C) -o $(OBJ) 30 | 31 | run: $(TGT) 32 | ./$(TGT) 33 | 34 | clean: 35 | rm -f $(TMP) 36 | 37 | $(LIBHALLOC): 38 | make -C ../.. 39 | -------------------------------------------------------------------------------- /samples/grid-points/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | -------------------------------------------------------------------------------- /samples/grid-points/grid-points.cu: -------------------------------------------------------------------------------- 1 | /** @file grid-points.cu a test where grid points are sorted into a grid */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /** a macro for checking CUDA calls */ 13 | #define cucheck(call) \ 14 | { \ 15 | cudaError_t cucheck_err = (call); \ 16 | if(cucheck_err != cudaSuccess) { \ 17 | const char* err_str = cudaGetErrorString(cucheck_err); \ 18 | fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call); \ 19 | exit(-1); \ 20 | } \ 21 | } 22 | 23 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); } 24 | 25 | /** a simple 3d vector */ 26 | template 27 | struct vec3 { 28 | T x, y, z; 29 | __host__ __device__ vec3(T x, T y, T z) { 30 | this->x = x; this->y = y; this->z = z; 31 | } 32 | __host__ __device__ vec3(T r = 0) { 33 | this->x = this->y = this->z = r; 34 | } 35 | }; 36 | 37 | typedef vec3 ivec3; 38 | typedef vec3 fvec3; 39 | 40 | /** a single point list */ 41 | struct point_list_t { 42 | /** point index */ 43 | int ip; 44 | /** next element in the list, or 0 if end */ 45 | point_list_t *next; 46 | }; 47 | 48 | /** gets a random float value between 0 and 1 */ 49 | float frandom(void) { 50 | const int rand_max = 65536; 51 | return (double)(random() % rand_max) / rand_max; 52 | } 53 | 54 | /** gets a random point within [0, 1]^3 cube */ 55 | fvec3 random_point(void) { 56 | return fvec3(frandom(), frandom(), frandom()); 57 | } // random_point 58 | 59 | typedef unsigned long long int uint64; 60 | 61 | /** atomicCAS wrapper for pointers (arguments same as standard atomicCAS()) */ 62 | __device__ void *atomicCAS(void **address, void *compare, void *val) { 63 | return (void *)atomicCAS((uint64 *)address, (uint64)compare, (uint64)val); 64 | } // atomicCAS 65 | 66 | /** atomicExch wrapper for void **/ 67 | __device__ void *atomicExch(void **address, void *val) { 68 | return (void *)atomicExch((uint64 *)address, (uint64)val); 69 | } 70 | 71 | /** a function to insert a point into a grid on device; this function can be 72 | called concurrently by multiple threads */ 73 | __device__ void insert_point 74 | (point_list_t **grid, int ncells, const fvec3 * __restrict__ ps, int ip, 75 | point_list_t *plist) { 76 | // compute the cell 77 | fvec3 p = ps[ip]; 78 | ivec3 cell; 79 | cell.x = max(min((int)floorf(p.x * ncells), ncells - 1), 0); 80 | cell.y = max(min((int)floorf(p.y * ncells), ncells - 1), 0); 81 | cell.z = max(min((int)floorf(p.z * ncells), ncells - 1), 0); 82 | 83 | // get the cell pointer 84 | point_list_t * volatile *pcell = grid + (cell.x + ncells * (cell.y + ncells * 85 | cell.z)); 86 | // try to take over the new start 87 | // TODO: add __threadfence() somewhere 88 | point_list_t *old = (point_list_t *)atomicExch((void **)pcell, plist); 89 | plist->ip = ip; 90 | plist->next = old; 91 | } // insert_point 92 | 93 | /** frees the grid cell; one cell can be simultaneously freed by one thread only 94 | */ 95 | __device__ void free_cell(point_list_t **grid, int ncells, ivec3 cell, 96 | point_list_t *pre_chains) { 97 | point_list_t **pcell = grid + cell.x + ncells * (cell.y + ncells * cell.z); 98 | // free all cells 99 | point_list_t *plist = *pcell, *pnext; 100 | while(plist) { 101 | pnext = plist->next; 102 | if(!pre_chains) { 103 | hafree(plist); 104 | } 105 | plist = pnext; 106 | } 107 | } // free_cell 108 | 109 | /** the kernel to insert points into the grid */ 110 | __global__ void sort_points_k 111 | (point_list_t **grid, int ncells, const fvec3 * __restrict__ ps, 112 | point_list_t *pre_chains, int n) { 113 | int ip = threadIdx.x + blockIdx.x * blockDim.x; 114 | if(ip >= n) 115 | return; 116 | 117 | // allocate memory for list element 118 | point_list_t *plist; 119 | if(pre_chains) 120 | plist = pre_chains + ip; 121 | else { 122 | plist = (point_list_t *)hamalloc(sizeof(point_list_t)); 123 | //plist = new point_list_t(); 124 | } 125 | if(!plist) { 126 | //printf("cannot allocate memory\n"); 127 | return; 128 | } 129 | 130 | insert_point(grid, ncells, ps, ip, plist); 131 | } // sort_points_k 132 | 133 | /** the kernel to free the entire grid; this is 1d kernel */ 134 | __global__ void free_grid_k 135 | (point_list_t **grid, int ncells, point_list_t *pre_chains) { 136 | int ncells3 = ncells * ncells * ncells; 137 | int i = threadIdx.x + blockIdx.x * blockDim.x; 138 | if(i >= ncells3) 139 | return; 140 | ivec3 cell; 141 | cell.x = i % ncells; 142 | cell.y = i % (ncells * ncells) / ncells; 143 | cell.z = i / (ncells * ncells); 144 | free_cell(grid, ncells, cell, pre_chains); 145 | } // free_grid_k 146 | 147 | // a test to fill in the grid and then free it 148 | void grid_test(int n, int ncells, bool alloc, bool print) { 149 | // points 150 | size_t sz = n * sizeof(fvec3); 151 | fvec3 *ps, *d_ps; 152 | ps = (fvec3 *)malloc(sz); 153 | cucheck(cudaMalloc((void **)&d_ps, sz)); 154 | for(int ip = 0; ip < n; ip++) { 155 | ps[ip] = random_point(); 156 | //printf("point = (%lf, %lf %lf)\n", (double)ps[ip].x, (double)ps[ip].y, 157 | // (double)ps[ip].z); 158 | } 159 | cucheck(cudaMemcpy(d_ps, ps, sz, cudaMemcpyHostToDevice)); 160 | 161 | // grid 162 | int ncells3 = ncells * ncells * ncells; 163 | size_t grid_sz = ncells3 * sizeof(point_list_t *); 164 | point_list_t **d_grid; 165 | cucheck(cudaMalloc((void **)&d_grid, grid_sz)); 166 | cucheck(cudaMemset(d_grid, 0, grid_sz)); 167 | 168 | // pre-allocated per-point chains 169 | point_list_t *pre_chains = 0; 170 | if(!alloc) { 171 | cucheck(cudaMalloc((void **)&pre_chains, n * sizeof(point_list_t))); 172 | cucheck(cudaMemset(pre_chains, 0, n * sizeof(point_list_t))); 173 | } 174 | 175 | // fill the grid 176 | double t1 = omp_get_wtime(); 177 | int bs = 128; 178 | sort_points_k<<>>(d_grid, ncells, d_ps, pre_chains, n); 179 | cucheck(cudaGetLastError()); 180 | cucheck(cudaStreamSynchronize(0)); 181 | double t2 = omp_get_wtime(); 182 | 183 | // free the grid 184 | free_grid_k<<>>(d_grid, ncells, pre_chains); 185 | cucheck(cudaGetLastError()); 186 | cucheck(cudaStreamSynchronize(0)); 187 | double t3 = omp_get_wtime(); 188 | 189 | // free everything 190 | //free(ps); 191 | cucheck(cudaFree(d_grid)); 192 | cucheck(cudaFree(d_ps)); 193 | cucheck(cudaFree(pre_chains)); 194 | 195 | // print time 196 | if(print) { 197 | printf("allocation time %.2lf ms\n", (t2 - t1) * 1e3); 198 | printf("free time %.2lf ms\n", (t3 - t2) * 1e3); 199 | printf("allocation performance %.2lf Mpoints/s\n", n / (t2 - t1) * 1e-6); 200 | printf("free performance %.2lf Mpoints/s\n", n / (t3 - t2) * 1e-6); 201 | } // if(print) 202 | 203 | } // grid_test 204 | 205 | int main(int argc, char **argv) { 206 | srandom((int)time(0)); 207 | size_t memory = 512 * 1024 * 1024; 208 | bool alloc = true; 209 | //cucheck(cudaSetDevice(0)); 210 | ha_init(halloc_opts_t(memory)); 211 | // warm-up run 212 | grid_test(10000, 8, alloc, false); 213 | // main run 214 | grid_test(1000000, 32, alloc, true); 215 | ha_shutdown(); 216 | } // main 217 | -------------------------------------------------------------------------------- /samples/grid-points/makefile: -------------------------------------------------------------------------------- 1 | NAME=grid-points 2 | 3 | include ../common.mk 4 | -------------------------------------------------------------------------------- /samples/include/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /samples/include/halloc.h: -------------------------------------------------------------------------------- 1 | ../../src/halloc.h -------------------------------------------------------------------------------- /samples/random-graph/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | -------------------------------------------------------------------------------- /samples/random-graph/makefile: -------------------------------------------------------------------------------- 1 | NAME=random-graph 2 | 3 | include ../common.mk 4 | -------------------------------------------------------------------------------- /samples/random-graph/random-graph.cu: -------------------------------------------------------------------------------- 1 | /** @file grid-points.cu a test where grid points are sorted into a grid */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /** a macro for checking CUDA calls */ 14 | #define cucheck(call) \ 15 | { \ 16 | cudaError_t cucheck_err = (call); \ 17 | if(cucheck_err != cudaSuccess) { \ 18 | const char* err_str = cudaGetErrorString(cucheck_err); \ 19 | fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call); \ 20 | exit(-1); \ 21 | } \ 22 | } 23 | 24 | /** sets CUDA device variable */ 25 | #define cuset(symbol, T, val) \ 26 | { \ 27 | void *cuset_addr; \ 28 | cucheck(cudaGetSymbolAddress(&cuset_addr, symbol)); \ 29 | T cuset_val = (val); \ 30 | cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \ 31 | cudaMemcpyHostToDevice)); \ 32 | } // cuset 33 | 34 | int divup(int a, int b) { return a / b + (a % b ? 1 : 0); } 35 | 36 | typedef unsigned long long int uint64; 37 | 38 | /** a random value in [a, b] range */ 39 | int random(int a, int b) { 40 | return a + random() % (b - a + 1); 41 | //return a; 42 | } 43 | 44 | /** an array filled with random values in [a, b] range, with contiguous groups 45 | of p values starting at p being the same */ 46 | void random_array(int *arr, size_t n, int p, int a, int b) { 47 | int v = 0; 48 | for(size_t i = 0; i < n; i++) { 49 | if(i % p == 0) 50 | v = random(a, b); 51 | arr[i] = v; 52 | } 53 | } 54 | 55 | /** a list of neighboring vertices */ 56 | struct vertex_t; 57 | struct edge_list_t { 58 | /** the target vertex */ 59 | vertex_t *target; 60 | /** the next element in the vertex list */ 61 | edge_list_t *next; 62 | /** creates a new list edge */ 63 | __host__ __device__ edge_list_t(vertex_t *target, edge_list_t *next = 0) 64 | : target(target), next(next) {} 65 | }; // edge_list_t 66 | 67 | /** a single vertex */ 68 | struct vertex_t { 69 | /** the id of the vertex (= illusion of some data) */ 70 | int id; 71 | /** the number of edges in the vertex */ 72 | int nedges; 73 | /** the list of edges of the vertex */ 74 | edge_list_t *edges; 75 | /** create a new vertex */ 76 | __host__ __device__ vertex_t(int id) : 77 | id(id), nedges(0), edges(0) {} 78 | /** adds an edge (to the beginning of the list) */ 79 | __device__ void add_edge(vertex_t *target) { 80 | edge_list_t *new_edges = (edge_list_t *)hamalloc(sizeof(edge_list_t)); 81 | *new_edges = edge_list_t(target, edges); 82 | edges = new_edges; 83 | nedges++; 84 | } // add_edge 85 | /** same function on the host */ 86 | __host__ void add_edge_host(vertex_t *target) { 87 | edge_list_t *new_edges = (edge_list_t *)malloc(sizeof(edge_list_t)); 88 | *new_edges = edge_list_t(target, edges); 89 | edges = new_edges; 90 | nedges++; 91 | } // add_edge 92 | 93 | }; // vertex_t 94 | 95 | /** random number data on device */ 96 | uint * __constant__ random_states_g; 97 | 98 | void drandom_init(void) { 99 | // TODO: somehow standardize this number 100 | const uint MAX_NTHREADS = 8 * 1024 * 1024; 101 | uint n = MAX_NTHREADS; 102 | size_t sz = n * sizeof(uint); 103 | uint *d_random_states, *h_random_states; 104 | 105 | // allocate memory 106 | cucheck(cudaMalloc((void **)&d_random_states, sz)); 107 | h_random_states = (uint *)malloc(sz); 108 | 109 | // initialize random values, respect groups 110 | //uint gp = 1; 111 | uint gp = 1; 112 | uint seed; 113 | for(uint i = 0; i < n; i++) { 114 | if(i % gp == 0) 115 | seed = random(); 116 | h_random_states[i] = seed; 117 | } 118 | cucheck(cudaMemcpy(d_random_states, h_random_states, sz, 119 | cudaMemcpyHostToDevice)); 120 | free(h_random_states); 121 | 122 | // initialize device variable 123 | cuset(random_states_g, uint *, d_random_states); 124 | } // drandom_init 125 | 126 | /** gets the next seed */ 127 | static inline __host__ __device__ uint next_seed(uint seed) { 128 | /* seed ^= (seed << 13); 129 | seed ^= (seed >> 17); 130 | seed ^= (seed << 5); */ 131 | seed = (seed ^ 61) ^ (seed >> 16); 132 | seed *= 9; 133 | seed = seed ^ (seed >> 4); 134 | seed *= 0x27d4eb2d; 135 | seed = seed ^ (seed >> 15); 136 | return seed; 137 | } // next_seed 138 | 139 | /** get the random value on the device */ 140 | static inline __device__ uint drandom(void) { 141 | uint tid = threadIdx.x + blockIdx.x * blockDim.x; 142 | uint seed = random_states_g[tid]; 143 | seed = next_seed(seed); 144 | random_states_g[tid] = seed; 145 | return seed; 146 | } // drandom 147 | 148 | /** get the random value within the specified interval (both ends inclusive) on 149 | the device */ 150 | static inline __device__ uint drandom(uint a, uint b) { 151 | return a + (drandom() & 0x00ffffffu) % (uint)(b - a + 1); 152 | } // drandom 153 | 154 | static inline __host__ uint hdrandom(uint *seed, uint a, uint b) { 155 | *seed = next_seed(*seed); 156 | return a + (*seed & 0x00ffffffu) % (uint)(b - a + 1); 157 | } // hdrandom 158 | 159 | /** get the floating-point random value between 0 and 1 */ 160 | // static inline __device__ float drandomf(void) { 161 | // float f = 1.0f / (1024.0f * 1024.0f); 162 | // uint m = 1024 * 1024; 163 | // return f * drandom(0, m - 1); 164 | // } // drandomf 165 | 166 | /** kernel building a random graph */ 167 | __global__ void random_graph_build_k 168 | (vertex_t *__restrict__ vs, int nvs, int max_degree) { 169 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 170 | if(tid >= nvs) 171 | return; 172 | vertex_t v = vertex_t(tid); 173 | // build edges for each vertex 174 | int nedges = drandom(1, max_degree); 175 | for(int iedge = 0; iedge < nedges; iedge++) { 176 | vertex_t *target = &vs[drandom(1, nvs)]; 177 | v.add_edge(target); 178 | } 179 | // write the vertex out 180 | vs[tid] = v; 181 | } // random_graph_build_k 182 | 183 | /** random graph test on GPU */ 184 | void random_graph_gpu(int nvs, int max_degree, bool print) { 185 | size_t vs_sz = nvs * sizeof(vertex_t); 186 | vertex_t *d_vs; 187 | cucheck(cudaMalloc((void **)&d_vs, vs_sz)); 188 | // build the graph 189 | int bs = 128; 190 | double t1 = omp_get_wtime(); 191 | random_graph_build_k<<>>(d_vs, nvs, max_degree); 192 | cucheck(cudaGetLastError()); 193 | cucheck(cudaStreamSynchronize(0)); 194 | double t2 = omp_get_wtime(); 195 | 196 | if(print) { 197 | double t = t2 - t1; 198 | double perf = 0.5 * (max_degree + 1) * nvs / t; 199 | printf("GPU time: %.3lf ms\n", t * 1e3); 200 | printf("GPU performance: %.3lf Medges/s\n", perf * 1e-6); 201 | } 202 | cucheck(cudaFree(d_vs)); 203 | } // random_graph_gpu 204 | 205 | /** random graph test on CPU */ 206 | void random_graph_cpu(int nvs, int max_degree, bool print) { 207 | size_t vs_sz = nvs * sizeof(vertex_t); 208 | vertex_t *vs = (vertex_t *)malloc(vs_sz); 209 | // build the graph 210 | double t1 = omp_get_wtime(); 211 | #pragma omp parallel 212 | { 213 | uint seed = random(); 214 | #pragma omp for 215 | for(int tid = 0; tid < nvs; tid++) { 216 | vertex_t v = vertex_t(tid); 217 | // build edges for each vertex 218 | int nedges = hdrandom(&seed, 1, max_degree); 219 | for(int iedge = 0; iedge < nedges; iedge++) { 220 | vertex_t *target = &vs[hdrandom(&seed, 1, nvs)]; 221 | v.add_edge_host(target); 222 | } 223 | // write the vertex out 224 | vs[tid] = v; 225 | } 226 | } 227 | double t2 = omp_get_wtime(); 228 | 229 | if(print) { 230 | double t = t2 - t1; 231 | double perf = 0.5 * (max_degree + 1) * nvs / t; 232 | printf("CPU time: %.3lf ms\n", t * 1e3); 233 | printf("CPU performance: %.3lf Medges/s\n", perf * 1e-6); 234 | } 235 | free(vs); 236 | } // random_graph_cpu 237 | 238 | int main(int argc, char **argv) { 239 | srandom((int)time(0)); 240 | drandom_init(); 241 | size_t memory = 512 * 1024 * 1024; 242 | // GPU test 243 | ha_init(halloc_opts_t(memory)); 244 | //cucheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1)); 245 | random_graph_gpu(10000, 4, false); 246 | random_graph_gpu(1000000, 8, true); 247 | printf("==============================\n"); 248 | // CPU test 249 | random_graph_cpu(10000, 4, false); 250 | random_graph_cpu(1000000, 8, true); 251 | ha_shutdown(); 252 | } // main 253 | -------------------------------------------------------------------------------- /samples/tmp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | -------------------------------------------------------------------------------- /src/globals.cuh: -------------------------------------------------------------------------------- 1 | /** @file forward-globals.cuh global variables that are used in many .cuh files, 2 | and thus require a forward declaration. This file is included into halloc.cu 3 | before all other .cuh files */ 4 | 5 | /** real possible number of superblocks (based on device memory and superblock 6 | size) */ 7 | static __constant__ uint nsbs_g; 8 | 9 | /** superblock size (common for all superblocks, power-of-two) */ 10 | static __constant__ uint sb_sz_g; 11 | /** superblock size shift (for fast division operations) */ 12 | static __constant__ uint sb_sz_sh_g; 13 | 14 | /** real number of sizes */ 15 | static __constant__ uint nsizes_g; 16 | 17 | /** slab descriptors */ 18 | static __device__ superblock_t sbs_g[MAX_NSBS]; 19 | /** slab pointers (stored separately from descriptors, as they do not change) */ 20 | __attribute__((aligned(128))) static __device__ void *sb_ptrs_g[MAX_NSBS]; 21 | /** slab (non-distributed) counters */ 22 | static __device__ uint sb_counters_g[MAX_NSBS]; 23 | -------------------------------------------------------------------------------- /src/grid.cuh: -------------------------------------------------------------------------------- 1 | /** @file grid.cuh implementation of superblock grid */ 2 | 3 | /** base address of the grid; this is the start address of the grid. It is 4 | always aligned to superblock size boundary */ 5 | static void * __constant__ base_addr_g; 6 | /** superblock grid; TODO: cache in L1, this helps */ 7 | __attribute__((aligned(128))) static __device__ uint64 sb_grid_g[2 * MAX_NSBS]; 8 | 9 | //extern __constant__ uint sb_sz_g; 10 | //extern __constant__ uint sb_sz_sh_g; 11 | 12 | /** add the superblock to the grid 13 | // TODO: use on device as well, also with size id 14 | */ 15 | __host__ void grid_add_sb 16 | (uint64 *cells, void *base_addr, uint sb, void *sb_addr, uint sb_sz) { 17 | void *sb_end_addr = (char *)sb_addr + sb_sz - 1; 18 | uint icell_start = ((char *)sb_addr - (char *)base_addr) / sb_sz; 19 | uint icell_end = ((char *)sb_addr + sb_sz - 1 - (char *)base_addr) / sb_sz; 20 | for(uint icell = icell_start; icell <= icell_end; icell++) { 21 | uint64 cell = cells[icell]; 22 | cell |= 1ull << GRID_INIT_POS; 23 | void *cell_start_addr = (char *)base_addr + (uint64)icell * sb_sz; 24 | void *cell_end_addr = (char *)base_addr + (uint64)(icell + 1) * sb_sz - 1; 25 | if(sb_addr <= cell_start_addr) { 26 | // set first superblock in cell 27 | uint64 first_sb_mask = ((1ull << GRID_SB_LEN) - 1) << GRID_FIRST_SB_POS; 28 | cell = ~first_sb_mask & cell | (uint64)sb << GRID_FIRST_SB_POS; 29 | } 30 | if(sb_end_addr >= cell_end_addr) { 31 | // set second superblock in cell 32 | uint64 second_sb_mask = ((1ull << GRID_SB_LEN) - 1) << GRID_SECOND_SB_POS; 33 | cell = ~second_sb_mask & cell | (uint64)sb << GRID_SECOND_SB_POS; 34 | } 35 | uint64 mid_addr_mask = ((1ull << GRID_ADDR_LEN) - 1) << GRID_ADDR_POS; 36 | // set the break address 37 | if(sb_addr > cell_start_addr) { 38 | // current superblock is the second superblock, mid address is its start 39 | uint64 mid_addr = ((char *)sb_addr - (char *)cell_start_addr) >> 40 | GRID_ADDR_SH; 41 | cell = ~mid_addr_mask & cell | mid_addr << GRID_ADDR_POS; 42 | //printf("icell = %d, cell_addr = %p, sb_addr = %p, mid_addr = %llx\n", 43 | // icell, cell_start_addr, sb_addr, mid_addr); 44 | } else if(sb_end_addr <= cell_end_addr) { 45 | // current superblock is the first superblock, mid address is end of this 46 | // superblock + 1 47 | uint64 mid_addr = ((char *)sb_end_addr + 1 - (char *)cell_start_addr) >> 48 | GRID_ADDR_SH; 49 | cell = ~mid_addr_mask & cell | mid_addr << GRID_ADDR_POS; 50 | //printf("icell = %d, cell_addr = %p, sb_addr = %p, mid_addr = %llx\n", 51 | // icell, cell_start_addr, sb_addr, mid_addr); 52 | } 53 | // save the modified cell 54 | cells[icell] = cell; 55 | } // for(each cell in interval) 56 | } // grid_add_sb 57 | 58 | /** gets the mid-address of the grid cell */ 59 | __device__ inline void *grid_mid_addr(uint icell, uint64 cell) { 60 | uint in_sb_addr = ((cell >> GRID_ADDR_POS) & ((1ull << GRID_ADDR_LEN) - 1)) 61 | << GRID_ADDR_SH; 62 | return (char *)base_addr_g + (uint64)icell * sb_sz_g + in_sb_addr; 63 | } 64 | /** gets the grid cell for the pointer */ 65 | __device__ inline uint64 grid_cell(void *p, uint *icell) { 66 | // TODO: handle stale cell data 67 | //*icell = ((char *)p - (char *)base_addr_g) / sb_sz_g; 68 | *icell = ((char *)p - (char *)base_addr_g) >> sb_sz_sh_g; 69 | //return sb_grid_g[*icell]; 70 | return ldca(sb_grid_g + *icell); 71 | } 72 | /** gets the (de)allocation size id for the pointer */ 73 | __device__ inline uint grid_size_id(uint icell, uint64 cell, void *p) { 74 | void *midp = grid_mid_addr(icell, cell); 75 | return p < midp ? grid_first_size_id(cell) : grid_second_size_id(cell); 76 | } 77 | /** gets the (de)allocation superblock id for the pointer */ 78 | __device__ inline uint grid_sb_id(uint icell, uint64 cell, void *p) { 79 | //void *midp = grid_mid_addr(icell, cell); 80 | uint in_sb_addr = ((cell >> GRID_ADDR_POS) & ((1ull << GRID_ADDR_LEN) - 1)) 81 | << GRID_ADDR_SH; 82 | //uint in_sb_addr = ((cell >> GRID_ADDR_POS) & ((1ull << GRID_ADDR_LEN) - 1)); 83 | //uint in_p = (char *)p - (char *)base_addr_g - ((uint64)icell << sb_sz_sh_g); 84 | uint in_p = (char *)p - (char *)base_addr_g - (uint64)icell * sb_sz_g; 85 | //uint in_p = uint(((char *)p - (char *)base_addr_g) >> GRID_ADDR_SH) - (icell << 86 | // (sb_sz_sh_g - GRID_ADDR_SH)); 87 | //return p < midp ? grid_first_sb_id(cell) : grid_second_sb_id(cell); 88 | return in_p < in_sb_addr ? grid_first_sb_id(cell) : grid_second_sb_id(cell); 89 | } 90 | -------------------------------------------------------------------------------- /src/grid.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_GRID_H_ 2 | #define HALLOC_GRID_H_ 3 | 4 | #include "utils.h" 5 | 6 | // constants related to grid cells 7 | #define GRID_SIZE_LEN 6 8 | #define GRID_ADDR_LEN 20 9 | #define GRID_SB_LEN 13 10 | #define GRID_INIT_POS 0 11 | #define GRID_FIRST_SIZE_POS 1 12 | #define GRID_SECOND_SIZE_POS 7 13 | #define GRID_FIRST_SB_POS 13 14 | #define GRID_SECOND_SB_POS 26 15 | #define GRID_ADDR_POS 39 16 | #define GRID_ADDR_SH 4 17 | #define GRID_SB_NONE ((1 << GRID_SB_LEN) - 1) 18 | 19 | /** initial value for the grid cell */ 20 | __host__ __device__ inline uint64 grid_cell_init() { 21 | uint64 no_sb_field = (1 << GRID_SB_LEN) - 1; 22 | return no_sb_field << GRID_FIRST_SB_POS | no_sb_field << GRID_SECOND_SB_POS; 23 | } 24 | /** checks whether the grid cell is initialized */ 25 | __device__ inline bool grid_is_init(uint64 cell) { 26 | return (cell >> GRID_INIT_POS) & 1; 27 | } 28 | /** gets the first size id of the grid cell */ 29 | __device__ inline uint grid_first_size_id(uint64 cell) { 30 | return (cell >> GRID_FIRST_SIZE_POS) & ((1ull << GRID_SIZE_LEN) - 1); 31 | } 32 | /** gets the second size id of the grid cell */ 33 | __device__ inline uint grid_second_size_id(uint64 cell) { 34 | return (cell >> GRID_SECOND_SIZE_POS) & ((1ull << GRID_SIZE_LEN) - 1); 35 | } 36 | /** gets the first superblock id of the grid cell */ 37 | __device__ inline uint grid_first_sb_id(uint64 cell) { 38 | return (cell >> GRID_FIRST_SB_POS) & ((1ull << GRID_SB_LEN) - 1); 39 | } 40 | /** gets the second superblock id of the grid cell */ 41 | __device__ inline uint grid_second_sb_id(uint64 cell) { 42 | return (cell >> GRID_SECOND_SB_POS) & ((1ull << GRID_SB_LEN) - 1); 43 | } 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/halloc.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_H_ 2 | #define HALLOC_H_ 3 | 4 | /** @file hamalloc.h header for halloc allocator */ 5 | #ifdef HALLOCLIB_COMPILING 6 | #define HALLOC_EXTERN 7 | #else 8 | #define HALLOC_EXTERN extern 9 | #endif 10 | 11 | //#define HALLOC_CPP 12 | 13 | /** structure (class) for halloc allocator options */ 14 | 15 | struct halloc_opts_t { 16 | /** total amount of memory available for allocation, bytes */ 17 | size_t memory; 18 | /** memory fraction available to halloc allocator, the rest goes to CUDA for 19 | larger allocations */ 20 | double halloc_fraction; 21 | /** occupancy fraction at which a slab is considered busy */ 22 | double busy_fraction; 23 | /** occupancy fraction at which a slab is considered roomy */ 24 | double roomy_fraction; 25 | /** occupancy fraction at which a slab is considered sparse */ 26 | double sparse_fraction; 27 | /** shift value for slab size (size in bytes) */ 28 | int sb_sz_sh; 29 | /** default constructor which initializes the structure with default values */ 30 | halloc_opts_t(size_t memory = 512 * 1024 * 1024) : 31 | memory(memory), halloc_fraction(0.75), busy_fraction(0.835), 32 | roomy_fraction(0.6), sparse_fraction(0.012), sb_sz_sh(22) 33 | {} 34 | }; // halloc_opts_t 35 | 36 | /** memory allocation */ 37 | //HALLOC_EXTERN __device__ __noinline__ void *hamalloc(uint nbytes); 38 | HALLOC_EXTERN __device__ __noinline__ void *hamalloc(size_t nbytes); 39 | 40 | /** freeing the memory */ 41 | HALLOC_EXTERN __device__ __noinline__ void hafree(void *p); 42 | 43 | /** initializes memory allocator host-side 44 | @param memory amount of memory which should be made available for allocation 45 | */ 46 | void ha_init(halloc_opts_t opts = halloc_opts_t()); 47 | 48 | /** shuts down memory allocator host-side */ 49 | void ha_shutdown(void); 50 | 51 | /** gets current external fragmentation 52 | @param ideal whether the ideal case is considered, i.e. all slabs do really 53 | get allocated from CUDA allocator memory; currently ignored and assumed false 54 | */ 55 | double ha_extfrag(bool ideal); 56 | 57 | // overrides for malloc and free if requested; currently unstable 58 | //#ifdef HALLOC_OVERRIDE_STDC 59 | #if 0 60 | __device__ void *malloc(uint nbytes) throw() { 61 | return hamalloc(nbytes); 62 | } 63 | inline __device__ void free(void *p) throw() { hafree(p); } 64 | extern "C" __host__ void free(void *p) throw(); 65 | #endif 66 | 67 | // overload new/delete C++ operators on device if requested 68 | // currently doesn't make much sense: the compiler treats operator new very 69 | // specially, and obviously links it against an external library, which kills 70 | // all performance 71 | #if defined(HALLOC_CPP) 72 | #include 73 | //struct halloc_tag_t; 74 | //typedef halloc_tag_t *halloc_t; 75 | //#define halloc ((halloc_t)0) 76 | __device__ void *operator new(size_t nbytes) throw(std::bad_alloc); 77 | //__device__ void *operator new[](size_t nbytes) throw(std::bad_alloc); 78 | __device__ void operator delete(void *p) throw(); 79 | //__device__ void operator delete[](void *p) throw(); 80 | #endif 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /src/sbset.cuh: -------------------------------------------------------------------------------- 1 | /** @file sbset.cuh slab set implementation */ 2 | 3 | //extern __constant__ uint nsbs_g; 4 | 5 | __device__ inline uint sbset_get_from(sbset_t sbset) { 6 | // the condition is always true, but the compiler doesn't know that 7 | // without it, performance somewhat drops 8 | #if SBSET_CTR 9 | if(nsbs_g) { 10 | int old = *(volatile int*)&sbset[SB_SET_SZ - 1]; 11 | //int old = atomicAdd((int *)&sbset[SB_SET_SZ - 1], 0); 12 | if(old <= 0) 13 | return SB_NONE; 14 | } 15 | #endif 16 | // then get it 17 | for(uint iword = 0; iword < nsbs_g / WORD_SZ; iword++) { 18 | // atomicOr() also works good here 19 | uint word = *(volatile uint *)&sbset[iword]; 20 | //uint word = atomicOr(&sbset[iword], 0); 21 | while(word) { 22 | uint ibit = __ffs(word) - 1; 23 | // try locking the bit 24 | uint mask = 1 << ibit; 25 | if(atomicAnd(&sbset[iword], ~mask) & mask) { 26 | #if SBSET_CTR 27 | atomicSub(&sbset[SB_SET_SZ - 1], 1); 28 | #endif 29 | return iword * WORD_SZ + ibit; 30 | } 31 | word &= ~mask; 32 | } 33 | } 34 | return SB_NONE; 35 | } // sbset_get_from 36 | -------------------------------------------------------------------------------- /src/sbset.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_SBSET_H_ 2 | #define HALLOC_SBSET_H_ 3 | 4 | /** @file sbset.h slab set definitions */ 5 | 6 | #include "utils.h" 7 | 8 | #define SBSET_CTR 0 9 | 10 | /** superblock set type; word 0 is actually an additional counter */ 11 | typedef uint sbset_t[SB_SET_SZ]; 12 | //typedef uint *sbset_t; 13 | 14 | //#define WORD_SZ2 64 15 | 16 | /** gets superblock from set (and removes it) */ 17 | __device__ inline uint sbset_get_from(sbset_t sbset); 18 | 19 | /** adds ("returns") superblock to the set */ 20 | __device__ inline void sbset_add_to(sbset_t sbset, uint sb) { 21 | uint iword = sb / WORD_SZ, ibit = sb % WORD_SZ; 22 | uint mask = 1 << ibit; 23 | //atomicAdd((int *)&sbset[SB_SET_SZ - 1], 1); 24 | #if SBSET_CTR 25 | if(!(atomicOr(&sbset[iword], mask) & mask)) 26 | atomicAdd((int *)&sbset[SB_SET_SZ - 1], 1); 27 | #else 28 | atomicOr(&sbset[iword], mask); 29 | #endif 30 | //atomicAdd((int *)&sbset[SB_SET_SZ - 1], 31 | // 1 - ((atomicOr(&sbset[iword], mask) & mask) >> ibit)); 32 | } // sbset_add_to 33 | 34 | /** removes the specified slab from set */ 35 | __device__ inline void sbset_remove_from(sbset_t sbset, uint sb) { 36 | uint iword = sb / WORD_SZ, ibit = sb % WORD_SZ; 37 | uint mask = 1 << ibit; 38 | #if SBSET_CTR 39 | if(atomicAnd(&sbset[iword], ~mask) & mask) 40 | atomicSub((int *)&sbset[SB_SET_SZ - 1], 1); 41 | #else 42 | atomicAnd(&sbset[iword], ~mask); 43 | #endif 44 | } // sbset_remove_from 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/size-info.cuh: -------------------------------------------------------------------------------- 1 | /** @file size-infos.cuh implementation of some stuff related to size 2 | information */ 3 | 4 | /** information on sizes */ 5 | __attribute__((aligned(128))) static __device__ size_info_t size_infos_g[MAX_NSIZES]; 6 | //static __constant__ size_info_t size_infos_g[MAX_NSIZES]; 7 | 8 | /** same data, but in different memory */ 9 | // __device__ size_info_t size_infos_dg[MAX_NSIZES]; 10 | -------------------------------------------------------------------------------- /src/size-info.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_SIZE_INFO_H_ 2 | #define HALLOC_SIZE_INFO_H_ 3 | 4 | /** @file size-info.h information and definitions related to sizes */ 5 | 6 | /** size information type; this is non-changing information, to be stored in 7 | constant memory */ 8 | typedef struct { 9 | /** number of chunks in slab */ 10 | uint nchunks; 11 | /** size of a single chunk */ 12 | uint chunk_sz; 13 | /** id of the chunk to which the size belongs */ 14 | uint chunk_id; 15 | /** number of chunks in a block for this size */ 16 | uint nchunks_in_block; 17 | /** threshold (in chunks) for the slab to be declared "sparse", so that it can 18 | be reused by other sizes with the same chunk size */ 19 | uint sparse_threshold; 20 | /** step for the hash function */ 21 | uint hash_step; 22 | /** threshold (in chunks) for the slab to be declared "roomy" */ 23 | uint roomy_threshold; 24 | /** threshold (in chunks) for the slab to be declared "busy" and be detached */ 25 | uint busy_threshold; 26 | } size_info_t __attribute__((aligned(32))); 27 | 28 | /** maximum number of sizes supported */ 29 | #define MAX_NSIZES 64 30 | /** maximum number of different chunk sizes supported */ 31 | #define MAX_NCHUNK_IDS 8 32 | /** a "no-size" constant */ 33 | #define SZ_NONE (~0) 34 | /** block step (16 bytes by default), a power of two */ 35 | #define BLOCK_STEP 16 36 | /** minimum unit size (allocation blocks are either 2 or 3 units) */ 37 | #define MIN_UNIT_SZ 8 38 | /** maximum unit size */ 39 | #define MAX_UNIT_SZ 1024 40 | /** unit step */ 41 | #define UNIT_STEP 2 42 | /** the number of units */ 43 | #define NUNITS 8 44 | /** minimum block size */ 45 | #define MIN_BLOCK_SZ 16 46 | /** maximum block size */ 47 | #define MAX_BLOCK_SZ 3072 48 | 49 | // chunk manipulation 50 | uint chunk_val(uint chunk_sz) { 51 | //return chunk_sz; 52 | uint div3 = chunk_sz % 3 ? 1 : 3; 53 | if(chunk_sz % 3 == 0) 54 | chunk_sz /= 3; 55 | uint sh = 0; 56 | for(; (1 << sh) < chunk_sz; sh++); 57 | return div3 << 16 | sh; 58 | } 59 | 60 | __host__ __device__ inline uint chunk_mul(uint v, uint chunk_sz) { 61 | //return v * chunk_sz; 62 | return (v << (chunk_sz & 0xffffu)) * (chunk_sz >> 16); 63 | } 64 | 65 | __host__ __device__ inline uint chunk_div(uint v, uint chunk_sz) { 66 | // return v / chunk_sz 67 | if(chunk_sz >> 16u == 3u) 68 | v /= 3u; 69 | return v >> (chunk_sz & 0xffffu); 70 | } 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /src/slab.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_SLAB_H_ 2 | #define HALLOC_SLAB_H_ 3 | 4 | /** @file slab.h slab (superblock) header file */ 5 | 6 | #include "utils.h" 7 | 8 | /** possible slab flags */ 9 | enum { 10 | /** slab allocated from CUDA device-side memory, and must be freed into it */ 11 | SB_CUDA = 0x1 12 | }; 13 | 14 | /** superblock descriptor type; information is mostly changing; note that during 15 | allocation, a superblock is mostly identified by superblock id */ 16 | typedef struct { 17 | /** slab size id 18 | TODO: check if we really need it 19 | */ 20 | unsigned int size_id; 21 | /** whether this is a head slab */ 22 | unsigned int is_head; 23 | /** slab chunk id */ 24 | unsigned int chunk_id; 25 | /** slab chunk size */ 26 | uint chunk_sz; 27 | /** pointer to memory owned by superblock */ 28 | void *ptr; 29 | } superblock_t; 30 | 31 | /** a step to check whether the slab can be moved to another free category */ 32 | #define SB_FREE_STEP 2048 33 | /** maximum number of tries inside a slab after which the allocation 34 | attempt is abandoned */ 35 | //#define MAX_NTRIES 32 36 | #define MAX_NTRIES 32 37 | /** the number of steps after which count check needs be peformed, to ensure 38 | that the allocator is not searching in a block that is already full */ 39 | #define CHECK_NTRIES 2 40 | /** a "no-sb" constant */ 41 | #define SB_NONE (~0) 42 | /** number of heads between which to distribute allocations */ 43 | #define NHEADS 1 44 | /** whether to cache head slabs */ 45 | #define CACHE_HEAD_SBS 1 46 | /** step frequency, i.e. what's the step for step update */ 47 | //#define STEP_FREQ 64 48 | #define STEP_FREQ 64 49 | 50 | /** positions and sizes related to slab counters */ 51 | // modified values enable better reading of counters in hex 52 | #define SB_SIZE_POS 0 53 | //#define SB_SIZE_SZ 6 54 | #define SB_SIZE_SZ 5 55 | //#define SB_CHUNK_POS 6 56 | #define SB_CHUNK_POS 5 57 | #define SB_CHUNK_SZ 3 58 | //#define SB_HEAD_POS 9 59 | #define SB_HEAD_POS 8 60 | #define SB_HEAD_SZ 1 61 | //#define SB_COUNT_POS 10 62 | #define SB_COUNT_POS 12 63 | #define SB_COUNT_SZ 20 64 | 65 | // functions for manipulation with counter values 66 | /** gets slab allocation count */ 67 | __device__ inline uint sb_count(uint counter) { 68 | return counter >> SB_COUNT_POS; 69 | } 70 | /** gets size id */ 71 | // __device__ inline uint sb_size_id(uint counter) { 72 | // return (counter >> SB_SIZE_POS) & ((1 << SB_SIZE_SZ) - 1); 73 | // } 74 | /** gets chunk id */ 75 | __device__ inline uint sb_chunk_id(uint counter) { 76 | return (counter >> SB_CHUNK_POS) & ((1 << SB_CHUNK_SZ) - 1); 77 | } 78 | /** gets whether the slab is head (i.e., head bit is set) */ 79 | __device__ inline bool sb_is_head(uint counter) { 80 | return (counter >> SB_HEAD_POS) & 1; 81 | } 82 | /** sets the head for the counter, returns the old counter value */ 83 | __device__ inline uint sb_set_head(uint *counter) { 84 | //return atomicOr(counter, 1 << SB_HEAD_POS); 85 | return atomicAdd(counter, 1 << SB_HEAD_POS); 86 | } 87 | /** resets the head for the slab counter, returns the old counter value */ 88 | __device__ inline uint sb_reset_head(uint *counter) { 89 | //return atomicAnd(counter, ~(1 << SB_HEAD_POS)); 90 | return atomicSub(counter, 1 << SB_HEAD_POS); 91 | } 92 | /** sets the chunk size for the slab counter, returns the old counter value; the 93 | chunk must be NONE for this to work correctly */ 94 | __device__ inline uint sb_set_chunk 95 | (uint *counter, uint chunk_id) { 96 | return atomicSub 97 | (counter, ((SZ_NONE - chunk_id) & ((1 << SB_CHUNK_SZ) - 1)) << 98 | SB_CHUNK_POS); 99 | } // sb_set_chunk 100 | 101 | /** resets the chunk from the specified size to the new size; */ 102 | __device__ inline uint sb_reset_chunk 103 | (uint *counter, uint old_chunk_id) { 104 | return atomicAdd 105 | (counter, ((SZ_NONE - old_chunk_id) & ((1 << SB_CHUNK_SZ) - 1)) << 106 | SB_CHUNK_POS); 107 | } // sb_reset_chunk 108 | 109 | /** updates the size id only, returns the new counter */ 110 | // __device__ inline uint sb_update_size_id 111 | // (uint *counter, uint old_size_id, uint new_size_id) { 112 | // old_size_id = old_size_id & ((1 << SB_SIZE_SZ) - 1); 113 | // new_size_id = new_size_id & ((1 << SB_SIZE_SZ) - 1); 114 | // if(old_size_id >= new_size_id) 115 | // return atomicSub(counter, old_size_id - new_size_id); 116 | // else 117 | // return atomicAdd(counter, new_size_id - old_size_id); 118 | // } // sb_update_size_id 119 | /** gets the counter value for the specified count, size id and chunk id */ 120 | __host__ __device__ inline uint sb_counter_val 121 | (uint count, bool is_head, uint chunk_id, uint size_id) { 122 | return count << SB_COUNT_POS | (is_head ? 1 : 0) << SB_HEAD_POS | 123 | (chunk_id & ((1 << SB_CHUNK_SZ) - 1)) << SB_CHUNK_POS | 124 | (size_id & ((1 << SB_SIZE_SZ) - 1)) << SB_SIZE_POS; 125 | } 126 | /** atomically increments/decrements slab counter, returns old slab counter value */ 127 | __device__ inline uint sb_counter_inc(uint *counter, uint change) { 128 | return atomicAdd(counter, change << SB_COUNT_POS); 129 | } 130 | __device__ inline uint sb_counter_dec(uint *counter, uint change) { 131 | return atomicSub(counter, change << SB_COUNT_POS); 132 | } 133 | 134 | /** a single-thread-in-warp slab lock; it loops until the slab is locked */ 135 | // __device__ inline void sb_lock(superblock_t *sb) { 136 | // lock(&sb->mutex); 137 | // } 138 | /** a single-thread-in-warp slab unlock; it loops until the slab is unlocked */ 139 | // __device__ inline void sb_unlock(superblock_t *sb) { 140 | // unlock(&sb->mutex); 141 | // } 142 | 143 | #endif 144 | -------------------------------------------------------------------------------- /src/statistics.cuh: -------------------------------------------------------------------------------- 1 | /** @file statistics.cuh functions for collecting memory statistics */ 2 | 3 | /** total free memory on device, B */ 4 | __device__ uint64 free_mem_g; 5 | /** maximum memory that can be allocated, B */ 6 | __device__ uint64 max_alloc_mem_g; 7 | /** total Halloc memory (incl. CUDA memory), B */ 8 | __constant__ uint64 total_mem_g; 9 | /** memory assigned to CUDA allocator, B */ 10 | __constant__ uint64 cuda_mem_g; 11 | 12 | /** one-thread kernel determining maximum allocatable memory; it does so by 13 | doing binary search on what CUDA malloc can do */ 14 | __global__ void find_max_alloc_k() { 15 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 16 | if(i > 0) 17 | return; 18 | uint64 hi = cuda_mem_g, lo = 0, mid; 19 | uint64 min_diff = 1024 * 1024; 20 | while(hi - lo > min_diff) { 21 | mid = (hi + lo) / 2; 22 | void *p = malloc(mid); 23 | if(p) { 24 | lo = mid; 25 | free(p); 26 | } else 27 | hi = mid; 28 | } // while 29 | max_alloc_mem_g = mid; 30 | } // find_max_alloc_k 31 | 32 | /** multi-thread kernel that counts free memory available on device by launching 33 | one thread per slab */ 34 | __global__ void find_free_mem_k(bool ideal) { 35 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 36 | if(i >= nsbs_g) 37 | return; 38 | uint sb_sz = sb_sz_g; 39 | uint chunk_sz = sbs_g[i].chunk_sz; 40 | uint nused_chunks = sb_count(sb_counters_g[i]); 41 | uint used_mem = chunk_sz != 0 ? chunk_mul(nused_chunks, chunk_sz) : 0; 42 | uint free_sz = sb_sz - used_mem; 43 | atomicAdd(&free_mem_g, free_sz); 44 | if(ideal && chunk_sz == 0) 45 | atomicAdd(&max_alloc_mem_g, sb_sz); 46 | if(i == 0) 47 | atomicAdd(&free_mem_g, cuda_mem_g); 48 | } // find_free_mem_k 49 | 50 | double ha_extfrag(bool ideal) { 51 | uint bs = 128; 52 | cuset(max_alloc_mem_g, uint64, 0); 53 | cuset(free_mem_g, uint64, 0); 54 | find_max_alloc_k<<<1, bs>>>(); 55 | cucheck(cudaGetLastError()); 56 | cucheck(cudaStreamSynchronize(0)); 57 | find_free_mem_k<<>>(ideal); 58 | cucheck(cudaGetLastError()); 59 | cucheck(cudaStreamSynchronize(0)); 60 | 61 | uint64 free_mem, max_alloc; 62 | //uint64 cuda_mem; 63 | cuget(&free_mem, free_mem_g); 64 | cuget(&max_alloc, max_alloc_mem_g); 65 | //cuget(&cuda_mem, cuda_mem_g); 66 | // printf("free_mem = %lld, max_alloc = %lld, cuda_mem = %lld\n", 67 | // free_mem, max_alloc, cuda_mem); 68 | return 1.0 - (double)max_alloc / free_mem; 69 | } // ha_extfrag 70 | -------------------------------------------------------------------------------- /src/utils.cu: -------------------------------------------------------------------------------- 1 | /** @file utils.cu utility function implementation */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | uint max_prime_below(uint n, uint nb) { 9 | for(uint p = n - 1; p >= 3; p--) { 10 | uint max_d = (uint)floor(sqrt(p)); 11 | bool is_prime = true; 12 | for(uint d = 2; d <= max_d; d++) 13 | if(p % d == 0) { 14 | is_prime = false; 15 | break; 16 | } 17 | if(is_prime && n % p && nb % p) 18 | return p; 19 | } 20 | // if we are here, we can't find prime; exit with failure 21 | fprintf(stderr, "cannot find prime below %d not dividing %d\n", n, n); 22 | exit(-1); 23 | return ~0; 24 | } // max_prime_below 25 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_UTILS_H_ 2 | #define HALLOC_UTILS_H_ 3 | 4 | /** @file utils.h some utility macros, functions and definitions */ 5 | 6 | /** a macro for checking CUDA calls */ 7 | #define cucheck(call) \ 8 | { \ 9 | cudaError_t cucheck_err = (call); \ 10 | if(cucheck_err != cudaSuccess) { \ 11 | const char* err_str = cudaGetErrorString(cucheck_err); \ 12 | fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call); \ 13 | exit(-1); \ 14 | } \ 15 | } 16 | 17 | /** sets CUDA device variable */ 18 | #define cuset(symbol, T, val) \ 19 | { \ 20 | void *cuset_addr; \ 21 | cucheck(cudaGetSymbolAddress(&cuset_addr, symbol)); \ 22 | T cuset_val = (val); \ 23 | cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \ 24 | cudaMemcpyHostToDevice)); \ 25 | } // cuset 26 | 27 | /** gets the value of the CUDA device variable */ 28 | #define cuget(pval, symbol) \ 29 | { \ 30 | void *cuget_addr; \ 31 | cucheck(cudaGetSymbolAddress(&cuget_addr, symbol)); \ 32 | cucheck(cudaMemcpy((pval), cuget_addr, sizeof(*(pval)), \ 33 | cudaMemcpyDeviceToHost)); \ 34 | } 35 | 36 | #define cuset_arr(symbol, val) \ 37 | { \ 38 | void *cuset_addr; \ 39 | cucheck(cudaGetSymbolAddress(&cuset_addr, symbol)); \ 40 | cucheck(cudaMemcpy(cuset_addr, *val, sizeof(*val), \ 41 | cudaMemcpyHostToDevice)); \ 42 | } // cuset_arr 43 | 44 | /** acts as cudaMemset(), but accepts device variable */ 45 | #define cuvar_memset(symbol, val, sz) \ 46 | { \ 47 | void *cuvar_addr; \ 48 | cucheck(cudaGetSymbolAddress(&cuvar_addr, symbol)); \ 49 | cucheck(cudaMemset(cuvar_addr, val, sz)); \ 50 | } // cuvar_memset 51 | 52 | /** 64-bit integer type */ 53 | typedef unsigned long long uint64; 54 | 55 | // constants 56 | /** word size (the word is uint, which is assumed to be 32-bit) */ 57 | #define WORD_SZ 32 58 | /** the warp size (32 on current NVidia architectures) */ 59 | #define WARP_SZ 32 60 | /** maximum number of superblocks */ 61 | //#define MAX_NSBS 4096 62 | #define MAX_NSBS 8192 63 | /** the size of SB set, in words; the number of used SBs can be smaller */ 64 | #define SB_SET_SZ (MAX_NSBS / WORD_SZ) 65 | /** the maximum number of warps in a thread block */ 66 | #define MAX_NWARPS 32 67 | 68 | /** division with rounding upwards, useful for kernel calls */ 69 | inline __host__ __device__ int divup 70 | (int a, int b) { return a / b + (a % b ? 1 : 0); } 71 | 72 | /** checks whether the step is in mask */ 73 | __device__ inline bool step_is_in_mask(uint mask, uint val) { 74 | return (mask >> val) & 1; 75 | } 76 | 77 | /** gets the distance to the next higher mask value */ 78 | __device__ inline uint step_next_dist(uint mask, uint val) { 79 | uint res = __ffs(mask >> (val + 1)); 80 | return res ? res : WORD_SZ - val; 81 | } 82 | 83 | /** tries single-thread-per-warp lock 84 | @returns true if locking is successful and false otherwise 85 | */ 86 | __device__ inline bool try_lock(uint *mutex) { 87 | return atomicExch(mutex, 1) == 0; 88 | } 89 | /** single-thread-per-warp lock; loops until the lock is acquired */ 90 | __device__ inline void lock(uint *mutex) { 91 | while(!try_lock(mutex)); 92 | } 93 | /** single-thread-per-warp unlock, without threadfence */ 94 | __device__ inline void unlock(uint *mutex) { 95 | __threadfence(); 96 | atomicExch(mutex, 0); 97 | } 98 | /** waits until the mutex is unlocked, but does not attempt locking */ 99 | __device__ inline void wait_unlock(uint *mutex) { 100 | while(*(volatile uint *)mutex); 101 | // { 102 | // uint64 t1 = clock64(); 103 | // while(clock64() - t1 < 1); 104 | // } 105 | } 106 | /** gets the warp leader based on the mask */ 107 | __device__ inline uint warp_leader(uint mask) { 108 | return __ffs(mask) - 1; 109 | } 110 | 111 | /** gets the lane id inside the warp */ 112 | __device__ inline uint lane_id(void) { 113 | uint lid; 114 | asm("mov.u32 %0, %%laneid;" : "=r" (lid)); 115 | return lid; 116 | // TODO: maybe use more reliable lane id computation 117 | //return threadIdx.x % WARP_SZ; 118 | } 119 | 120 | /** gets the id of the warp */ 121 | __device__ inline uint warp_id(void) { 122 | // TODO: use something more stable 123 | return threadIdx.x / WARP_SZ; 124 | } 125 | 126 | /** broadcasts a value to all participating threads in a warp */ 127 | __device__ inline uint warp_bcast(uint v, uint root_lid) { 128 | #if __CUDA_ARCH__ >= 300 129 | // use warp intrinsics 130 | return (uint) __shfl((int)v, root_lid); 131 | #else 132 | // use shared memory 133 | volatile __shared__ uint vs[MAX_NWARPS]; 134 | if(lane_id() == root_lid) 135 | vs[warp_id()] = v; 136 | return vs[warp_id()]; 137 | #endif 138 | } // warp_bcast 139 | 140 | /** loads the data with caching */ 141 | __device__ inline uint ldca(const uint *p) { 142 | uint res; 143 | asm("ld.global.ca.u32 %0, [%1];": "=r"(res) : "l"(p)); 144 | return res; 145 | } 146 | 147 | __device__ inline uint64 ldca(const uint64 *p) { 148 | uint64 res; 149 | asm("ld.global.ca.u64 %0, [%1];": "=l"(res) : "l"(p)); 150 | return res; 151 | } 152 | 153 | __device__ inline void *ldca(void * const *p) { 154 | void *res; 155 | asm("ld.global.ca.u64 %0, [%1];": "=l"(res) : "l"(p)); 156 | return res; 157 | } 158 | 159 | /** prefetches into L1 cache */ 160 | __device__ inline void prefetch_l1(const void *p) { 161 | asm("prefetch.global.L1 [%0];": :"l"(p)); 162 | } 163 | 164 | /** prefetches into L2 cache */ 165 | __device__ inline void prefetch_l2(const void *p) { 166 | asm("prefetch.global.L2 [%0];": :"l"(p)); 167 | } 168 | 169 | __device__ inline uint lanemask_lt() { 170 | uint mask; 171 | asm("mov.u32 %0, %%lanemask_lt;" : "=r" (mask)); 172 | return mask; 173 | } 174 | 175 | /** find the largest prime number below this one, and not dividing this one */ 176 | uint max_prime_below(uint n, uint nb); 177 | 178 | #endif 179 | -------------------------------------------------------------------------------- /tst/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | -------------------------------------------------------------------------------- /tst/common-def.mk: -------------------------------------------------------------------------------- 1 | # currently, =1 is not supported 2 | WITH_SCATTER=0 3 | -------------------------------------------------------------------------------- /tst/common.mk: -------------------------------------------------------------------------------- 1 | include ../../common-def.mk 2 | 3 | LIBHALLOC=../../../bin/libhalloc.a 4 | LIBCOMMON=../../common/libcommontest.a 5 | LIBSCATTER=../../include/libscatteralloc.a 6 | 7 | LIBS :=$(LIBHALLOC) $(LIBCOMMON) 8 | 9 | ARCH := -gencode arch=compute_20,code=sm_20 \ 10 | -gencode arch=compute_30,code=sm_30 \ 11 | -gencode arch=compute_35,code=sm_35 12 | 13 | FLAGS := $(ARCH) -O3 -Xcompiler -fopenmp 14 | CUFLAGS := $(FLAGS) -I../../include -I../../common 15 | 16 | ifeq ($(WITH_SCATTER), 1) 17 | LIBS += $(LIBSCATTER) 18 | CUFLAGS += -DWITH_SCATTER 19 | endif 20 | 21 | CUFLAGS += -dc 22 | 23 | SRC_C=*.cu 24 | SRC_H=../../include/halloc.h ../../common/*.h 25 | SRC=$(SRC_C) $(SRC_H) 26 | TGT=../bin/$(NAME) 27 | 28 | OBJ=../tmp/$(NAME).o 29 | 30 | TMP=*~ \\\#* ../tmp/*.o $(TGT) 31 | 32 | build: $(TGT) 33 | $(TGT): $(LIBS) $(OBJ) makefile 34 | nvcc $(FLAGS) $(OBJ) $(LIBS) -o $(TGT) 35 | 36 | $(OBJ): $(SRC) makefile 37 | nvcc $(CUFLAGS) -dc $(SRC_C) -o $(OBJ) 38 | 39 | run: $(TGT) 40 | ./$(TGT) 41 | 42 | clean: 43 | rm -f $(TMP) 44 | -------------------------------------------------------------------------------- /tst/common/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | *.a 6 | -------------------------------------------------------------------------------- /tst/common/common.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_COMMON_H_ 2 | #define HALLOC_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /** number of bytes in one GiB */ 10 | #define NBYTES_IN_GIB (1024.0 * 1024.0 * 1024.0) 11 | 12 | /** a macro for checking CUDA calls */ 13 | #define cucheck(call) \ 14 | { \ 15 | cudaError_t cucheck_err = (call); \ 16 | if(cucheck_err != cudaSuccess) { \ 17 | const char* err_str = cudaGetErrorString(cucheck_err); \ 18 | fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call); \ 19 | exit(-1); \ 20 | } \ 21 | } 22 | 23 | /** sets CUDA device variable */ 24 | #define cuset(symbol, T, val) \ 25 | { \ 26 | void *cuset_addr; \ 27 | cucheck(cudaGetSymbolAddress(&cuset_addr, symbol)); \ 28 | T cuset_val = (val); \ 29 | cucheck(cudaMemcpy(cuset_addr, &cuset_val, sizeof(cuset_val), \ 30 | cudaMemcpyHostToDevice)); \ 31 | } // cuset 32 | 33 | /** gets the value of the CUDA device variable */ 34 | #define cuget(pval, symbol) \ 35 | { \ 36 | void *cuget_addr; \ 37 | cucheck(cudaGetSymbolAddress(&cuget_addr, symbol)); \ 38 | cucheck(cudaMemcpy((pval), cuget_addr, sizeof(*(pval)), \ 39 | cudaMemcpyDeviceToHost)); \ 40 | } 41 | 42 | /** division with rounding upwards, useful for kernel calls */ 43 | inline int divup(int a, int b) { return a / b + (a % b ? 1 : 0); } 44 | 45 | /** short-name typedef for a long long unsigned type */ 46 | typedef unsigned long long uint64; 47 | 48 | /** @file common.h common functions and definitions for testing infrastructure 49 | of halloc and similar GPU memory allocations. Note that this is provided for 50 | testing, performance measurement and comparison only, and is not intended 51 | for use in end-user applications. For end-user applications, halloc or 52 | another allocator is better to be used directly. */ 53 | 54 | /** supported allocators */ 55 | typedef enum { 56 | AllocatorNone = 0, AllocatorCuda, AllocatorHalloc, AllocatorScatterAlloc, 57 | AllocatorXMalloc, AllocatorTopNone 58 | } AllocatorType; 59 | 60 | /** supported allocation size distributions */ 61 | typedef enum { 62 | DistrNone = 0, DistrUniform, DistrExpUniform, DistrExpEqual, DistrTopNone 63 | } DistrType; 64 | 65 | /** allocation action */ 66 | typedef enum { 67 | ActionNone = 0, ActionAlloc, ActionFree 68 | } ActionType; 69 | 70 | #ifdef COMMONTEST_COMPILING 71 | #define COMMONTEST_EXTERN 72 | #else 73 | #define COMMONTEST_EXTERN extern 74 | #endif 75 | 76 | /** external variable holding random values, one per thread */ 77 | COMMONTEST_EXTERN uint * __constant__ random_states_g; 78 | 79 | /** get the random value on the device */ 80 | static inline __device__ uint drandom(void) { 81 | uint tid = threadIdx.x + blockIdx.x * blockDim.x; 82 | uint seed = random_states_g[tid]; 83 | // TODO: check if other advancements algorithms are faster 84 | /* seed ^= (seed << 13); 85 | seed ^= (seed >> 17); 86 | seed ^= (seed << 5); */ 87 | seed = (seed ^ 61) ^ (seed >> 16); 88 | seed *= 9; 89 | seed = seed ^ (seed >> 4); 90 | seed *= 0x27d4eb2d; 91 | seed = seed ^ (seed >> 15); 92 | random_states_g[tid] = seed; 93 | return seed; 94 | } // drandom 95 | 96 | /** get the random value within the specified interval (both ends inclusive) on 97 | the device */ 98 | static inline __device__ uint drandom(uint a, uint b) { 99 | return a + (drandom() & 0x00ffffffu) % (uint)(b - a + 1); 100 | } // drandom 101 | 102 | /** get the floating-point random value between 0 and 1 */ 103 | static inline __device__ float drandomf(void) { 104 | float f = 1.0f / (1024.0f * 1024.0f); 105 | uint m = 1024 * 1024; 106 | return f * drandom(0, m - 1); 107 | } // drandomf 108 | 109 | /** get the random boolean value with the specified probability 110 | @param probab the probability to return true 111 | */ 112 | static inline __device__ bool drandomb(float probab) { 113 | if(0.0f < probab && probab < 1.0f) 114 | return drandomf() <= probab; 115 | else 116 | return probab >= 1.0f; 117 | } // drandomb 118 | 119 | /** common options for tests and allocator intiialization; note that some tests 120 | are free to provide their own default settings */ 121 | struct CommonOpts { 122 | /** default initialization for common options */ 123 | CommonOpts(bool dummy) 124 | : allocator(AllocatorHalloc), memory(512 * 1024 * 1024), 125 | halloc_fraction(0.75), busy_fraction(0.835), roomy_fraction(0.6), 126 | sparse_fraction(0.0125), sb_sz_sh(22), device(0), nthreads(1024 * 1024), 127 | ntries(8), alloc_sz(16), max_alloc_sz(16), nallocs(4), niters(1), 128 | bs(128), period_mask(0), group_sh(0), distr_type(DistrUniform), 129 | alloc_fraction(1), free_fraction(0), exec_fraction(1) { 130 | recompute_fields(); 131 | } 132 | 133 | __host__ __device__ CommonOpts() {} 134 | /** parses the options from command line, with the defaults specified; memory 135 | is also capped to fraction of device-available at this step 136 | @param [in, out] this the default options on the input, and the options 137 | provided by the command line on the output 138 | */ 139 | void parse_cmdline(int argc, char **argv); 140 | /** the allocator type, as parsed from the command line, -a */ 141 | AllocatorType allocator; 142 | // allocator arguments 143 | /** maximum allocatable memory; silently capped by a fraction (0.75) of 144 | available device memory, -m */ 145 | size_t memory; 146 | /** fraction of memory allocated for halloc allocator, halloc only, -C */ 147 | double halloc_fraction; 148 | /** slab occupancy above which it is declared busy, -B */ 149 | double busy_fraction; 150 | /** slab occupancy below which it is declared roomy, -R */ 151 | double roomy_fraction; 152 | /** slab occupancy below which it is declared sparse; currently, no option, as 153 | we don't see where it's useful */ 154 | double sparse_fraction; 155 | /** shift of slab size, -b */ 156 | int sb_sz_sh; 157 | 158 | // test parameters 159 | /** the device on which everything runs, -D */ 160 | int device; 161 | /** number of threads in the test, -n */ 162 | int nthreads; 163 | /** thread block size, -T */ 164 | int bs; 165 | /** number of tries in the test, -t */ 166 | int ntries; 167 | /** allocation size in bytes when fixed, -s */ 168 | uint alloc_sz; 169 | /** maximum alloc size in bytes, -S */ 170 | uint max_alloc_sz; 171 | /** ceil(log2(max_alloc_sz/alloc_sz) */ 172 | uint max_alloc_sh; 173 | /** number of allocations per thread, -l */ 174 | int nallocs; 175 | /** number of inside-kernel iterations, applicable only to priv-* samples, 176 | forced to one in other cases, -i */ 177 | int niters; 178 | /** period mask, indicates one of how many threads actually does allocation; 179 | -q specifies period shift 180 | */ 181 | int period_mask; 182 | /** group size for period; the "period" parameter is applied to groups, not 183 | individual threads; -g */ 184 | int group_sh; 185 | /** gets the allocation size distribution type; -d */ 186 | DistrType distr_type; 187 | /** probabilities; first dimension is the phase (alloc = 0, free = 1), second 188 | dimension is the action to be taken (alloc = 0, free = 1); these cannot be specified 189 | from command line directly, and computed instead from steady state*/ 190 | float probabs[2][2]; 191 | /** the steady state fraction threads having something allocated after the 192 | allocation phase (f' in equation terms); -f 193 | */ 194 | float alloc_fraction; 195 | /** the steady state fraction of threads having something allocated after the 196 | free phase (f'' in equation terms); -F */ 197 | float free_fraction; 198 | /** the fraction of threads which need to do (execute) something between 199 | steady states; -e */ 200 | float exec_fraction; 201 | /** gets the total number of allocations, as usually defined for tests; for 202 | randomized tests, expectation is returned; individual tests may use their own 203 | definition */ 204 | double total_nallocs(void); 205 | /** gets the total size of all the allocations; for randomized tests, 206 | expectation is returned 207 | */ 208 | double total_sz(void); 209 | /** gets the single allocation expectation size */ 210 | double expected_sz(void); 211 | 212 | /** gets the next action */ 213 | __device__ ActionType next_action 214 | (bool allocated, uint itry, uint iter) const { 215 | uint phase = (itry * niters + iter) % 2; 216 | uint state = allocated ? 1 : 0; 217 | if(drandomb(probabs[phase][state])) 218 | return allocated ? ActionFree : ActionAlloc; 219 | else 220 | return ActionNone; 221 | } // next_action 222 | 223 | /** gets the next allocation size, which can be random */ 224 | __device__ uint next_alloc_sz(void) const { 225 | // single-size case 226 | if(!is_random()) 227 | return alloc_sz; 228 | switch(distr_type) { 229 | case DistrUniform: 230 | { 231 | uint sz = drandom(alloc_sz, max_alloc_sz); 232 | //sz = min(sz, max_alloc_sz); 233 | //printf("sz = %d, alloc_sz = %d, max_alloc_sz = %d\n", sz, alloc_sz, 234 | // max_alloc_sz); 235 | return sz; 236 | } 237 | case DistrExpUniform: 238 | { 239 | // get random shift 240 | uint sh = drandom(0, max_alloc_sh); 241 | // get a value within the exponential group 242 | uint sz = drandom(alloc_sz << sh, (alloc_sz << (sh + 1)) - 1); 243 | sz = min(sz, max_alloc_sz); 244 | return sz; 245 | } 246 | case DistrExpEqual: 247 | { 248 | // get shift, distributed in geometric progression (shift *2 => 249 | // probability / 2) 250 | uint sh = __ffs(drandom(1, 1 << (max_alloc_sh + 1))) - 1; 251 | // get a value within the exponential group 252 | uint sz = drandom(alloc_sz << sh, (alloc_sz << (sh + 1)) - 1); 253 | sz = min(sz, max_alloc_sz); 254 | return sz; 255 | } 256 | default: 257 | // this should definitely not happen 258 | assert(0); 259 | return 0; 260 | } 261 | } // next_alloc_sz 262 | /** checks whether the thread is inactive */ 263 | __host__ __device__ bool is_thread_inactive(uint tid) const { 264 | return tid >= nthreads || (tid >> group_sh) & period_mask; 265 | } 266 | /** gets the period */ 267 | __host__ __device__ uint period(void) const { return period_mask + 1; } 268 | /** gets the group size */ 269 | __host__ __device__ uint group(void) const { return 1 << group_sh; } 270 | /** gets the (contiguous) number of pointers for the given number of threads */ 271 | __host__ __device__ uint nptrs_cont(uint nts) const { 272 | return nts / (group() * period()) * group() + 273 | min(nts % (group() * period()), group()); 274 | } 275 | /** checks whether randomization is employed */ 276 | __host__ __device__ uint is_random(void) const { 277 | return alloc_sz != max_alloc_sz; 278 | } 279 | /** recompute the fields which need be recomputed */ 280 | void recompute_fields(void); 281 | }; 282 | 283 | #ifndef COMMONTEST_COMPILING 284 | __constant__ CommonOpts opts_g; 285 | #endif 286 | 287 | /** initialize device generation of random numbers */ 288 | void drandom_init(const CommonOpts &opts); 289 | 290 | /** shutdown device generation of random numbers */ 291 | void drandom_shutdown(const CommonOpts &opts); 292 | 293 | /** checks that all the pointers are non-zero 294 | @param d_ptrs device pointers 295 | @param nptrs the number of pointers 296 | */ 297 | bool check_nz(void **d_ptrs, uint *d_ctrs, uint nptrs, const CommonOpts &opts); 298 | 299 | /** checks that all allocations are made properly, i.e. that no pointer is zero, 300 | and there's at least alloc_sz memory after each pointer (alloc_sz is the 301 | same for all allocations). Parameters are mostly the same as with check_nz() 302 | */ 303 | bool check_alloc(void **d_ptrs, uint *d_ctrs, uint nptrs, 304 | const CommonOpts &opts); 305 | 306 | #include "halloc-wrapper.h" 307 | #include "cuda-malloc-wrapper.h" 308 | #include "scatter-alloc-wrapper.h" 309 | 310 | /** does a test with specific allocator and test functor; it is called after 311 | command line parsing */ 312 | template class Test> 313 | void run_test(CommonOpts &opts, bool with_warmup) { 314 | T::init(opts); 315 | //warm_up(); 316 | 317 | Test test; 318 | // warmup, if necessary 319 | if(with_warmup) 320 | test(opts, true); 321 | // real run 322 | test(opts, false); 323 | 324 | T::shutdown(); 325 | } // run_test 326 | 327 | /** does a test with specific test functor; basically 328 | this is a main function for all the tests */ 329 | template class Test > 330 | void run_test(int argc, char ** argv, CommonOpts &opts, bool with_warmup = true) { 331 | // parse command line 332 | opts.parse_cmdline(argc, argv); 333 | cucheck(cudaSetDevice(opts.device)); 334 | 335 | // initialize random numbers 336 | drandom_init(opts); 337 | 338 | // instantiate based on allocator type 339 | switch(opts.allocator) { 340 | case AllocatorCuda: 341 | run_test (opts, with_warmup); 342 | break; 343 | case AllocatorHalloc: 344 | //printf("testing halloc allocator\n"); 345 | run_test (opts, with_warmup); 346 | break; 347 | #ifdef WITH_SCATTER 348 | case AllocatorScatterAlloc: 349 | run_test (opts, with_warmup); 350 | break; 351 | #endif 352 | default: 353 | fprintf(stderr, "allocator invalid or not supported\n"); 354 | exit(-1); 355 | } 356 | } // run_test 357 | 358 | #ifndef COMMONTEST_COMPILING 359 | 360 | /** helper malloc kernel used by many tests throughout */ 361 | template 362 | __global__ void malloc_k 363 | (CommonOpts opts, void **ptrs) { 364 | int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x; 365 | if(opts.is_thread_inactive(i)) 366 | return; 367 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) { 368 | uint sz = opts.next_alloc_sz(); 369 | void *ptr = T::malloc(sz); 370 | ptrs[i + n * ialloc] = ptr; 371 | } 372 | } // malloc_k 373 | 374 | /** helper non-randomized malloc kernel */ 375 | template 376 | __global__ void malloc_corr_k 377 | (CommonOpts opts, void **ptrs) { 378 | int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x; 379 | if(opts.is_thread_inactive(i)) 380 | return; 381 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) { 382 | uint sz = opts.next_alloc_sz(); 383 | void *ptr = T::malloc(sz); 384 | ptrs[i + n * ialloc] = ptr; 385 | if(ptr) 386 | *(uint *)ptr = sz; 387 | } 388 | } // malloc_corr_k 389 | 390 | /** helper free kernel used by many tests throughout */ 391 | template 392 | __global__ void free_k 393 | (CommonOpts opts, void **ptrs) { 394 | int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x; 395 | if(opts.is_thread_inactive(i)) 396 | return; 397 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) 398 | T::free(ptrs[i + n * ialloc]); 399 | } // free_k 400 | 401 | /** free the rest after the throughput test; this also counts against the total 402 | time */ 403 | template __global__ void free_rest_k(void **ptrs, uint *ctrs) { 404 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 405 | if(opts_g.is_thread_inactive(i)) 406 | return; 407 | uint ctr = ctrs[i], n = opts_g.nthreads; 408 | for(uint ialloc = 0; ialloc < ctr; ialloc++) { 409 | T::free(ptrs[n * ialloc + i]); 410 | } 411 | ctrs[i] = 0; 412 | } // free_rest_k 413 | 414 | #endif 415 | 416 | #endif 417 | -------------------------------------------------------------------------------- /tst/common/cuda-malloc-wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_MALLOC_WRAPPER_H_ 2 | #define CUDA_MALLOC_WRAPPER_H_ 3 | 4 | /** @file cuda-malloc-wrapper.h wrapper class for CUDA malloc allocator */ 5 | 6 | #include "common.h" 7 | 8 | class CudaMalloc { 9 | public: 10 | static void init(const CommonOpts &opts) { 11 | cucheck(cudaDeviceSetLimit(cudaLimitMallocHeapSize, opts.memory)); 12 | } 13 | 14 | static inline __device__ void *malloc(uint nbytes) { 15 | return ::malloc(nbytes); 16 | } 17 | 18 | static inline __device__ void free(void *p) { 19 | ::free(p); 20 | } 21 | 22 | static double extfrag(bool ideal) { 23 | return 0; 24 | } 25 | 26 | static void shutdown(void) {} 27 | 28 | }; 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /tst/common/halloc-wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef HALLOC_WRAPPER_H_ 2 | #define HALLOC_WRAPPER_H_ 3 | 4 | /** @file halloc-wrapper.h wrapper class for halloc allocator */ 5 | 6 | #include "common.h" 7 | #include 8 | 9 | class Halloc { 10 | public: 11 | static void init(const CommonOpts &opts) { 12 | halloc_opts_t halloc_opts(opts.memory); 13 | halloc_opts.halloc_fraction = opts.halloc_fraction; 14 | halloc_opts.busy_fraction = opts.busy_fraction; 15 | halloc_opts.roomy_fraction = opts.roomy_fraction; 16 | halloc_opts.sparse_fraction = opts.sparse_fraction; 17 | halloc_opts.sb_sz_sh = opts.sb_sz_sh; 18 | ha_init(halloc_opts); 19 | } 20 | 21 | static inline __device__ void *malloc(uint nbytes) { 22 | return hamalloc(nbytes); 23 | } 24 | 25 | static inline __device__ void free(void *p) { 26 | hafree(p); 27 | } 28 | 29 | static double extfrag(bool ideal) { 30 | return ha_extfrag(ideal); 31 | } 32 | 33 | static void shutdown(void) { 34 | ha_shutdown(); 35 | } 36 | 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /tst/common/makefile: -------------------------------------------------------------------------------- 1 | include ../common-def.mk 2 | 3 | TGT=libcommontest.a 4 | SRC_C=*.cu 5 | SRC_H=*.h 6 | SRC=$(SRC_C) $(SRC_H) 7 | TMP=$(TGT) *~ \#* 8 | 9 | ARCH := -gencode arch=compute_20,code=sm_20 \ 10 | -gencode arch=compute_30,code=sm_30 \ 11 | -gencode arch=compute_35,code=sm_35 12 | 13 | FLAGS= $(ARCH) -O3 -rdc=true -lib -I../include 14 | ifeq ($(WITH_SCATTER), 1) 15 | FLAGS += -DWITH_SCATTER 16 | endif 17 | 18 | build: $(TGT) 19 | $(TGT): $(SRC) makefile 20 | nvcc $(FLAGS) -o $(TGT) $(SRC_C) 21 | 22 | clean: 23 | rm -f $(TMP) 24 | -------------------------------------------------------------------------------- /tst/common/scatter-alloc-wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef SCATTER_ALLOC_WRAPPER_H_ 2 | #define SCATTER_ALLOC_WRAPPER_H_ 3 | 4 | /** @file scatter-alloc-wrapper.h wrapper class for ScatterAlloc allocator */ 5 | #ifdef WITH_SCATTER 6 | 7 | #include "common.h" 8 | #include 9 | 10 | class ScatterAlloc { 11 | public: 12 | static void init(const CommonOpts &opts) { 13 | sc_init_heap(opts.memory); 14 | } 15 | 16 | static inline __device__ void *malloc(uint nbytes) { 17 | return scmalloc(nbytes); 18 | } 19 | 20 | static inline __device__ void free(void *p) { 21 | scfree(p); 22 | } 23 | 24 | static double extfrag(bool ideal) { 25 | return 0; 26 | } 27 | 28 | static void shutdown(void) { 29 | } 30 | 31 | }; 32 | 33 | #endif 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /tst/corr/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | *.log 6 | -------------------------------------------------------------------------------- /tst/corr/bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /tst/corr/checkptr/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/corr/checkptr/checkptr.cu: -------------------------------------------------------------------------------- 1 | /** @file latency.cu latency test for various memory allocators */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template class CheckPtrTest { 13 | 14 | public: 15 | void operator()(CommonOpts opts, bool warmup) { 16 | opts.niters = 1; 17 | // allocate memory 18 | if(warmup) { 19 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 20 | opts.ntries = 1; 21 | } 22 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 23 | int nptrs = n * opts.nallocs; 24 | size_t ptrs_sz = nptrs * sizeof(void *); 25 | void **d_ptrs; 26 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 27 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 28 | 29 | // do testing 30 | for(int itry = 0; itry < opts.ntries; itry++) { 31 | // allocate 32 | malloc_corr_k <<>>(opts, d_ptrs); 33 | cucheck(cudaGetLastError()); 34 | cucheck(cudaStreamSynchronize(0)); 35 | // check that pointers are correct 36 | if(!check_alloc(d_ptrs, 0, nptrs, opts)) { 37 | exit(-1); 38 | } 39 | // free 40 | free_k <<>>(opts, d_ptrs); 41 | cucheck(cudaGetLastError()); 42 | cucheck(cudaStreamSynchronize(0)); 43 | } // for(itry) 44 | 45 | // free memory 46 | cucheck(cudaFree(d_ptrs)); 47 | } // operator() 48 | 49 | }; // CheckPtrTest 50 | 51 | int main(int argc, char **argv) { 52 | CommonOpts opts(true); 53 | run_test (argc, argv, opts, false); 54 | return 0; 55 | } // main 56 | -------------------------------------------------------------------------------- /tst/corr/checkptr/makefile: -------------------------------------------------------------------------------- 1 | NAME=checkptr 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/corr/freeslabs/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/corr/freeslabs/freeslabs.cu: -------------------------------------------------------------------------------- 1 | /** @file freeslabs.cu tests whether all slabs are returned as free */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template class FreeSlabsTest { 13 | 14 | public: 15 | void operator()(CommonOpts opts, bool warmup) { 16 | opts.niters = 1; 17 | // allocate memory 18 | if(warmup) { 19 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 20 | opts.ntries = 1; 21 | } 22 | // override number of allocations, period and group options 23 | opts.nallocs = 1; 24 | opts.period_mask = 0; 25 | opts.group_sh = 0; 26 | int max_n = opts.nthreads, nptrs = max_n * opts.nallocs; 27 | // note that here, nthreads is treated as the maximum thread number 28 | size_t ptrs_sz = nptrs * sizeof(void *); 29 | void **d_ptrs; 30 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 31 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 32 | 33 | // allocation fraction; increase to larger values when it's possible 34 | // to free cached or head slabs 35 | double fraction = 0.4; 36 | // do testing 37 | for(int itry = 0; itry < opts.ntries; itry++) { 38 | // step over sizes: 39 | // 16..64: step 8 40 | // 64..256: step 16 41 | // 256..1k: step 128 42 | uint step = 8; 43 | for(uint alloc_sz = 16; alloc_sz <= 1024; alloc_sz += step) { 44 | printf("allocation size %d\n", alloc_sz); 45 | int nthreads = (int)floor(fraction * opts.memory / alloc_sz); 46 | nthreads = min(max_n, nthreads); 47 | opts.nthreads = nthreads; 48 | opts.alloc_sz = opts.max_alloc_sz = alloc_sz; 49 | opts.recompute_fields(); 50 | int bs = opts.bs, grid = divup(opts.nthreads, bs); 51 | // allocate 52 | malloc_k <<>>(opts, d_ptrs); 53 | cucheck(cudaGetLastError()); 54 | cucheck(cudaStreamSynchronize(0)); 55 | // check that pointers are correct 56 | if(!check_alloc(d_ptrs, 0, opts.nthreads, opts)) { 57 | exit(-1); 58 | } 59 | // free 60 | free_k <<>>(opts, d_ptrs); 61 | cucheck(cudaGetLastError()); 62 | cucheck(cudaStreamSynchronize(0)); 63 | // set up step 64 | if(alloc_sz >= 256) 65 | step = 128; 66 | else if(alloc_sz >= 64) 67 | step = 16; 68 | else 69 | step = 8; 70 | } // for(alloc_sz) 71 | } // for(itry) 72 | 73 | // free memory 74 | cucheck(cudaFree(d_ptrs)); 75 | } // operator() 76 | 77 | }; // FreeSlabsTest 78 | 79 | int main(int argc, char **argv) { 80 | CommonOpts opts(true); 81 | opts.ntries = 4; 82 | run_test (argc, argv, opts, false); 83 | return 0; 84 | } // main 85 | -------------------------------------------------------------------------------- /tst/corr/freeslabs/makefile: -------------------------------------------------------------------------------- 1 | NAME=freeslabs 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/corr/make-all.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # runs specific make target for each performance test 4 | ls -1 | grep -vE 'bin|tmp|make|run|\.log' | xargs -IXA_TEST -P0 \ 5 | make -C XA_TEST $1 6 | -------------------------------------------------------------------------------- /tst/corr/makefile: -------------------------------------------------------------------------------- 1 | TMP=*~ 2 | 3 | build: 4 | ./make-all.sh build 5 | 6 | clean: 7 | rm -f $(TMP) 8 | ./make-all.sh clean 9 | 10 | run: build 11 | ./run-all-tests.pl 12 | 13 | run-only: 14 | ./run-all-tests.pl 15 | -------------------------------------------------------------------------------- /tst/corr/prob-checkptr/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/corr/prob-checkptr/makefile: -------------------------------------------------------------------------------- 1 | NAME=prob-checkptr 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/corr/prob-checkptr/prob-checkptr.cu: -------------------------------------------------------------------------------- 1 | /** @file prob-throughput.cu probabalitized throughput test */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /** the kernel of the probability throughput test */ 13 | template 14 | __global__ void prob_corr_k 15 | (void **ptrs, uint *ctrs, uint itry) { 16 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 17 | uint n = opts_g.nthreads, nallocs = opts_g.nallocs; 18 | if(opts_g.is_thread_inactive(i)) 19 | return; 20 | uint ctr = ctrs[i]; 21 | 22 | // iterate 23 | for(uint iter = 0; iter < opts_g.niters; iter++) { 24 | // perform the action 25 | switch(opts_g.next_action(ctr > 0, itry, iter)) { 26 | case ActionAlloc: 27 | for(uint ialloc = 0; ialloc < nallocs; ialloc++) { 28 | uint sz = opts_g.next_alloc_sz(); 29 | void *ptr = T::malloc(sz); 30 | ptrs[ialloc * n + i] = ptr; 31 | if(ptr) 32 | *(uint *)ptr = sz; 33 | //printf("tid = %d, sz = %d\n", i, sz); 34 | } 35 | ctr = nallocs; 36 | break; 37 | case ActionFree: 38 | for(uint ialloc = 0; ialloc < nallocs; ialloc++) 39 | T::free(ptrs[ialloc * n + i]); 40 | ctr = 0; 41 | break; 42 | } 43 | } // for(each iteration) 44 | ctrs[i] = ctr; 45 | } // prob_throughput_k 46 | 47 | /** measures malloc throughput */ 48 | template class ProbCorrTest { 49 | 50 | public: 51 | void operator()(CommonOpts opts, bool warmup) { 52 | // allocate memory 53 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 54 | int nptrs = n * opts.nallocs; 55 | size_t ptrs_sz = nptrs * sizeof(void *); 56 | uint ctrs_sz = n * sizeof(uint); 57 | void **d_ptrs; 58 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 59 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 60 | uint *d_ctrs; 61 | cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz)); 62 | cucheck(cudaMemset(d_ctrs, 0, ctrs_sz)); 63 | 64 | cuset(opts_g, CommonOpts, opts); 65 | 66 | // do testing 67 | for(int itry = 0; itry < opts.ntries; itry++) { 68 | printf("iteration %d\n", itry); 69 | // run the kernel 70 | //printf("kernel configuration: %d, %d\n", grid, bs); 71 | prob_corr_k <<>>(d_ptrs, d_ctrs, itry); 72 | cucheck(cudaGetLastError()); 73 | cucheck(cudaStreamSynchronize(0)); 74 | // check that pointers are correct 75 | if(!check_alloc(d_ptrs, d_ctrs, nptrs, opts)) { 76 | fprintf(stderr, "cannot allocate enough memory\n"); 77 | exit(-1); 78 | } 79 | } // for(itry) 80 | 81 | // free the rest 82 | printf("freeing the rest\n"); 83 | free_rest_k <<>> (d_ptrs, d_ctrs); 84 | cucheck(cudaGetLastError()); 85 | cucheck(cudaStreamSynchronize(0)); 86 | 87 | // free memory 88 | cucheck(cudaFree(d_ptrs)); 89 | cucheck(cudaFree(d_ctrs)); 90 | } // operator() 91 | 92 | }; // ProbThroughputTest 93 | 94 | int main(int argc, char **argv) { 95 | CommonOpts opts(true); 96 | run_test(argc, argv, opts, false); 97 | return 0; 98 | } // main 99 | -------------------------------------------------------------------------------- /tst/corr/run-all-tests.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env perl 2 | 3 | # a script to run all test for halloc 4 | 5 | use POSIX; 6 | 7 | $ntests = 0; 8 | $nsuccesses = 0; 9 | $device = 0; 10 | $memory = 512 * 1024 * 1024; 11 | 12 | sub runtest { 13 | system("./run-test.sh", @_, "-D$device", "-m$memory"); 14 | if($? >> 8 == 0) { 15 | $nsuccesses++; 16 | } 17 | $ntests++; 18 | } # runtest 19 | 20 | # correctness memory allocation test; over all sizes, allocate/free 25% memory 21 | # for each small size, and 12.5% memory for each large size 22 | $step = 8; 23 | for($alloc_sz = 16; $alloc_sz <= 32 * 1024; $alloc_sz += $step) { 24 | $fraction = $alloc_sz <= 2 * 1024 ? 0.25 : 0.125; 25 | $nthreads = floor($fraction * $memory / $alloc_sz); 26 | if($nthreads == 0) { 27 | next; 28 | } 29 | runtest("checkptr", "-l1", "-t4", "-s$alloc_sz", "-n$nthreads"); 30 | # modify step 31 | if($alloc_sz >= 1024 * 1024) { 32 | $step = 1024 * 1024; 33 | } elsif($alloc_sz >= 128 * 1024) { 34 | $step = 128 * 1024; 35 | } elsif($alloc_sz >= 16 * 1024) { 36 | $step = 16 * 1024; 37 | } elsif($alloc_sz >= 2 * 1024) { 38 | $step = 2 * 1024; 39 | } elsif($alloc_sz >= 256) { 40 | $step = 256; 41 | } elsif($alloc_sz >= 64) { 42 | $step = 16; 43 | } else { 44 | $step = 8; 45 | } 46 | } # for($step) 47 | 48 | # free slabs test - to ensure that slabs are freed correctly 49 | runtest("freeslabs", "-m$memory"); 50 | 51 | # probabilitized tests 52 | $falloc = 0.5; 53 | $ffree = 0.5; 54 | $fexec = 0.75; 55 | #foreach $group (10) { 56 | foreach $group (0, 5, 10) { 57 | foreach $niters (1, 5) { 58 | # foreach $niters (1) { 59 | $ntries = $group == 1 ? 1024 : 16384; 60 | $ntries = ceil($ntries / $niters); 61 | @fixed_args = ("prob-checkptr", "-i$niters", "-t$ntries", "-f$falloc", 62 | "-F$ffree", "-e$fexec", "-g$group"); 63 | # small sizes (<= 64 bytes) 64 | $nthreads = 1024 * 1024; 65 | runtest(@fixed_args, "-l4", "-n$nthreads", "-s8", "-S64", "-duniform"); 66 | # medium sizes (<= 256 bytes) 67 | runtest(@fixed_args, "-l1", "-n$nthreads", "-s8", "-S256", "-duniform"); 68 | runtest(@fixed_args, "-l4", "-n$nthreads", "-s8", "-S256", "-dexpequal"); 69 | # large-size test (<= 3072 bytes) 70 | $nthreads = 64 * 1024; 71 | runtest(@fixed_args, "-l1", "-n$nthreads", "-s8", "-S3072", "-duniform"); 72 | $nthreads = 128 * 1024; 73 | runtest(@fixed_args, "-l4", "-n$nthreads", "-s8", "-S3072", "-dexpequal"); 74 | } 75 | } 76 | 77 | # print the total count 78 | $nfails = $ntests - $nsuccesses; 79 | print "tests: $ntests TOTAL, $nsuccesses SUCCEEDED, $nfails FAILED\n"; 80 | -------------------------------------------------------------------------------- /tst/corr/run-test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # usage: 4 | # ./run-test.sh 5 | 6 | TEST_NAME=$1 7 | shift 1 8 | TEST_EXE=./bin/$TEST_NAME 9 | 10 | # run the test 11 | echo $TEST_EXE $@ 12 | $TEST_EXE $@ 13 | 14 | # analyze exit code 15 | # TODO: add output coloring 16 | TEST_EXIT=$? 17 | if [ $TEST_EXIT == 0 ]; then 18 | echo "$TEST_NAME test PASSED" 19 | exit 0 20 | else 21 | echo "$TEST_NAME test FAILED with exit code $TEST_EXIT" 22 | exit -1 23 | fi 24 | -------------------------------------------------------------------------------- /tst/corr/test/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/corr/test/makefile: -------------------------------------------------------------------------------- 1 | NAME=test 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/corr/test/test.cu: -------------------------------------------------------------------------------- 1 | /** @file test.cu testing a simple idea of an allocator */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | /** a macro for checking CUDA calls */ 15 | #define cucheck(call) \ 16 | { \ 17 | cudaError_t cucheck_err = (call); \ 18 | if(cucheck_err != cudaSuccess) { \ 19 | const char* err_str = cudaGetErrorString(cucheck_err); \ 20 | fprintf(stderr, "%s (%d): %s in %s\n", __FILE__, __LINE__, err_str, #call); \ 21 | exit(-1); \ 22 | } \ 23 | } 24 | 25 | //#include "halloc.h" 26 | 27 | /** testing parameters */ 28 | #define NTHREADS (2 * 1024 * 1024) 29 | #define NMALLOCS 8 30 | #define NTHREADS2 (NTHREADS / NMALLOCS) 31 | //#define NTHREADS2 NTHREADS 32 | #define BS 256 33 | #define NTRIES 8 34 | #define MEMORY (4 * 16 * NTHREADS) 35 | //#define NTRIES 1 36 | 37 | // alloc/free kernel 38 | __global__ void malloc_free_k(int ntimes) { 39 | for(int i = 0; i < ntimes; i++) { 40 | void *p = hamalloc(16); 41 | if(!p) 42 | printf("cannot allocate memory\n"); 43 | hafree(p); 44 | } 45 | } // malloc_free_k 46 | 47 | // alloc-and-save-pointer kernel 48 | __global__ void malloc_k(void **ptrs, int ntimes) { 49 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 50 | int nthreads = blockDim.x * gridDim.x; 51 | for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads) { 52 | ptrs[iptr] = hamalloc(16); 53 | if(!ptrs[iptr]) 54 | printf("cannot allocate memory\n"); 55 | } 56 | } // malloc_k 57 | // read-and-free pointer kernel 58 | __global__ void free_k(void **ptrs, int ntimes) { 59 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 60 | int nthreads = blockDim.x * gridDim.x; 61 | for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads) 62 | hafree(ptrs[iptr]); 63 | } // free_k 64 | 65 | // alloc-and-save-pointer kernel 66 | __global__ void cuda_malloc_k(void **ptrs, int ntimes) { 67 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 68 | int nthreads = blockDim.x * gridDim.x; 69 | for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads) { 70 | ptrs[iptr] = malloc(16); 71 | if(!ptrs[iptr]) 72 | printf("cannot allocate memory using CUDA malloc()\n"); 73 | } 74 | } // malloc_k 75 | // read-and-free pointer kernel 76 | __global__ void cuda_free_k(void **ptrs, int ntimes) { 77 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 78 | int nthreads = blockDim.x * gridDim.x; 79 | for(int iptr = tid; iptr < ntimes * nthreads; iptr += nthreads) 80 | free(ptrs[iptr]); 81 | } // free_k 82 | 83 | // a kernel to check whether pointers are good 84 | __global__ void check_ptrs_k(bool *good, uint sz, size_t *ptrs, uint n) { 85 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 86 | size_t *ptr = (size_t *)ptrs[i]; 87 | // check 1: try to write two values at the pointer 88 | ptr[0] = ptrs[i]; 89 | ptr[1] = ptrs[i]; 90 | // check 2: check that the pointer addresses are really valid 91 | if(i < n - 1) { 92 | good[i] = ptrs[i + 1] - ptrs[i] >= sz; 93 | } else 94 | good[i] = true; 95 | } // check_ptrs_k 96 | 97 | // correctness test - checks if all allocations are correct 98 | void run_test0(void) { 99 | void **d_ptrs; 100 | size_t ptrs_sz = NTHREADS2 * NMALLOCS * sizeof(void *); 101 | uint nmallocs = NMALLOCS * NTHREADS2; 102 | cucheck(cudaMalloc(&d_ptrs, ptrs_sz)); 103 | size_t *d_addresses = (size_t *)d_ptrs; 104 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 105 | // allocate data 106 | malloc_k<<>>(d_ptrs, NMALLOCS); 107 | cucheck(cudaGetLastError()); 108 | cucheck(cudaStreamSynchronize(0)); 109 | // sort pointers 110 | thrust::device_ptr dt_addresses(d_addresses); 111 | thrust::sort(dt_addresses, dt_addresses + nmallocs); 112 | // check sorted pointers 113 | bool *d_good; 114 | size_t good_sz = nmallocs * sizeof(bool); 115 | cucheck(cudaMalloc((void **)&d_good, good_sz)); 116 | check_ptrs_k<<>>(d_good, 16, d_addresses, nmallocs); 117 | cucheck(cudaGetLastError()); 118 | cucheck(cudaStreamSynchronize(0)); 119 | thrust::device_ptr dt_good(d_good); 120 | bool passed = thrust::all_of(dt_good, dt_good + nmallocs, 121 | thrust::identity()); 122 | printf("test 0 (correctness of allocation):\n"); 123 | printf("test %s\n", passed ? "PASSED" : "FAILED"); 124 | printf("\n"); 125 | // FINISHED HERE 126 | // TODO: check pointers (each should point to enough memory) 127 | // free memory 128 | free_k<<>>(d_ptrs, NMALLOCS); 129 | cucheck(cudaGetLastError()); 130 | cucheck(cudaStreamSynchronize(0)); 131 | cucheck(cudaFree(d_ptrs)); 132 | } // run_test0 133 | 134 | void run_test1(void) { 135 | double t1 = omp_get_wtime(); 136 | for(int itry = 0; itry < NTRIES; itry++) { 137 | malloc_free_k<<>>(1); 138 | cucheck(cudaGetLastError()); 139 | cucheck(cudaStreamSynchronize(0)); 140 | } 141 | double t2 = omp_get_wtime(); 142 | double nmallocs = (double)NTHREADS * NTRIES; 143 | printf("test 1 (malloc/free inside each thread):\n"); 144 | printf("test duration %.2lf ms\n", (t2 - t1) * 1e3); 145 | printf("%.0lf malloc/free pairs in the test\n", nmallocs); 146 | printf("allocation speed: %.2lf Mpairs/s\n", nmallocs / (t2 - t1) * 1e-6); 147 | printf("\n"); 148 | } // run_test1 149 | 150 | void run_test2(void) { 151 | void **d_ptrs; 152 | size_t ptrs_sz = NTHREADS2 * NMALLOCS * sizeof(void *); 153 | cucheck(cudaMalloc(&d_ptrs, ptrs_sz)); 154 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 155 | double t1 = omp_get_wtime(); 156 | for(int itry = 0; itry < NTRIES; itry++) { 157 | malloc_k<<>>(d_ptrs, NMALLOCS); 158 | cucheck(cudaGetLastError()); 159 | //cucheck(cudaStreamSynchronize(0)); 160 | free_k<<>>(d_ptrs, NMALLOCS); 161 | cucheck(cudaGetLastError()); 162 | cucheck(cudaStreamSynchronize(0)); 163 | } 164 | double t2 = omp_get_wtime(); 165 | cucheck(cudaFree(d_ptrs)); 166 | double nmallocs = (double)NMALLOCS * NTHREADS2 * NTRIES; 167 | printf("test 2 (first all mallocs, then all frees):\n"); 168 | printf("test duration %.2lf ms\n", (t2 - t1) * 1e3); 169 | printf("%.0lf malloc/free pairs in the test\n", nmallocs); 170 | printf("allocation speed: %.2lf Mpairs/s\n", nmallocs / (t2 - t1) * 1e-6); 171 | printf("\n"); 172 | } // run_test2 173 | 174 | /** latency test */ 175 | void run_test3(void) { 176 | double t1 = omp_get_wtime(); 177 | int lat_ntries = 4, lat_nmallocs = 16 * 1024; 178 | //int lat_ntries = 1, lat_nmallocs = 1; 179 | for(int itry = 0; itry < lat_ntries; itry++) { 180 | malloc_free_k<<<1, 1>>>(lat_nmallocs); 181 | cucheck(cudaGetLastError()); 182 | cucheck(cudaStreamSynchronize(0)); 183 | } 184 | double t2 = omp_get_wtime(); 185 | double nmallocs = (double)lat_nmallocs * lat_ntries; 186 | printf("test 3 (latency):\n"); 187 | printf("test duration %.2lf ms\n", (t2 - t1) * 1e3); 188 | printf("%.0lf malloc/free pairs in the test\n", nmallocs); 189 | printf("latency: %.0lf ns\n", (t2 - t1) * 1e9 / nmallocs); 190 | printf("\n"); 191 | } // run_test3 192 | 193 | /** throughput test for CUDA allocator */ 194 | void run_test4(void) { 195 | void **d_ptrs; 196 | int cuda_nthreads = 128 * 1024, cuda_nmallocs = 2, cuda_ntries = 4; 197 | //cucheck(cudaDeviceSetLimit(cudaLimitMallocHeapSize, 32 * 1024 * 1024)); 198 | size_t ptrs_sz = cuda_nthreads * cuda_nmallocs * sizeof(void *); 199 | cucheck(cudaMalloc(&d_ptrs, ptrs_sz)); 200 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 201 | double t1 = omp_get_wtime(); 202 | for(int itry = 0; itry < cuda_ntries; itry++) { 203 | cuda_malloc_k<<>>(d_ptrs, cuda_nmallocs); 204 | cucheck(cudaGetLastError()); 205 | //cucheck(cudaStreamSynchronize(0)); 206 | cuda_free_k<<>>(d_ptrs, cuda_nmallocs); 207 | cucheck(cudaGetLastError()); 208 | cucheck(cudaStreamSynchronize(0)); 209 | } 210 | double t2 = omp_get_wtime(); 211 | cucheck(cudaFree(d_ptrs)); 212 | double nmallocs = (double)cuda_nmallocs * cuda_nthreads * cuda_ntries; 213 | printf("test 4 (CUDA, first all mallocs, then all frees):\n"); 214 | printf("test duration %.2lf ms\n", (t2 - t1) * 1e3); 215 | printf("%.0lf malloc/free pairs in the test\n", nmallocs); 216 | printf("allocation speed: %.2lf Mpairs/s\n", nmallocs / (t2 - t1) * 1e-6); 217 | printf("\n"); 218 | } // run_test4 219 | 220 | // separate time, first for allocation, then for free 221 | void run_test5(void) { 222 | void **d_ptrs; 223 | size_t ptrs_sz = NTHREADS2 * NMALLOCS * sizeof(void *); 224 | cucheck(cudaMalloc(&d_ptrs, ptrs_sz)); 225 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 226 | uint ntries = 1; 227 | double t1 = omp_get_wtime(); 228 | for(int itry = 0; itry < ntries; itry++) { 229 | malloc_k<<>>(d_ptrs, NMALLOCS); 230 | cucheck(cudaGetLastError()); 231 | cucheck(cudaStreamSynchronize(0)); 232 | } 233 | double t2 = omp_get_wtime(); 234 | for(int itry = 0; itry < ntries; itry++) { 235 | free_k<<>>(d_ptrs, NMALLOCS); 236 | cucheck(cudaGetLastError()); 237 | cucheck(cudaStreamSynchronize(0)); 238 | } 239 | double t3 = omp_get_wtime(); 240 | cucheck(cudaFree(d_ptrs)); 241 | double nmallocs = (double)NMALLOCS * NTHREADS2 * ntries; 242 | printf("test 5 (first mallocs, then frees, separate timing):\n"); 243 | printf("test duration: malloc %.2lf ms, free %.2lf ms\n", 244 | (t2 - t1) * 1e3, (t3 - t2) * 1e3); 245 | printf("%.0lf malloc/free pairs in the test\n", nmallocs); 246 | printf("speed: %.2lf Mmallocs/s, %.2lf Mfrees/s\n", 247 | nmallocs / (t2 - t1) * 1e-6, nmallocs / (t3 - t2) * 1e-6); 248 | printf("\n"); 249 | } // run_test5 250 | 251 | int main(int argc, char **argv) { 252 | ha_init(halloc_opts_t(MEMORY)); 253 | //ha_init(halloc_opts_t(1024 * 1024 * 1024)); 254 | run_test0(); 255 | run_test1(); 256 | run_test2(); 257 | run_test3(); 258 | run_test4(); 259 | run_test5(); 260 | ha_shutdown(); 261 | } // main 262 | -------------------------------------------------------------------------------- /tst/corr/tmp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /tst/exp/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.png 5 | *.pdf 6 | -------------------------------------------------------------------------------- /tst/exp/common.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env perl 2 | 3 | 4 | $device = 1; 5 | 6 | # runs, sets exernal variables to data extracted from test run; 7 | # negative values mean that no data has been extracted 8 | sub runtest { 9 | my $test = $_[0]; 10 | shift @_; 11 | $test = "../../perf/bin/phase-$test"; 12 | $args = join ' ', @_; 13 | #print $args; 14 | my @res = `$test -D$device $args`; 15 | shift @res; 16 | #print @res; 17 | # set standard variables to undefined 18 | #throughput 19 | $thru_malloc = -1; 20 | $thru_free = -1; 21 | $thru_pair = -1; 22 | #speed; note that there's no "free speed" 23 | $speed_malloc = -1; 24 | $speed_pair = -1; 25 | #latency: note that there's no pair latency 26 | $lat_malloc_min = -1; 27 | $lat_malloc_max = -1; 28 | $lat_malloc_avg = -1; 29 | $lat_free_min = -1; 30 | $lat_free_max = -1; 31 | $lat_free_avg = -1; 32 | # analyze result lines 33 | 34 | foreach $line (@res) { 35 | my @fields = split ' ', $line; 36 | #print (join ',', @fields); 37 | my $is_malloc = grep /malloc/, @fields; 38 | my $is_free = grep /free/, @fields; 39 | my $is_pair = grep /pair/, @fields; 40 | my $is_thru = grep /throughput/, @fields; 41 | my $is_speed = grep /speed/, @fields; 42 | my $is_lat = grep /latency/, @fields; 43 | my $is_avg = grep /avg/, @fields; 44 | my $is_min = grep /min/, @fields; 45 | my $is_max = grep /max/, @fields; 46 | #print $is_pair, $is_thru, $is_malloc, "\n"; 47 | if($is_thru) { 48 | if($is_malloc) { 49 | $thru_malloc = $fields[2]; 50 | } elsif($is_free) { 51 | $thru_free = $fields[2]; 52 | } elsif($is_pair) { 53 | $thru_pair = $fields[2]; 54 | } 55 | } elsif($is_speed) { 56 | if($is_malloc) { 57 | $speed_malloc = $fields[2]; 58 | } elsif($is_pair) { 59 | $speed_pair = $fields[2]; 60 | } 61 | } elsif($is_lat) { 62 | if($is_malloc) { 63 | if($is_min) { 64 | $lat_malloc_min = $fields[3]; 65 | } elsif($is_max) { 66 | $lat_malloc_max = $fields[3]; 67 | } elsif($is_avg) { 68 | $lat_malloc_avg = $fields[3]; 69 | } 70 | } elsif($is_free) { 71 | if($is_min) { 72 | $lat_free_min = $fields[3]; 73 | } elsif($is_max) { 74 | $lat_free_max = $fields[3]; 75 | } elsif($is_avg) { 76 | $lat_free_avg = $fields[3]; 77 | } 78 | } 79 | } 80 | } # foreach $line 81 | } # sub runtest 82 | -------------------------------------------------------------------------------- /tst/exp/frag-int/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.png 5 | *.pdf 6 | -------------------------------------------------------------------------------- /tst/exp/frag-int/exp-plot.gpl: -------------------------------------------------------------------------------- 1 | # plots results of internal fragmentation experiments 2 | 3 | set terminal pdf enhanced color 4 | set style data lines 5 | 6 | set output "frag-int.pdf" 7 | set xlabel "Allocation size, B" 8 | set ylabel "Average internal fragmentation" 9 | #plot [0:3072][] 10 | plot [0:384][] \ 11 | "exp-log.csv" u 1:4 t "Average" 12 | # "exp-log.csv" u 1:2 t "Block",\ 13 | # "exp-log.csv" u 1:3 t "Cumulative",\ 14 | -------------------------------------------------------------------------------- /tst/exp/frag-int/exp-run.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env perl 2 | 3 | # data for internal fragmentation plot 4 | 5 | # 2, 3 6 | @alloc_szs = (16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 7 | 2048, 3072); 8 | # 2, 3, 5 9 | #@alloc_szs = (16, 24, 32, 40, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 10 | # 640, 768, 1024, 1280, 1536, 2048, 2560, 3072); 11 | # 2, 3, 5, 7 12 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 13 | # 320, 384, 448, 512, 640, 768, 896, 1024, 1280, 1536, 1792, 2048, 14 | # 2560, 3072); 15 | # 2, 3, 5, 7, 9 16 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 72, 80, 96, 112, 128, 144, 160, 192, 224, 256, 17 | # 288, 320, 384, 448, 512, 576, 640, 768, 896, 1024, 1152, 1280, 18 | # 1536, 1792, 2048, 2304, 2560, 3072); 19 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 20 | # 256, 320, 384, 448, 512, 640, 768, 896, 21 | # 1024, 1280, 1536, 1792, 2048, 2560, 3072); 22 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 23 | # 192, 224, 256, 288, 320, 352, 384, 448, 512, 576, 640, 704, 24 | # 768, 896, 1024, 1280, 1408, 1536, 1792, 2048, 2304, 2560, 3072); 25 | #@alloc_szs = (16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 26 | # 128, 144, 160, 176, 192, 208, 224, 240, 27 | # 256, 288, 320, 352, 384, 416, 448, 480, 28 | # 512, 576, 640, 704, 768, 832, 896, 960, 29 | # 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 30 | # 2048, 2304, 2560, 2816, 3072); 31 | sub find_alloc_sz { 32 | my $sz = $_[0]; 33 | foreach $alloc_sz (@alloc_szs) { 34 | if($alloc_sz >= $sz) { 35 | return $alloc_sz; 36 | } 37 | } 38 | } # find_alloc_sz 39 | 40 | $min_sz = 16; 41 | $max_sz = 3072; 42 | $step_sz = 8; 43 | 44 | $OCSV = 100; 45 | $ofile = "./exp-log.csv"; 46 | open(OCSV, ">", $ofile) || die "cannot open $ofile for writing"; 47 | $oline = "nbytes block_frag cum_frag cum_frag2\n"; 48 | print OCSV $oline; 49 | #print $oline; 50 | print "$#alloc_szs sizes\n"; 51 | 52 | $sum_frag = 0; 53 | $sum_alloc_sz = 0; 54 | $sum_overhead = 0; 55 | $n = 1; 56 | 57 | for($sz = $min_sz; $sz <= $max_sz; $sz += $step_sz) { 58 | $alloc_sz = find_alloc_sz($sz); 59 | $overhead = $alloc_sz - $sz; 60 | $sum_overhead += $overhead; 61 | $sum_alloc_sz += $alloc_sz; 62 | $block_frag = $overhead / $alloc_sz; 63 | $sum_frag += $block_frag; 64 | $cum_frag = $sum_frag / $n; 65 | $cum_frag2 = $sum_overhead / $sum_alloc_sz; 66 | $n++; 67 | $oline = "$sz $block_frag $cum_frag $cum_frag2\n"; 68 | print OCSV $oline; 69 | # print $oline; 70 | } # for($sz) 71 | 72 | close OCSV; 73 | system('gnuplot exp-plot.gpl'); 74 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.png 5 | *.pdf 6 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/exp-log-priv-1.csv: -------------------------------------------------------------------------------- 1 | allocator alloc_sz nallocs ffree thru 2 | halloc 16 4 0.2 982.04 3 | scatter 16 4 0.2 271.01 4 | halloc 16 4 0.21 983.60 5 | scatter 16 4 0.21 269.33 6 | halloc 16 4 0.22 982.17 7 | scatter 16 4 0.22 268.03 8 | halloc 16 4 0.23 978.75 9 | scatter 16 4 0.23 266.21 10 | halloc 16 4 0.24 980.30 11 | scatter 16 4 0.24 263.26 12 | halloc 16 4 0.25 977.35 13 | scatter 16 4 0.25 270.15 14 | halloc 16 4 0.26 980.39 15 | scatter 16 4 0.26 271.01 16 | halloc 16 4 0.27 977.82 17 | scatter 16 4 0.27 266.86 18 | halloc 16 4 0.28 973.04 19 | scatter 16 4 0.28 256.02 20 | halloc 16 4 0.29 972.50 21 | scatter 16 4 0.29 11.33 22 | halloc 16 4 0.3 979.25 23 | scatter 16 4 0.3 7.64 24 | halloc 16 4 0.31 970.01 25 | scatter 16 4 0.31 7.29 26 | halloc 16 4 0.32 971.70 27 | scatter 16 4 0.32 6.63 28 | halloc 16 4 0.33 970.19 29 | scatter 16 4 0.33 6.53 30 | halloc 16 4 0.34 972.48 31 | scatter 16 4 0.34 5.96 32 | halloc 64 1 0.2 806.59 33 | scatter 64 1 0.2 190.25 34 | halloc 64 1 0.21 807.06 35 | scatter 64 1 0.21 189.42 36 | halloc 64 1 0.22 808.17 37 | scatter 64 1 0.22 187.81 38 | halloc 64 1 0.23 797.65 39 | scatter 64 1 0.23 63.09 40 | halloc 64 1 0.24 805.14 41 | scatter 64 1 0.24 19.88 42 | halloc 64 1 0.25 805.23 43 | scatter 64 1 0.25 9.88 44 | halloc 64 1 0.26 804.99 45 | scatter 64 1 0.26 9.58 46 | halloc 64 1 0.27 801.62 47 | scatter 64 1 0.27 8.52 48 | halloc 64 1 0.28 800.72 49 | scatter 64 1 0.28 7.69 50 | halloc 64 1 0.29 799.41 51 | scatter 64 1 0.29 5.91 52 | halloc 64 1 0.3 802.26 53 | scatter 64 1 0.3 5.88 54 | halloc 64 1 0.31 799.17 55 | scatter 64 1 0.31 4.85 56 | halloc 64 1 0.32 799.17 57 | scatter 64 1 0.32 4.65 58 | halloc 64 1 0.33 802.99 59 | scatter 64 1 0.33 3.95 60 | halloc 64 1 0.34 802.42 61 | scatter 64 1 0.34 3.67 62 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/exp-log-priv-2.csv: -------------------------------------------------------------------------------- 1 | allocator alloc_sz nallocs ffree thru 2 | halloc 16 4 0.2 965.75 3 | scatter 16 4 0.2 269.91 4 | halloc 16 4 0.21 982.14 5 | scatter 16 4 0.21 268.22 6 | halloc 16 4 0.22 981.53 7 | scatter 16 4 0.22 267.53 8 | halloc 16 4 0.23 981.10 9 | scatter 16 4 0.23 266.25 10 | halloc 16 4 0.24 983.11 11 | scatter 16 4 0.24 267.94 12 | halloc 16 4 0.25 976.61 13 | scatter 16 4 0.25 270.16 14 | halloc 16 4 0.26 975.87 15 | scatter 16 4 0.26 270.08 16 | halloc 16 4 0.27 979.27 17 | scatter 16 4 0.27 268.75 18 | halloc 16 4 0.28 975.02 19 | scatter 16 4 0.28 12.77 20 | halloc 16 4 0.29 977.26 21 | scatter 16 4 0.29 8.99 22 | halloc 16 4 0.3 976.81 23 | scatter 16 4 0.3 8.29 24 | halloc 16 4 0.31 972.75 25 | scatter 16 4 0.31 7.46 26 | halloc 16 4 0.32 975.86 27 | scatter 16 4 0.32 6.54 28 | halloc 16 4 0.33 968.50 29 | scatter 16 4 0.33 6.10 30 | halloc 16 4 0.34 975.61 31 | scatter 16 4 0.34 5.73 32 | halloc 64 1 0.2 800.17 33 | scatter 64 1 0.2 190.13 34 | halloc 64 1 0.21 805.85 35 | scatter 64 1 0.21 188.04 36 | halloc 64 1 0.22 808.36 37 | scatter 64 1 0.22 118.72 38 | halloc 64 1 0.23 804.07 39 | scatter 64 1 0.23 41.38 40 | halloc 64 1 0.24 806.42 41 | scatter 64 1 0.24 30.69 42 | halloc 64 1 0.25 803.27 43 | scatter 64 1 0.25 20.10 44 | halloc 64 1 0.26 801.75 45 | scatter 64 1 0.26 9.43 46 | halloc 64 1 0.27 804.59 47 | scatter 64 1 0.27 8.74 48 | halloc 64 1 0.28 804.18 49 | scatter 64 1 0.28 8.61 50 | halloc 64 1 0.29 798.36 51 | scatter 64 1 0.29 6.93 52 | halloc 64 1 0.3 807.24 53 | scatter 64 1 0.3 6.17 54 | halloc 64 1 0.31 801.66 55 | scatter 64 1 0.31 5.29 56 | halloc 64 1 0.32 801.73 57 | scatter 64 1 0.32 4.42 58 | halloc 64 1 0.33 802.09 59 | scatter 64 1 0.33 3.90 60 | halloc 64 1 0.34 804.09 61 | scatter 64 1 0.34 3.81 62 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/exp-log-priv-3.csv: -------------------------------------------------------------------------------- 1 | allocator alloc_sz nallocs ntries thru 2 | halloc 16 4 0.25 929.48 3 | scatter 16 4 0.25 252.56 4 | cuda 16 4 0.25 1.13 5 | halloc 16 4 0.26 932.18 6 | scatter 16 4 0.26 251.80 7 | cuda 16 4 0.26 1.11 8 | halloc 16 4 0.27 934.27 9 | scatter 16 4 0.27 253.12 10 | cuda 16 4 0.27 1.10 11 | halloc 16 4 0.28 927.89 12 | scatter 16 4 0.28 255.17 13 | cuda 16 4 0.28 1.10 14 | halloc 16 4 0.29 923.13 15 | scatter 16 4 0.29 251.95 16 | cuda 16 4 0.29 1.10 17 | halloc 16 4 0.3 931.29 18 | scatter 16 4 0.3 8.72 19 | cuda 16 4 0.3 1.08 20 | halloc 16 4 0.31 924.89 21 | scatter 16 4 0.31 9.54 22 | cuda 16 4 0.31 1.08 23 | halloc 16 4 0.32 929.67 24 | scatter 16 4 0.32 7.07 25 | cuda 16 4 0.32 1.08 26 | halloc 16 4 0.33 928.99 27 | scatter 16 4 0.33 6.33 28 | cuda 16 4 0.33 1.04 29 | halloc 16 4 0.34 930.59 30 | scatter 16 4 0.34 6.00 31 | cuda 16 4 0.34 1.02 32 | halloc 64 1 0.25 755.22 33 | scatter 64 1 0.25 169.06 34 | cuda 64 1 0.25 1.39 35 | halloc 64 1 0.26 761.69 36 | scatter 64 1 0.26 171.18 37 | cuda 64 1 0.26 1.40 38 | halloc 64 1 0.27 760.99 39 | scatter 64 1 0.27 168.87 40 | cuda 64 1 0.27 1.40 41 | halloc 64 1 0.28 758.68 42 | scatter 64 1 0.28 169.78 43 | cuda 64 1 0.28 1.41 44 | halloc 64 1 0.29 756.84 45 | scatter 64 1 0.29 167.94 46 | cuda 64 1 0.29 1.32 47 | halloc 64 1 0.3 756.97 48 | scatter 64 1 0.3 155.65 49 | cuda 64 1 0.3 1.36 50 | halloc 64 1 0.31 759.72 51 | scatter 64 1 0.31 165.55 52 | cuda 64 1 0.31 1.41 53 | halloc 64 1 0.32 761.54 54 | scatter 64 1 0.32 69.02 55 | cuda 64 1 0.32 1.40 56 | halloc 64 1 0.33 755.98 57 | scatter 64 1 0.33 32.53 58 | cuda 64 1 0.33 1.33 59 | halloc 64 1 0.34 759.43 60 | scatter 64 1 0.34 25.85 61 | cuda 64 1 0.34 1.32 62 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/exp-log-priv.csv: -------------------------------------------------------------------------------- 1 | allocator alloc_sz nallocs ntries thru 2 | halloc 16 4 65536 1038.58 3 | scatter 16 4 65536 312.89 4 | cuda 16 4 65536 1.22 5 | halloc 16 4 131072 777.01 6 | scatter 16 4 131072 288.14 7 | cuda 16 4 131072 1.21 8 | halloc 16 4 196608 950.07 9 | scatter 16 4 196608 266.85 10 | cuda 16 4 196608 1.15 11 | halloc 16 4 262144 970.49 12 | scatter 16 4 262144 262.20 13 | cuda 16 4 262144 1.13 14 | halloc 16 4 327680 980.67 15 | scatter 16 4 327680 260.94 16 | cuda 16 4 327680 1.10 17 | halloc 16 4 393216 982.32 18 | scatter 16 4 393216 258.07 19 | cuda 16 4 393216 1.11 20 | halloc 16 4 458752 985.25 21 | scatter 16 4 458752 8.09 22 | cuda 16 4 458752 1.09 23 | halloc 16 4 524288 990.09 24 | scatter 16 4 524288 5.23 25 | cuda 16 4 524288 1.09 26 | halloc 16 4 589824 981.62 27 | scatter 16 4 589824 3.81 28 | cuda 16 4 589824 1.06 29 | halloc 16 4 655360 990.06 30 | scatter 16 4 655360 3.15 31 | cuda 16 4 655360 1.09 32 | halloc 64 1 65536 843.04 33 | scatter 64 1 65536 200.29 34 | cuda 64 1 65536 1.35 35 | halloc 64 1 131072 631.26 36 | scatter 64 1 131072 185.52 37 | cuda 64 1 131072 1.42 38 | halloc 64 1 196608 792.66 39 | scatter 64 1 196608 183.51 40 | cuda 64 1 196608 1.42 41 | halloc 64 1 262144 813.74 42 | scatter 64 1 262144 180.21 43 | cuda 64 1 262144 1.40 44 | halloc 64 1 327680 822.22 45 | scatter 64 1 327680 179.32 46 | cuda 64 1 327680 1.37 47 | halloc 64 1 393216 825.92 48 | scatter 64 1 393216 178.73 49 | cuda 64 1 393216 1.41 50 | halloc 64 1 458752 834.72 51 | scatter 64 1 458752 97.93 52 | cuda 64 1 458752 1.40 53 | halloc 64 1 524288 833.25 54 | scatter 64 1 524288 5.73 55 | cuda 64 1 524288 1.39 56 | halloc 64 1 589824 821.45 57 | scatter 64 1 589824 3.27 58 | cuda 64 1 589824 1.38 59 | halloc 64 1 655360 841.57 60 | scatter 64 1 655360 2.60 61 | cuda 64 1 655360 1.41 62 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/exp-log-spree.csv: -------------------------------------------------------------------------------- 1 | allocator alloc_sz nallocs nthreads thru thru_malloc thru_free 2 | halloc 16 4 65536 558.85 1058.06 1166.94 3 | scatter 16 4 65536 180.58 244.43 673.03 4 | cuda 16 4 65536 0.31 0.33 7.29 5 | halloc 16 4 131072 647.81 1181.17 1412.47 6 | scatter 16 4 131072 138.49 166.11 798.98 7 | cuda 16 4 131072 0.31 0.33 7.14 8 | halloc 16 4 196608 658.35 1165.55 1488.63 9 | scatter 16 4 196608 128.39 149.57 863.38 10 | cuda 16 4 196608 0.30 0.31 7.23 11 | halloc 16 4 262144 683.10 1225.46 1519.11 12 | scatter 16 4 262144 102.87 116.73 817.47 13 | cuda 16 4 262144 0.28 0.29 7.27 14 | halloc 16 4 327680 701.69 1268.07 1546.45 15 | scatter 16 4 327680 7.80 7.82 772.93 16 | cuda 16 4 327680 0.27 0.28 7.27 17 | halloc 16 4 393216 716.42 1302.52 1567.44 18 | scatter 16 4 393216 5.62 5.62 777.60 19 | cuda 16 4 393216 0.25 0.26 7.36 20 | halloc 16 4 458752 714.80 1289.26 1579.07 21 | scatter 16 4 458752 2.43 2.42 774.32 22 | cuda 16 4 458752 0.23 0.24 7.19 23 | halloc 16 4 524288 723.18 1306.52 1594.38 24 | scatter 16 4 524288 1.13 1.13 711.65 25 | cuda 16 4 524288 0.22 0.22 7.24 26 | halloc 16 4 589824 740.49 1359.65 1601.15 27 | scatter 16 4 589824 0.98 0.97 714.50 28 | cuda 16 4 589824 0.20 0.20 7.15 29 | halloc 16 4 655360 736.18 1333.57 1617.76 30 | scatter 16 4 655360 0.76 0.75 680.68 31 | cuda 16 4 655360 0.19 0.19 7.18 32 | halloc 64 1 65536 303.24 572.84 634.75 33 | scatter 64 1 65536 105.39 152.90 331.62 34 | cuda 64 1 65536 0.29 0.30 5.90 35 | halloc 64 1 131072 365.00 644.84 827.53 36 | scatter 64 1 131072 61.70 73.22 374.97 37 | cuda 64 1 131072 0.31 0.33 6.66 38 | halloc 64 1 196608 408.28 744.14 890.58 39 | scatter 64 1 196608 47.34 53.84 370.44 40 | cuda 64 1 196608 0.32 0.33 7.02 41 | halloc 64 1 262144 428.33 760.95 964.29 42 | scatter 64 1 262144 48.93 55.29 400.87 43 | cuda 64 1 262144 0.31 0.33 7.08 44 | halloc 64 1 327680 450.79 804.79 1008.55 45 | scatter 64 1 327680 47.26 53.25 395.21 46 | cuda 64 1 327680 0.30 0.31 7.13 47 | halloc 64 1 393216 456.52 804.09 1039.07 48 | scatter 64 1 393216 44.96 50.00 416.80 49 | cuda 64 1 393216 0.29 0.30 7.29 50 | halloc 64 1 458752 467.69 824.36 1063.50 51 | scatter 64 1 458752 3.45 3.46 421.65 52 | cuda 64 1 458752 0.29 0.30 7.32 53 | halloc 64 1 524288 475.88 838.48 1082.67 54 | scatter 64 1 524288 1.34 1.34 378.90 55 | cuda 64 1 524288 0.29 0.30 7.27 56 | halloc 64 1 589824 481.29 847.43 1095.94 57 | scatter 64 1 589824 0.93 0.92 371.99 58 | cuda 64 1 589824 0.28 0.29 7.36 59 | halloc 64 1 655360 496.02 884.87 1110.82 60 | scatter 64 1 655360 0.88 0.87 377.13 61 | cuda 64 1 655360 0.29 0.30 7.35 62 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/exp-plot.gpl: -------------------------------------------------------------------------------- 1 | # fixed slab size, varying occupancy 2 | set terminal pdf enhanced color font ",10" 3 | set pointsize 0.65 4 | set style data linespoints 5 | 6 | set output "vs-priv.pdf" 7 | set logscale y 2 8 | set xlabel "Allocation fraction" 9 | set ylabel "Throughput, Mops/s" 10 | plot [0.19:0.35][2:4096] \ 11 | "", $ocsv_name) 25 | || die "cannot open file $ocsv_name for writing"; 26 | $oline = "allocator alloc_sz nallocs ntries thru\n"; 27 | print $oline; 28 | print OCSV $oline; 29 | foreach $alloc_sz (16, 64) { 30 | # foreach $alloc_sz (16) { 31 | # for($nallocs = 1; $nallocs < 16; $nallocs++) { 32 | # for($ffree = 0.25; $ffree <= 0.35; $ffree += 0.01) { 33 | for($nthreads = 64 * 1024; $nthreads <= $max_nthreads; 34 | $nthreads += 64 * 1024) { 35 | # for($ntries = 2; $ntries <= 32; $ntries += 2) { 36 | # $falloc = $ffree + $fexec - 0.01; 37 | $falloc = 0.9; 38 | $nallocs = $alloc_sz == 16 ? 4 : 1; 39 | # $nthreads = $max_nthreads; 40 | # foreach $allocator ("halloc", "scatter", "cuda") { 41 | foreach $allocator ("halloc", "scatter", "cuda") { 42 | $args = "-a$allocator -n$nthreads -l$nallocs -s$alloc_sz " . 43 | "-f$falloc -F$ffree -e$fexec"; 44 | # private speed 45 | $niters = 16; 46 | $ntries = $total_niters / $niters; 47 | if($allocator eq "cuda") { 48 | $ntries = 1; 49 | } 50 | runtest("throughput", $common, $args, "-i$niters -t$ntries"); 51 | $oline = "$allocator $alloc_sz $nallocs $nthreads $thru_pair\n"; 52 | # $oline = "$allocator $alloc_sz $nallocs $ffree $thru_pair\n"; 53 | print OCSV $oline; 54 | print $oline; 55 | } 56 | } 57 | } # foreach $alloc_sz 58 | close OCSV; 59 | } # sub priv_test 60 | 61 | # spree test: fractions fixed, nthreads varies 62 | sub spree_test { 63 | $ocsv_name = "./exp-log-spree.csv"; 64 | $OCSV = 100; 65 | $falloc = 0.9; $ffree = 0.2; $fexec = 0.71; 66 | $total_niters = 16; 67 | $common = "-f$falloc -F$ffree -e$fexec -m$memory -g$group"; 68 | open(OCSV, ">", $ocsv_name) 69 | || die "cannot open file $ocsv_name for writing"; 70 | $oline = "allocator alloc_sz nallocs nthreads thru thru_malloc thru_free\n"; 71 | print $oline; 72 | print OCSV $oline; 73 | foreach $alloc_sz (16, 64) { 74 | for($nthreads = 64 * 1024; $nthreads <= $max_nthreads; 75 | $nthreads += 64 * 1024) { 76 | $nallocs = $alloc_sz == 16 ? 4 : 1; 77 | foreach $allocator ("halloc", "scatter", "cuda") { 78 | $args = "-a$allocator -n$nthreads -l$nallocs -s$alloc_sz"; 79 | # private speed 80 | $niters = 1; 81 | $ntries = $total_niters / $niters; 82 | runtest("throughput", $common, $args, "-i$niters -t$ntries"); 83 | $oline = "$allocator $alloc_sz $nallocs $nthreads $thru_pair " 84 | . "$thru_malloc $thru_free\n"; 85 | print OCSV $oline; 86 | print $oline; 87 | } 88 | } 89 | } # foreach $alloc_sz 90 | close OCSV; 91 | } # sub spree_test 92 | 93 | # main 94 | priv_test(); 95 | spree_test(); 96 | # run gnuplot 97 | system('gnuplot', './exp-plot.gpl'); 98 | -------------------------------------------------------------------------------- /tst/exp/halloc-vs-scatter/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np #numerical stuff 3 | import sys 4 | import os 5 | 6 | import prettyplotlib as ppl # makes nicer colors and generally better to look at graphs 7 | import matplotlib.pyplot as plt 8 | import matplotlib as mpl 9 | from prettyplotlib import brewer2mpl 10 | 11 | def funlink(path): 12 | try: 13 | os.unlink(path) 14 | except: 15 | pass 16 | 17 | # filtering the numpy array for specific sz and l values 18 | def np_filter(data, sz): 19 | nps = data.shape[0] 20 | return np.array([data[i,:] for i in range(nps) if 21 | data[i,0]==sz]) 22 | 23 | inputFileName = "exp-log-priv.csv" 24 | data = np.loadtxt(inputFileName, skiprows=1, usecols=[1,2,3,4]) 25 | 26 | allocators = ['Halloc', 'ScatterAlloc', 'CUDA'] 27 | #allocators = ['Halloc'] 28 | 29 | nps0 = data.shape[0] 30 | fig = plt.figure(figsize=(12,7)) 31 | ax = fig.add_subplot(111) 32 | ax.set_yscale('log') 33 | #ymin = 1 34 | ymin = np.amin(data[:,3]) / 1.5 35 | ymax = np.amax(data[:,3]) * 1.5 36 | #ymin = 0.1 37 | #ymax = 5 * 10**3 38 | for ialloc in range(len(allocators)): 39 | for sz in [16, 64]: 40 | l = 1 41 | if(sz == 16): 42 | l = 4 43 | alloc = allocators[ialloc]; 44 | curData = data[np.array(range(nps0/3))*3 + ialloc, :] 45 | curData = np_filter(curData, sz) 46 | xs = range(curData.shape[0]) 47 | # allocation throughput for different sizes 48 | ppl.plot(ax, xs, curData[:,3], '-o', 49 | label=('%dx%d B %s' % (l,sz,alloc)), linewidth=2) 50 | ax.set_xlabel('#threads, x 1024') 51 | ax.set_ylabel('Throughput, Mops/s') 52 | if(ialloc == len(allocators) - 1 and sz == 64): 53 | ax.set_xticks(xs) 54 | ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024.0]) 55 | ax.axis(xmin=-1, xmax=len(xs), ymin=ymin, ymax=ymax) 56 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 57 | 58 | ax.set_title('Private Test Pair Throughput') 59 | ppl.legend(ax, loc=0) 60 | plt.tick_params(axis='both', which='major', direction='in', bottom=True) 61 | outputfilename = 'vs-priv-pair.pdf' 62 | funlink(outputfilename) 63 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 64 | 65 | #plt.show() 66 | 67 | 68 | inputFileName = "exp-log-spree.csv" 69 | data = np.loadtxt(inputFileName, skiprows=1, usecols=[1,2,3,4,5,6]) 70 | 71 | allocators = ['Halloc', 'ScatterAlloc', 'CUDA'] 72 | nps0 = data.shape[0] 73 | fig = plt.figure(figsize=(12,7)) 74 | ax = fig.add_subplot(111) 75 | ax.set_yscale('log') 76 | ymin = np.amin(data[:,4]) / 1.5 77 | ymax = np.amax(data[:,4]) * 1.5 78 | for ialloc in range(len(allocators)): 79 | for sz in [16, 64]: 80 | l = 1 81 | if(sz == 16): 82 | l = 4 83 | alloc = allocators[ialloc]; 84 | curData = data[np.array(range(nps0/3))*3 + ialloc, :] 85 | curData = np_filter(curData, sz) 86 | xs = range(curData.shape[0]) 87 | # allocation throughput for different sizes 88 | ppl.plot(ax, xs, curData[:,4], '-o', 89 | label=('%dx%d B %s' % (l,sz,alloc)), linewidth=2) 90 | ax.set_xlabel('#threads, x 1024') 91 | ax.set_ylabel('Throughput, Mops/s') 92 | if(ialloc == len(allocators) - 1 and sz == 64): 93 | ax.set_xticks(xs) 94 | ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024.0]) 95 | ax.axis(xmin=-1, xmax=len(xs), ymin=ymin, ymax=ymax) 96 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 97 | 98 | ax.set_title('Spree Test malloc() Throughput') 99 | ppl.legend(ax, loc=0) 100 | plt.tick_params(axis='both', which='major', direction='in', bottom=True) 101 | outputfilename = 'vs-spree-malloc.pdf' 102 | funlink(outputfilename) 103 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 104 | 105 | #plt.show() 106 | -------------------------------------------------------------------------------- /tst/exp/run-all-exps.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | cd scaling 4 | ./exp-run.pl 5 | cd .. 6 | 7 | cd settings 8 | ./exp-run.pl 9 | cd .. 10 | 11 | cd speed 12 | ./exp-run.pl 13 | cd .. 14 | 15 | cd halloc-vs-scatter 16 | ./exp-run.pl 17 | cd .. 18 | -------------------------------------------------------------------------------- /tst/exp/run-scaling-speed.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | 4 | cd scaling 5 | ./exp-run.pl 6 | #gnuplot ./exp-plot.gpl 7 | cd .. 8 | 9 | cd settings 10 | ./exp-run.pl 11 | #gnuplot ./exp-plot.gpl 12 | cd .. 13 | 14 | cd speed 15 | ./exp-run.pl 16 | #gnuplot ./exp-plot.gpl 17 | cd .. 18 | -------------------------------------------------------------------------------- /tst/exp/scaling/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.png 5 | *.pdf 6 | -------------------------------------------------------------------------------- /tst/exp/scaling/exp-log-lat.csv: -------------------------------------------------------------------------------- 1 | alloc_sz nallocs nthreads malloc_min malloc_avg malloc_max free_min free_avg free_max 2 | 16 1 1 2669.00 3046.49 5233.00 1966.00 2053.32 2606.00 3 | 256 1 1 2664.00 3020.17 5285.00 1970.00 2051.83 2578.00 4 | 16 1 2 2665.00 3034.24 5190.00 1970.00 2075.22 2574.00 5 | 256 1 2 2672.00 3050.47 5385.00 1970.00 2072.81 2542.00 6 | 16 1 4 2705.00 3068.02 5213.00 2042.00 2117.30 2578.00 7 | 256 1 4 2689.00 3048.25 5297.00 2006.00 2085.87 2582.00 8 | 16 1 8 2757.00 3106.10 5337.00 2090.00 2175.38 2602.00 9 | 256 1 8 2685.00 3063.93 5233.00 2010.00 2091.18 2490.00 10 | 16 1 16 2753.00 3170.95 5445.00 2094.00 2218.83 2730.00 11 | 256 1 16 2729.00 3117.48 5329.00 2018.00 2112.97 2594.00 12 | 16 1 32 2897.00 3181.90 4801.00 2182.00 2295.02 2758.00 13 | 256 1 32 2720.00 3056.85 4717.00 2050.00 2155.36 2622.00 14 | 16 1 64 2905.00 3174.13 4585.00 2182.00 2282.11 2682.00 15 | 256 1 64 2713.00 3011.81 4672.00 2046.00 2135.31 2506.00 16 | 16 1 128 2885.00 3151.48 4584.00 2166.00 2269.20 2646.00 17 | 256 1 128 2697.00 2964.06 4572.00 2034.00 2119.09 2586.00 18 | 16 1 256 2885.00 3141.89 4556.00 2166.00 2262.80 2790.00 19 | 256 1 256 2693.00 2948.81 4548.00 2030.00 2123.28 2562.00 20 | 16 1 512 2825.00 3157.58 4636.00 2162.00 2272.25 2958.00 21 | 256 1 512 2689.00 2969.11 4600.00 2030.00 2129.53 2790.00 22 | 16 1 1024 2809.00 3108.37 4576.00 2146.00 2256.25 2946.00 23 | 256 1 1024 2677.00 2950.17 4536.00 2010.00 2111.54 2866.00 24 | 16 1 2048 2796.00 3090.98 4589.00 2126.00 2248.39 2942.00 25 | 256 1 2048 2661.00 2950.79 4501.00 1994.00 2104.90 2910.00 26 | 16 1 4096 2792.00 3133.48 4643.00 2129.00 2278.27 3025.00 27 | 256 1 4096 2650.00 2994.24 4517.00 1994.00 2129.01 2942.00 28 | 16 1 8192 2806.00 3269.68 5114.00 2134.00 2349.96 3266.00 29 | 256 1 8192 2666.00 3104.65 5758.00 1994.00 2192.73 3172.00 30 | 16 1 16384 2805.00 3993.63 9289.00 2130.00 2522.31 5405.00 31 | 256 1 16384 2665.00 3771.82 25824.00 1994.00 2434.00 5683.00 32 | 16 1 32768 2794.00 3699.55 9389.00 2128.00 2488.10 5508.00 33 | 256 1 32768 2652.00 3631.28 26175.00 1994.00 2362.26 5788.00 34 | 16 1 65536 2796.00 3543.83 9365.00 2126.00 2468.92 5542.00 35 | 256 1 65536 2649.00 12856.61 66737.00 1986.00 2472.33 5755.00 36 | 16 1 131072 2805.00 3488.33 9691.00 2134.00 2463.11 5517.00 37 | 256 1 131072 2676.00 14862.38 124004.00 1990.00 2500.34 6001.00 38 | 16 1 262144 2801.00 3472.84 10822.00 2129.00 2458.63 5435.00 39 | 256 1 262144 2667.00 17748.94 169232.00 1993.00 2556.95 5812.00 40 | 16 1 524288 2797.00 3510.06 10846.00 2130.00 2456.23 5655.00 41 | 256 1 524288 2681.00 30596.38 197553.00 2012.00 3071.61 7347.00 42 | 16 1 1048576 2805.00 3629.24 14415.00 2130.00 2455.04 5298.00 43 | 256 1 1048576 2683.00 42801.76 250025.00 2002.00 3734.92 9445.00 44 | -------------------------------------------------------------------------------- /tst/exp/scaling/exp-log-thru.csv: -------------------------------------------------------------------------------- 1 | alloc_sz nallocs nthreads priv_pair spree_pair spree_malloc spree_free 2 | 16 1 32 3.11 0.61 1.17 1.25 3 | 16 4 32 4.06 1.74 3.27 3.66 4 | 256 1 32 3.21 0.60 1.16 1.24 5 | 16 1 64 5.97 1.19 2.30 2.43 6 | 16 4 64 7.97 3.33 6.28 7.01 7 | 256 1 64 6.32 1.20 2.31 2.47 8 | 16 1 128 11.79 2.40 4.66 4.90 9 | 16 4 128 15.42 6.76 12.75 14.22 10 | 256 1 128 12.63 2.68 5.15 5.51 11 | 16 1 256 23.68 5.37 10.36 11.00 12 | 16 4 256 30.63 14.45 27.15 30.53 13 | 256 1 256 24.64 5.37 10.19 11.20 14 | 16 1 512 44.59 7.66 15.00 15.49 15 | 16 4 512 59.58 22.65 43.36 46.87 16 | 256 1 512 46.30 7.58 14.56 15.62 17 | 16 1 1024 89.53 15.43 30.34 31.04 18 | 16 4 1024 118.87 45.02 86.98 92.29 19 | 256 1 1024 92.89 15.51 30.41 31.31 20 | 16 1 2048 177.10 30.63 60.00 61.87 21 | 16 4 2048 236.00 89.26 172.32 183.08 22 | 256 1 2048 184.24 30.54 59.76 61.75 23 | 16 1 4096 347.55 59.95 119.07 119.41 24 | 16 4 4096 458.70 172.43 335.65 350.60 25 | 256 1 4096 361.17 61.05 120.40 122.48 26 | 16 1 8192 667.23 118.77 234.67 237.82 27 | 16 4 8192 879.54 336.12 650.60 687.53 28 | 256 1 8192 692.55 120.26 237.31 241.12 29 | 16 1 16384 758.85 205.24 392.85 424.84 30 | 16 4 16384 932.55 501.49 928.16 1077.97 31 | 256 1 16384 788.59 190.48 351.94 410.25 32 | 16 1 32768 1012.49 347.03 654.60 729.97 33 | 16 4 32768 1218.44 753.21 1373.81 1647.22 34 | 256 1 32768 621.78 312.41 557.77 701.38 35 | 16 1 65536 1214.44 546.46 1026.08 1155.39 36 | 16 4 65536 1431.13 887.71 1712.50 1822.22 37 | 256 1 65536 454.22 419.25 675.25 1089.95 38 | 16 1 131072 1334.22 677.16 1276.19 1425.88 39 | 16 4 131072 1552.65 1064.33 2039.86 2200.11 40 | 256 1 131072 506.78 530.69 862.80 1359.21 41 | 16 1 262144 1366.82 792.88 1562.20 1592.19 42 | 16 4 262144 1544.41 1151.00 2176.99 2413.93 43 | 256 1 262144 421.84 610.48 998.63 1548.64 44 | 16 1 524288 1380.35 895.23 1767.78 1793.64 45 | 16 4 524288 1457.55 1193.52 2213.27 2559.74 46 | 256 1 524288 268.46 626.00 974.51 1723.85 47 | 16 1 1048576 1366.76 940.35 1818.48 1925.30 48 | 16 4 1048576 1396.75 1220.92 2269.06 2611.85 49 | 256 1 1048576 190.58 557.76 807.69 1770.94 50 | -------------------------------------------------------------------------------- /tst/exp/scaling/exp-plot.gpl: -------------------------------------------------------------------------------- 1 | # fixed slab size, varying occupancy 2 | set terminal pdf enhanced color font ",10" 3 | set pointsize 0.65 4 | set style data linespoints 5 | 6 | set output "1x16b-thru.pdf" 7 | set xlabel "#threads" 8 | set logscale x 2 9 | set ylabel "Throughput, Mops/s" 10 | plot [1024:2*1024*1024][0:2000] \ 11 | "", $ocsv_name) 24 | || die "cannot open file $ocsv_name for writing"; 25 | $oline = "alloc_sz nallocs nthreads priv_pair spree_pair spree_malloc spree_free\n"; 26 | print $oline; 27 | print OCSV $oline; 28 | for($nthreads = 32; $nthreads <= 1024 * 1024; $nthreads *= 2) { 29 | foreach $alloc_sz (16, 256) { 30 | foreach $nallocs (1, 4) { 31 | if($nallocs == 4 && $alloc_sz > 64) { 32 | next; 33 | } 34 | # spree test 35 | $ntries = $total_niters; 36 | $args = "-n$nthreads -l$nallocs -s$alloc_sz"; 37 | runtest("throughput", $common, $args, "-i1 -t$ntries"); 38 | $spree_pair = $thru_pair; 39 | $spree_malloc = $thru_malloc; 40 | $spree_free = $thru_free; 41 | # private test 42 | $niters = 32; 43 | $ntries = $total_niters / $niters; 44 | runtest("throughput", $common, $args, "-i$niters -t$ntries"); 45 | $priv_pair = $thru_pair; 46 | $oline = "$alloc_sz $nallocs $nthreads $priv_pair " . 47 | "$spree_pair $spree_malloc $spree_free\n"; 48 | print OCSV $oline; 49 | print $oline; 50 | } # foreach $nallocs 51 | } # foreach $alloc_sz 52 | } # for($nthreads) 53 | close OCSV; 54 | } # sub thru_test 55 | 56 | # latency test 57 | sub lat_test { 58 | $total_niters = 128; 59 | $ocsv_name = "./exp-log-lat.csv"; 60 | $OCSV = 100; 61 | open(OCSV, ">", $ocsv_name) 62 | || die "cannot open file $ocsv_name for writing"; 63 | $oline = "alloc_sz nallocs nthreads malloc_min malloc_avg malloc_max " . 64 | "free_min free_avg free_max\n"; 65 | print $oline; 66 | print OCSV $oline; 67 | for($nthreads = 1; $nthreads <= 1024 * 1024; $nthreads *= 2) { 68 | foreach $alloc_sz (16, 256) { 69 | $nallocs = 1; 70 | #foreach $nallocs (1, 4) { 71 | # if($nallocs == 4 && $alloc_sz > 64) { 72 | # next; 73 | # } 74 | # private test 75 | $niters = 16; 76 | $ntries = $total_niters / $niters; 77 | $args = "-n$nthreads -l$nallocs -s$alloc_sz"; 78 | runtest("latency", $common, $args, "-i$niters -t$ntries"); 79 | $priv_pair = $thru_pair; 80 | $oline = "$alloc_sz $nallocs $nthreads " . 81 | "$lat_malloc_min $lat_malloc_avg $lat_malloc_max " . 82 | "$lat_free_min $lat_free_avg $lat_free_max\n"; 83 | print OCSV $oline; 84 | print $oline; 85 | #} # foreach $nallocs 86 | } # foreach $alloc_sz 87 | } # for($nthreads) 88 | close OCSV; 89 | } # sub lat_test 90 | 91 | # main 92 | thru_test(); 93 | lat_test(); 94 | # run gnuplot 95 | system('gnuplot', './exp-plot.gpl'); 96 | -------------------------------------------------------------------------------- /tst/exp/scaling/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np #numerical stuff 3 | import sys 4 | import os 5 | 6 | import prettyplotlib as ppl # makes nicer colors and generally better to look at graphs 7 | import matplotlib.pyplot as plt 8 | import matplotlib as mpl 9 | from prettyplotlib import brewer2mpl 10 | 11 | def funlink(path): 12 | try: 13 | os.unlink(path) 14 | except: 15 | pass 16 | 17 | inputFileName = "exp-log-thru.csv" 18 | data = np.loadtxt(inputFileName, skiprows=1) 19 | 20 | # filtering the numpy array for specific sz and l values 21 | def np_filter(data, l, sz): 22 | nps = data.shape[0] 23 | return np.array([data[i,:] for i in range(nps) if 24 | data[i,0]==sz and data[i,1]==l and data[i,2]>=1024]) 25 | 26 | for l in [1, 4]: 27 | for sz in [16, 256]: 28 | if(sz == 256 and l == 4): 29 | continue 30 | curData = np_filter(data, l, sz) 31 | xs = range(curData.shape[0]) 32 | 33 | # allocation throughput for different sizes 34 | fig = plt.figure(figsize=(12,7)) 35 | ax = fig.add_subplot(111) 36 | ppl.plot(ax, xs, curData[:,3], '-o', label="Private", linewidth=2) 37 | ppl.plot(ax, xs, curData[:,4], '-o', label="Spree", linewidth=2) 38 | ppl.plot(ax, xs, curData[:,5], '-o', label="Spree malloc", linewidth=2) 39 | ppl.plot(ax, xs, curData[:,6], '-o', label="Spree free", linewidth=2) 40 | ax.set_title("%dx%d B Throughput" % (l,sz)); 41 | ax.set_xlabel("#threads, x 1024") 42 | ax.set_ylabel("Throughput, Mops/s") 43 | ax.set_xticks(xs) 44 | ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024]) 45 | ax.axis(xmin=-1, xmax=len(xs), ymin=0) 46 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 47 | ppl.legend(ax, loc=0) 48 | plt.tick_params(axis='both', which='major', direction='in', bottom=True) 49 | outputfilename = '%dx%db-thru.pdf' % (l,sz) 50 | funlink(outputfilename) 51 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 52 | 53 | 54 | inputFileName = "exp-log-lat.csv" 55 | data = np.loadtxt(inputFileName, skiprows=1) 56 | l = 1 57 | for sz in [16, 256]: 58 | for iaction in [0, 1]: 59 | actions = ['Malloc', 'Free'] 60 | action = actions[iaction] 61 | curData = np_filter(data, l, sz) 62 | xs = range(curData.shape[0]) 63 | # allocation throughput for different sizes 64 | fig = plt.figure(figsize=(12,7)) 65 | ax = fig.add_subplot(111) 66 | divd=0.732*1000 67 | ppl.plot(ax, xs, curData[:,3+3*iaction]/divd, '-o', label="Min", linewidth=2) 68 | ppl.plot(ax, xs, curData[:,4+3*iaction]/divd, '-o', label="Avg", linewidth=2) 69 | ppl.plot(ax, xs, curData[:,5+3*iaction]/divd, '-o', label="Max", linewidth=2) 70 | ax.set_title("%s Latency" % (action)); 71 | ax.set_xlabel("#threads, x 1024") 72 | ax.set_ylabel("Latency, us") 73 | ax.set_xticks(xs) 74 | ax.set_xticklabels(['%.0lf' % d for d in curData[:,2] / 1024]) 75 | ax.set_yscale('log') 76 | ax.axis(xmin=-1, xmax=len(xs), ymin=1) 77 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 78 | ppl.legend(ax, loc=0) 79 | plt.tick_params(axis='both', which='major', direction='in', bottom=True) 80 | outputfilename = '1x%dB-lat-%s.pdf' % (sz,action.lower()) 81 | funlink(outputfilename) 82 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 83 | -------------------------------------------------------------------------------- /tst/exp/settings/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.png 5 | *.pdf 6 | -------------------------------------------------------------------------------- /tst/exp/settings/exp-log.csv: -------------------------------------------------------------------------------- 1 | slab_size busy nallocs alloc_sz throughput speed 2 | 1 0.745 4 16 879.20 13.10 3 | 1 0.745 1 256 78.70 18.76 4 | 1 0.755 4 16 840.40 12.52 5 | 1 0.755 1 256 77.48 18.47 6 | 1 0.765 4 16 812.89 12.11 7 | 1 0.765 1 256 76.54 18.25 8 | 1 0.775 4 16 776.14 11.57 9 | 1 0.775 1 256 75.02 17.89 10 | 1 0.785 4 16 744.02 11.09 11 | 1 0.785 1 256 73.82 17.60 12 | 1 0.795 4 16 732.39 10.91 13 | 1 0.795 1 256 72.82 17.36 14 | 1 0.805 4 16 714.30 10.64 15 | 1 0.805 1 256 71.24 16.98 16 | 1 0.815 4 16 665.75 9.92 17 | 1 0.815 1 256 70.58 16.83 18 | 1 0.825 4 16 613.18 9.14 19 | 1 0.825 1 256 69.28 16.52 20 | 1 0.835 4 16 612.29 9.12 21 | 1 0.835 1 256 68.53 16.34 22 | 1 0.845 4 16 598.90 8.92 23 | 1 0.845 1 256 67.58 16.11 24 | 1 0.855 4 16 575.33 8.57 25 | 1 0.855 1 256 66.76 15.92 26 | 1 0.865 4 16 542.84 8.09 27 | 1 0.865 1 256 65.74 15.67 28 | 1 0.875 4 16 565.86 8.43 29 | 1 0.875 1 256 64.81 15.45 30 | 1 0.885 4 16 541.23 8.06 31 | 1 0.885 1 256 63.81 15.21 32 | 1 0.895 4 16 521.58 7.77 33 | 1 0.895 1 256 62.99 15.02 34 | 1 0.905 4 16 544.74 8.12 35 | 1 0.905 1 256 62.01 14.79 36 | 1 0.915 4 16 510.70 7.61 37 | 1 0.915 1 256 60.87 14.51 38 | 1 0.925 4 16 438.11 6.53 39 | 1 0.925 1 256 59.94 14.29 40 | 1 0.935 4 16 471.63 7.03 41 | 1 0.935 1 256 59.21 14.12 42 | 1 0.945 4 16 409.59 6.10 43 | 1 0.945 1 256 57.93 13.81 44 | 2 0.745 4 16 1443.03 21.50 45 | 2 0.745 1 256 156.03 37.20 46 | 2 0.755 4 16 1445.88 21.55 47 | 2 0.755 1 256 153.99 36.71 48 | 2 0.765 4 16 1442.89 21.50 49 | 2 0.765 1 256 153.37 36.57 50 | 2 0.775 4 16 1432.73 21.35 51 | 2 0.775 1 256 151.29 36.07 52 | 2 0.785 4 16 1425.57 21.24 53 | 2 0.785 1 256 149.80 35.72 54 | 2 0.795 4 16 1421.52 21.18 55 | 2 0.795 1 256 147.74 35.22 56 | 2 0.805 4 16 1421.90 21.19 57 | 2 0.805 1 256 145.90 34.79 58 | 2 0.815 4 16 1417.70 21.13 59 | 2 0.815 1 256 143.77 34.28 60 | 2 0.825 4 16 1367.36 20.38 61 | 2 0.825 1 256 142.19 33.90 62 | 2 0.835 4 16 1366.65 20.36 63 | 2 0.835 1 256 140.04 33.39 64 | 2 0.845 4 16 1304.42 19.44 65 | 2 0.845 1 256 138.25 32.96 66 | 2 0.855 4 16 1313.96 19.58 67 | 2 0.855 1 256 135.85 32.39 68 | 2 0.865 4 16 1279.31 19.06 69 | 2 0.865 1 256 133.39 31.80 70 | 2 0.875 4 16 1210.28 18.03 71 | 2 0.875 1 256 131.32 31.31 72 | 2 0.885 4 16 1142.57 17.03 73 | 2 0.885 1 256 129.17 30.80 74 | 2 0.895 4 16 1110.86 16.55 75 | 2 0.895 1 256 125.69 29.97 76 | 2 0.905 4 16 1037.18 15.46 77 | 2 0.905 1 256 123.12 29.35 78 | 2 0.915 4 16 964.39 14.37 79 | 2 0.915 1 256 119.22 28.42 80 | 2 0.925 4 16 908.65 13.54 81 | 2 0.925 1 256 116.32 27.73 82 | 2 0.935 4 16 843.35 12.57 83 | 2 0.935 1 256 114.11 27.21 84 | 2 0.945 4 16 798.92 11.90 85 | 2 0.945 1 256 112.85 26.91 86 | 4 0.745 4 16 1441.52 21.48 87 | 4 0.745 1 256 348.23 83.02 88 | 4 0.755 4 16 1420.42 21.17 89 | 4 0.755 1 256 339.54 80.95 90 | 4 0.765 4 16 1427.30 21.27 91 | 4 0.765 1 256 343.84 81.98 92 | 4 0.775 4 16 1463.45 21.81 93 | 4 0.775 1 256 338.31 80.66 94 | 4 0.785 4 16 1465.73 21.84 95 | 4 0.785 1 256 339.53 80.95 96 | 4 0.795 4 16 1442.23 21.49 97 | 4 0.795 1 256 323.45 77.12 98 | 4 0.805 4 16 1402.25 20.90 99 | 4 0.805 1 256 333.15 79.43 100 | 4 0.815 4 16 1383.01 20.61 101 | 4 0.815 1 256 329.53 78.57 102 | 4 0.825 4 16 1344.60 20.04 103 | 4 0.825 1 256 329.00 78.44 104 | 4 0.835 4 16 1369.94 20.41 105 | 4 0.835 1 256 319.74 76.23 106 | 4 0.845 4 16 1407.73 20.98 107 | 4 0.845 1 256 332.08 79.17 108 | 4 0.855 4 16 1428.62 21.29 109 | 4 0.855 1 256 323.09 77.03 110 | 4 0.865 4 16 1361.36 20.29 111 | 4 0.865 1 256 319.26 76.12 112 | 4 0.875 4 16 1335.62 19.90 113 | 4 0.875 1 256 317.13 75.61 114 | 4 0.885 4 16 1274.91 19.00 115 | 4 0.885 1 256 309.58 73.81 116 | 4 0.895 4 16 1253.91 18.68 117 | 4 0.895 1 256 310.67 74.07 118 | 4 0.905 4 16 1309.75 19.52 119 | 4 0.905 1 256 300.99 71.76 120 | 4 0.915 4 16 1287.82 19.19 121 | 4 0.915 1 256 294.47 70.21 122 | 4 0.925 4 16 1238.84 18.46 123 | 4 0.925 1 256 301.45 71.87 124 | 4 0.935 4 16 1086.71 16.19 125 | 4 0.935 1 256 280.21 66.81 126 | 4 0.945 4 16 936.88 13.96 127 | 4 0.945 1 256 291.00 69.38 128 | 8 0.745 4 16 1443.02 21.50 129 | 8 0.745 1 256 909.74 216.90 130 | 8 0.755 4 16 1441.99 21.49 131 | 8 0.755 1 256 889.23 212.01 132 | 8 0.765 4 16 1445.51 21.54 133 | 8 0.765 1 256 874.22 208.43 134 | 8 0.775 4 16 1448.12 21.58 135 | 8 0.775 1 256 788.44 187.98 136 | 8 0.785 4 16 1451.03 21.62 137 | 8 0.785 1 256 830.44 197.99 138 | 8 0.795 4 16 1457.40 21.72 139 | 8 0.795 1 256 838.23 199.85 140 | 8 0.805 4 16 1459.32 21.75 141 | 8 0.805 1 256 833.81 198.80 142 | 8 0.815 4 16 1457.17 21.71 143 | 8 0.815 1 256 817.50 194.91 144 | 8 0.825 4 16 1453.37 21.66 145 | 8 0.825 1 256 812.52 193.72 146 | 8 0.835 4 16 1443.71 21.51 147 | 8 0.835 1 256 814.79 194.26 148 | 8 0.845 4 16 1315.38 19.60 149 | 8 0.845 1 256 809.40 192.98 150 | 8 0.855 4 16 1235.75 18.41 151 | 8 0.855 1 256 800.73 190.91 152 | 8 0.865 4 16 1227.53 18.29 153 | 8 0.865 1 256 782.43 186.55 154 | 8 0.875 4 16 1193.70 17.79 155 | 8 0.875 1 256 776.16 185.05 156 | 8 0.885 4 16 1237.99 18.45 157 | 8 0.885 1 256 757.69 180.65 158 | 8 0.895 4 16 1266.08 18.87 159 | 8 0.895 1 256 743.97 177.38 160 | 8 0.905 4 16 1277.63 19.04 161 | 8 0.905 1 256 719.34 171.50 162 | 8 0.915 4 16 1290.66 19.23 163 | 8 0.915 1 256 729.92 174.03 164 | 8 0.925 4 16 1273.92 18.98 165 | 8 0.925 1 256 672.34 160.30 166 | 8 0.935 4 16 1235.11 18.40 167 | 8 0.935 1 256 672.28 160.28 168 | 8 0.945 4 16 1176.06 17.52 169 | 8 0.945 1 256 661.98 157.83 170 | -------------------------------------------------------------------------------- /tst/exp/settings/exp-plot.gpl: -------------------------------------------------------------------------------- 1 | # fixed slab size, varying occupancy 2 | set terminal pdf enhanced color font ",10" 3 | set pointsize 0.65 4 | set style data linespoints 5 | 6 | set output "var-occup.pdf" 7 | set xlabel "Busy threshold" 8 | set ylabel "Speed, GiB/s" 9 | plot [][0:140] \ 10 | "", $ocsv_name) 23 | || die "cannot open file $ocsv_name for writing"; 24 | $oline = "slab_size busy nallocs alloc_sz throughput speed\n"; 25 | print OCSV $oline; 26 | print $oline; 27 | foreach $slab_size (20, 21, 22, 23) { 28 | #foreach $slab_size (22) { 29 | # foreach $busy (0.75, 0.835, 0.9, 0.95) { 30 | # foreach $busy (0.835) { 31 | for($busy = 0.745; $busy <= 0.955; $busy += 0.01) { 32 | foreach $alloc_sz (16, 256) { 33 | # foreach $alloc_sz (16) { 34 | my $nallocs = $alloc_sz == 16 ? 4 : 1; 35 | runtest("throughput", $common, "-b$slab_size", "-B$busy", 36 | "-s$alloc_sz", "-l$nallocs"); 37 | my $slab_sz = 2 ** ($slab_size - 20); 38 | $oline = 39 | "$slab_sz $busy $nallocs $alloc_sz $thru_pair $speed_pair\n"; 40 | print OCSV $oline; 41 | print $oline; 42 | } # foreach alloc_sz 43 | } # foreach busy 44 | } # foreach slab_size 45 | 46 | close OCSV; 47 | # run gnuplot 48 | system('gnuplot', './exp-plot.gpl'); 49 | -------------------------------------------------------------------------------- /tst/exp/speed/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.png 5 | *.pdf 6 | -------------------------------------------------------------------------------- /tst/exp/speed/exp-log-combi.csv: -------------------------------------------------------------------------------- 1 | block priv_speed priv_thru spree_speed spree_speed_malloc speed_thru speed_thru_malloc 2 | 8..32 25.17 1351.07 21.56 38.48 1157.69 2065.96 3 | 8..64 32.00 954.48 37.15 65.65 1108.01 1958.13 4 | 8..256 113.98 927.18 97.75 177.29 795.18 1442.11 5 | 8..3072 93.01 64.85 126.69 137.21 88.33 95.66 6 | -------------------------------------------------------------------------------- /tst/exp/speed/exp-log-single.csv: -------------------------------------------------------------------------------- 1 | block priv_speed priv_thru spree_speed spree_speed_malloc speed_thru speed_thru_malloc 2 | 16 20.89 1401.70 18.18 33.79 1220.15 2267.83 3 | 24 25.11 1123.53 25.83 46.64 1155.82 2086.62 4 | 32 41.83 1403.64 35.65 65.45 1196.23 2196.17 5 | 48 22.70 507.83 47.93 84.00 1072.26 1879.02 6 | 64 23.26 390.20 59.86 102.81 1004.20 1724.82 7 | 96 36.88 412.49 67.94 106.74 759.91 1193.91 8 | 128 35.26 295.80 82.85 131.64 695.02 1104.25 9 | 192 53.44 298.88 53.32 66.25 298.19 370.48 10 | 256 46.47 194.92 60.05 70.94 251.85 297.56 11 | 384 47.71 133.39 79.04 92.03 221.00 257.33 12 | 512 60.11 126.06 89.41 102.16 187.50 214.25 13 | 768 66.74 93.31 78.01 85.12 109.07 119.01 14 | 1024 68.13 71.44 92.00 99.52 96.47 104.36 15 | 1536 65.66 45.90 82.36 86.91 57.57 60.76 16 | 2048 60.71 31.83 100.74 106.33 52.82 55.75 17 | 3072 56.30 19.68 85.90 89.04 30.02 31.12 18 | -------------------------------------------------------------------------------- /tst/exp/speed/exp-plot.gpl: -------------------------------------------------------------------------------- 1 | # fixed slab size, varying occupancy 2 | set terminal pdf enhanced color font ",10" 3 | set pointsize 0.65 4 | set style data linespoints 5 | 6 | set output "single-speed.pdf" 7 | set xlabel "Allocation size, B" 8 | set logscale x 2 9 | set ylabel "Speed, GiB/s" 10 | plot [12:4096][0:120] \ 11 | "exp-log-single.csv" u 1:2 title "Private",\ 12 | "exp-log-single.csv" u 1:4 title "Spree",\ 13 | "exp-log-single.csv" u 1:5 title "Spree malloc" 14 | 15 | set output "single-thru.pdf" 16 | set xlabel "Allocation size, B" 17 | set logscale xy 2 18 | set ylabel "Throughput, Mops/s" 19 | plot [12:4096][10:2000] \ 20 | "exp-log-single.csv" u 1:3 title "Private",\ 21 | "exp-log-single.csv" u 1:6 title "Spree",\ 22 | "exp-log-single.csv" u 1:7 title "Spree malloc" 23 | 24 | set style data boxes 25 | set boxwidth 0.225 26 | set style fill solid 0.6 27 | 28 | set output "combi-speed.pdf" 29 | set xlabel "Allocation size range, B" 30 | unset logscale xy 31 | set ylabel "Speed, GiB/s" 32 | plot [][0:160] \ 33 | "exp-log-combi.csv" u 0:2 title "Private",\ 34 | "exp-log-combi.csv" u ($0+0.25):4:xticlabels(1) title "Spree",\ 35 | "exp-log-combi.csv" u ($0+0.5):5 title "Spree malloc" 36 | 37 | set output "combi-thru.pdf" 38 | set xlabel "Allocation size range, B" 39 | set logscale y 2 40 | set ylabel "Throughput, Mops/s" 41 | plot [][16:2048] \ 42 | "exp-log-combi.csv" u 0:3 title "Private",\ 43 | "exp-log-combi.csv" u ($0+0.25):6:xticlabels(1) title "Spree",\ 44 | "exp-log-combi.csv" u ($0+0.5):7 title "Spree malloc" 45 | -------------------------------------------------------------------------------- /tst/exp/speed/exp-run.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env perl 2 | 3 | # a script to perform experiment with settings 4 | use POSIX; 5 | 6 | #include common functions 7 | do '../common.pl'; 8 | 9 | $falloc = 0.95; 10 | $ffree = 0.05; 11 | $fexec = 0.91; 12 | $memory = 512 * 1024 * 1024; 13 | $group = 5; 14 | $max_nthreads = 1024 * 1024; 15 | $mem_fraction = 0.4; 16 | $total_niters = 512; 17 | $common = "-f$falloc -F$ffree -e$fexec -m$memory -g$group"; 18 | 19 | # running a speed test 20 | sub run_speedtest { 21 | # spree speed 22 | $ntries = $total_niters; 23 | runtest("throughput", $common, $_[0], "-i1 -t$ntries"); 24 | $spree_speed = $speed_pair; 25 | $spree_speed_malloc = $speed_malloc; 26 | $spree_thru = $thru_pair; 27 | $spree_thru_malloc = $thru_malloc; 28 | # private speed 29 | $niters = 32; 30 | $ntries = $total_niters / $niters; 31 | runtest("throughput", $common, $_[0], "-i$niters -t$ntries"); 32 | $priv_speed = $speed_pair; 33 | $priv_thru = $thru_pair; 34 | } # run_speedtest 35 | 36 | # single-size test 37 | sub single_size { 38 | @alloc_szs = (16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 39 | 2048, 3072); 40 | $ocsv_name = "./exp-log-single.csv"; 41 | $OCSV = 100; 42 | open(OCSV, ">", $ocsv_name) 43 | || die "cannot open file $ocsv_name for writing"; 44 | $oline = "block priv_speed priv_thru spree_speed spree_speed_malloc " . 45 | "speed_thru speed_thru_malloc\n"; 46 | print $oline; 47 | print OCSV $oline; 48 | foreach $alloc_sz (@alloc_szs) { 49 | $nallocs = 1; 50 | if($alloc_sz <= 64) { 51 | $nallocs = 4; 52 | } elsif($alloc_sz <= 128) { 53 | $nallocs = 2; 54 | } 55 | $nthreads = floor($mem_fraction * $memory / ($alloc_sz * $nallocs)); 56 | $nthreads = $nthreads > $max_nthreads ? $max_nthreads : $nthreads; 57 | run_speedtest("-n$nthreads -l$nallocs -s$alloc_sz"); 58 | $oline = "$alloc_sz $priv_speed $priv_thru " . 59 | "$spree_speed $spree_speed_malloc $spree_thru $spree_thru_malloc\n"; 60 | print OCSV $oline; 61 | print $oline; 62 | } # foreach $alloc_sz 63 | 64 | close OCSV; 65 | } # sub single_size 66 | 67 | # combined-size tests 68 | sub combi_size { 69 | @min_alloc_szs = (8, 8, 8, 8); 70 | @max_alloc_szs = (32, 64, 256, 3072); 71 | $ocsv_name = "./exp-log-combi.csv"; 72 | $OCSV = 100; 73 | open(OCSV, ">", $ocsv_name) 74 | || die "cannot open file $ocsv_name for writing"; 75 | $oline = "block priv_speed priv_thru spree_speed spree_speed_malloc " . 76 | "speed_thru speed_thru_malloc\n"; 77 | print $oline; 78 | print OCSV $oline; 79 | for($isz = 0; $isz < @min_alloc_szs; $isz++) { 80 | $min_sz = $min_alloc_szs[$isz]; 81 | $max_sz = $max_alloc_szs[$isz]; 82 | $nallocs = 1; 83 | if($max_sz <= 64) { 84 | $nallocs = 4; 85 | } 86 | $avg_sz = ($min_sz + $max_sz) / 2; 87 | $nthreads = floor($mem_fraction * $memory / ($avg_sz * $nallocs)); 88 | $nthreads = $nthreads > $max_nthreads ? $max_nthreads : $nthreads; 89 | run_speedtest("-n$nthreads -l$nallocs -s$min_sz -S$max_sz"); 90 | $oline = "$min_sz..$max_sz $priv_speed $priv_thru " . 91 | "$spree_speed $spree_speed_malloc $spree_thru $spree_thru_malloc\n"; 92 | print OCSV $oline; 93 | print $oline; 94 | } # foreach $alloc_sz 95 | 96 | close OCSV; 97 | } # sub combi_size 98 | 99 | 100 | # main 101 | single_size(); 102 | combi_size(); 103 | # run gnuplot 104 | system('gnuplot', './exp-plot.gpl'); 105 | -------------------------------------------------------------------------------- /tst/exp/speed/graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np #numerical stuff 3 | import sys 4 | import os 5 | 6 | import prettyplotlib as ppl # makes nicer colors and generally better to look at graphs 7 | import matplotlib.pyplot as plt 8 | import matplotlib as mpl 9 | from prettyplotlib import brewer2mpl 10 | 11 | def funlink(path): 12 | try: 13 | os.unlink(path) 14 | except: 15 | pass 16 | 17 | # change font to Open Sans (has some kerning issues, though) 18 | #mpl.rcParams.update({'font.family':'Open Sans'}) 19 | 20 | inputFileName = "exp-log-single.csv" 21 | 22 | data = np.loadtxt(inputFileName, skiprows=1) 23 | xs = range(data.shape[0]) 24 | 25 | # allocation speed for different sizes 26 | fig = plt.figure(figsize=(12,7)) 27 | ax = fig.add_subplot(111) 28 | ppl.plot(ax, xs, data[:,1], '-o', label="Private", linewidth=2) 29 | ppl.plot(ax, xs, data[:,3], '-o', label="Spree", linewidth=2) 30 | ppl.plot(ax, xs, data[:,4], '-o', label="Spree malloc", linewidth=2) 31 | ax.set_title("Allocation Speed for Different Sizes"); 32 | ax.set_xlabel("Allocation size, B") 33 | ax.set_ylabel("Speed, GiB/s") 34 | ax.set_xticks(xs) 35 | ax.set_xticklabels(['%.0lf' % d for d in data[:, 0]]) 36 | ax.axis(xmin=-1, ymin=0) 37 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 38 | ppl.legend(ax, loc=0) 39 | plt.tick_params(axis='both', which='major', direction='in', bottom=True) 40 | outputfilename = 'single-speed.pdf' 41 | funlink(outputfilename) 42 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 43 | 44 | # allocation throughput for different sizes 45 | fig = plt.figure(figsize=(12,7)) 46 | ax = fig.add_subplot(111) 47 | ppl.plot(ax, xs, data[:,2], '-o', label="Private", linewidth=2) 48 | ppl.plot(ax, xs, data[:,5], '-o', label="Spree", linewidth=2) 49 | ppl.plot(ax, xs, data[:,6], '-o', label="Spree malloc", linewidth=2) 50 | ax.set_title("Allocation Throughput for Different Sizes"); 51 | ax.set_xlabel("Allocation size, B") 52 | ax.set_ylabel("Throughput, Mop/s") 53 | ax.set_yscale('log') 54 | ax.set_xticks(xs) 55 | ax.set_xticklabels(['%.0lf' % d for d in data[:, 0]]) 56 | ax.axis(xmin=-1, ymin=1) 57 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 58 | ppl.legend(ax, loc=0) 59 | plt.tick_params(axis='both', which='major', direction='in', bottom=True) 60 | outputfilename = 'single-thru.pdf' 61 | funlink(outputfilename) 62 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 63 | 64 | # size combinations 65 | data = np.loadtxt('exp-log-combi.csv', skiprows=1, usecols=[1, 2, 3, 4, 5, 6]) 66 | labels = ['8..32', '8..64', '8..256', '8..3072'] 67 | xs = np.array(range(data.shape[0])) * 2 68 | step=0.3 69 | width=0.25 70 | 71 | # allocation speed for different size combinations 72 | fig = plt.figure(figsize=(12,7)) 73 | ax = fig.add_subplot(111) 74 | ppl.bar(ax, xs, data[:,0], color='b', width=width, label="Private") 75 | ppl.bar(ax, xs + step, data[:,2], color='g', width=width, label="Spree") 76 | ppl.bar(ax, xs + 2*step, data[:,3], color='r', width=width, label="Spree malloc") 77 | ax.set_title("Allocation Speed for Combinations of Sizes"); 78 | ax.set_xlabel("Allocation size, B") 79 | ax.set_ylabel("Speed, GiB/s") 80 | ax.set_xticks(xs + 0.45) 81 | ax.set_xticklabels(labels) 82 | ax.axis(xmin=-0.5, ymin=1) 83 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 84 | ppl.legend(ax, loc=0) 85 | outputfilename = 'combi-speed.pdf' 86 | funlink(outputfilename) 87 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 88 | 89 | # allocation throughput for different size combinations 90 | fig = plt.figure(figsize=(12,7)) 91 | ax = fig.add_subplot(111) 92 | ppl.bar(ax, xs, data[:,1], color='b', width=width, label="Private") 93 | ppl.bar(ax, xs + step, data[:,4], color='g', width=width, label="Spree") 94 | ppl.bar(ax, xs + 2*step, data[:,5], color='r', width=width, label="Spree malloc") 95 | ax.set_title("Allocation Throughput for Combinations of Sizes"); 96 | ax.set_xlabel("Allocation size, B") 97 | ax.set_ylabel("Throughput, Mop/s") 98 | ax.set_xticks(xs + 0.45) 99 | ax.set_xticklabels(labels) 100 | ax.axis(xmin=-0.5, ymin=1) 101 | ax.grid(axis='y', color='0.3', linestyle=':', antialiased=True) 102 | ppl.legend(ax, loc=0) 103 | outputfilename = 'combi-thru.pdf' 104 | funlink(outputfilename) 105 | fig.savefig(outputfilename, dpi=300, bbox_inches='tight') 106 | 107 | #plt.show() 108 | -------------------------------------------------------------------------------- /tst/include/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | libscatteralloc.a 6 | scatter-alloc.h 7 | -------------------------------------------------------------------------------- /tst/include/halloc.h: -------------------------------------------------------------------------------- 1 | ../../src/halloc.h -------------------------------------------------------------------------------- /tst/perf/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/bin/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /tst/perf/latency/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/latency/latency.cu: -------------------------------------------------------------------------------- 1 | /** @file latency.cu latency test for various memory allocators */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /** measures malloc latencies; note that latencies are averaged per-thread, 14 | per-allocation latencies are not preserved; latencies here are measured in cycles */ 15 | template 16 | __global__ void latency_malloc_k 17 | (CommonOpts opts, void **ptrs, double *latencies) { 18 | int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x; 19 | if(opts.is_thread_inactive(i)) 20 | return; 21 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) { 22 | uint sz = opts.next_alloc_sz(); 23 | uint64 t1 = clock64(); 24 | ptrs[i + n * ialloc] = T::malloc(sz); 25 | uint64 t2 = clock64(), latency = t2 - t1; 26 | latencies[i + ialloc * n] = (double)latency; 27 | } 28 | } // latency_malloc_k 29 | 30 | // TODO: verify that all pointers are non-zero 31 | 32 | template 33 | __global__ void latency_free_k 34 | (CommonOpts opts, void **ptrs, double *latencies) { 35 | int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x; 36 | if(opts.is_thread_inactive(i)) 37 | return; 38 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) { 39 | uint64 t1 = clock64(); 40 | T::free(ptrs[i + n * ialloc]); 41 | uint64 t2 = clock64(), latency = t2 - t1; 42 | latencies[i + ialloc * n] = (double)latency; 43 | } 44 | } // latency_free_k 45 | 46 | template class LatencyTest { 47 | 48 | public: 49 | void operator()(CommonOpts opts, bool warmup) { 50 | opts.niters = 1; 51 | // allocate memory 52 | if(warmup) { 53 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 54 | opts.ntries = 1; 55 | } 56 | if(!warmup) 57 | printf("latency test\n"); 58 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 59 | int nptrs = n * opts.nallocs; 60 | size_t ptrs_sz = nptrs * sizeof(void *); 61 | size_t lat_sz = nptrs * sizeof(double); 62 | void **d_ptrs; 63 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 64 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 65 | double *h_malloc_latencies = (double *)malloc(lat_sz); 66 | double *h_free_latencies = (double *)malloc(lat_sz); 67 | double *d_malloc_latencies, *d_free_latencies; 68 | cucheck(cudaMalloc((void **)&d_malloc_latencies, lat_sz)); 69 | cucheck(cudaMalloc((void **)&d_free_latencies, lat_sz)); 70 | cucheck(cudaMemset(d_malloc_latencies, 0, lat_sz)); 71 | cucheck(cudaMemset(d_free_latencies, 0, lat_sz)); 72 | 73 | // latency variables 74 | double avg_malloc_latency = 0, avg_free_latency = 0; 75 | double min_malloc_latency = FLT_MAX, min_free_latency = FLT_MAX; 76 | double max_malloc_latency = FLT_MIN, max_free_latency = FLT_MIN; 77 | 78 | // do testing 79 | for(int itry = 0; itry < opts.ntries; itry++) { 80 | // allocate 81 | latency_malloc_k <<>>(opts, d_ptrs, d_malloc_latencies); 82 | cucheck(cudaGetLastError()); 83 | cucheck(cudaStreamSynchronize(0)); 84 | // check that pointers are correct 85 | if(!check_nz(d_ptrs, 0, nptrs, opts)) { 86 | fprintf(stderr, "cannot allocate enough memory\n"); 87 | exit(-1); 88 | } 89 | // free 90 | latency_free_k <<>>(opts, d_ptrs, d_free_latencies); 91 | cucheck(cudaGetLastError()); 92 | cucheck(cudaStreamSynchronize(0)); 93 | // collect latency infos 94 | if(!warmup) { 95 | cucheck(cudaMemcpy(h_malloc_latencies, d_malloc_latencies, lat_sz, 96 | cudaMemcpyDeviceToHost)); 97 | cucheck(cudaMemcpy(h_free_latencies, d_free_latencies, lat_sz, 98 | cudaMemcpyDeviceToHost)); 99 | for(int i = 0; i < n; i += opts.period_mask + 1) { 100 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) { 101 | double malloc_latency = h_malloc_latencies[ialloc * n + i]; 102 | double free_latency = h_free_latencies[ialloc * n + i]; 103 | avg_malloc_latency += malloc_latency; 104 | avg_free_latency += free_latency; 105 | min_malloc_latency = min(min_malloc_latency, malloc_latency); 106 | min_free_latency = min(min_free_latency, free_latency); 107 | max_malloc_latency = max(max_malloc_latency, malloc_latency); 108 | max_free_latency = max(max_free_latency, free_latency); 109 | } 110 | } 111 | } // if(not warmup) 112 | } // for(itry) 113 | 114 | // output latency infos 115 | if(!warmup) { 116 | avg_malloc_latency /= opts.total_nallocs(); 117 | avg_free_latency /= opts.total_nallocs(); 118 | printf("min malloc latency %.2lf cycles\n", min_malloc_latency); 119 | printf("avg malloc latency %.2lf cycles\n", avg_malloc_latency); 120 | printf("max malloc latency %.2lf cycles\n", max_malloc_latency); 121 | printf("min free latency %.2lf cycles\n", min_free_latency); 122 | printf("avg free latency %.2lf cycles\n", avg_free_latency); 123 | printf("max free latency %.2lf cycles\n", max_free_latency); 124 | printf("avg pair latency %.2lf cycles\n", 125 | avg_malloc_latency + avg_free_latency); 126 | } // output latency infos 127 | 128 | // free memory 129 | free(h_malloc_latencies); 130 | free(h_free_latencies); 131 | cucheck(cudaFree(d_malloc_latencies)); 132 | cucheck(cudaFree(d_free_latencies)); 133 | cucheck(cudaFree(d_ptrs)); 134 | } // operator() 135 | 136 | }; // LatencyTest 137 | 138 | int main(int argc, char **argv) { 139 | CommonOpts opts(true); 140 | run_test (argc, argv, opts); 141 | return 0; 142 | } // main 143 | -------------------------------------------------------------------------------- /tst/perf/latency/makefile: -------------------------------------------------------------------------------- 1 | NAME=latency 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/perf/make-all.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # runs specific make target for each performance test 4 | ls -1 | grep -vE 'bin|tmp|make|run' | xargs -IXA_TEST -P0 make -C XA_TEST $1 5 | -------------------------------------------------------------------------------- /tst/perf/makefile: -------------------------------------------------------------------------------- 1 | TMP=*~ 2 | 3 | build: 4 | ./make-all.sh build 5 | 6 | clean: 7 | rm -f $(TMP) 8 | ./make-all.sh clean 9 | -------------------------------------------------------------------------------- /tst/perf/phase-alloc-write/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/phase-alloc-write/makefile: -------------------------------------------------------------------------------- 1 | NAME=phase-alloc-write 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/perf/phase-alloc-write/phase-alloc-write.cu: -------------------------------------------------------------------------------- 1 | /** @file prob-throughput.cu probabalitized throughput test */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /** global counters for number of allocations, frees and total size allocated 13 | */ 14 | __device__ uint64 nmallocs_g = 0; 15 | 16 | /** the kernel of the probability throughput test */ 17 | template 18 | __global__ void prob_throughput_k 19 | (void **ptrs, uint *ctrs, uint itry) { 20 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 21 | uint n = opts_g.nthreads; 22 | //uint nallocs = opts_g.nallocs; 23 | if(opts_g.is_thread_inactive(i)) 24 | return; 25 | uint ctr = ctrs[i]; 26 | //uint nmallocs = 0; 27 | 28 | // iterate 29 | for(uint iter = 0; iter < opts_g.niters; iter++) { 30 | // perform the action 31 | switch(opts_g.next_action(ctr > 0, itry, iter)) { 32 | //switch(ctr > 0 ? ActionFree : ActionAlloc) { 33 | case ActionAlloc: 34 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) { 35 | // allocate 36 | uint alloc_sz = opts_g.next_alloc_sz(); 37 | uint64 *p = (uint64 *)T::malloc(alloc_sz); 38 | for(int iword = 0; iword < alloc_sz / (uint)sizeof(uint64); iword++) 39 | p[iword] = 123ull; 40 | ptrs[ialloc * n + i] = p; 41 | } 42 | ctr = opts_g.nallocs; 43 | //nmallocs += nallocs; 44 | break; 45 | case ActionFree: 46 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) 47 | T::free(ptrs[ialloc * n + i]); 48 | ctr = 0; 49 | break; 50 | case ActionNone: 51 | //printf("no action taken\n"); 52 | break; 53 | } 54 | } // for(each iteration) 55 | ctrs[i] = ctr; 56 | } // prob_throughput_k 57 | 58 | /** measures malloc throughput */ 59 | template class PhaseThroughputTest { 60 | 61 | public: 62 | void operator()(CommonOpts opts, bool warmup) { 63 | // allocate memory 64 | if(warmup) { 65 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 66 | opts.ntries = 1; 67 | opts.niters = 1; 68 | } 69 | if(!warmup) 70 | printf("two-phase throuhgput test\n"); 71 | cuset(opts_g, CommonOpts, opts); 72 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 73 | int nptrs = n * opts.nallocs; 74 | size_t ptrs_sz = nptrs * sizeof(void *); 75 | uint ctrs_sz = n * sizeof(uint); 76 | void **d_ptrs; 77 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 78 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 79 | uint *d_ctrs; 80 | cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz)); 81 | cucheck(cudaMemset(d_ctrs, 0, ctrs_sz)); 82 | 83 | double t_malloc = 0, t_free = 0, t_pair = 0; 84 | cuset(nmallocs_g, uint64, 0); 85 | 86 | // do testing 87 | for(int itry = 0; itry < opts.ntries; itry++) { 88 | // run the kernel 89 | double t_start = omp_get_wtime(); 90 | prob_throughput_k <<>>(/* opts, */ d_ptrs, d_ctrs, itry); 91 | cucheck(cudaGetLastError()); 92 | cucheck(cudaStreamSynchronize(0)); 93 | double t_end = omp_get_wtime(), dt = t_end - t_start; 94 | t_pair += dt; 95 | if(itry % 2) 96 | t_free += dt; 97 | else 98 | t_malloc += dt; 99 | // check that pointers are correct 100 | if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) { 101 | fprintf(stderr, "cannot allocate enough memory\n"); 102 | exit(-1); 103 | } 104 | } // for(itry) 105 | 106 | // free the rest 107 | { 108 | double t_start = omp_get_wtime(); 109 | free_rest_k <<>> (/* opts, */ d_ptrs, d_ctrs); 110 | cucheck(cudaGetLastError()); 111 | cucheck(cudaStreamSynchronize(0)); 112 | double t_end = omp_get_wtime(), dt = t_end - t_start; 113 | t_pair += dt; 114 | t_free += dt; 115 | } 116 | 117 | // output throughput infos 118 | if(!warmup) { 119 | //uint64 nallocs; 120 | //cuget(&nallocs, nmallocs_g); 121 | 122 | //double malloc_throughput = opts.total_nallocs() / t_malloc * 1e-6; 123 | //double free_throughput = opts.total_nallocs() / t_free * 1e-6; 124 | double npairs = 0.5 * opts.total_nallocs() * opts.exec_fraction; 125 | double nmallocs = 0.25 * opts.total_nallocs() * 126 | (opts.exec_fraction + opts.alloc_fraction - opts.free_fraction); 127 | double nfrees = 0.25 * opts.total_nallocs() * 128 | (opts.exec_fraction + opts.alloc_fraction - opts.free_fraction); 129 | double pair_throughput = npairs / t_pair * 1e-6; 130 | double malloc_throughput = nmallocs / t_malloc * 1e-6; 131 | double free_throughput = nfrees / t_free * 1e-6; 132 | double malloc_speed = nmallocs * opts.expected_sz() / t_malloc / 133 | NBYTES_IN_GIB; 134 | double pair_speed = npairs * opts.expected_sz() / t_pair / 135 | NBYTES_IN_GIB; 136 | if(opts.niters == 1) { 137 | printf("malloc throughput %.2lf Mmallocs/s\n", malloc_throughput); 138 | printf("free throughput %.2lf Mfrees/s\n", free_throughput); 139 | } 140 | //printf("total test time %.2lf ms\n", t_pair * 1e3); 141 | printf("pair throughput %.2lf Mpairs/s\n", pair_throughput); 142 | if(opts.niters == 1) { 143 | printf("malloc speed %.2lf GiB/s\n", malloc_speed); 144 | } 145 | printf("pair speed %.2lf GiB/s\n", pair_speed); 146 | } // output latency infos 147 | 148 | // free memory 149 | cucheck(cudaFree(d_ptrs)); 150 | cucheck(cudaFree(d_ctrs)); 151 | } // operator() 152 | 153 | }; // PhaseThroughputTest 154 | 155 | int main(int argc, char **argv) { 156 | CommonOpts opts(true); 157 | run_test(argc, argv, opts); 158 | return 0; 159 | } // main 160 | -------------------------------------------------------------------------------- /tst/perf/phase-extfrag/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/phase-extfrag/makefile: -------------------------------------------------------------------------------- 1 | NAME=phase-extfrag 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/perf/phase-extfrag/phase-extfrag.cu: -------------------------------------------------------------------------------- 1 | /** @file phase-extfrag.cu probabalitized external fragmentation test */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /** global counters for number of allocations, frees and total size allocated 13 | */ 14 | __device__ uint64 nmallocs_g = 0; 15 | 16 | /** the kernel of the probability throughput test */ 17 | template 18 | __global__ void prob_throughput_k 19 | (void **ptrs, uint *ctrs, uint itry) { 20 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 21 | uint n = opts_g.nthreads; 22 | //uint nallocs = opts_g.nallocs; 23 | if(opts_g.is_thread_inactive(i)) 24 | return; 25 | uint ctr = ctrs[i]; 26 | //uint nmallocs = 0; 27 | 28 | // iterate 29 | for(uint iter = 0; iter < opts_g.niters; iter++) { 30 | // perform the action 31 | switch(opts_g.next_action(ctr > 0, itry, iter)) { 32 | //switch(ctr > 0 ? ActionFree : ActionAlloc) { 33 | case ActionAlloc: 34 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) 35 | ptrs[ialloc * n + i] = T::malloc(opts_g.next_alloc_sz()); 36 | ctr = opts_g.nallocs; 37 | //nmallocs += nallocs; 38 | break; 39 | case ActionFree: 40 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) 41 | T::free(ptrs[ialloc * n + i]); 42 | ctr = 0; 43 | break; 44 | case ActionNone: 45 | //printf("no action taken\n"); 46 | break; 47 | } 48 | } // for(each iteration) 49 | ctrs[i] = ctr; 50 | } // prob_throughput_k 51 | 52 | /** measures malloc throughput */ 53 | template class PhaseExtFragTest { 54 | 55 | public: 56 | void operator()(CommonOpts opts, bool warmup) { 57 | // allocate memory 58 | if(warmup) { 59 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 60 | opts.ntries = 1; 61 | opts.niters = 1; 62 | } 63 | if(!warmup) 64 | printf("two-phase throuhgput test\n"); 65 | cuset(opts_g, CommonOpts, opts); 66 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 67 | int nptrs = n * opts.nallocs; 68 | size_t ptrs_sz = nptrs * sizeof(void *); 69 | uint ctrs_sz = n * sizeof(uint); 70 | void **d_ptrs; 71 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 72 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 73 | uint *d_ctrs; 74 | cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz)); 75 | cucheck(cudaMemset(d_ctrs, 0, ctrs_sz)); 76 | 77 | // do testing 78 | for(int itry = 0; itry < opts.ntries; itry++) { 79 | // run the kernel 80 | prob_throughput_k <<>>(d_ptrs, d_ctrs, itry); 81 | cucheck(cudaGetLastError()); 82 | cucheck(cudaStreamSynchronize(0)); 83 | // check that pointers are correct 84 | if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) { 85 | fprintf(stderr, "cannot allocate enough memory\n"); 86 | exit(-1); 87 | } 88 | if(!warmup) 89 | printf("external fragmentation %d %.2lf %.2lf\n", itry, T::extfrag(false), 90 | T::extfrag(true)); 91 | } // for(itry) 92 | 93 | // free the rest 94 | { 95 | free_rest_k <<>> (/* opts, */ d_ptrs, d_ctrs); 96 | cucheck(cudaGetLastError()); 97 | cucheck(cudaStreamSynchronize(0)); 98 | } 99 | 100 | // free memory 101 | cucheck(cudaFree(d_ptrs)); 102 | cucheck(cudaFree(d_ctrs)); 103 | } // operator() 104 | 105 | }; // PhaseExtFragTest 106 | 107 | int main(int argc, char **argv) { 108 | CommonOpts opts(true); 109 | run_test(argc, argv, opts); 110 | return 0; 111 | } // main 112 | -------------------------------------------------------------------------------- /tst/perf/phase-latency/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/phase-latency/makefile: -------------------------------------------------------------------------------- 1 | NAME=phase-latency 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/perf/phase-latency/phase-latency.cu: -------------------------------------------------------------------------------- 1 | /** @file prob-throughput.cu probabalitized throughput test */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /** global counters for number of allocations, frees and total size allocated 14 | */ 15 | __device__ uint64 nmallocs_g = 0; 16 | 17 | /** the kernel of the probability throughput test */ 18 | template 19 | __global__ void phase_latency_k 20 | (void **ptrs, uint *ctrs, uint itry, ActionType *actions, uint *latencies) { 21 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 22 | uint n = opts_g.nthreads; 23 | uint nallocs = opts_g.nallocs; 24 | if(opts_g.is_thread_inactive(i)) 25 | return; 26 | uint ctr = ctrs[i]; 27 | //uint nmallocs = 0; 28 | 29 | // iterate 30 | for(uint iter = 0; iter < opts_g.niters; iter++) { 31 | // perform the action 32 | ActionType action = opts_g.next_action(ctr > 0, itry, iter); 33 | actions[iter * n + i] = action; 34 | switch(action) { 35 | //switch(ctr > 0 ? ActionFree : ActionAlloc) { 36 | case ActionAlloc: 37 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) { 38 | uint t1 = clock(); 39 | ptrs[ialloc * n + i] = T::malloc(opts_g.next_alloc_sz()); 40 | uint t2 = clock(); 41 | latencies[(iter * nallocs + ialloc) * n + i] = t2 - t1; 42 | } 43 | ctr = opts_g.nallocs; 44 | //nmallocs += nallocs; 45 | break; 46 | case ActionFree: 47 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) { 48 | uint t1 = clock(); 49 | T::free(ptrs[ialloc * n + i]); 50 | uint t2 = clock(); 51 | latencies[(iter * nallocs + ialloc) * n + i] = t2 - t1; 52 | } 53 | ctr = 0; 54 | break; 55 | case ActionNone: 56 | //printf("no action taken\n"); 57 | break; 58 | } 59 | } // for(each iteration) 60 | ctrs[i] = ctr; 61 | } // phase_latency_k 62 | 63 | /** measures malloc/free latency */ 64 | template class PhaseLatencyTest { 65 | 66 | public: 67 | void operator()(CommonOpts opts, bool warmup) { 68 | // allocate memory 69 | if(warmup) { 70 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 71 | opts.ntries = 1; 72 | opts.niters = 1; 73 | } 74 | if(!warmup) 75 | printf("two-phase latency test\n"); 76 | cuset(opts_g, CommonOpts, opts); 77 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 78 | int nptrs = n * opts.nallocs; 79 | size_t ptrs_sz = nptrs * sizeof(void *); 80 | uint ctrs_sz = n * sizeof(uint); 81 | size_t lat_sz = n * opts.niters * opts.nallocs * sizeof(uint); 82 | size_t act_sz = n * opts.niters * sizeof(ActionType); 83 | void **d_ptrs; 84 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 85 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 86 | uint *d_ctrs; 87 | cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz)); 88 | cucheck(cudaMemset(d_ctrs, 0, ctrs_sz)); 89 | uint *d_latencies; 90 | cucheck(cudaMalloc((void **)&d_latencies, lat_sz)); 91 | cucheck(cudaMemset(d_latencies, 0, lat_sz)); 92 | ActionType *d_actions; 93 | cucheck(cudaMalloc((void **)&d_actions, act_sz)); 94 | cucheck(cudaMemset(d_actions, 0, act_sz)); 95 | uint *h_latencies; 96 | cucheck(cudaMallocHost((void **)&h_latencies, lat_sz)); 97 | ActionType *h_actions; 98 | cucheck(cudaMallocHost((void **)&h_actions, act_sz)); 99 | 100 | //cuset(nmallocs_g, uint64, 0); 101 | 102 | // latency variables 103 | double avg_malloc_latency = 0, avg_free_latency = 0; 104 | double min_malloc_latency = FLT_MAX, min_free_latency = FLT_MAX; 105 | double max_malloc_latency = FLT_MIN, max_free_latency = FLT_MIN; 106 | double nmallocs = 0, nfrees = 0; 107 | 108 | // do testing 109 | for(int itry = 0; itry < opts.ntries; itry++) { 110 | // run the kernel 111 | phase_latency_k <<>> 112 | (d_ptrs, d_ctrs, itry, d_actions, d_latencies); 113 | cucheck(cudaGetLastError()); 114 | cucheck(cudaStreamSynchronize(0)); 115 | // check that pointers are correct 116 | if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) { 117 | fprintf(stderr, "cannot allocate enough memory\n"); 118 | exit(-1); 119 | } 120 | // compute the latencies 121 | if(!warmup) { 122 | cucheck(cudaMemcpy(h_latencies, d_latencies, lat_sz, 123 | cudaMemcpyDeviceToHost)); 124 | cucheck(cudaMemcpy(h_actions, d_actions, act_sz, 125 | cudaMemcpyDeviceToHost)); 126 | for(int iter = 0; iter < opts.niters; iter++) { 127 | for(int i = 0; i < n; i += opts.period_mask + 1) { 128 | ActionType action = h_actions[iter * n + i]; 129 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) { 130 | uint latency = 131 | h_latencies[(iter * opts.nallocs + ialloc) * n + i]; 132 | //double malloc_latency = h_malloc_latencies[ialloc * n + i]; 133 | //double free_latency = h_free_latencies[ialloc * n + i]; 134 | switch(action) { 135 | case ActionAlloc: 136 | nmallocs++; 137 | avg_malloc_latency += (double)latency; 138 | min_malloc_latency = min(min_malloc_latency, (double)latency); 139 | max_malloc_latency = max(max_malloc_latency, (double)latency); 140 | break; 141 | case ActionFree: 142 | nfrees++; 143 | avg_free_latency += (double)latency; 144 | min_free_latency = min(min_free_latency, (double)latency); 145 | max_free_latency = max(max_free_latency, (double)latency); 146 | break; 147 | // otherwise, do nothing 148 | } 149 | } // for(ialloc) 150 | } // for(i) 151 | } // for(iter) 152 | } // if(not warmup) 153 | 154 | } // for(itry) 155 | 156 | // free the rest - this is not timed for latency 157 | { 158 | free_rest_k <<>> (/* opts, */ d_ptrs, d_ctrs); 159 | cucheck(cudaGetLastError()); 160 | cucheck(cudaStreamSynchronize(0)); 161 | } 162 | 163 | // output throughput infos 164 | if(!warmup) { 165 | avg_malloc_latency /= nmallocs; 166 | avg_free_latency /= nfrees; 167 | printf("min malloc latency %.2lf cycles\n", min_malloc_latency); 168 | printf("avg malloc latency %.2lf cycles\n", avg_malloc_latency); 169 | printf("max malloc latency %.2lf cycles\n", max_malloc_latency); 170 | printf("min free latency %.2lf cycles\n", min_free_latency); 171 | printf("avg free latency %.2lf cycles\n", avg_free_latency); 172 | printf("max free latency %.2lf cycles\n", max_free_latency); 173 | printf("avg pair latency %.2lf cycles\n", 174 | avg_malloc_latency + avg_free_latency); 175 | } // output latency infos 176 | 177 | // free memory 178 | cucheck(cudaFree(d_ptrs)); 179 | cucheck(cudaFree(d_ctrs)); 180 | } // operator() 181 | }; // PhaseLatencyTest 182 | 183 | int main(int argc, char **argv) { 184 | CommonOpts opts(true); 185 | run_test(argc, argv, opts); 186 | return 0; 187 | } // main 188 | -------------------------------------------------------------------------------- /tst/perf/phase-throughput/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/phase-throughput/makefile: -------------------------------------------------------------------------------- 1 | NAME=phase-throughput 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/perf/phase-throughput/phase-throughput.cu: -------------------------------------------------------------------------------- 1 | /** @file prob-throughput.cu probabalitized throughput test */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /** global counters for number of allocations, frees and total size allocated 13 | */ 14 | __device__ uint64 nmallocs_g = 0; 15 | 16 | /** the kernel of the probability throughput test */ 17 | template 18 | __global__ void prob_throughput_k 19 | (void **ptrs, uint *ctrs, uint itry) { 20 | uint i = threadIdx.x + blockIdx.x * blockDim.x; 21 | uint n = opts_g.nthreads; 22 | //uint nallocs = opts_g.nallocs; 23 | if(opts_g.is_thread_inactive(i)) 24 | return; 25 | uint ctr = ctrs[i]; 26 | //uint nmallocs = 0; 27 | 28 | // iterate 29 | for(uint iter = 0; iter < opts_g.niters; iter++) { 30 | // perform the action 31 | switch(opts_g.next_action(ctr > 0, itry, iter)) { 32 | //switch(ctr > 0 ? ActionFree : ActionAlloc) { 33 | case ActionAlloc: 34 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) 35 | ptrs[ialloc * n + i] = T::malloc(opts_g.next_alloc_sz()); 36 | ctr = opts_g.nallocs; 37 | //nmallocs += nallocs; 38 | break; 39 | case ActionFree: 40 | for(uint ialloc = 0; ialloc < opts_g.nallocs; ialloc++) 41 | T::free(ptrs[ialloc * n + i]); 42 | ctr = 0; 43 | break; 44 | case ActionNone: 45 | //printf("no action taken\n"); 46 | break; 47 | } 48 | } // for(each iteration) 49 | ctrs[i] = ctr; 50 | } // prob_throughput_k 51 | 52 | /** measures malloc throughput */ 53 | template class PhaseThroughputTest { 54 | 55 | public: 56 | void operator()(CommonOpts opts, bool warmup) { 57 | // allocate memory 58 | if(warmup) { 59 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 60 | opts.ntries = 1; 61 | opts.niters = 1; 62 | } 63 | if(!warmup) 64 | printf("two-phase throuhgput test\n"); 65 | cuset(opts_g, CommonOpts, opts); 66 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 67 | int nptrs = n * opts.nallocs; 68 | size_t ptrs_sz = nptrs * sizeof(void *); 69 | uint ctrs_sz = n * sizeof(uint); 70 | void **d_ptrs; 71 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 72 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 73 | uint *d_ctrs; 74 | cucheck(cudaMalloc((void **)&d_ctrs, ctrs_sz)); 75 | cucheck(cudaMemset(d_ctrs, 0, ctrs_sz)); 76 | 77 | double t_malloc = 0, t_free = 0, t_pair = 0; 78 | cuset(nmallocs_g, uint64, 0); 79 | 80 | // do testing 81 | for(int itry = 0; itry < opts.ntries; itry++) { 82 | // run the kernel 83 | double t_start = omp_get_wtime(); 84 | prob_throughput_k <<>>(/* opts, */ d_ptrs, d_ctrs, itry); 85 | cucheck(cudaGetLastError()); 86 | cucheck(cudaStreamSynchronize(0)); 87 | double t_end = omp_get_wtime(), dt = t_end - t_start; 88 | t_pair += dt; 89 | if(itry % 2) 90 | t_free += dt; 91 | else 92 | t_malloc += dt; 93 | // check that pointers are correct 94 | if(!check_nz(d_ptrs, d_ctrs, nptrs, opts)) { 95 | fprintf(stderr, "cannot allocate enough memory\n"); 96 | exit(-1); 97 | } 98 | } // for(itry) 99 | 100 | // free the rest 101 | { 102 | double t_start = omp_get_wtime(); 103 | free_rest_k <<>> (/* opts, */ d_ptrs, d_ctrs); 104 | cucheck(cudaGetLastError()); 105 | cucheck(cudaStreamSynchronize(0)); 106 | double t_end = omp_get_wtime(), dt = t_end - t_start; 107 | t_pair += dt; 108 | t_free += dt; 109 | } 110 | 111 | // output throughput infos 112 | if(!warmup) { 113 | //uint64 nallocs; 114 | //cuget(&nallocs, nmallocs_g); 115 | 116 | //double malloc_throughput = opts.total_nallocs() / t_malloc * 1e-6; 117 | //double free_throughput = opts.total_nallocs() / t_free * 1e-6; 118 | double npairs = 0.5 * opts.total_nallocs() * opts.exec_fraction; 119 | double nmallocs = 0.25 * opts.total_nallocs() * 120 | (opts.exec_fraction + opts.alloc_fraction - opts.free_fraction); 121 | double nfrees = 0.25 * opts.total_nallocs() * 122 | (opts.exec_fraction + opts.alloc_fraction - opts.free_fraction); 123 | double pair_throughput = npairs / t_pair * 1e-6; 124 | double malloc_throughput = nmallocs / t_malloc * 1e-6; 125 | double free_throughput = nfrees / t_free * 1e-6; 126 | double malloc_speed = nmallocs * opts.expected_sz() / t_malloc / 127 | NBYTES_IN_GIB; 128 | double pair_speed = npairs * opts.expected_sz() / t_pair / 129 | NBYTES_IN_GIB; 130 | if(opts.niters == 1) { 131 | printf("malloc throughput %.2lf Mmallocs/s\n", malloc_throughput); 132 | printf("free throughput %.2lf Mfrees/s\n", free_throughput); 133 | } 134 | //printf("total test time %.2lf ms\n", t_pair * 1e3); 135 | printf("pair throughput %.2lf Mpairs/s\n", pair_throughput); 136 | if(opts.niters == 1) { 137 | printf("malloc speed %.2lf GiB/s\n", malloc_speed); 138 | } 139 | printf("pair speed %.2lf GiB/s\n", pair_speed); 140 | } // output latency infos 141 | 142 | // free memory 143 | cucheck(cudaFree(d_ptrs)); 144 | cucheck(cudaFree(d_ctrs)); 145 | } // operator() 146 | 147 | }; // PhaseThroughputTest 148 | 149 | int main(int argc, char **argv) { 150 | CommonOpts opts(true); 151 | run_test(argc, argv, opts); 152 | return 0; 153 | } // main 154 | -------------------------------------------------------------------------------- /tst/perf/priv-throughput/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/priv-throughput/makefile: -------------------------------------------------------------------------------- 1 | NAME=priv-throughput 2 | 3 | include ../../common.mk 4 | -------------------------------------------------------------------------------- /tst/perf/priv-throughput/priv-throughput.cu: -------------------------------------------------------------------------------- 1 | /** @file throughput.cu throughput test for various memory allocators */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /** measures malloc throughput */ 13 | template 14 | __global__ void throughput_malloc_free_k 15 | (CommonOpts opts, void **ptrs) { 16 | int n = opts.nthreads, i = threadIdx.x + blockIdx.x * blockDim.x; 17 | if(opts.is_thread_inactive(i)) 18 | return; 19 | for(int iter = 0; iter < opts.niters; iter++) { 20 | // first allocate 21 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) 22 | ptrs[i + n * ialloc] = T::malloc(opts.next_alloc_sz()); 23 | // then free 24 | for(int ialloc = 0; ialloc < opts.nallocs; ialloc++) 25 | T::free(ptrs[i + n * ialloc]); 26 | } 27 | } // throughput_malloc_k 28 | 29 | template class PrivThroughputTest { 30 | 31 | public: 32 | void operator()(CommonOpts opts, bool warmup) { 33 | // allocate memory 34 | if(warmup) { 35 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 36 | opts.ntries = 1; 37 | } 38 | if(!warmup) 39 | printf("private throughput test\n"); 40 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 41 | int nptrs = n * opts.nallocs; 42 | size_t ptrs_sz = nptrs * sizeof(void *); 43 | void **d_ptrs; 44 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 45 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 46 | 47 | double t_pair = 0; 48 | 49 | // do testing 50 | for(int itry = 0; itry < opts.ntries; itry++) { 51 | // allocate 52 | double t_pair_start = omp_get_wtime(); 53 | throughput_malloc_free_k <<>>(opts, d_ptrs); 54 | cucheck(cudaGetLastError()); 55 | cucheck(cudaStreamSynchronize(0)); 56 | double t_pair_end = omp_get_wtime(); 57 | t_pair += t_pair_end - t_pair_start; 58 | // as pointers have not been zeroed out, check them nevertheless 59 | if(!check_nz(d_ptrs, 0, nptrs, opts)) { 60 | fprintf(stderr, "cannot allocate enough memory\n"); 61 | exit(-1); 62 | } 63 | } // for(itry) 64 | 65 | // output throughput infos; no individual malloc/free throughput can be 66 | // estimated 67 | if(!warmup) { 68 | double pair_throughput = opts.total_nallocs() / t_pair * 1e-6; 69 | double pair_speed = opts.total_sz() / t_pair / NBYTES_IN_GIB; 70 | printf("pair throughput %.2lf Mpairs/s\n", pair_throughput); 71 | printf("pair speed %.2lf GiB/s\n", pair_speed); 72 | } // output latency infos 73 | 74 | // free memory 75 | cucheck(cudaFree(d_ptrs)); 76 | } // operator() 77 | }; // PrivThroughputTest 78 | 79 | int main(int argc, char **argv) { 80 | CommonOpts opts(true); 81 | run_test(argc, argv, opts); 82 | return 0; 83 | } // main 84 | -------------------------------------------------------------------------------- /tst/perf/run-test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # usage: 4 | # ./run-test.sh 5 | 6 | TEST_NAME=$1 7 | shift 1 8 | TEST_EXE=./bin/$TEST_NAME 9 | 10 | # run the test 11 | echo $TEST_EXE $@ 12 | $TEST_EXE $@ 13 | -------------------------------------------------------------------------------- /tst/perf/throughput/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | \#* 3 | ._* 4 | *.o 5 | -------------------------------------------------------------------------------- /tst/perf/throughput/makefile: -------------------------------------------------------------------------------- 1 | NAME=throughput 2 | 3 | include ../../common.mk 4 | 5 | throughput-all: $(TGT) 6 | cd .. && ./throughput/throughput-all.sh 7 | -------------------------------------------------------------------------------- /tst/perf/throughput/throughput-all.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # runs throughput test for all sizes 4 | ./run-test.sh throughput -s16 -l4 5 | ./run-test.sh throughput -s24 -l4 6 | ./run-test.sh throughput -s32 -l4 7 | ./run-test.sh throughput -s48 -l4 8 | ./run-test.sh throughput -s64 -l4 9 | ./run-test.sh throughput -s96 -l2 10 | ./run-test.sh throughput -s128 -l2 11 | ./run-test.sh throughput -s192 -l1 12 | ./run-test.sh throughput -s256 -l1 13 | ./run-test.sh throughput -s384 -l1 -n$((512*1024)) 14 | ./run-test.sh throughput -s512 -l1 -n$((512*1024)) 15 | ./run-test.sh throughput -s768 -l1 -n$((256*1024)) 16 | ./run-test.sh throughput -s1024 -l1 -n$((256*1024)) 17 | ./run-test.sh throughput -s1536 -l1 -n$((128*1024)) 18 | ./run-test.sh throughput -s2048 -l1 -n$((128*1024)) 19 | ./run-test.sh throughput -s3072 -l1 -n$((64*1024)) 20 | 21 | -------------------------------------------------------------------------------- /tst/perf/throughput/throughput.cu: -------------------------------------------------------------------------------- 1 | /** @file throughput.cu throughput test for various memory allocators */ 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /** measures malloc throughput */ 13 | 14 | template class ThroughputTest { 15 | 16 | public: 17 | void operator()(CommonOpts opts, bool warmup) { 18 | opts.niters = 1; 19 | // allocate memory 20 | if(warmup) { 21 | opts.nthreads = min(4 * opts.bs, opts.nthreads); 22 | opts.ntries = 1; 23 | } 24 | if(!warmup) 25 | printf("throughput test\n"); 26 | int n = opts.nthreads, bs = opts.bs, grid = divup(n, bs); 27 | int nptrs = n * opts.nallocs; 28 | size_t ptrs_sz = nptrs * sizeof(void *); 29 | void **d_ptrs; 30 | cucheck(cudaMalloc((void **)&d_ptrs, ptrs_sz)); 31 | cucheck(cudaMemset(d_ptrs, 0, ptrs_sz)); 32 | 33 | double t_malloc = 0, t_free = 0; 34 | 35 | // do testing 36 | for(int itry = 0; itry < opts.ntries; itry++) { 37 | // allocate 38 | double t_malloc_start = omp_get_wtime(); 39 | malloc_k <<>>(opts, d_ptrs); 40 | cucheck(cudaGetLastError()); 41 | cucheck(cudaStreamSynchronize(0)); 42 | double t_malloc_end = omp_get_wtime(); 43 | t_malloc += t_malloc_end - t_malloc_start; 44 | // check that pointers are correct 45 | if(!check_nz(d_ptrs, 0, nptrs, opts)) { 46 | fprintf(stderr, "cannot allocate enough memory\n"); 47 | exit(-1); 48 | } 49 | // free 50 | double t_free_start = omp_get_wtime(); 51 | free_k <<>>(opts, d_ptrs); 52 | cucheck(cudaGetLastError()); 53 | cucheck(cudaStreamSynchronize(0)); 54 | double t_free_end = omp_get_wtime(); 55 | t_free += t_free_end - t_free_start; 56 | } // for(itry) 57 | 58 | // output latency infos 59 | if(!warmup) { 60 | double malloc_throughput = opts.total_nallocs() / t_malloc * 1e-6; 61 | double free_throughput = opts.total_nallocs() / t_free * 1e-6; 62 | double pair_throughput = opts.total_nallocs() / (t_malloc + t_free) 63 | * 1e-6; 64 | double malloc_speed = opts.total_sz() / t_malloc / NBYTES_IN_GIB; 65 | double pair_speed = opts.total_sz() / (t_malloc + t_free) / NBYTES_IN_GIB; 66 | printf("malloc throughput %.2lf Mmallocs/s\n", malloc_throughput); 67 | printf("free throughput %.2lf Mfrees/s\n", free_throughput); 68 | printf("pair throughput %.2lf Mpairs/s\n", pair_throughput); 69 | printf("malloc speed %.2lf GiB/s\n", malloc_speed); 70 | printf("pair speed %.2lf GiB/s\n", pair_speed); 71 | } // output latency infos 72 | 73 | // free memory 74 | cucheck(cudaFree(d_ptrs)); 75 | } // operator() 76 | 77 | }; // LatencyTest 78 | 79 | int main(int argc, char **argv) { 80 | CommonOpts opts(true); 81 | run_test(argc, argv, opts); 82 | return 0; 83 | } // main 84 | -------------------------------------------------------------------------------- /tst/perf/tmp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | --------------------------------------------------------------------------------