├── Makefile.am ├── bootstrap ├── .gitmodules ├── benchmark ├── serial │ └── Makefile.am ├── Makefile.am ├── cilkplus │ └── Makefile.am └── tbb │ └── Makefile.am ├── src ├── stats.c ├── pool.h ├── deque.h ├── stack.h ├── serial.h ├── mutex.h ├── param.h ├── Makefile.am ├── cilkplus.h ├── deque.c ├── mutex.c ├── safe.h ├── tbb.h ├── sync.h ├── stats.h ├── param.c ├── fibrile.h ├── fibril.h ├── fibrili.h ├── runtime.c ├── pool.c ├── stack.c ├── debug.h ├── fibrili.c └── fork.h ├── m4 ├── fibril.m4 └── acx_pthread.m4 ├── test ├── Makefile.am ├── fib.c ├── nqueens.c ├── quicksort.c ├── integrate.c ├── test.h ├── matmul.c ├── knapsack.c ├── heat.c ├── fft.c ├── rectmul.c ├── lu.c ├── cholesky.c └── strassen.c ├── README.md ├── .gitignore ├── LICENSE └── configure.ac /Makefile.am: -------------------------------------------------------------------------------- 1 | ACLOCAL_AMFLAGS = -I m4 2 | SUBDIRS = src test benchmark 3 | -------------------------------------------------------------------------------- /bootstrap: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | autoreconf --install 4 | automake --add-missing --copy 5 | 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Hoard"] 2 | path = Hoard 3 | url = https://github.com/chaoran/Hoard 4 | -------------------------------------------------------------------------------- /benchmark/serial/Makefile.am: -------------------------------------------------------------------------------- 1 | include $(srcdir)/../Makefile.am 2 | AM_CPPFLAGS += -DFIBRIL_SERIAL 3 | -------------------------------------------------------------------------------- /benchmark/Makefile.am: -------------------------------------------------------------------------------- 1 | VPATH = $(top_srcdir)/test 2 | include $(top_srcdir)/test/Makefile.am 3 | AM_CPPFLAGS += -DBENCHMARK 4 | -------------------------------------------------------------------------------- /src/stats.c: -------------------------------------------------------------------------------- 1 | #include "stats.h" 2 | 3 | #ifdef FIBRIL_STATS 4 | 5 | struct _stats_counter_t _stats_table[STATS_LAST_ENTRY]; 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /benchmark/cilkplus/Makefile.am: -------------------------------------------------------------------------------- 1 | include $(srcdir)/../Makefile.am 2 | AM_CPPFLAGS += -DFIBRIL_CILKPLUS 3 | AM_CFLAGS = -fcilkplus 4 | AM_LDFLAGS += -lcilkrts 5 | -------------------------------------------------------------------------------- /src/pool.h: -------------------------------------------------------------------------------- 1 | #ifndef POOL_H 2 | #define POOL_H 3 | 4 | void pool_put(void * stack); 5 | void * pool_take(); 6 | 7 | #endif /* end of include guard: POOL_H */ 8 | -------------------------------------------------------------------------------- /benchmark/tbb/Makefile.am: -------------------------------------------------------------------------------- 1 | include $(srcdir)/../Makefile.am 2 | AM_CFLAGS = -std=c++11 3 | AM_CPPFLAGS += -x c++ -DFIBRIL_TBB -fpermissive 4 | AM_LDFLAGS += -ltbb -lstdc++ 5 | -------------------------------------------------------------------------------- /src/deque.h: -------------------------------------------------------------------------------- 1 | #ifndef DEQUE_H 2 | #define DEQUE_H 3 | 4 | #include "fibrili.h" 5 | 6 | typedef struct _fibrili_deque_t deque_t; 7 | 8 | struct _fibril_t * deque_steal(deque_t * deq); 9 | 10 | #endif /* end of include guard: DEQUE_H */ 11 | -------------------------------------------------------------------------------- /src/stack.h: -------------------------------------------------------------------------------- 1 | #ifndef STACK_H 2 | #define STACK_H 3 | 4 | #include "fibrili.h" 5 | 6 | void stack_init(int id); 7 | void * stack_setup(struct _fibril_t * frptr); 8 | void stack_reinstall(struct _fibril_t * frptr); 9 | int stack_uninstall(struct _fibril_t * frptr); 10 | 11 | #endif /* end of include guard: STACK_H */ 12 | -------------------------------------------------------------------------------- /m4/fibril.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([FIBRIL_IF_ENABLED_NOHELP],[ 2 | case "$enable_[]patsubst([$1], -, _)" in 3 | '' | no) : 4 | $3 ;; 5 | *) $2 ;; 6 | esac 7 | ]) 8 | 9 | AC_DEFUN([FIBRIL_IF_ENABLED],[ 10 | AC_MSG_CHECKING(whether to enable $1) 11 | AC_ARG_ENABLE($1,AS_HELP_STRING(--enable-$1,[$2])) 12 | FIBRIL_IF_ENABLED_NOHELP([$1],[$3],[$4]) 13 | AC_MSG_RESULT($enable_[]patsubst([$1], -, _)) 14 | ]) 15 | 16 | -------------------------------------------------------------------------------- /src/serial.h: -------------------------------------------------------------------------------- 1 | #ifndef FIBRIL_SERIAL_H 2 | #define FIBRIL_SERIAL_H 3 | 4 | #define fibril 5 | #define fibril_t __attribute__((unused)) int 6 | #define fibril_init(fp) 7 | #define fibril_join(fp) 8 | 9 | #define fibril_fork_nrt(fp, fn, ag) (fn ag) 10 | #define fibril_fork_wrt(fp, rtp, fn, ag) (*rtp = fn ag) 11 | 12 | #define fibril_rt_init(n) 13 | #define fibril_rt_exit() 14 | #define fibril_rt_nprocs(n) (1) 15 | 16 | #endif /* end of include guard: FIBRIL_SERIAL_H */ 17 | -------------------------------------------------------------------------------- /src/mutex.h: -------------------------------------------------------------------------------- 1 | #ifndef MUTEX_H 2 | #define MUTEX_H 3 | 4 | #define MUTEX_LOCKED 1 5 | 6 | typedef struct _mutex_t { 7 | struct _mutex_t * volatile next; 8 | volatile char flag; 9 | } mutex_t __attribute__((aligned(128))); 10 | 11 | void mutex_lock (mutex_t * volatile * mutex, mutex_t * node); 12 | int mutex_trylock(mutex_t * volatile * mutex, mutex_t * node); 13 | void mutex_unlock (mutex_t * volatile * mutex, mutex_t * node); 14 | 15 | #endif /* end of include guard: MUTEX_H */ 16 | -------------------------------------------------------------------------------- /src/param.h: -------------------------------------------------------------------------------- 1 | #ifndef PARAM_H 2 | #define PARAM_H 3 | 4 | #include 5 | 6 | extern size_t PARAM_PAGE_SIZE; 7 | extern void * PARAM_STACK_ADDR; 8 | extern size_t PARAM_STACK_SIZE; 9 | extern int PARAM_NPROCS; 10 | 11 | #define PAGE_ALIGN_DOWN(x) ((void *) ((size_t) (x) & ~(PARAM_PAGE_SIZE - 1))) 12 | #define PAGE_ALIGNED(x) (0 == ((size_t) (x) & (PARAM_PAGE_SIZE - 1))) 13 | 14 | extern int param_nprocs(int n); 15 | extern void param_init(); 16 | 17 | #endif /* end of include guard: PARAM_H */ 18 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | lib_LTLIBRARIES = libfibril.la 2 | 3 | include_HEADERS = fibril.h 4 | 5 | pkginclude_HEADERS = \ 6 | cilkplus.h \ 7 | fibrile.h \ 8 | fibrili.h \ 9 | fork.h \ 10 | serial.h \ 11 | tbb.h 12 | 13 | libfibril_la_SOURCES = deque.c \ 14 | fibrili.c \ 15 | param.c \ 16 | pool.c \ 17 | runtime.c \ 18 | stack.c \ 19 | stats.c \ 20 | mutex.c 21 | -------------------------------------------------------------------------------- /src/cilkplus.h: -------------------------------------------------------------------------------- 1 | #ifndef CILKPLUS_H 2 | #define CILKPLUS_H 3 | 4 | #include 5 | 6 | #define fibril 7 | #define fibril_t __attribute__((unused)) int 8 | #define fibril_init(fp) 9 | #define fibril_join(fp) cilk_sync 10 | 11 | #define fibril_fork_nrt(fp, fn, ag) cilk_spawn fn ag 12 | #define fibril_fork_wrt(fp, rt, fn, ag) *rt = cilk_spawn fn ag 13 | 14 | #define fibril_rt_init(n) (__cilkrts_set_param("stack size", "0x800000")) 15 | #define fibril_rt_exit() (__cilkrts_end_cilk()) 16 | #define fibril_rt_nprocs() (__cilkrts_get_nworkers()) 17 | 18 | #endif /* end of include guard: CILKPLUS_H */ 19 | -------------------------------------------------------------------------------- /test/Makefile.am: -------------------------------------------------------------------------------- 1 | AM_CPPFLAGS = -I$(includedir) 2 | AM_LDFLAGS = -L$(libdir) -l$(PACKAGE) 3 | 4 | check_PROGRAMS = \ 5 | cholesky \ 6 | fft \ 7 | fib \ 8 | heat \ 9 | integrate \ 10 | knapsack \ 11 | lu \ 12 | matmul \ 13 | nqueens \ 14 | quicksort \ 15 | rectmul \ 16 | strassen 17 | 18 | cholesky_LDADD = -lm 19 | fft_LDADD = -lm 20 | heat_LDADD = -lm 21 | lu_LDADD = -lm 22 | strassen_LDADD = -lm 23 | 24 | TESTS = $(check_PROGRAMS) 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fibril 2 | 3 | ## Install 4 | 5 | ``` 6 | ./bootstrap 7 | ./configure 8 | make 9 | make install 10 | ``` 11 | 12 | ## Test 13 | ``` 14 | make check 15 | ``` 16 | 17 | ## Benchmark 18 | By default, `make check` will run standard tests AND benchmarks. To run benchmarks only, 19 | 20 | ``` 21 | cd benchmark 22 | make check 23 | ``` 24 | 25 | To run the benchmarks with serial version, do 26 | ``` 27 | cd benchmark/serial 28 | make check 29 | ``` 30 | 31 | You can also compare the performance of **fibril** with **Intel CilkPlus**, or **Intel Threading Building Blocks**. To run these versions, you have to have a compiler that supports these frameworks. GCC 5+ supports Intel CilkPlus natively. To run these benchmarks, do 32 | ``` 33 | cd benchmark/[cilkplus or tbb] 34 | make check 35 | ``` 36 | 37 | 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | *.d 7 | *.d.* 8 | 9 | # Precompiled Headers 10 | *.gch 11 | *.pch 12 | 13 | # Libraries 14 | *.lib 15 | *.a 16 | *.la 17 | *.lo 18 | 19 | # Shared objects (inc. Windows DLLs) 20 | *.dll 21 | *.so 22 | *.so.* 23 | *.dylib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | *.i*86 30 | *.x86_64 31 | *.hex 32 | 33 | # http://www.gnu.org/software/automake 34 | 35 | Makefile.in 36 | 37 | # http://www.gnu.org/software/autoconf 38 | 39 | /autom4te.cache 40 | /aclocal.m4 41 | /compile 42 | /configure 43 | /depcomp 44 | /install-sh 45 | /missing 46 | /stamp-h1 47 | /config.guess 48 | /config.h.in 49 | /config.h.in~ 50 | /config.sub 51 | /ltmain.sh 52 | /test-driver 53 | /m4/libtool.m4 54 | /m4/ltoptions.m4 55 | /m4/ltsugar.m4 56 | /m4/ltversion.m4 57 | /m4/lt~obsolete.m4 58 | /build 59 | -------------------------------------------------------------------------------- /src/deque.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sync.h" 3 | #include "debug.h" 4 | #include "deque.h" 5 | 6 | __thread deque_t fibrili_deq; 7 | 8 | struct _fibril_t * deque_steal(deque_t * deq) 9 | { 10 | if (deq->head >= deq->tail) return NULL; 11 | 12 | sync_lock(deq->lock); 13 | 14 | int head = deq->head++; 15 | 16 | sync_fence(); 17 | 18 | if (head >= deq->tail) { 19 | deq->head--; 20 | sync_unlock(deq->lock); 21 | 22 | return NULL; 23 | } 24 | 25 | struct _fibril_t * frptr = deq->buff[head]; 26 | DEBUG_ASSERT(frptr != NULL); 27 | 28 | sync_lock(frptr->lock); 29 | int count = frptr->count; 30 | 31 | if (count < 0) { 32 | frptr->count = 1; 33 | frptr->stack.ptr = deq->stack; 34 | } else { 35 | frptr->count = count + 1; 36 | } 37 | 38 | sync_unlock(deq->lock); 39 | return frptr; 40 | } 41 | 42 | -------------------------------------------------------------------------------- /test/fib.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "test.h" 3 | 4 | int n = 42; 5 | int m; 6 | 7 | static int fib_fast(int n) 8 | { 9 | if (n < 2) return n; 10 | 11 | int i = 2, x = 0, y = 0, z = 1; 12 | 13 | do { 14 | x = y; 15 | y = z; 16 | z = x + y; 17 | } while (i++ < n); 18 | 19 | return z; 20 | } 21 | 22 | fibril int fib(int n) 23 | { 24 | if (n < 2) return n; 25 | 26 | int x, y; 27 | fibril_t fr; 28 | fibril_init(&fr); 29 | 30 | fibril_fork(&fr, &x, fib, (n - 1)); 31 | 32 | y = fib(n - 2); 33 | fibril_join(&fr); 34 | 35 | return x + y; 36 | } 37 | 38 | int verify() 39 | { 40 | int expect = fib_fast(n); 41 | 42 | if (expect != m) { 43 | printf("fib(%d)=%d (expected %d)\n", n, m, expect); 44 | return 1; 45 | } 46 | 47 | return 0; 48 | } 49 | 50 | void init() {}; 51 | void prep() {}; 52 | 53 | void test() { 54 | m = fib(n); 55 | } 56 | 57 | -------------------------------------------------------------------------------- /src/mutex.c: -------------------------------------------------------------------------------- 1 | #include "mutex.h" 2 | #include "sync.h" 3 | 4 | #define NULL ((void *) 0) 5 | #define spin_wait(x) while (!(x)) __asm__ ( "pause" ::: "memory" ) 6 | 7 | void mutex_lock(mutex_t * volatile * mutex, mutex_t * node) 8 | { 9 | node->next = NULL; 10 | mutex_t * prev = sync_swap(mutex, node); 11 | 12 | if (prev) { 13 | node->flag = 0; 14 | prev->next = node; 15 | spin_wait(node->flag); 16 | } 17 | } 18 | 19 | int mutex_trylock(mutex_t * volatile * mutex, mutex_t * node) 20 | { 21 | node->next = NULL; 22 | mutex_t * prev = sync_cas(mutex, NULL, node); 23 | return (prev == NULL); 24 | } 25 | 26 | void mutex_unlock(mutex_t * volatile * mutex, mutex_t * node) 27 | { 28 | if (node->next == NULL) { 29 | if (node == sync_cas(mutex, node, NULL)) { 30 | return; 31 | } 32 | 33 | spin_wait(node->next); 34 | } 35 | 36 | node->next->flag = MUTEX_LOCKED; 37 | } 38 | 39 | -------------------------------------------------------------------------------- /src/safe.h: -------------------------------------------------------------------------------- 1 | #ifndef SAFE_H 2 | #define SAFE_H 3 | 4 | #include "debug.h" 5 | 6 | #ifndef DISABLE_SAFE 7 | 8 | #define SAFE_STRINGIFY(x) #x 9 | #define SAFE_TOSTRING(x) SAFE_STRINGIFY(x) 10 | #define SAFE_AT __FILE__ ":" SAFE_TOSTRING(__LINE__) ": " 11 | 12 | #define SAFE_ASSERT(cond) do { \ 13 | if (!(cond)) { \ 14 | DEBUG_DUMP(0, "error: " SAFE_AT "%m"); \ 15 | DEBUG_BREAK(!(cond)); \ 16 | } \ 17 | } while (0) 18 | 19 | #else 20 | #define SAFE_ASSERT(...) 21 | #endif 22 | 23 | #include 24 | 25 | #define SAFE_NNCALL(call) do { \ 26 | intptr_t ret = (intptr_t) (call); \ 27 | SAFE_ASSERT(ret >= 0); \ 28 | } while (0) 29 | 30 | #define SAFE_NZCALL(call) do { \ 31 | intptr_t ret = (intptr_t) (call); \ 32 | SAFE_ASSERT(ret != 0); \ 33 | } while (0) 34 | 35 | #define SAFE_RZCALL(call) do { \ 36 | intptr_t ret = (intptr_t) (call); \ 37 | SAFE_ASSERT(ret == 0); \ 38 | } while (0) 39 | 40 | #endif /* end of include guard: SAFE_H */ 41 | -------------------------------------------------------------------------------- /src/tbb.h: -------------------------------------------------------------------------------- 1 | #ifndef TBB_H 2 | #define TBB_H 3 | 4 | #include 5 | #include 6 | 7 | #define fibril 8 | #define fibril_t tbb::task_group 9 | #define fibril_init(fp) 10 | #define fibril_join(fp) (fp)->wait() 11 | 12 | #define fibril_fork_nrt(fp, fn, ag) (fp)->run([=]{ fn ag; }) 13 | #define fibril_fork_wrt(fp, rtp, fn, ag) do { \ 14 | __typeof__(rtp) pt = rtp; \ 15 | (fp)->run([=]{ *pt = fn ag; }); \ 16 | } while (0) 17 | 18 | extern "C" { 19 | extern int PARAM_NPROCS; 20 | extern int fibril_rt_nprocs(); 21 | } 22 | 23 | #define fibril_rt_init(n) \ 24 | do { \ 25 | int max_nprocs = fibril_rt_nprocs(); \ 26 | if (n > 0 && n <= max_nprocs) { \ 27 | PARAM_NPROCS = n; \ 28 | } else { \ 29 | PARAM_NPROCS = max_nprocs; \ 30 | } \ 31 | } while(0); \ 32 | tbb::task_scheduler_init _fibril_rt_init(PARAM_NPROCS) 33 | 34 | #define fibril_rt_exit() 35 | 36 | #endif /* end of include guard: TBB_H */ 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Chaoran Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/nqueens.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "test.h" 3 | 4 | int n = 14; 5 | int m; 6 | 7 | fibril static int nqueens(const int * a, int n, int d, int i) 8 | { 9 | int aa[d + 1]; 10 | int j; 11 | 12 | for (j = 0; j < d; ++j) { 13 | aa[j] = a[j]; 14 | 15 | int diff = a[j] - i; 16 | int dist = d - j; 17 | 18 | if (diff == 0 || dist == diff || dist + diff == 0) return 0; 19 | } 20 | 21 | if (d >= 0) aa[d] = i; 22 | if (++d == n) return 1; 23 | 24 | int res[n]; 25 | a = aa; 26 | 27 | fibril_t fr; 28 | fibril_init(&fr); 29 | 30 | for (i = 0; i < n; ++i) { 31 | fibril_fork(&fr, &res[i], nqueens, (a, n, d, i)); 32 | } 33 | 34 | fibril_join(&fr); 35 | 36 | int sum = 0; 37 | 38 | for (i = 0; i < n; ++i) { 39 | sum += res[i]; 40 | } 41 | 42 | return sum; 43 | } 44 | 45 | void init() {} 46 | void prep() {} 47 | 48 | void test() 49 | { 50 | m = nqueens(NULL, n, -1, 0); 51 | } 52 | 53 | int verify() 54 | { 55 | static int res[16] = { 56 | 1, 0, 0, 2, 10, 4, 40, 92, 352, 724, 2680, 57 | 14200, 73712, 365596, 2279184, 14772512 58 | }; 59 | 60 | int failed; 61 | 62 | if (failed = (m != res[n - 1])) { 63 | printf("nqueens(%d)=%d (expected %d)\n", n, m, res[n - 1]); 64 | } 65 | 66 | return failed; 67 | } 68 | 69 | -------------------------------------------------------------------------------- /src/sync.h: -------------------------------------------------------------------------------- 1 | #ifndef SYNC_H 2 | #define SYNC_H 3 | 4 | #include "fibrili.h" 5 | 6 | #define sync_fence() fibrili_fence() 7 | #define sync_lock(lock) fibrili_lock(lock) 8 | #define sync_unlock(lock) fibrili_unlock(lock) 9 | 10 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7 11 | 12 | #define sync_fadd(val, n) __atomic_fetch_add(&(val), n, __ATOMIC_ACQ_REL) 13 | #define sync_cas(ptr, cmp, val) __sync_val_compare_and_swap(ptr, cmp, val) 14 | #define sync_swap(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_ACQ_REL) 15 | 16 | #else 17 | #if defined(__x86_64__) || defined(_M_X64_) 18 | 19 | #define sync_fadd(val, n) __sync_fetch_and_add(&(val), n) 20 | #define sync_cas(ptr, cmp, val) __sync_val_compare_and_swap(ptr, cmp, val) 21 | #define sync_swap(ptr, val) __sync_lock_test_and_set(ptr, val) 22 | 23 | #endif 24 | #endif 25 | 26 | static inline void sync_barrier(int nprocs) 27 | { 28 | static volatile int _count; 29 | static volatile int _sense; 30 | static __thread volatile int _local_sense; 31 | 32 | int sense = !_local_sense; 33 | 34 | if (sync_fadd(_count, 1) == nprocs - 1) { 35 | _count = 0; 36 | _sense = sense; 37 | } 38 | 39 | while (_sense != sense); 40 | _local_sense = sense; 41 | sync_fence(); 42 | } 43 | 44 | #endif /* end of include guard: SYNC_H */ 45 | -------------------------------------------------------------------------------- /src/stats.h: -------------------------------------------------------------------------------- 1 | #ifndef STATS_H 2 | #define STATS_H 3 | 4 | #if HAVE_CONFIG_H 5 | #include "config.h" 6 | #endif 7 | 8 | //#define FIBRIL_STATS 9 | 10 | #ifndef FIBRIL_STATS 11 | 12 | #define STATS_COUNT(...) 13 | #define STATS_INC(...) 14 | #define STATS_DEC(...) 15 | #define STATS_EXPORT(...) 16 | 17 | #else // FIBRIL_STATS defined 18 | 19 | #include 20 | #include "sync.h" 21 | 22 | typedef enum _stats_t { 23 | N_STEALS = 0, 24 | N_SUSPENSIONS, 25 | N_STACKS, 26 | N_PAGES, 27 | STATS_LAST_ENTRY /** No more enum entries after this. */ 28 | } stats_t; 29 | 30 | extern struct _stats_counter_t { 31 | volatile long curr; 32 | volatile size_t peak; 33 | } _stats_table[STATS_LAST_ENTRY]; 34 | 35 | #define STATS_COUNT(e, n) do { \ 36 | sync_fadd(_stats_table[e].peak, n); \ 37 | } while (0) 38 | 39 | #define STATS_INC(e, n) do { \ 40 | long curr = sync_fadd(_stats_table[e].curr, n); \ 41 | while (1) { \ 42 | size_t peak = _stats_table[e].peak; \ 43 | if (peak > curr) break; \ 44 | if (sync_cas(&_stats_table[e].peak, peak, curr + 1)) break; \ 45 | } \ 46 | } while (0) 47 | 48 | #define STATS_DEC(e, n) do { \ 49 | sync_fadd(_stats_table[e].curr, -n); \ 50 | } while (0) 51 | 52 | #define STATS_EXPORT(e) do { \ 53 | char tmp[32]; \ 54 | sprintf(tmp, "%ld", _stats_table[e].peak); \ 55 | setenv("FIBRIL_" #e, tmp, 1); \ 56 | } while (0) 57 | 58 | #endif 59 | #endif /* end of include guard: STATS_H */ 60 | -------------------------------------------------------------------------------- /test/quicksort.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "test.h" 4 | 5 | int n = 8; 6 | static int * a, * b; 7 | static size_t size; 8 | 9 | fibril void quicksort(int * a, size_t n) 10 | { 11 | if (n < 2) return; 12 | 13 | int pivot = a[n / 2]; 14 | 15 | int *left = a; 16 | int *right = a + n - 1; 17 | 18 | while (left <= right) { 19 | if (*left < pivot) { 20 | left++; 21 | } else if (*right > pivot) { 22 | right--; 23 | } else { 24 | int tmp = *left; 25 | *left = *right; 26 | *right = tmp; 27 | left++; 28 | right--; 29 | } 30 | } 31 | 32 | fibril_t fr; 33 | fibril_init(&fr); 34 | 35 | fibril_fork(&fr, quicksort, (a, right - a + 1)); 36 | quicksort(left, a + n - left); 37 | 38 | fibril_join(&fr); 39 | } 40 | 41 | int verify() 42 | { 43 | if (size < 2) return 0; 44 | 45 | int prev = a[0]; 46 | int i; 47 | for (i = 1; i < size; ++i) { 48 | if (prev > a[i]) return 1; 49 | prev = a[i]; 50 | } 51 | 52 | return 0; 53 | } 54 | 55 | void init() 56 | { 57 | size = 1; 58 | 59 | int i; 60 | for (i = 0; i < n; ++i) { 61 | size *= 10; 62 | } 63 | 64 | a = malloc(sizeof(int [size])); 65 | b = malloc(sizeof(int [size])); 66 | 67 | for (i = 0; i < size; ++i) { 68 | b[i] = rand(); 69 | } 70 | } 71 | 72 | void prep() 73 | { 74 | int i; 75 | for (i = 0; i < size; ++i) { 76 | a[i] = b[i]; 77 | } 78 | } 79 | 80 | void test() 81 | { 82 | quicksort(a, size); 83 | } 84 | 85 | -------------------------------------------------------------------------------- /src/param.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include "safe.h" 6 | #include "param.h" 7 | 8 | size_t PARAM_PAGE_SIZE; 9 | void * PARAM_STACK_ADDR; 10 | size_t PARAM_STACK_SIZE; 11 | int PARAM_NPROCS; 12 | 13 | static size_t get_page_size() 14 | { 15 | int pagesize = sysconf(_SC_PAGESIZE); 16 | return pagesize; 17 | } 18 | 19 | static void get_stack_size(void ** addr, size_t * size) 20 | { 21 | pthread_attr_t attr; 22 | 23 | pthread_getattr_np(pthread_self(), &attr); 24 | pthread_attr_getstack(&attr, addr, size); 25 | } 26 | 27 | int param_nprocs(int n) { 28 | int nprocs = 0; 29 | 30 | /** If user provided a positive number, use that number. */ 31 | if (n > 0) { 32 | nprocs = n; 33 | } 34 | 35 | if (nprocs == 0) { 36 | char * env = getenv("FIBRIL_NPROCS"); 37 | if (env) nprocs = atoi(env); 38 | } 39 | 40 | int max_nprocs = sysconf(_SC_NPROCESSORS_ONLN); 41 | 42 | /** 43 | * Make sure nprocs is positive and less than or equal to 44 | * _SC_NPROCESSORS_ONLN. 45 | */ 46 | if (nprocs <= 0 || nprocs > max_nprocs) { 47 | nprocs = max_nprocs; 48 | } 49 | 50 | return nprocs; 51 | } 52 | 53 | void param_init(int n) 54 | { 55 | PARAM_PAGE_SIZE = get_page_size(); 56 | DEBUG_DUMP(2, "init:", (PARAM_PAGE_SIZE, "0x%lx")); 57 | 58 | get_stack_size(&PARAM_STACK_ADDR, &PARAM_STACK_SIZE); 59 | DEBUG_DUMP(2, "init:", (PARAM_STACK_ADDR, "%p")); 60 | DEBUG_DUMP(2, "init:", (PARAM_STACK_SIZE, "0x%lx")); 61 | 62 | PARAM_NPROCS = param_nprocs(n); 63 | DEBUG_DUMP(2, "init:", (PARAM_NPROCS, "%d")); 64 | } 65 | 66 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | AC_PREREQ([2.69]) 5 | AC_INIT([fibril], [0.0.2], [chaoran@rice.edu]) 6 | AM_INIT_AUTOMAKE([-Wall -Wno-extra-portability -Werror foreign]) 7 | LT_PREREQ([2.2]) 8 | LT_INIT 9 | AC_CONFIG_MACRO_DIR([m4]) 10 | AC_CONFIG_SRCDIR([src/fibril.h]) 11 | AC_CONFIG_HEADERS([config.h]) 12 | 13 | # Checks for programs. 14 | AC_PROG_CC 15 | 16 | # Checks for command-line. 17 | FIBRIL_IF_ENABLED([debug], [Build fibril in debugging mode], 18 | [ 19 | case "${enable_debug}" in 1|2|3) ;; *) enable_debug=0 ;; esac 20 | AC_DEFINE_UNQUOTED([FIBRIL_DEBUG], [${enable_debug}], [Fibril debug enabled]) 21 | ]) 22 | FIBRIL_IF_ENABLED([stats], [Enable statistics collection], 23 | [ AC_DEFINE([FIBRIL_STATS], [1], [Enable statistics collection.]) ]) 24 | 25 | # Check for pthreads 26 | ACX_PTHREAD([LIBS="$PTHREAD_LIBS $LIBS" 27 | CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 28 | CC="$PTHREAD_CC"]) 29 | 30 | # Checks for libraries. 31 | 32 | # Checks for header files. 33 | AC_CHECK_HEADERS([stddef.h stdint.h stdlib.h unistd.h pthread.h]) 34 | 35 | # Checks for typedefs, structures, and compiler characteristics. 36 | AC_C_INLINE 37 | AC_TYPE_SIZE_T 38 | 39 | # Checks for library functions. 40 | AC_FUNC_MMAP 41 | AC_CHECK_FUNCS([mmap, madvise]) 42 | 43 | AC_CONFIG_FILES([Makefile 44 | src/Makefile 45 | test/Makefile 46 | benchmark/Makefile 47 | benchmark/cilkplus/Makefile 48 | benchmark/tbb/Makefile 49 | benchmark/serial/Makefile]) 50 | AC_OUTPUT 51 | -------------------------------------------------------------------------------- /src/fibrile.h: -------------------------------------------------------------------------------- 1 | #ifndef FIBRILE_H 2 | #define FIBRILE_H 3 | 4 | #include "fibrili.h" 5 | 6 | /** fibril. */ 7 | #define fibril __attribute__((optimize("no-omit-frame-pointer"))) 8 | 9 | /** fibril_t. */ 10 | typedef struct _fibril_t fibril_t; 11 | 12 | /** fibril_init. */ 13 | __attribute__((always_inline)) extern inline 14 | void fibril_init(fibril_t * frptr) 15 | { 16 | register void * rbp asm ("rbp"); 17 | register void * rsp asm ("rsp"); 18 | 19 | frptr->lock = 0; 20 | frptr->unmapped = 0; 21 | frptr->count = -1; 22 | frptr->stack.btm = rbp; 23 | frptr->stack.top = rsp; 24 | } 25 | 26 | /** fibril_join. */ 27 | __attribute__((always_inline)) extern inline 28 | void fibril_join(fibril_t * frptr) 29 | { 30 | if (frptr->count > -1) { 31 | fibrili_membar(fibrili_join(frptr)); 32 | } 33 | } 34 | 35 | #include "fork.h" 36 | 37 | /** _fibril_fork_nrt. */ 38 | #define fibril_fork_nrt(fp, fn, ag) do { \ 39 | __attribute__((noinline, hot, optimize(3))) \ 40 | void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f) { \ 41 | fibrili_push(f); \ 42 | fn(_fibril_args ag); \ 43 | if (!fibrili_pop()) fibrili_resume(f); \ 44 | } \ 45 | fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \ 46 | } while (0) 47 | 48 | /** _fibril_fork_wrt. */ 49 | #define fibril_fork_wrt(fp, rtp, fn, ag) do { \ 50 | __attribute__((noinline, hot, optimize(3))) \ 51 | void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f, __typeof__(rtp) p) { \ 52 | fibrili_push(f); \ 53 | *p = fn(_fibril_args ag); \ 54 | if (!fibrili_pop()) fibrili_resume(f); \ 55 | } \ 56 | fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \ 57 | } while (0) 58 | 59 | extern int fibril_rt_init(int nprocs); 60 | extern int fibril_rt_exit(); 61 | extern int fibril_rt_nprocs(); 62 | 63 | #endif /* end of include guard: FIBRILE_H */ 64 | -------------------------------------------------------------------------------- /src/fibril.h: -------------------------------------------------------------------------------- 1 | #ifndef FIBRIL_H 2 | #define FIBRIL_H 3 | 4 | #define FIBRIL_SUCCESS 0 5 | #define FIBRIL_FAILURE -1 6 | 7 | /** 8 | * These are special arguments to fibril_rt_init(). 9 | * FIBRIL_NPROCS tells the runtime to fetch the number of processors 10 | * from the environment variable FIBRIL_NPROCS (getenv(FIBRIL_NPROCS)). 11 | * FIBRIL_NPROCS_ONLN tells the runtime to use all available processors 12 | * in the system (sysconf(_SC_NPROCESSORS_ONLN)). 13 | */ 14 | #define FIBRIL_NPROCS 0 15 | #define FIBRIL_NPROCS_ONLN -1 16 | 17 | /** Serial version. */ 18 | #ifdef FIBRIL_SERIAL 19 | #include 20 | 21 | /** Cilkplus version. */ 22 | #elif FIBRIL_CILKPLUS 23 | #include 24 | 25 | /** TBB version. */ 26 | #elif FIBRIL_TBB 27 | #include 28 | 29 | /** Fibril version. */ 30 | #else 31 | #include 32 | #endif 33 | 34 | /** fibril_fork has two versions: one with return value and one without. */ 35 | #define fibril_fork(...) _fibril_fork_(_fibril_nth(__VA_ARGS__), __VA_ARGS__) 36 | #define _fibril_fork_(n, ...) _fibril_concat(_fibril_fork_, n)(__VA_ARGS__) 37 | 38 | /** If nargs is 3, use the no-return-value version. */ 39 | #define _fibril_fork_3(...) fibril_fork_nrt(__VA_ARGS__) 40 | 41 | /** If nargs is 4, use the with-return-value version. */ 42 | #define _fibril_fork_4(...) fibril_fork_wrt(__VA_ARGS__) 43 | 44 | /** Helper macros to count number of arguments. */ 45 | #define _fibril_nth(...) _fibril_nth_(__VA_ARGS__, ## __VA_ARGS__, \ 46 | 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, \ 47 | 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 0) 48 | #define _fibril_nth_(_1, _1_, _2, _2_, _3, _3_, _4, _4_, _5, _5_, \ 49 | _6, _6_, _7, _7_, _8, _8_, _9, _9_, _10, _10_, _11, _11_, _12, _12_, \ 50 | _13, _13_, _14, _14_, _15, _15_, _16, _16_, N, ...) N 51 | #define _fibril_concat(left, right) left##right 52 | 53 | #endif /* end of include guard: FIBRIL_H */ 54 | -------------------------------------------------------------------------------- /test/integrate.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "test.h" 3 | 4 | int n = 10000; 5 | 6 | static double m; 7 | static const double epsilon = 1.0e-9; 8 | 9 | static double f(double x) 10 | { 11 | return (x * x + 1.0) * x; 12 | } 13 | 14 | static 15 | double integrate_serial(double x1, double y1, double x2, double y2, double area) 16 | { 17 | double half = (x2 - x1) / 2; 18 | double x0 = x1 + half; 19 | double y0 = f(x0); 20 | 21 | double area_x1x0 = (y1 + y0) / 2 * half; 22 | double area_x0x2 = (y0 + y2) / 2 * half; 23 | double area_x1x2 = area_x1x0 + area_x0x2; 24 | 25 | if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) { 26 | return area_x1x2; 27 | } 28 | 29 | area_x1x0 = integrate_serial(x1, y1, x0, y0, area_x1x0); 30 | area_x0x2 = integrate_serial(x0, y0, x2, y2, area_x0x2); 31 | 32 | return area_x1x0 + area_x0x2; 33 | } 34 | 35 | static fibril 36 | double integrate(double x1, double y1, double x2, double y2, double area) 37 | { 38 | double half = (x2 - x1) / 2; 39 | double x0 = x1 + half; 40 | double y0 = f(x0); 41 | 42 | double area_x1x0 = (y1 + y0) / 2 * half; 43 | double area_x0x2 = (y0 + y2) / 2 * half; 44 | double area_x1x2 = area_x1x0 + area_x0x2; 45 | 46 | if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) { 47 | return area_x1x2; 48 | } 49 | 50 | fibril_t fr; 51 | fibril_init(&fr); 52 | 53 | fibril_fork(&fr, &area_x1x0, integrate, (x1, y1, x0, y0, area_x1x0)); 54 | area_x0x2 = integrate(x0, y0, x2, y2, area_x0x2); 55 | 56 | fibril_join(&fr); 57 | return area_x1x0 + area_x0x2; 58 | } 59 | 60 | void init() {} 61 | void prep() {} 62 | 63 | void test() 64 | { 65 | m = integrate(0, f(0), n, f(n), 0); 66 | } 67 | 68 | int verify() 69 | { 70 | double expect = integrate_serial(0, f(0), n, f(n), 0); 71 | 72 | if (m - expect < epsilon && expect - m < epsilon) { 73 | return 0; 74 | } 75 | 76 | printf("integrate(%d)=%lf (expected %lf)\n", n, m, expect); 77 | return 1; 78 | } 79 | 80 | -------------------------------------------------------------------------------- /src/fibrili.h: -------------------------------------------------------------------------------- 1 | #ifndef FIBRILI_H 2 | #define FIBRILI_H 3 | 4 | struct _fibril_t { 5 | char lock; 6 | char unmapped; 7 | int count; 8 | struct { 9 | void * btm; 10 | void * top; 11 | void * ptr; 12 | } stack; 13 | void * pc; 14 | }; 15 | 16 | extern __thread struct _fibrili_deque_t { 17 | char lock; 18 | int head; 19 | int tail; 20 | void * stack; 21 | void * buff[1000]; 22 | } fibrili_deq; 23 | 24 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7 25 | 26 | #define fibrili_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST) 27 | #define fibrili_lock(l) do { \ 28 | __asm__ ( "pause" : : : "memory" ); \ 29 | } while (__atomic_test_and_set(&(l), __ATOMIC_ACQUIRE)) 30 | #define fibrili_unlock(l) __atomic_clear(&(l), __ATOMIC_RELEASE) 31 | 32 | #else 33 | #if defined(__x86_64__) || defined(_M_X64_) 34 | 35 | #define fibrili_fence() __sync_synchronize() 36 | #define fibrili_lock(l) do { \ 37 | __asm__ ( "pause" ::: "memory" ); \ 38 | } while (__sync_lock_test_and_set(&(l), 1)) 39 | #define fibrili_unlock(l) __sync_lock_release(&(l)) 40 | 41 | #endif 42 | #endif 43 | 44 | __attribute__((noinline)) extern 45 | void fibrili_join(struct _fibril_t * frptr); 46 | __attribute__((noreturn)) extern 47 | void fibrili_resume(struct _fibril_t * frptr); 48 | 49 | #define fibrili_push(frptr) do { \ 50 | (frptr)->pc = __builtin_return_address(0); \ 51 | fibrili_deq.buff[fibrili_deq.tail++] = (frptr); \ 52 | } while (0) 53 | 54 | __attribute__((hot)) static 55 | int fibrili_pop(void) 56 | { 57 | int tail = fibrili_deq.tail; 58 | 59 | if (tail == 0) return 0; 60 | 61 | fibrili_deq.tail = --tail; 62 | 63 | fibrili_fence(); 64 | 65 | if (fibrili_deq.head > tail) { 66 | fibrili_deq.tail = tail + 1; 67 | 68 | fibrili_lock(fibrili_deq.lock); 69 | 70 | if (fibrili_deq.head > tail) { 71 | fibrili_deq.head = 0; 72 | fibrili_deq.tail = 0; 73 | 74 | fibrili_unlock(fibrili_deq.lock); 75 | return 0; 76 | } 77 | 78 | fibrili_deq.tail = tail; 79 | fibrili_unlock(fibrili_deq.lock); 80 | } 81 | 82 | return 1; 83 | } 84 | 85 | #define fibrili_membar(call) do { \ 86 | call; \ 87 | __asm__ ( "nop" : : : "rbx", "r12", "r13", "r14", "r15", "memory" ); \ 88 | } while (0) 89 | 90 | #endif /* end of include guard: FIBRILI_H */ 91 | -------------------------------------------------------------------------------- /src/runtime.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "safe.h" 5 | #include "debug.h" 6 | #include "param.h" 7 | #include "stats.h" 8 | 9 | static pthread_t * _procs; 10 | static void ** _stacks; 11 | 12 | __thread int _tid; 13 | 14 | extern void fibrili_init(int id, int nprocs); 15 | extern void fibrili_exit(int id, int nprocs); 16 | 17 | #ifdef FIBRIL_STATS 18 | void * MAIN_STACK_TOP; 19 | #endif 20 | 21 | static void * __main(void * id) 22 | { 23 | _tid = (int) (intptr_t) id; 24 | 25 | fibrili_init(_tid, PARAM_NPROCS); 26 | return NULL; 27 | } 28 | 29 | int fibril_rt_nprocs() 30 | { 31 | if (PARAM_NPROCS == 0) { 32 | return param_nprocs(0); 33 | } else { 34 | return PARAM_NPROCS; 35 | } 36 | } 37 | 38 | int fibril_rt_init(int n) 39 | { 40 | param_init(n); 41 | 42 | int nprocs = PARAM_NPROCS; 43 | if (nprocs <= 0) return -1; 44 | 45 | size_t stacksize = PARAM_STACK_SIZE; 46 | 47 | _procs = malloc(sizeof(pthread_t [nprocs])); 48 | _stacks = malloc(sizeof(void * [nprocs])); 49 | 50 | pthread_attr_t attrs[nprocs]; 51 | int i; 52 | 53 | for (i = 1; i < nprocs; ++i) { 54 | SAFE_RZCALL(posix_memalign(&_stacks[i], PARAM_PAGE_SIZE, stacksize)); 55 | pthread_attr_init(&attrs[i]); 56 | pthread_attr_setstack(&attrs[i], _stacks[i], stacksize); 57 | pthread_create(&_procs[i], &attrs[i], __main, (void *) (intptr_t) i); 58 | pthread_attr_destroy(&attrs[i]); 59 | } 60 | 61 | _procs[0] = pthread_self(); 62 | SAFE_RZCALL(posix_memalign(&_stacks[0], PARAM_PAGE_SIZE, stacksize)); 63 | 64 | register void * rsp asm ("r15"); 65 | rsp = _stacks[0] + stacksize; 66 | 67 | #ifdef FIBRIL_STATS 68 | register void * top asm ("rsp"); 69 | MAIN_STACK_TOP = PAGE_ALIGN_DOWN(top); 70 | #endif 71 | 72 | __asm__ ( "xchg\t%0,%%rsp" : "+r" (rsp) :: "memory" ); 73 | __main((void *) 0); 74 | __asm__ ( "xchg\t%0,%%rsp" : : "r" (rsp) : "memory" ); 75 | 76 | return 0; 77 | } 78 | 79 | int fibril_rt_exit() 80 | { 81 | fibrili_exit(_tid, PARAM_NPROCS); 82 | 83 | int i; 84 | 85 | for (i = 1; i < PARAM_NPROCS; ++i) { 86 | pthread_join(_procs[i], NULL); 87 | free(_stacks[i]); 88 | } 89 | 90 | free(_procs); 91 | free(_stacks); 92 | 93 | STATS_EXPORT(N_STEALS); 94 | STATS_EXPORT(N_SUSPENSIONS); 95 | STATS_EXPORT(N_STACKS); 96 | STATS_EXPORT(N_PAGES); 97 | 98 | return 0; 99 | } 100 | 101 | -------------------------------------------------------------------------------- /src/pool.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "safe.h" 4 | #include "sync.h" 5 | #include "mutex.h" 6 | #include "param.h" 7 | #include "stats.h" 8 | 9 | #ifndef POOL_GLOBAL_SIZE 10 | #define POOL_GLOBAL_SIZE (2048 - 3) 11 | #endif 12 | 13 | #ifndef POOL_LOCAL_SIZE 14 | #define POOL_LOCAL_SIZE 7 15 | #endif 16 | 17 | #ifndef POOL_CACHE_SIZE 18 | #define POOL_CACHE_SIZE 4 19 | #endif 20 | 21 | static struct { 22 | mutex_t * volatile lock; 23 | size_t volatile avail; 24 | void * buff[POOL_GLOBAL_SIZE]; 25 | } _pg __attribute__((aligned(128))); 26 | 27 | static __thread struct { 28 | size_t volatile avail; 29 | void * buff[POOL_LOCAL_SIZE]; 30 | } _pl __attribute__((aligned(128))); 31 | 32 | /** 33 | * Take a stack from the pool or allocate from heap if the pool is empty. 34 | * @return Return a stack or NULL if the pool has reached its limit. 35 | */ 36 | void * pool_take() 37 | { 38 | void * stack = NULL; 39 | 40 | /** Take a stack from the available stacks. */ 41 | if (_pl.avail > 0) { 42 | stack = _pl.buff[--_pl.avail]; 43 | } else { 44 | /** Take a stack from the parent pool. */ 45 | if (_pg.avail > 0) { 46 | mutex_t mutex; 47 | mutex_lock(&_pg.lock, &mutex); 48 | 49 | if (_pg.avail > 0) { 50 | stack = _pg.buff[--_pg.avail]; 51 | } 52 | 53 | mutex_unlock(&_pg.lock, &mutex); 54 | } 55 | 56 | if (!stack) { 57 | SAFE_RZCALL(posix_memalign(&stack, PARAM_PAGE_SIZE, PARAM_STACK_SIZE)); 58 | STATS_INC(N_STACKS, 1); 59 | 60 | #ifdef FIBRIL_STATS 61 | SAFE_NNCALL(mprotect(stack, PARAM_STACK_SIZE, PROT_NONE)); 62 | #endif 63 | } 64 | } 65 | 66 | SAFE_ASSERT(stack); 67 | return stack; 68 | } 69 | 70 | /** 71 | * Put a stack back into pool. 72 | * @param p The pool to put back into. 73 | * @param stack The stack to put back. 74 | */ 75 | void pool_put(void * stack) 76 | { 77 | SAFE_ASSERT(stack); 78 | 79 | /** If local pool does not have space, */ 80 | if (_pl.avail >= POOL_LOCAL_SIZE) { 81 | /** Try moving stacks to parent pool. */ 82 | if (_pg.avail < POOL_GLOBAL_SIZE) { 83 | mutex_t mutex; 84 | mutex_lock(&_pg.lock, &mutex); 85 | 86 | /** Keep only POOL_CACHE_SIZE stacks. */ 87 | while (_pl.avail > POOL_CACHE_SIZE && _pg.avail < POOL_GLOBAL_SIZE) { 88 | _pg.buff[_pg.avail++] = _pl.buff[--_pl.avail]; 89 | } 90 | 91 | mutex_unlock(&_pg.lock, &mutex); 92 | } 93 | 94 | /** Free local pool for space. */ 95 | while (_pl.avail >= POOL_LOCAL_SIZE) { 96 | free(_pl.buff[--_pl.avail]); 97 | STATS_DEC(N_STACKS, 1); 98 | } 99 | } 100 | 101 | /** Invariant: we always put stack into local pool. */ 102 | _pl.buff[_pl.avail++] = stack; 103 | } 104 | 105 | -------------------------------------------------------------------------------- /src/stack.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "pool.h" 7 | #include "safe.h" 8 | #include "sync.h" 9 | #include "mutex.h" 10 | #include "param.h" 11 | #include "stack.h" 12 | #include "stats.h" 13 | 14 | #ifdef FIBRIL_STATS 15 | extern void * MAIN_STACK_TOP; 16 | 17 | void handle_segfault(int s, siginfo_t * si, void * unused) 18 | { 19 | if (si->si_code != SEGV_ACCERR) { 20 | struct sigaction default_action = { 21 | .sa_handler = SIG_DFL, 22 | .sa_sigaction = NULL, 23 | .sa_mask = 0, 24 | .sa_flags = 0, 25 | .sa_restorer = NULL 26 | }; 27 | sigaction(SIGSEGV, &default_action, NULL); 28 | return; 29 | } 30 | 31 | void * stack = fibrili_deq.stack; 32 | void * addr = PAGE_ALIGN_DOWN(si->si_addr); 33 | if (addr < stack || addr >= stack + PARAM_STACK_SIZE) return; 34 | 35 | STATS_COUNT(N_PAGES, 1); 36 | SAFE_NNCALL(mprotect(addr, PARAM_PAGE_SIZE, PROT_READ | PROT_WRITE)); 37 | } 38 | #endif 39 | 40 | void stack_init(int id) 41 | { 42 | #ifdef FIBRIL_STATS 43 | stack_t altstack = { 44 | .ss_flags = 0, 45 | .ss_size = PARAM_STACK_SIZE 46 | }; 47 | SAFE_RZCALL(posix_memalign(&altstack.ss_sp, PARAM_PAGE_SIZE, 48 | PARAM_STACK_SIZE)); 49 | SAFE_NNCALL(sigaltstack(&altstack, NULL)); 50 | 51 | struct sigaction sa = { 52 | .sa_flags = SA_SIGINFO | SA_STACK, 53 | .sa_sigaction = handle_segfault, 54 | }; 55 | SAFE_NNCALL(sigaction(SIGSEGV, &sa, NULL)); 56 | #endif 57 | 58 | if (id == 0) { 59 | fibrili_deq.stack = PARAM_STACK_ADDR; 60 | 61 | #ifdef FIBRIL_STATS 62 | SAFE_ASSERT(MAIN_STACK_TOP >= PARAM_STACK_ADDR); 63 | SAFE_ASSERT(MAIN_STACK_TOP < (PARAM_STACK_ADDR + PARAM_STACK_SIZE)); 64 | size_t size = MAIN_STACK_TOP - PARAM_STACK_ADDR; 65 | 66 | STATS_COUNT(N_PAGES, ((PARAM_STACK_ADDR + PARAM_STACK_SIZE) - 67 | MAIN_STACK_TOP) / PARAM_PAGE_SIZE); 68 | 69 | int flags = MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; 70 | SAFE_NNCALL(mmap(PARAM_STACK_ADDR, size, PROT_NONE, flags, -1, 0)); 71 | #endif 72 | } 73 | } 74 | 75 | void * stack_setup(struct _fibril_t * frptr) 76 | { 77 | void ** rsp = fibrili_deq.stack + PARAM_STACK_SIZE; 78 | 79 | /** Reserve 128 byte at the bottom. */ 80 | rsp -= 16; 81 | return rsp; 82 | } 83 | 84 | int stack_uninstall(struct _fibril_t * frptr) 85 | { 86 | DEBUG_ASSERT(frptr != NULL); 87 | 88 | void * addr = frptr->stack.ptr; 89 | fibrili_deq.stack = NULL; 90 | 91 | if (addr != PARAM_STACK_ADDR) { 92 | size_t size = PAGE_ALIGN_DOWN(frptr->stack.top) - addr; 93 | SAFE_NNCALL(madvise(addr, size, MADV_DONTNEED)); 94 | } 95 | 96 | return 1; 97 | } 98 | 99 | void stack_reinstall(struct _fibril_t * frptr) 100 | { 101 | DEBUG_ASSERT(frptr != NULL); 102 | 103 | void * addr = fibrili_deq.stack; 104 | SAFE_ASSERT(addr != PARAM_STACK_ADDR); 105 | 106 | if (addr) pool_put(addr); 107 | 108 | fibrili_deq.stack = frptr->stack.ptr; 109 | } 110 | 111 | -------------------------------------------------------------------------------- /src/debug.h: -------------------------------------------------------------------------------- 1 | #ifndef DEBUG_H 2 | #define DEBUG_H 3 | 4 | #if HAVE_CONFIG_H 5 | #include "config.h" 6 | #ifdef FIBRIL_DEBUG 7 | #define ENABLE_DEBUG 8 | #define DEBUG_LEVEL FIBRIL_DEBUG 9 | #endif 10 | #endif 11 | 12 | extern __thread int _tid; 13 | #define DEBUG_TID _tid 14 | 15 | #ifndef DEBUG_LEVEL 16 | #define DEBUG_LEVEL 0 17 | #endif 18 | 19 | #ifndef DEBUG_WAIT 20 | #define DEBUG_WAIT 0 21 | #endif 22 | 23 | #define DEBUG_CRIT 1 24 | #define DEBUG_INFO 2 25 | #define DEBUG_STEP 3 26 | 27 | #define DEBUG_CONCAT(left, right) left##right 28 | #define DEBUG_NARG(...) DEBUG_NARG_(__VA_ARGS__, ## __VA_ARGS__, \ 29 | 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 0) 30 | #define DEBUG_NARG_(...) DEBUG_ARG_N(__VA_ARGS__) 31 | #define DEBUG_ARG_N(_1, _11, _2, _22, _3, _33, _4, _44, _5, _55, \ 32 | _6, _66, _7, _77, _8, _88, N, ...) N 33 | 34 | #define DEBUG_FORMAT(...) DEBUG_FORMAT_(DEBUG_NARG(__VA_ARGS__), ##__VA_ARGS__) 35 | #define DEBUG_FORMAT_(N, ...) DEBUG_CONCAT(DEBUG_FORMAT_, N)(__VA_ARGS__) 36 | #define DEBUG_FORMAT_0(...) 37 | #define DEBUG_FORMAT_1(p, ...) DEBUG_FORM p 38 | #define DEBUG_FORMAT_2(p, ...) DEBUG_FORM p DEBUG_FORMAT_1(__VA_ARGS__) 39 | #define DEBUG_FORMAT_3(p, ...) DEBUG_FORM p DEBUG_FORMAT_2(__VA_ARGS__) 40 | #define DEBUG_FORMAT_4(p, ...) DEBUG_FORM p DEBUG_FORMAT_3(__VA_ARGS__) 41 | #define DEBUG_FORMAT_5(p, ...) DEBUG_FORM p DEBUG_FORMAT_4(__VA_ARGS__) 42 | #define DEBUG_FORMAT_6(p, ...) DEBUG_FORM p DEBUG_FORMAT_5(__VA_ARGS__) 43 | #define DEBUG_FORMAT_7(p, ...) DEBUG_FORM p DEBUG_FORMAT_6(__VA_ARGS__) 44 | #define DEBUG_FORM(var, spec) " " #var "=" spec 45 | 46 | #define DEBUG_VARS(...) DEBUG_VARS_(DEBUG_NARG(__VA_ARGS__), ##__VA_ARGS__) 47 | #define DEBUG_VARS_(N, ...) DEBUG_CONCAT(DEBUG_VARS_, N)(__VA_ARGS__) 48 | #define DEBUG_VARS_0(...) 49 | #define DEBUG_VARS_1(p, ...) DEBUG_VAR p 50 | #define DEBUG_VARS_2(p, ...) DEBUG_VAR p DEBUG_VARS_1(__VA_ARGS__) 51 | #define DEBUG_VARS_3(p, ...) DEBUG_VAR p DEBUG_VARS_2(__VA_ARGS__) 52 | #define DEBUG_VARS_4(p, ...) DEBUG_VAR p DEBUG_VARS_3(__VA_ARGS__) 53 | #define DEBUG_VARS_5(p, ...) DEBUG_VAR p DEBUG_VARS_4(__VA_ARGS__) 54 | #define DEBUG_VARS_6(p, ...) DEBUG_VAR p DEBUG_VARS_5(__VA_ARGS__) 55 | #define DEBUG_VARS_7(p, ...) DEBUG_VAR p DEBUG_VARS_6(__VA_ARGS__) 56 | #define DEBUG_VAR(var, spec) , var 57 | 58 | #include 59 | #include 60 | #include 61 | #include 62 | 63 | #define DEBUG_DUMP(lv, tag, ...) do { \ 64 | if (lv <= DEBUG_LEVEL) { \ 65 | fprintf(stderr, "[%d]: " tag DEBUG_FORMAT(__VA_ARGS__) "\n", \ 66 | DEBUG_TID DEBUG_VARS(__VA_ARGS__) \ 67 | ); \ 68 | fflush(stderr); \ 69 | } \ 70 | } while (0) 71 | 72 | #define DEBUG_BREAK(T) do { \ 73 | volatile int wait = (T); \ 74 | if (wait) { \ 75 | if (DEBUG_WAIT) { \ 76 | int pid = getpid(); \ 77 | DEBUG_DUMP(0, "waiting for debugger:", (pid, "%d")); \ 78 | } else { \ 79 | abort(); \ 80 | } \ 81 | } \ 82 | while (wait); \ 83 | } while (0) 84 | 85 | #ifdef ENABLE_DEBUG 86 | 87 | #define DEBUG_ASSERT(F) do { \ 88 | if (!(F)) { \ 89 | DEBUG_DUMP(0, "assertion failed: " # F); \ 90 | DEBUG_BREAK(!(F)); \ 91 | } \ 92 | } while (0) 93 | 94 | #else /* ENABLE_DEBUG is undefined */ 95 | 96 | #define DEBUG_ASSERT(...) 97 | 98 | #endif /* end of ENABLE_DEBUG */ 99 | #endif /* end of include guard: DEBUG_H */ 100 | -------------------------------------------------------------------------------- /test/test.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_H 2 | #define TEST_H 3 | 4 | #if HAVE_CONFIG_H 5 | #include "config.h" 6 | #endif 7 | 8 | extern void init(); 9 | extern void prep(); 10 | extern void test(); 11 | extern int verify(); 12 | 13 | extern int n; 14 | 15 | #include 16 | #include 17 | 18 | #ifdef BENCHMARK 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | static void sort(float * a, int n) 27 | { 28 | int i, sorted = 0; 29 | 30 | while (!sorted) { 31 | sorted = 1; 32 | 33 | for (i = 1; i < n; ++i) { 34 | if (a[i] < a[i - 1]) { 35 | float t = a[i]; 36 | a[i] = a[i - 1]; 37 | a[i - 1] = t; 38 | sorted = 0; 39 | } 40 | } 41 | } 42 | } 43 | 44 | size_t static inline time_elapsed(size_t val) 45 | { 46 | struct timeval t; 47 | gettimeofday(&t, NULL); 48 | return t.tv_sec * 1000000 + t.tv_usec - val; 49 | } 50 | 51 | static void bench(const char * name, int nprocs) 52 | { 53 | static int iter = 10; 54 | float times[iter]; 55 | 56 | printf("===========================================\n"); 57 | printf(" Benchmark: %s\n", strrchr(name, '/') + 1); 58 | printf(" Input size: %d\n", n); 59 | printf(" Number of iterations: %d\n", iter); 60 | printf(" Number of processors: %d\n", nprocs); 61 | 62 | struct rusage ru; 63 | getrusage(RUSAGE_SELF, &ru); 64 | long rss = ru.ru_maxrss; 65 | long flt = ru.ru_minflt; 66 | 67 | int i; 68 | for (i = 0; i < iter; ++i) { 69 | prep(); 70 | size_t usecs = time_elapsed(0); 71 | test(); 72 | usecs = time_elapsed(usecs); 73 | times[i] = usecs / 1000000.0; 74 | printf(" #%d execution time: %f s\n", i, times[i]); 75 | } 76 | 77 | sort(times, iter); 78 | 79 | float p10 = times[1]; 80 | float p90 = times[8]; 81 | float med = times[5]; 82 | 83 | getrusage(RUSAGE_SELF, &ru); 84 | rss = ru.ru_maxrss - rss; 85 | flt = ru.ru_minflt - flt; 86 | 87 | printf(" Execution time summary:\n"); 88 | printf(" Median: %f s\n", med); 89 | printf(" 10th %%: %f s\n", p10); 90 | printf(" 90th %%: %f s\n", p90); 91 | printf(" Resources summary: \n"); 92 | printf(" Max RSS: %ld (KB)\n", ru.ru_maxrss); 93 | printf(" Runtime RSS: %ld (KB)\n", rss); 94 | printf(" # of page faults: %ld\n", flt); 95 | } 96 | 97 | #endif 98 | 99 | #include 100 | 101 | int main(int argc, const char * argv[]) 102 | { 103 | if (argc > 1 && (argc = atoi(argv[1])) > 0) { 104 | n = argc; 105 | } 106 | 107 | init(); 108 | 109 | fibril_rt_init(0); 110 | int nprocs = fibril_rt_nprocs(); 111 | 112 | #ifdef BENCHMARK 113 | bench(argv[0], nprocs); 114 | #else 115 | prep(); 116 | test(); 117 | #endif 118 | 119 | fibril_rt_exit(); 120 | 121 | #ifdef BENCHMARK 122 | #ifdef FIBRIL_STATS 123 | printf(" Statistics summary:\n"); 124 | printf(" # of steals: %s\n", getenv("FIBRIL_N_STEALS")); 125 | printf(" # of suspensions: %s\n", getenv("FIBRIL_N_SUSPENSIONS")); 126 | printf(" # of stacks used: %s\n", getenv("FIBRIL_N_STACKS")); 127 | printf(" # of pages used: %s\n", getenv("FIBRIL_N_PAGES")); 128 | #endif 129 | printf("===========================================\n"); 130 | #endif 131 | 132 | return verify(); 133 | } 134 | 135 | #endif /* end of include guard: TEST_H */ 136 | -------------------------------------------------------------------------------- /src/fibrili.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "pool.h" 4 | #include "sync.h" 5 | #include "stack.h" 6 | #include "debug.h" 7 | #include "deque.h" 8 | #include "param.h" 9 | #include "stats.h" 10 | #include "fibrile.h" 11 | 12 | static __thread fibril_t * _restart; 13 | static __thread fibril_t * _frptr; 14 | static deque_t ** _deqs; 15 | static fibril_t * volatile _stop; 16 | 17 | __attribute__((noreturn)) static 18 | void longjmp(fibril_t * frptr, void * rsp) 19 | { 20 | DEBUG_DUMP(3, "jump:", (frptr->pc, "%p"), (rsp, "%p")); 21 | sync_unlock(frptr->lock); 22 | __asm__ ( "mov\t%1,%%rsp\n\t" 23 | "mov\t%0,%%rbp\n\t" 24 | "jmp\t*%2\n\t" 25 | : : "r" (frptr->stack.btm), "r" (rsp), "r" (frptr->pc) : "memory"); 26 | __builtin_unreachable(); 27 | } 28 | 29 | __attribute__((noinline)) static 30 | void schedule(int id, int nprocs, fibril_t * frptr) 31 | { 32 | struct drand48_data _buffer; 33 | 34 | if (frptr != _restart && frptr != _stop) { 35 | sync_lock(frptr->lock); 36 | 37 | if (frptr->count-- == 0) { 38 | if (frptr->stack.ptr != fibrili_deq.stack) { 39 | stack_reinstall(frptr); 40 | } 41 | 42 | longjmp(frptr, frptr->stack.top); 43 | } else { 44 | if (frptr->stack.ptr == fibrili_deq.stack) { 45 | STATS_COUNT(N_SUSPENSIONS, 1); 46 | stack_uninstall(frptr); 47 | } 48 | 49 | sync_unlock(frptr->lock); 50 | } 51 | } else { 52 | if (id == 0) return; 53 | } 54 | 55 | while (!_stop) { 56 | long victim; 57 | lrand48_r(&_buffer, &victim); 58 | victim %= nprocs - 1; 59 | if (victim >= id) victim += 1; 60 | 61 | fibril_t * frptr = deque_steal(_deqs[victim]); 62 | 63 | if (frptr) { 64 | if (!fibrili_deq.stack) fibrili_deq.stack = pool_take(); 65 | 66 | DEBUG_DUMP(1, "steal:", (victim, "%d"), (frptr, "%p")); 67 | STATS_COUNT(N_STEALS, 1); 68 | longjmp(frptr, stack_setup(frptr)); 69 | } 70 | 71 | /** Force the worker to yield as a penalty for the failed steal. */ 72 | sched_yield(); 73 | } 74 | 75 | sync_barrier(nprocs); 76 | 77 | if (id) pthread_exit(NULL); 78 | else longjmp(_stop, _stop->stack.top); 79 | } 80 | 81 | void fibrili_init(int id, int nprocs) 82 | { 83 | _tid = id; 84 | stack_init(id); 85 | 86 | if (id == 0) { 87 | /** Setup deque pointers. */ 88 | _deqs = malloc(sizeof(deque_t * [nprocs])); 89 | } 90 | 91 | sync_barrier(nprocs); 92 | _deqs[id] = &fibrili_deq; 93 | sync_barrier(nprocs); 94 | 95 | DEBUG_DUMP(2, "proc_start:", (id, "%d"), (_deqs[id], "%p")); 96 | sync_barrier(nprocs); 97 | 98 | fibril_t fr; 99 | fibril_init(&fr); 100 | _restart = &fr; 101 | DEBUG_DUMP(2, "restart:", (_restart, "%p"), (_restart->stack.top, "%p"), 102 | (_restart->stack.btm, "%p")); 103 | fibrili_membar(fibrili_join(_restart)); 104 | schedule(id, nprocs, _frptr); 105 | } 106 | 107 | void fibrili_exit(int id, int nprocs) 108 | { 109 | fibril_t fr; 110 | 111 | if (id != 0) { 112 | fibril_init(&fr); 113 | _stop = &fr; 114 | DEBUG_DUMP(2, "proc_stop:", (_stop, "%p"), (fibrili_deq.stack, "%p")); 115 | fibrili_membar(fibrili_join(_stop)); 116 | } else { 117 | _stop = &fr; 118 | sync_barrier(nprocs); 119 | } 120 | 121 | free(_deqs); 122 | } 123 | 124 | void fibrili_resume(fibril_t * frptr) 125 | { 126 | _frptr = frptr; 127 | longjmp(_restart, _restart->stack.top); 128 | } 129 | 130 | __attribute__((noinline)) 131 | void fibrili_join(fibril_t * frptr) 132 | { 133 | frptr->pc = __builtin_return_address(0); 134 | fibrili_resume(frptr); 135 | } 136 | 137 | -------------------------------------------------------------------------------- /src/fork.h: -------------------------------------------------------------------------------- 1 | #ifndef FIBRIL_FORK_H 2 | #define FIBRIL_FORK_H 3 | 4 | #define _fibril_defs(...) \ 5 | _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) 6 | #define _fibril_defs_(n, ...) \ 7 | _fibril_concat(_fibril_defs_, n)(__VA_ARGS__) 8 | #define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__) 9 | #define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__) 10 | #define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__) 11 | #define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__) 12 | #define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__) 13 | #define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__) 14 | #define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__) 15 | #define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__) 16 | #define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__) 17 | #define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__) 18 | #define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__) 19 | #define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__) 20 | #define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__) 21 | #define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__) 22 | #define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__) 23 | #define _fibril_defs_1(a) __typeof__(a) a1, 24 | #define _fibril_defs_0() 25 | 26 | #define _fibril_args(...) \ 27 | _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) 28 | #define _fibril_args_(n, ...) \ 29 | _fibril_concat(_fibril_args_, n)(__VA_ARGS__) 30 | #define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__) 31 | #define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__) 32 | #define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__) 33 | #define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__) 34 | #define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__) 35 | #define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__) 36 | #define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__) 37 | #define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__) 38 | #define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__) 39 | #define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__) 40 | #define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__) 41 | #define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__) 42 | #define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__) 43 | #define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__) 44 | #define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__) 45 | #define _fibril_args_1(a) a1 46 | #define _fibril_args_0() 47 | 48 | #define _fibril_expand(...) \ 49 | _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__) 50 | #define _fibril_expand_(n, ...) \ 51 | _fibril_concat(_fibril_expand_, n)(__VA_ARGS__) 52 | #define _fibril_expand_16(...) __VA_ARGS__, 53 | #define _fibril_expand_15(...) __VA_ARGS__, 54 | #define _fibril_expand_14(...) __VA_ARGS__, 55 | #define _fibril_expand_13(...) __VA_ARGS__, 56 | #define _fibril_expand_12(...) __VA_ARGS__, 57 | #define _fibril_expand_11(...) __VA_ARGS__, 58 | #define _fibril_expand_10(...) __VA_ARGS__, 59 | #define _fibril_expand_9( ...) __VA_ARGS__, 60 | #define _fibril_expand_8( ...) __VA_ARGS__, 61 | #define _fibril_expand_7( ...) __VA_ARGS__, 62 | #define _fibril_expand_6( ...) __VA_ARGS__, 63 | #define _fibril_expand_5( ...) __VA_ARGS__, 64 | #define _fibril_expand_4( ...) __VA_ARGS__, 65 | #define _fibril_expand_3( ...) __VA_ARGS__, 66 | #define _fibril_expand_2( ...) __VA_ARGS__, 67 | #define _fibril_expand_1( ...) __VA_ARGS__, 68 | #define _fibril_expand_0() 69 | 70 | #endif /* end of include guard: FIBRIL_FORK_H */ 71 | -------------------------------------------------------------------------------- /test/matmul.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "test.h" 4 | 5 | int n = 2048; 6 | 7 | static float * a; 8 | static float * b; 9 | static float ** c; 10 | 11 | fibril static void compute(float *, int, int, float *, int, int, 12 | float **, int, int, int); 13 | 14 | static void compute00(float * a, int ai, int aj, float * b, int bi, int bj, 15 | float ** c, int ci, int cj, int n) 16 | { 17 | compute(a, ai, aj, b, bi, bj, c, ci, cj, n); 18 | compute(a, ai, aj + n, b, bi + n, bj, c, ci, cj, n); 19 | } 20 | 21 | static void compute01(float * a, int ai, int aj, float * b, int bi, int bj, 22 | float ** c, int ci, int cj, int n) 23 | { 24 | compute(a, ai, aj, b, bi, bj + n, c, ci, cj + n, n); 25 | compute(a, ai, aj + n, b, bi + n, bj + n, c, ci, cj + n, n); 26 | } 27 | 28 | static void compute10(float * a, int ai, int aj, float * b, int bi, int bj, 29 | float ** c, int ci, int cj, int n) 30 | { 31 | compute(a, ai + n, aj, b, bi, bj, c, ci + n, cj, n); 32 | compute(a, ai + n, aj + n, b, bi + n, bj, c, ci + n, cj, n); 33 | } 34 | 35 | static void compute11(float * a, int ai, int aj, float * b, int bi, int bj, 36 | float ** c, int ci, int cj, int n) 37 | { 38 | compute(a, ai + n, aj, b, bi, bj + n, c, ci + n, cj + n, n); 39 | compute(a, ai + n, aj + n, b, bi + n, bj + n, c, ci + n, cj + n, n); 40 | } 41 | 42 | static void multiply(float * a, int ai, int aj, float * b, int bi, int bj, 43 | float ** c, int ci, int cj) 44 | { 45 | int a0 = ai; 46 | int a1 = ai + 1; 47 | 48 | float s00 = 0.0F; 49 | float s01 = 0.0F; 50 | float s10 = 0.0F; 51 | float s11 = 0.0F; 52 | 53 | int b0 = bi; 54 | int b1 = bi + 1; 55 | 56 | s00 += a[a0 + aj] * b[b0 + bj]; 57 | s10 += a[a1 + aj] * b[b0 + bj]; 58 | s01 += a[a0 + aj] * b[b0 + bj + 1]; 59 | s11 += a[a1 + aj] * b[b0 + bj + 1]; 60 | 61 | s00 += a[a0 + aj + 1] * b[b1 + bj]; 62 | s10 += a[a1 + aj + 1] * b[b1 + bj]; 63 | s01 += a[a0 + aj + 1] * b[b1 + bj + 1]; 64 | s11 += a[a1 + aj + 1] * b[b1 + bj + 1]; 65 | 66 | c[ci] [cj] += s00; 67 | c[ci] [cj + 1] += s01; 68 | c[ci + 1][cj] += s10; 69 | c[ci + 1][cj + 1] += s11; 70 | } 71 | 72 | fibril static void compute(float * a, int ai, int aj, float * b, int bi, int bj, 73 | float ** c, int ci, int cj, int n) 74 | { 75 | if (n == 2) { 76 | multiply(a, ai, aj, b, bi, bj, c, ci, cj); 77 | } else { 78 | int h = n / 2; 79 | 80 | fibril_t fr; 81 | fibril_init(&fr); 82 | 83 | fibril_fork(&fr, compute00, (a, ai, aj, b, bi, bj, c, ci, cj, h)); 84 | fibril_fork(&fr, compute10, (a, ai, aj, b, bi, bj, c, ci, cj, h)); 85 | fibril_fork(&fr, compute01, (a, ai, aj, b, bi, bj, c, ci, cj, h)); 86 | compute11(a, ai, aj, b, bi, bj, c, ci, cj, h); 87 | 88 | fibril_join(&fr); 89 | } 90 | } 91 | 92 | void init() 93 | { 94 | a = malloc(sizeof(float [n * n])); 95 | b = malloc(sizeof(float [n * n])); 96 | c = malloc(sizeof(float * [n])); 97 | 98 | int i, j; 99 | for (i = 0; i < n; ++i) { 100 | c[i] = malloc(sizeof(float [n])); 101 | } 102 | 103 | for (i = 0; i < n * n; ++i) { 104 | a[i] = 1.0F; 105 | } 106 | 107 | for (i = 0; i < n * n; ++i) { 108 | b[i] = 1.0F; 109 | } 110 | } 111 | 112 | void prep() 113 | { 114 | int i, j; 115 | 116 | for (i = 0; i < n; ++i) { 117 | for (j = 0; j < n; ++j) { 118 | c[i][j] = 0; 119 | } 120 | } 121 | } 122 | 123 | void test() 124 | { 125 | compute(a, 0, 0, b, 0, 0, c, 0, 0, n); 126 | } 127 | 128 | int verify() { 129 | int i, j; 130 | 131 | for (i = 0; i < n; ++i) { 132 | for (j = 0; j < n; j++) { 133 | if (c[i][j] != n) { 134 | printf("c[%d][%d]=%f (expected %f)\n", i, j, c[i][j], n); 135 | return 1; 136 | } 137 | } 138 | } 139 | 140 | return 0; 141 | } 142 | -------------------------------------------------------------------------------- /test/knapsack.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Cilk program to solve the 0-1 knapsack problem using a branch-and-bound 3 | * technique. 4 | * 5 | * Author: Matteo Frigo 6 | */ 7 | /* 8 | * Copyright (c) 2000 Massachusetts Institute of Technology 9 | * Copyright (c) 2000 Matteo Frigo 10 | * 11 | * This program is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 2 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * This program is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU General Public License 22 | * along with this program; if not, write to the Free Software 23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 | * 25 | */ 26 | 27 | #include 28 | #include 29 | #include 30 | #include "test.h" 31 | 32 | struct item { 33 | int value; 34 | int weight; 35 | }; 36 | 37 | int n = 32; 38 | static int capacity = 900; 39 | static int sol; 40 | 41 | static struct item items[] = { 42 | { 15, 23 }, 43 | { 22, 12 }, 44 | { 17, 42 }, 45 | { 1, 13 }, 46 | { 32, 21 }, 47 | { 65, 43 }, 48 | { 23, 56 }, 49 | { 4, 7 }, 50 | { 4, 8 }, 51 | { 32, 42 }, 52 | { 51, 32 }, 53 | { 22, 12 }, 54 | { 17, 24 }, 55 | { 12, 13 }, 56 | { 23, 21 }, 57 | { 56, 47 }, 58 | { 23, 65 }, 59 | { 6, 7 }, 60 | { 4, 7 }, 61 | { 32, 42 }, 62 | { 22, 42 }, 63 | { 59, 32 }, 64 | { 23, 12 }, 65 | { 12, 24 }, 66 | { 12, 13 }, 67 | { 23, 21 }, 68 | { 39, 48 }, 69 | { 22, 65 }, 70 | { 6, 7 }, 71 | { 4, 7 }, 72 | { 33, 42 }, 73 | { 18, 53 } 74 | }; 75 | 76 | static int best_so_far = INT_MIN; 77 | 78 | static int compare(struct item *a, struct item *b) 79 | { 80 | double c = ((double) a->value / a->weight) - 81 | ((double) b->value / b->weight); 82 | 83 | if (c > 0) 84 | return -1; 85 | if (c < 0) 86 | return 1; 87 | return 0; 88 | } 89 | 90 | /* 91 | * return the optimal solution for n items (first is e) and 92 | * capacity c. Value so far is v. 93 | */ 94 | fibril static int knapsack(struct item *e, int c, int n, int v) 95 | { 96 | int with, without, best; 97 | double ub; 98 | 99 | /* base case: full knapsack or no items */ 100 | if (c < 0) 101 | return INT_MIN; 102 | 103 | if (n == 0 || c == 0) 104 | return v; /* feasible solution, with value v */ 105 | 106 | ub = (double) v + c * e->value / e->weight; 107 | 108 | if (ub < best_so_far) { 109 | /* prune ! */ 110 | return INT_MIN; 111 | } 112 | 113 | fibril_t fr; 114 | fibril_init(&fr); 115 | /* 116 | * compute the best solution without the current item in the knapsack 117 | */ 118 | fibril_fork(&fr, &without, knapsack, (e + 1, c, n - 1, v)); 119 | 120 | /* compute the best solution with the current item in the knapsack */ 121 | with = knapsack(e + 1, c - e->weight, n - 1, v + e->value); 122 | 123 | fibril_join(&fr); 124 | 125 | best = with > without ? with : without; 126 | 127 | /* 128 | * notice the race condition here. The program is still 129 | * correct, in the sense that the best solution so far 130 | * is at least best_so_far. Moreover best_so_far gets updated 131 | * when returning, so eventually it should get the right 132 | * value. The program is highly non-deterministic. 133 | */ 134 | if (best > best_so_far) 135 | best_so_far = best; 136 | 137 | return best; 138 | } 139 | 140 | void init() 141 | { 142 | /* sort the items on decreasing order of value/weight */ 143 | qsort(items, n, sizeof(struct item), 144 | (int (*)(const void *, const void *)) compare); 145 | } 146 | 147 | void prep() {} 148 | 149 | void test() 150 | { 151 | sol = knapsack(items, capacity, n, 0); 152 | } 153 | 154 | int verify() 155 | { 156 | int expected = 733; 157 | 158 | if (sol != expected) { 159 | printf("sol: %d (expected: %d)\n", sol, expected); 160 | return 1; 161 | } 162 | 163 | return 0; 164 | } 165 | 166 | -------------------------------------------------------------------------------- /test/heat.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Heat diffusion (Jacobi-type iteration) 3 | * 4 | * Volker Strumpen, Boston August 1996 5 | * 6 | * Copyright (c) 1996 Massachusetts Institute of Technology 7 | * 8 | * This program is free software; you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation; either version 2 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program; if not, write to the Free Software 20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 | */ 22 | #include 23 | #include 24 | #include 25 | #include "test.h" 26 | 27 | #define f(x,y) (sin(x)*sin(y)) 28 | #define randa(x,t) (0.0) 29 | #define randb(x,t) (exp(-2*(t))*sin(x)) 30 | #define randc(y,t) (0.0) 31 | #define randd(y,t) (exp(-2*(t))*sin(y)) 32 | #define solu(x,y,t) (exp(-2*(t))*sin(x)*sin(y)) 33 | 34 | int n = 4096; 35 | 36 | int nx, ny, nt; 37 | double xu, xo, yu, yo, tu, to; 38 | 39 | double dx, dy, dt; 40 | double dtdxsq, dtdysq; 41 | 42 | double ** odd; 43 | double ** even; 44 | 45 | fibril static void heat(double ** m, int il, int iu) 46 | { 47 | if (iu - il > 1) { 48 | int im = (il + iu) / 2; 49 | 50 | fibril_t fr; 51 | fibril_init(&fr); 52 | 53 | fibril_fork(&fr, heat, (m, il, im)); 54 | heat(m, im, iu); 55 | 56 | fibril_join(&fr); 57 | return; 58 | } 59 | 60 | int i = il; 61 | int j; 62 | double * row = m[i]; 63 | 64 | if (i == 0) { 65 | for (j = 0; j < ny; ++j) { 66 | row[j] = randc(yu + j * dy, 0); 67 | } 68 | } else if (i == nx - 1) { 69 | for (j = 0; j < ny; ++j) { 70 | row[j] = randd(yu + j * dy, 0); 71 | } 72 | } else { 73 | row[0] = randa(xu + i * dx, 0); 74 | for (j = 1; j < ny - 1; ++j) { 75 | row[j] = f(xu + i * dx, yu + j * dy); 76 | } 77 | row[ny - 1] = randb(xu + i * dx, 0); 78 | } 79 | } 80 | 81 | fibril void diffuse(double ** out, double ** in, int il, int iu, double t) 82 | { 83 | if (iu - il > 1) { 84 | int im = (il + iu) / 2; 85 | 86 | fibril_t fr; 87 | fibril_init(&fr); 88 | 89 | fibril_fork(&fr, diffuse, (out, in, il, im, t)); 90 | diffuse(out, in, im, iu, t); 91 | 92 | fibril_join(&fr); 93 | return; 94 | } 95 | 96 | int i = il; 97 | int j; 98 | double * row = out[i]; 99 | 100 | if (i == 0) { 101 | for (j = 0; j < ny; ++j) { 102 | row[j] = randc(yu + j * dy, t); 103 | } 104 | } else if (i == nx - 1) { 105 | for (j = 0; j < ny; ++j) { 106 | row[j] = randd(yu + j * dy, t); 107 | } 108 | } else { 109 | row[0] = randa(xu + i * dx, t); 110 | for (j = 1; j < ny - 1; ++j) { 111 | row[j] = in[i][j] 112 | + dtdysq * (in[i][j + 1] - 2 * in[i][j] + in[i][j - 1]) 113 | + dtdxsq * (in[i + 1][j] - 2 * in[i][j] + in[i - 1][j]); 114 | } 115 | row[ny - 1] = randb(xu + i * dx, t); 116 | } 117 | } 118 | 119 | void init() 120 | { 121 | nx = n; 122 | ny = 1024; 123 | nt = 100; 124 | xu = 0.0; 125 | xo = 1.570796326794896558; 126 | yu = 0.0; 127 | yo = 1.570796326794896558; 128 | tu = 0.0; 129 | to = 0.0000001; 130 | 131 | dx = (xo - xu) / (nx - 1); 132 | dy = (yo - yu) / (ny - 1); 133 | dt = (to - tu) / nt; 134 | 135 | dtdxsq = dt / (dx * dx); 136 | dtdysq = dt / (dy * dy); 137 | 138 | even = malloc(sizeof(double * [nx])); 139 | odd = malloc(sizeof(double * [nx])); 140 | 141 | int i; 142 | for (i = 0; i < nx; ++i) { 143 | even[i] = malloc(sizeof(double [ny])); 144 | odd [i] = malloc(sizeof(double [ny])); 145 | } 146 | } 147 | 148 | void prep() 149 | { 150 | heat(even, 0, nx); 151 | } 152 | 153 | void test() 154 | { 155 | double t = tu; 156 | int i; 157 | 158 | for (i = 1; i <= nt; i += 2) { 159 | diffuse(odd, even, 0, nx, t += dt); 160 | diffuse(even, odd, 0, nx, t += dt); 161 | } 162 | 163 | if (nt % 2) { 164 | diffuse(odd, even, 0, nx, t += dt); 165 | } 166 | } 167 | 168 | int verify() 169 | { 170 | double **mat; 171 | double mae = 0.0; 172 | double mre = 0.0; 173 | double me = 0.0; 174 | 175 | mat = nt % 2 ? odd : even; 176 | 177 | int a, b; 178 | 179 | for (a = 0; a < nx; ++a) { 180 | for (b = 0; b < ny; ++b) { 181 | double tmp = fabs(mat[a][b] - solu(xu + a * dx, yu + b * dy, to)); 182 | 183 | me += tmp; 184 | if (tmp > mae) mae = tmp; 185 | if (mat[a][b] != 0.0) tmp = tmp / mat[a][b]; 186 | if (tmp > mre) mre = tmp; 187 | } 188 | } 189 | 190 | me = me / (nx * ny); 191 | 192 | if (mae > 1e-12) { 193 | printf("Local maximal absolute error %10e\n", mae); 194 | return 1; 195 | } if (mre > 1e-12) { 196 | printf("Local maximal relative error %10e\n", mre); 197 | return 1; 198 | } if (me > 1e-12) { 199 | printf("Global Mean absolute error %10e\n", me); 200 | return 1; 201 | } 202 | 203 | return 0; 204 | } 205 | 206 | -------------------------------------------------------------------------------- /test/fft.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2000 Massachusetts Institute of Technology 3 | * Copyright (c) 2000 Matteo Frigo 4 | * 5 | * This program is free software; you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation; either version 2 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "test.h" 26 | #include "fft.h" 27 | 28 | #ifdef BENCHMARK 29 | int n = 26; 30 | #else 31 | int n = 12; 32 | #endif 33 | 34 | static int size; 35 | static COMPLEX *in, *out, *cp, *W; 36 | static const REAL pi = 3.1415926535897932384626434; 37 | 38 | /* 39 | * compute the W coefficients (that is, powers of the root of 1) 40 | * and store them into an array. 41 | */ 42 | fibril static void compute_w_coefficients(int n, int a, int b, COMPLEX * W) 43 | { 44 | register double twoPiOverN; 45 | register int k; 46 | register REAL s, c; 47 | 48 | if (b - a < 128) { 49 | twoPiOverN = 2.0 * pi / n; 50 | for (k = a; k <= b; ++k) { 51 | c = cos(twoPiOverN * k); 52 | c_re(W[k]) = c_re(W[n - k]) = c; 53 | s = sin(twoPiOverN * k); 54 | c_im(W[k]) = -s; 55 | c_im(W[n - k]) = s; 56 | } 57 | } else { 58 | int ab = (a + b) / 2; 59 | 60 | fibril_t fr; 61 | fibril_init(&fr); 62 | 63 | fibril_fork(&fr, compute_w_coefficients, (n, a, ab, W)); 64 | compute_w_coefficients(n, ab + 1, b, W); 65 | 66 | fibril_join(&fr); 67 | } 68 | } 69 | 70 | /* 71 | * Determine (in a stupid way) if n is divisible by eight, then by four, else 72 | * find the smallest prime factor of n. 73 | */ 74 | static int factor(int n) 75 | { 76 | int r; 77 | 78 | if (n < 2) 79 | return 1; 80 | 81 | if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048 82 | || n == 4096) 83 | return 8; 84 | if ((n & 15) == 0) 85 | return 16; 86 | if ((n & 7) == 0) 87 | return 8; 88 | if ((n & 3) == 0) 89 | return 4; 90 | if ((n & 1) == 0) 91 | return 2; 92 | 93 | #if 0 94 | /* radix-32 is too big --- wait for processors with more registers 95 | * :-) */ 96 | if ((n & 31) == 0 && n > 256) 97 | return 32; 98 | #endif 99 | 100 | /* try odd numbers up to n (computing the sqrt may be slower) */ 101 | for (r = 3; r < n; r += 2) 102 | if (n % r == 0) 103 | return r; 104 | 105 | /* n is prime */ 106 | return n; 107 | } 108 | 109 | fibril static void unshuffle(int a, int b, 110 | COMPLEX * in, COMPLEX * out, int r, int m) 111 | { 112 | int i, j; 113 | int r4 = r & (~0x3); 114 | const COMPLEX *ip; 115 | COMPLEX *jp; 116 | 117 | if (b - a < 16) { 118 | ip = in + a * r; 119 | for (i = a; i < b; ++i) { 120 | jp = out + i; 121 | for (j = 0; j < r4; j += 4) { 122 | jp[0] = ip[0]; 123 | jp[m] = ip[1]; 124 | jp[2 * m] = ip[2]; 125 | jp[3 * m] = ip[3]; 126 | jp += 4 * m; 127 | ip += 4; 128 | } 129 | for (; j < r; ++j) { 130 | *jp = *ip; 131 | ip++; 132 | jp += m; 133 | } 134 | } 135 | } else { 136 | int ab = (a + b) / 2; 137 | 138 | fibril_t fr; 139 | fibril_init(&fr); 140 | 141 | fibril_fork(&fr, unshuffle, (a, ab, in, out, r, m)); 142 | unshuffle(ab, b, in, out, r, m); 143 | 144 | fibril_join(&fr); 145 | } 146 | } 147 | 148 | /* 149 | * Recursive complex FFT on the n complex components of the array in: 150 | * basic Cooley-Tukey algorithm, with some improvements for 151 | * n power of two. The result is placed in the array out. n is arbitrary. 152 | * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk 153 | * are prime numbers, and r1 * r2 * ... * rk = n. 154 | * 155 | * n: size of the input 156 | * in: pointer to input 157 | * out: pointer to output 158 | * factors: list of factors of n, precomputed 159 | * W: twiddle factors 160 | * nW: size of W, that is, size of the original transform 161 | * 162 | */ 163 | fibril static void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, 164 | COMPLEX * W, int nW) 165 | { 166 | int r, m; 167 | 168 | /* special cases */ 169 | if (n == 32) { 170 | fft_base_32(in, out); 171 | return; 172 | } 173 | if (n == 16) { 174 | fft_base_16(in, out); 175 | return; 176 | } 177 | if (n == 8) { 178 | fft_base_8(in, out); 179 | return; 180 | } 181 | if (n == 4) { 182 | fft_base_4(in, out); 183 | return; 184 | } 185 | if (n == 2) { 186 | fft_base_2(in, out); 187 | return; 188 | } 189 | /* the cases n == 3, n == 5, and maybe 7 should be implemented as well */ 190 | 191 | r = *factors; 192 | m = n / r; 193 | 194 | if (r < n) { 195 | /* split the DFT of length n into r DFTs of length n/r, and recurse */ 196 | if (r == 32) 197 | fft_unshuffle_32(0, m, in, out, m); 198 | else if (r == 16) 199 | fft_unshuffle_16(0, m, in, out, m); 200 | else if (r == 8) 201 | fft_unshuffle_8(0, m, in, out, m); 202 | else if (r == 4) 203 | fft_unshuffle_4(0, m, in, out, m); 204 | else if (r == 2) 205 | fft_unshuffle_2(0, m, in, out, m); 206 | else 207 | unshuffle(0, m, in, out, r, m); 208 | 209 | fibril_t fr; 210 | fibril_init(&fr); 211 | 212 | int k; 213 | for(k = 0; k < n; k += m) { 214 | fibril_fork(&fr, fft_aux, (m, out + k, in + k, factors + 1, W, nW)); 215 | } 216 | 217 | fibril_join(&fr); 218 | } 219 | 220 | /* now multiply by the twiddle factors, and perform m FFTs of length r */ 221 | if (r == 2) 222 | fft_twiddle_2(0, m, in, out, W, nW, nW / n, m); 223 | else if (r == 4) 224 | fft_twiddle_4(0, m, in, out, W, nW, nW / n, m); 225 | else if (r == 8) 226 | fft_twiddle_8(0, m, in, out, W, nW, nW / n, m); 227 | else if (r == 16) 228 | fft_twiddle_16(0, m, in, out, W, nW, nW / n, m); 229 | else if (r == 32) 230 | fft_twiddle_32(0, m, in, out, W, nW, nW / n, m); 231 | else 232 | fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m); 233 | 234 | return; 235 | } 236 | 237 | /* 238 | * user interface for fft_aux 239 | */ 240 | static void fft(int n, COMPLEX * in, COMPLEX * out) 241 | { 242 | int factors[40]; /* allows FFTs up to at least 3^40 */ 243 | int *p = factors; 244 | int l = n; 245 | int r; 246 | 247 | compute_w_coefficients(n, 0, n / 2, W); 248 | 249 | /** 250 | * find factors of n, first 8, then 4 and then primes in ascending 251 | * order. 252 | */ 253 | do { 254 | r = factor(l); 255 | *p++ = r; 256 | l /= r; 257 | } while (l > 1); 258 | 259 | fft_aux(n, in, out, factors, W, n); 260 | return; 261 | } 262 | 263 | /**************************************************************** 264 | * END OF FFT ALGORITHM 265 | ****************************************************************/ 266 | 267 | /* tests */ 268 | 269 | static void fft_alt(int n, COMPLEX * in, COMPLEX * out) 270 | { 271 | int i, j; 272 | COMPLEX sum; 273 | COMPLEX w; 274 | 275 | for (j = 0; j < n; ++j) { 276 | c_re(sum) = c_im(sum) = 0.0; 277 | 278 | for (i = 0; i < n; ++i) { 279 | c_re(w) = cos((2.0 * pi * (i * j % n)) / n); 280 | c_im(w) = -sin((2.0 * pi * (i * j % n)) / n); 281 | c_re(sum) += c_re(in[i]) * c_re(w) - c_im(in[i]) * c_im(w); 282 | c_im(sum) += c_im(in[i]) * c_re(w) + c_re(in[i]) * c_im(w); 283 | } 284 | 285 | out[j] = sum; 286 | } 287 | 288 | return; 289 | } 290 | 291 | void init() 292 | { 293 | size = (1 << n); 294 | out = malloc(sizeof(COMPLEX [size])); 295 | in = malloc(sizeof(COMPLEX [size])); 296 | W = malloc(sizeof(COMPLEX [size + 1])); 297 | 298 | int i; 299 | for (i = 0; i < size; ++i) { 300 | c_re(in[i]) = drand48(); 301 | c_im(in[i]) = drand48(); 302 | } 303 | } 304 | 305 | void prep() 306 | { 307 | if (cp == NULL) 308 | cp = malloc(sizeof(COMPLEX [size])); 309 | 310 | memcpy(cp, in, sizeof(COMPLEX [size])); 311 | } 312 | 313 | void test() 314 | { 315 | fft(size, cp, out); 316 | } 317 | 318 | #ifdef BENCHMARK 319 | int verify(void) { return 0; } 320 | #else 321 | int verify(void) 322 | { 323 | COMPLEX * expect = malloc(sizeof(COMPLEX [size])); 324 | 325 | fft_alt(size, in, expect); 326 | 327 | /* compute the relative error */ 328 | double error = 0.0; 329 | 330 | int i; 331 | for (i = 0; i < size; ++i) { 332 | double a = sqrt( 333 | (c_re(out[i]) - c_re(expect[i])) * (c_re(out[i]) - c_re(expect[i])) + 334 | (c_im(out[i]) - c_im(expect[i])) * (c_im(out[i]) - c_im(expect[i]))); 335 | double d = sqrt( 336 | c_re(expect[i]) * c_re(expect[i]) + c_im(expect[i]) * c_im(expect[i])); 337 | 338 | if (d < -1.0e-10 || d > 1.0e-10) a /= d; 339 | if (a > error) error = a; 340 | } 341 | 342 | if (error > 1e-3) { 343 | printf("size=%d error=%e\n", size, error); 344 | return 1; 345 | } else { 346 | return 0; 347 | } 348 | } 349 | #endif 350 | 351 | -------------------------------------------------------------------------------- /m4/acx_pthread.m4: -------------------------------------------------------------------------------- 1 | dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) 2 | dnl 3 | dnl @summary figure out how to build C programs using POSIX threads 4 | dnl 5 | dnl This macro figures out how to build C programs using POSIX threads. 6 | dnl It sets the PTHREAD_LIBS output variable to the threads library and 7 | dnl linker flags, and the PTHREAD_CFLAGS output variable to any special 8 | dnl C compiler flags that are needed. (The user can also force certain 9 | dnl compiler flags/libs to be tested by setting these environment 10 | dnl variables.) 11 | dnl 12 | dnl Also sets PTHREAD_CC to any special C compiler that is needed for 13 | dnl multi-threaded programs (defaults to the value of CC otherwise). 14 | dnl (This is necessary on AIX to use the special cc_r compiler alias.) 15 | dnl 16 | dnl NOTE: You are assumed to not only compile your program with these 17 | dnl flags, but also link it with them as well. e.g. you should link 18 | dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS 19 | dnl $LIBS 20 | dnl 21 | dnl If you are only building threads programs, you may wish to use 22 | dnl these variables in your default LIBS, CFLAGS, and CC: 23 | dnl 24 | dnl LIBS="$PTHREAD_LIBS $LIBS" 25 | dnl CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 26 | dnl CC="$PTHREAD_CC" 27 | dnl 28 | dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute 29 | dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to 30 | dnl that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). 31 | dnl 32 | dnl ACTION-IF-FOUND is a list of shell commands to run if a threads 33 | dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands to 34 | dnl run it if it is not found. If ACTION-IF-FOUND is not specified, the 35 | dnl default action will define HAVE_PTHREAD. 36 | dnl 37 | dnl Please let the authors know if this macro fails on any platform, or 38 | dnl if you have any other suggestions or comments. This macro was based 39 | dnl on work by SGJ on autoconf scripts for FFTW (www.fftw.org) (with 40 | dnl help from M. Frigo), as well as ac_pthread and hb_pthread macros 41 | dnl posted by Alejandro Forero Cuervo to the autoconf macro repository. 42 | dnl We are also grateful for the helpful feedback of numerous users. 43 | dnl 44 | dnl @category InstalledPackages 45 | dnl @author Steven G. Johnson 46 | dnl @version 2006-05-29 47 | dnl @license GPLWithACException 48 | 49 | AC_DEFUN([ACX_PTHREAD], [ 50 | AC_REQUIRE([AC_CANONICAL_HOST]) 51 | AC_LANG_SAVE 52 | AC_LANG_C 53 | acx_pthread_ok=no 54 | 55 | # We used to check for pthread.h first, but this fails if pthread.h 56 | # requires special compiler flags (e.g. on True64 or Sequent). 57 | # It gets checked for in the link test anyway. 58 | 59 | # First of all, check if the user has set any of the PTHREAD_LIBS, 60 | # etcetera environment variables, and if threads linking works using 61 | # them: 62 | if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then 63 | save_CFLAGS="$CFLAGS" 64 | CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 65 | save_LIBS="$LIBS" 66 | LIBS="$PTHREAD_LIBS $LIBS" 67 | AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS]) 68 | AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes) 69 | AC_MSG_RESULT($acx_pthread_ok) 70 | if test x"$acx_pthread_ok" = xno; then 71 | PTHREAD_LIBS="" 72 | PTHREAD_CFLAGS="" 73 | fi 74 | LIBS="$save_LIBS" 75 | CFLAGS="$save_CFLAGS" 76 | fi 77 | 78 | # We must check for the threads library under a number of different 79 | # names; the ordering is very important because some systems 80 | # (e.g. DEC) have both -lpthread and -lpthreads, where one of the 81 | # libraries is broken (non-POSIX). 82 | 83 | # Create a list of thread flags to try. Items starting with a "-" are 84 | # C compiler flags, and other items are library names, except for "none" 85 | # which indicates that we try without any flags at all, and "pthread-config" 86 | # which is a program returning the flags for the Pth emulation library. 87 | 88 | acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" 89 | 90 | # The ordering *is* (sometimes) important. Some notes on the 91 | # individual items follow: 92 | 93 | # pthreads: AIX (must check this before -lpthread) 94 | # none: in case threads are in libc; should be tried before -Kthread and 95 | # other compiler flags to prevent continual compiler warnings 96 | # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) 97 | # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) 98 | # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) 99 | # -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) 100 | # -pthreads: Solaris/gcc 101 | # -mthreads: Mingw32/gcc, Lynx/gcc 102 | # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it 103 | # doesn't hurt to check since this sometimes defines pthreads too; 104 | # also defines -D_REENTRANT) 105 | # ... -mt is also the pthreads flag for HP/aCC 106 | # pthread: Linux, etcetera 107 | # --thread-safe: KAI C++ 108 | # pthread-config: use pthread-config program (for GNU Pth library) 109 | 110 | case "${host_cpu}-${host_os}" in 111 | *solaris*) 112 | 113 | # On Solaris (at least, for some versions), libc contains stubbed 114 | # (non-functional) versions of the pthreads routines, so link-based 115 | # tests will erroneously succeed. (We need to link with -pthreads/-mt/ 116 | # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather 117 | # a function called by this macro, so we could check for that, but 118 | # who knows whether they'll stub that too in a future libc.) So, 119 | # we'll just look for -pthreads and -lpthread first: 120 | 121 | acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags" 122 | ;; 123 | esac 124 | 125 | if test x"$acx_pthread_ok" = xno; then 126 | for flag in $acx_pthread_flags; do 127 | 128 | case $flag in 129 | none) 130 | AC_MSG_CHECKING([whether pthreads work without any flags]) 131 | ;; 132 | 133 | -*) 134 | AC_MSG_CHECKING([whether pthreads work with $flag]) 135 | PTHREAD_CFLAGS="$flag" 136 | ;; 137 | 138 | pthread-config) 139 | AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no) 140 | if test x"$acx_pthread_config" = xno; then continue; fi 141 | PTHREAD_CFLAGS="`pthread-config --cflags`" 142 | PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" 143 | ;; 144 | 145 | *) 146 | AC_MSG_CHECKING([for the pthreads library -l$flag]) 147 | PTHREAD_LIBS="-l$flag" 148 | ;; 149 | esac 150 | 151 | save_LIBS="$LIBS" 152 | save_CFLAGS="$CFLAGS" 153 | LIBS="$PTHREAD_LIBS $LIBS" 154 | CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 155 | 156 | # Check for various functions. We must include pthread.h, 157 | # since some functions may be macros. (On the Sequent, we 158 | # need a special flag -Kthread to make this header compile.) 159 | # We check for pthread_join because it is in -lpthread on IRIX 160 | # while pthread_create is in libc. We check for pthread_attr_init 161 | # due to DEC craziness with -lpthreads. We check for 162 | # pthread_cleanup_push because it is one of the few pthread 163 | # functions on Solaris that doesn't have a non-functional libc stub. 164 | # We try pthread_create on general principles. 165 | AC_TRY_LINK([#include ], 166 | [pthread_t th; pthread_join(th, 0); 167 | pthread_attr_init(0); pthread_cleanup_push(0, 0); 168 | pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], 169 | [acx_pthread_ok=yes]) 170 | 171 | LIBS="$save_LIBS" 172 | CFLAGS="$save_CFLAGS" 173 | 174 | AC_MSG_RESULT($acx_pthread_ok) 175 | if test "x$acx_pthread_ok" = xyes; then 176 | break; 177 | fi 178 | 179 | PTHREAD_LIBS="" 180 | PTHREAD_CFLAGS="" 181 | done 182 | fi 183 | 184 | # Various other checks: 185 | if test "x$acx_pthread_ok" = xyes; then 186 | save_LIBS="$LIBS" 187 | LIBS="$PTHREAD_LIBS $LIBS" 188 | save_CFLAGS="$CFLAGS" 189 | CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 190 | 191 | # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. 192 | AC_MSG_CHECKING([for joinable pthread attribute]) 193 | attr_name=unknown 194 | for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do 195 | AC_TRY_LINK([#include ], [int attr=$attr; return attr;], 196 | [attr_name=$attr; break]) 197 | done 198 | AC_MSG_RESULT($attr_name) 199 | if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then 200 | AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name, 201 | [Define to necessary symbol if this constant 202 | uses a non-standard name on your system.]) 203 | fi 204 | 205 | AC_MSG_CHECKING([if more special flags are required for pthreads]) 206 | flag=no 207 | case "${host_cpu}-${host_os}" in 208 | *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";; 209 | *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";; 210 | esac 211 | AC_MSG_RESULT(${flag}) 212 | if test "x$flag" != xno; then 213 | PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" 214 | fi 215 | 216 | LIBS="$save_LIBS" 217 | CFLAGS="$save_CFLAGS" 218 | 219 | # More AIX lossage: must compile with xlc_r or cc_r 220 | if test x"$GCC" != xyes; then 221 | AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC}) 222 | else 223 | PTHREAD_CC=$CC 224 | fi 225 | else 226 | PTHREAD_CC="$CC" 227 | fi 228 | 229 | AC_SUBST(PTHREAD_LIBS) 230 | AC_SUBST(PTHREAD_CFLAGS) 231 | AC_SUBST(PTHREAD_CC) 232 | 233 | # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: 234 | if test x"$acx_pthread_ok" = xyes; then 235 | ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1]) 236 | : 237 | else 238 | acx_pthread_ok=no 239 | $2 240 | fi 241 | AC_LANG_RESTORE 242 | ])dnl ACX_PTHREAD 243 | -------------------------------------------------------------------------------- /test/rectmul.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Program to multiply two rectangualar matrizes A(n,m) * B(m,n), where 3 | * (n < m) and (n mod 16 = 0) and (m mod n = 0). (Otherwise fill with 0s 4 | * to fit the shape.) 5 | * 6 | * written by Harald Prokop (prokop@mit.edu) Fall 97. 7 | */ 8 | /* 9 | * Copyright (c) 2003 Massachusetts Institute of Technology 10 | * 11 | * This program is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 2 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * This program is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU General Public License 22 | * along with this program; if not, write to the Free Software 23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 | * 25 | */ 26 | 27 | #include 28 | #include 29 | #include "test.h" 30 | 31 | #define BLOCK_EDGE 16 32 | #define BLOCK_SIZE (BLOCK_EDGE * BLOCK_EDGE) 33 | 34 | typedef double block[BLOCK_SIZE]; 35 | 36 | #ifndef BENCHMARK 37 | int n = 512; 38 | #else 39 | int n = 4096; 40 | #endif 41 | 42 | static block * A, * B, * R; 43 | static int x, y, z; 44 | 45 | /* compute R = R+AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies 46 | */ 47 | static void mult_add_block(block * A, block * B, block * R) 48 | { 49 | int i, j; 50 | 51 | for (j = 0; j < 16; j += 2) { /* 2 columns at a time */ 52 | double *bp = &((double *) B)[j]; 53 | for (i = 0; i < 16; i += 2) { /* 2 rows at a time */ 54 | double *ap = &((double *) A)[i * 16]; 55 | double *rp = &((double *) R)[j + i * 16]; 56 | register double s0_0, s0_1; 57 | register double s1_0, s1_1; 58 | s0_0 = rp[0]; 59 | s0_1 = rp[1]; 60 | s1_0 = rp[16]; 61 | s1_1 = rp[17]; 62 | s0_0 += ap[0] * bp[0]; 63 | s0_1 += ap[0] * bp[1]; 64 | s1_0 += ap[16] * bp[0]; 65 | s1_1 += ap[16] * bp[1]; 66 | s0_0 += ap[1] * bp[16]; 67 | s0_1 += ap[1] * bp[17]; 68 | s1_0 += ap[17] * bp[16]; 69 | s1_1 += ap[17] * bp[17]; 70 | s0_0 += ap[2] * bp[32]; 71 | s0_1 += ap[2] * bp[33]; 72 | s1_0 += ap[18] * bp[32]; 73 | s1_1 += ap[18] * bp[33]; 74 | s0_0 += ap[3] * bp[48]; 75 | s0_1 += ap[3] * bp[49]; 76 | s1_0 += ap[19] * bp[48]; 77 | s1_1 += ap[19] * bp[49]; 78 | s0_0 += ap[4] * bp[64]; 79 | s0_1 += ap[4] * bp[65]; 80 | s1_0 += ap[20] * bp[64]; 81 | s1_1 += ap[20] * bp[65]; 82 | s0_0 += ap[5] * bp[80]; 83 | s0_1 += ap[5] * bp[81]; 84 | s1_0 += ap[21] * bp[80]; 85 | s1_1 += ap[21] * bp[81]; 86 | s0_0 += ap[6] * bp[96]; 87 | s0_1 += ap[6] * bp[97]; 88 | s1_0 += ap[22] * bp[96]; 89 | s1_1 += ap[22] * bp[97]; 90 | s0_0 += ap[7] * bp[112]; 91 | s0_1 += ap[7] * bp[113]; 92 | s1_0 += ap[23] * bp[112]; 93 | s1_1 += ap[23] * bp[113]; 94 | s0_0 += ap[8] * bp[128]; 95 | s0_1 += ap[8] * bp[129]; 96 | s1_0 += ap[24] * bp[128]; 97 | s1_1 += ap[24] * bp[129]; 98 | s0_0 += ap[9] * bp[144]; 99 | s0_1 += ap[9] * bp[145]; 100 | s1_0 += ap[25] * bp[144]; 101 | s1_1 += ap[25] * bp[145]; 102 | s0_0 += ap[10] * bp[160]; 103 | s0_1 += ap[10] * bp[161]; 104 | s1_0 += ap[26] * bp[160]; 105 | s1_1 += ap[26] * bp[161]; 106 | s0_0 += ap[11] * bp[176]; 107 | s0_1 += ap[11] * bp[177]; 108 | s1_0 += ap[27] * bp[176]; 109 | s1_1 += ap[27] * bp[177]; 110 | s0_0 += ap[12] * bp[192]; 111 | s0_1 += ap[12] * bp[193]; 112 | s1_0 += ap[28] * bp[192]; 113 | s1_1 += ap[28] * bp[193]; 114 | s0_0 += ap[13] * bp[208]; 115 | s0_1 += ap[13] * bp[209]; 116 | s1_0 += ap[29] * bp[208]; 117 | s1_1 += ap[29] * bp[209]; 118 | s0_0 += ap[14] * bp[224]; 119 | s0_1 += ap[14] * bp[225]; 120 | s1_0 += ap[30] * bp[224]; 121 | s1_1 += ap[30] * bp[225]; 122 | s0_0 += ap[15] * bp[240]; 123 | s0_1 += ap[15] * bp[241]; 124 | s1_0 += ap[31] * bp[240]; 125 | s1_1 += ap[31] * bp[241]; 126 | rp[0] = s0_0; 127 | rp[1] = s0_1; 128 | rp[16] = s1_0; 129 | rp[17] = s1_1; 130 | } 131 | } 132 | } 133 | 134 | 135 | /* compute R = AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies 136 | */ 137 | static void multiply_block(block * A, block * B, block * R) 138 | { 139 | int i, j; 140 | 141 | for (j = 0; j < 16; j += 2) { /* 2 columns at a time */ 142 | double *bp = &((double *) B)[j]; 143 | for (i = 0; i < 16; i += 2) { /* 2 rows at a time */ 144 | double *ap = &((double *) A)[i * 16]; 145 | double *rp = &((double *) R)[j + i * 16]; 146 | register double s0_0, s0_1; 147 | register double s1_0, s1_1; 148 | s0_0 = ap[0] * bp[0]; 149 | s0_1 = ap[0] * bp[1]; 150 | s1_0 = ap[16] * bp[0]; 151 | s1_1 = ap[16] * bp[1]; 152 | s0_0 += ap[1] * bp[16]; 153 | s0_1 += ap[1] * bp[17]; 154 | s1_0 += ap[17] * bp[16]; 155 | s1_1 += ap[17] * bp[17]; 156 | s0_0 += ap[2] * bp[32]; 157 | s0_1 += ap[2] * bp[33]; 158 | s1_0 += ap[18] * bp[32]; 159 | s1_1 += ap[18] * bp[33]; 160 | s0_0 += ap[3] * bp[48]; 161 | s0_1 += ap[3] * bp[49]; 162 | s1_0 += ap[19] * bp[48]; 163 | s1_1 += ap[19] * bp[49]; 164 | s0_0 += ap[4] * bp[64]; 165 | s0_1 += ap[4] * bp[65]; 166 | s1_0 += ap[20] * bp[64]; 167 | s1_1 += ap[20] * bp[65]; 168 | s0_0 += ap[5] * bp[80]; 169 | s0_1 += ap[5] * bp[81]; 170 | s1_0 += ap[21] * bp[80]; 171 | s1_1 += ap[21] * bp[81]; 172 | s0_0 += ap[6] * bp[96]; 173 | s0_1 += ap[6] * bp[97]; 174 | s1_0 += ap[22] * bp[96]; 175 | s1_1 += ap[22] * bp[97]; 176 | s0_0 += ap[7] * bp[112]; 177 | s0_1 += ap[7] * bp[113]; 178 | s1_0 += ap[23] * bp[112]; 179 | s1_1 += ap[23] * bp[113]; 180 | s0_0 += ap[8] * bp[128]; 181 | s0_1 += ap[8] * bp[129]; 182 | s1_0 += ap[24] * bp[128]; 183 | s1_1 += ap[24] * bp[129]; 184 | s0_0 += ap[9] * bp[144]; 185 | s0_1 += ap[9] * bp[145]; 186 | s1_0 += ap[25] * bp[144]; 187 | s1_1 += ap[25] * bp[145]; 188 | s0_0 += ap[10] * bp[160]; 189 | s0_1 += ap[10] * bp[161]; 190 | s1_0 += ap[26] * bp[160]; 191 | s1_1 += ap[26] * bp[161]; 192 | s0_0 += ap[11] * bp[176]; 193 | s0_1 += ap[11] * bp[177]; 194 | s1_0 += ap[27] * bp[176]; 195 | s1_1 += ap[27] * bp[177]; 196 | s0_0 += ap[12] * bp[192]; 197 | s0_1 += ap[12] * bp[193]; 198 | s1_0 += ap[28] * bp[192]; 199 | s1_1 += ap[28] * bp[193]; 200 | s0_0 += ap[13] * bp[208]; 201 | s0_1 += ap[13] * bp[209]; 202 | s1_0 += ap[29] * bp[208]; 203 | s1_1 += ap[29] * bp[209]; 204 | s0_0 += ap[14] * bp[224]; 205 | s0_1 += ap[14] * bp[225]; 206 | s1_0 += ap[30] * bp[224]; 207 | s1_1 += ap[30] * bp[225]; 208 | s0_0 += ap[15] * bp[240]; 209 | s0_1 += ap[15] * bp[241]; 210 | s1_0 += ap[31] * bp[240]; 211 | s1_1 += ap[31] * bp[241]; 212 | rp[0] = s0_0; 213 | rp[1] = s0_1; 214 | rp[16] = s1_0; 215 | rp[17] = s1_1; 216 | } 217 | } 218 | } 219 | 220 | 221 | int check_matrix(block * R, long x, long y, long o, double v) 222 | { 223 | int a, b; 224 | 225 | if (x * y == 1) { 226 | /** 227 | * Checks if each A[i,j] of a martix A of size nb x nb blocks has 228 | * value v. 229 | */ 230 | int i; 231 | for (i = 0; i < BLOCK_SIZE; i++) 232 | if (((double *) R)[i] != v) 233 | return 1; 234 | 235 | return 0; 236 | } 237 | 238 | if (x>y) { 239 | a = check_matrix(R, x / 2, y, o, v); 240 | b = check_matrix(R + (x / 2) * o,(x + 1) / 2, y, o, v); 241 | } else { 242 | a = check_matrix(R, x, y / 2, o, v); 243 | b = check_matrix(R + (y / 2), x, (y + 1) / 2, o, v); 244 | } 245 | 246 | return a + b; 247 | } 248 | 249 | /* Add matrix T into matrix R, where T and R are bl blocks in size 250 | * 251 | */ 252 | fibril void add_matrix(block * T, long ot, block * R, long oR, long x, long y) 253 | { 254 | if (x + y == 2) { 255 | long i; 256 | for (i = 0; i < BLOCK_SIZE; i += 4) { 257 | ((double *) R)[i + 0] += ((double *) T)[i + 0]; 258 | ((double *) R)[i + 1] += ((double *) T)[i + 1]; 259 | ((double *) R)[i + 2] += ((double *) T)[i + 2]; 260 | ((double *) R)[i + 3] += ((double *) T)[i + 3]; 261 | } 262 | return; 263 | } 264 | 265 | fibril_t fr; 266 | fibril_init(&fr); 267 | 268 | if (x > y) { 269 | fibril_fork(&fr, add_matrix, (T, ot, R, oR, x/2, y)); 270 | add_matrix(T+(x/2)*ot, ot, R+(x/2)*oR, oR, (x+1)/2, y); 271 | } else { 272 | fibril_fork(&fr, add_matrix, (T, ot, R, oR, x, y/2)); 273 | add_matrix(T+(y/2), ot, R+(y/2), oR, x, (y+1)/2); 274 | } 275 | 276 | fibril_join(&fr); 277 | } 278 | 279 | void init_matrix(block * R, long x, long y, long o, double v) 280 | { 281 | if (x + y ==2) { 282 | int i; 283 | for (i = 0; i < BLOCK_SIZE; i++) 284 | ((double *) R)[i] = v; 285 | return; 286 | } 287 | 288 | if (x > y) { 289 | init_matrix(R, x/2, y, o, v); 290 | init_matrix(R+(x/2) * o, (x+1)/2, y, o, v); 291 | } else { 292 | init_matrix(R, x, y/2, o, v); 293 | init_matrix(R+(y/2), x, (y+1)/2, o, v); 294 | } 295 | } 296 | 297 | fibril static void multiply_matrix(block * A, long oa, block * B, long ob, 298 | long x, long y, long z, block * R, long oR, int add) 299 | { 300 | if (x + y + z == 3) { 301 | if (add) 302 | return mult_add_block(A, B, R); 303 | else 304 | return multiply_block(A, B, R); 305 | } 306 | 307 | fibril_t fr; 308 | fibril_init(&fr); 309 | 310 | if (x >= y && x >= z) { 311 | fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x/2, y, z, R, oR, add)); 312 | multiply_matrix(A+(x/2)*oa, oa, B, ob, (x+1)/2, y, z, R+(x/2)*oR, oR, add); 313 | fibril_join(&fr); 314 | } else if (y > x && y > z) { 315 | fibril_fork(&fr, multiply_matrix, 316 | (A+(y/2), oa, B+(y/2)*ob, ob, x, (y+1)/2, z, R, oR, add)); 317 | 318 | block * tmp = malloc(x * z * sizeof(block)); 319 | multiply_matrix(A, oa, B, ob, x, y/2, z, tmp, z, 0); 320 | fibril_join(&fr); 321 | 322 | add_matrix(tmp, z, R, oR, x, z); 323 | free(tmp); 324 | } else { 325 | fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x, y, z/2, R, oR, add)); 326 | multiply_matrix(A, oa, B+(z/2), ob, x, y, (z+1)/2, R+(z/2), oR, add); 327 | fibril_join(&fr); 328 | } 329 | } 330 | 331 | void init() { 332 | x = n / BLOCK_EDGE; 333 | y = n / BLOCK_EDGE; 334 | z = n / BLOCK_EDGE; 335 | 336 | A = malloc(x * y * sizeof(block)); 337 | B = malloc(y * z * sizeof(block)); 338 | R = malloc(x * z * sizeof(block)); 339 | 340 | init_matrix(A, x, y, y, 1.0); 341 | init_matrix(B, y, z, z, 1.0); 342 | } 343 | 344 | void prep() { 345 | init_matrix(R, x, z, z, 0.0); 346 | } 347 | 348 | void test() { 349 | multiply_matrix(A, y, B, z, x, y, z, R, z, 0); 350 | } 351 | 352 | int verify() { 353 | #ifndef BENCHMARK 354 | if (check_matrix(R, x, z, z, y * 16)) { 355 | printf("WRONG RESULT!\n"); 356 | return 1; 357 | }; 358 | #endif 359 | 360 | return 0; 361 | } 362 | -------------------------------------------------------------------------------- /test/lu.c: -------------------------------------------------------------------------------- 1 | /****************************************************************************\ 2 | * LU decomposition 3 | * Robert Blumofe 4 | * 5 | * Copyright (c) 1996, Robert Blumofe. All rights reserved. 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 2 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 | * 20 | \****************************************************************************/ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include "test.h" 27 | 28 | /* Define the size of a block. */ 29 | #ifndef BLOCK_SIZE 30 | #define BLOCK_SIZE 16 31 | #endif 32 | 33 | /* Define the default matrix size. */ 34 | #ifndef DEFAULT_SIZE 35 | #ifndef BENCHMARK 36 | #define DEFAULT_SIZE (16 * BLOCK_SIZE) 37 | #else 38 | #define DEFAULT_SIZE 4096 39 | #endif 40 | #endif 41 | 42 | /* A block is a 2D array of doubles. */ 43 | typedef double Block[BLOCK_SIZE][BLOCK_SIZE]; 44 | #define BLOCK(B,I,J) (B[I][J]) 45 | 46 | /* A matrix is a 1D array of blocks. */ 47 | typedef Block * Matrix; 48 | #define MATRIX(M,I,J) ((M)[(I)*nBlocks+(J)]) 49 | 50 | /** Matrix size. */ 51 | int n = DEFAULT_SIZE; 52 | 53 | /** The global matrix and a copy of the matrix. */ 54 | static Matrix M, Msave; 55 | 56 | /* Matrix size in blocks. */ 57 | static int nBlocks; 58 | 59 | /****************************************************************************\ 60 | * Utility routines. 61 | \****************************************************************************/ 62 | 63 | /* 64 | * init_matrix - Fill in matrix M with random values. 65 | */ 66 | static void init_matrix(Matrix M, int nb) 67 | { 68 | int I, J, K, i, j, k; 69 | 70 | /* Initialize random number generator. */ 71 | srand(1); 72 | 73 | /* For each element of each block, fill in random value. */ 74 | for (I = 0; I < nb; I++) 75 | for (J = 0; J < nb; J++) 76 | for (i = 0; i < BLOCK_SIZE; i++) 77 | for (j = 0; j < BLOCK_SIZE; j++) 78 | BLOCK(MATRIX(M, I, J), i, j) = ((double)rand()) / (double)RAND_MAX; 79 | 80 | /* Inflate diagonal entries. */ 81 | for (K = 0; K < nb; K++) 82 | for (k = 0; k < BLOCK_SIZE; k++) 83 | BLOCK(MATRIX(M, K, K), k, k) *= 10.0; 84 | } 85 | 86 | /* 87 | * print_matrix - Print matrix M. 88 | */ 89 | static void print_matrix(Matrix M, int nb) 90 | { 91 | int i, j; 92 | 93 | /* Print out matrix. */ 94 | for (i = 0; i < nb * BLOCK_SIZE; i++) { 95 | for (j = 0; j < nb * BLOCK_SIZE; j++) 96 | printf(" %6.4f", 97 | BLOCK(MATRIX(M, i / BLOCK_SIZE, j / BLOCK_SIZE), 98 | i % BLOCK_SIZE, j % BLOCK_SIZE)); 99 | printf("\n"); 100 | } 101 | } 102 | 103 | /* 104 | * test_result - Check that matrix LU contains LU decomposition of M. 105 | */ 106 | static int test_result(Matrix LU, Matrix M, int nb) 107 | { 108 | int I, J, K, i, j, k; 109 | double diff, max_diff; 110 | double v; 111 | 112 | /* Initialize test. */ 113 | max_diff = 0.0; 114 | 115 | /* Find maximum difference between any element of LU and M. */ 116 | for (i = 0; i < nb * BLOCK_SIZE; i++) 117 | for (j = 0; j < nb * BLOCK_SIZE; j++) { 118 | I = i / BLOCK_SIZE; 119 | J = j / BLOCK_SIZE; 120 | v = 0.0; 121 | for (k = 0; k < i && k <= j; k++) { 122 | K = k / BLOCK_SIZE; 123 | v += BLOCK(MATRIX(LU, I, K), i % BLOCK_SIZE, 124 | k % BLOCK_SIZE) * 125 | BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE, 126 | j % BLOCK_SIZE); 127 | } 128 | if (k == i && k <= j) { 129 | K = k / BLOCK_SIZE; 130 | v += BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE, 131 | j % BLOCK_SIZE); 132 | } 133 | diff = fabs(BLOCK(MATRIX(M, I, J), i % BLOCK_SIZE, 134 | j % BLOCK_SIZE) - v); 135 | if (diff > max_diff) 136 | max_diff = diff; 137 | } 138 | 139 | /* Check maximum difference against threshold. */ 140 | return (max_diff > 0.00001); 141 | } 142 | 143 | /****************************************************************************\ 144 | * Element operations. 145 | \****************************************************************************/ 146 | /* 147 | * elem_daxmy - Compute y' = y - ax where a is a double and x and y are 148 | * vectors of doubles. 149 | */ 150 | static void elem_daxmy(double a, double *x, double *y, int n) 151 | { 152 | for (n--; n >= 0; n--) y[n] -= a * x[n]; 153 | } 154 | 155 | /****************************************************************************\ 156 | * Block operations. 157 | \****************************************************************************/ 158 | 159 | /* 160 | * block_lu - Factor block B. 161 | */ 162 | static void block_lu(Block B) 163 | { 164 | int i, k; 165 | 166 | /* Factor block. */ 167 | for (k = 0; k < BLOCK_SIZE; k++) 168 | for (i = k + 1; i < BLOCK_SIZE; i++) { 169 | BLOCK(B, i, k) /= BLOCK(B, k, k); 170 | elem_daxmy(BLOCK(B, i, k), &BLOCK(B, k, k + 1), 171 | &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1); 172 | } 173 | } 174 | 175 | /* 176 | * block_lower_solve - Perform forward substitution to solve for B' in 177 | * LB' = B. 178 | */ 179 | static void block_lower_solve(Block B, Block L) 180 | { 181 | int i, k; 182 | 183 | /* Perform forward substitution. */ 184 | for (i = 1; i < BLOCK_SIZE; i++) 185 | for (k = 0; k < i; k++) 186 | elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0), 187 | &BLOCK(B, i, 0), BLOCK_SIZE); 188 | } 189 | 190 | /* 191 | * block_upper_solve - Perform forward substitution to solve for B' in 192 | * B'U = B. 193 | */ 194 | static void block_upper_solve(Block B, Block U) 195 | { 196 | int i, k; 197 | 198 | /* Perform forward substitution. */ 199 | for (i = 0; i < BLOCK_SIZE; i++) 200 | for (k = 0; k < BLOCK_SIZE; k++) { 201 | BLOCK(B, i, k) /= BLOCK(U, k, k); 202 | elem_daxmy(BLOCK(B, i, k), &BLOCK(U, k, k + 1), 203 | &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1); 204 | } 205 | } 206 | 207 | /* 208 | * block_schur - Compute Schur complement B' = B - AC. 209 | */ 210 | static void block_schur(Block B, Block A, Block C) 211 | { 212 | int i, k; 213 | 214 | /* Compute Schur complement. */ 215 | for (i = 0; i < BLOCK_SIZE; i++) 216 | for (k = 0; k < BLOCK_SIZE; k++) 217 | elem_daxmy(BLOCK(A, i, k), &BLOCK(C, k, 0), 218 | &BLOCK(B, i, 0), BLOCK_SIZE); 219 | } 220 | 221 | 222 | /****************************************************************************\ 223 | * Divide-and-conquer matrix LU decomposition. 224 | \****************************************************************************/ 225 | 226 | /** 227 | * schur - Compute M' = M - VW. 228 | */ 229 | fibril static void schur(Matrix M, Matrix V, Matrix W, int nb) 230 | { 231 | Matrix M00, M01, M10, M11; 232 | Matrix V00, V01, V10, V11; 233 | Matrix W00, W01, W10, W11; 234 | int hnb; 235 | 236 | /* Check base case. */ 237 | if (nb == 1) { 238 | block_schur(*M, *V, *W); 239 | return; 240 | } 241 | 242 | /* Break matrices into 4 pieces. */ 243 | hnb = nb / 2; 244 | M00 = &MATRIX(M, 0, 0); 245 | M01 = &MATRIX(M, 0, hnb); 246 | M10 = &MATRIX(M, hnb, 0); 247 | M11 = &MATRIX(M, hnb, hnb); 248 | V00 = &MATRIX(V, 0, 0); 249 | V01 = &MATRIX(V, 0, hnb); 250 | V10 = &MATRIX(V, hnb, 0); 251 | V11 = &MATRIX(V, hnb, hnb); 252 | W00 = &MATRIX(W, 0, 0); 253 | W01 = &MATRIX(W, 0, hnb); 254 | W10 = &MATRIX(W, hnb, 0); 255 | W11 = &MATRIX(W, hnb, hnb); 256 | 257 | /* Form Schur complement with recursive calls. */ 258 | fibril_t fr; 259 | fibril_init(&fr); 260 | 261 | fibril_fork(&fr, schur, (M00, V00, W00, hnb)); 262 | fibril_fork(&fr, schur, (M01, V00, W01, hnb)); 263 | fibril_fork(&fr, schur, (M10, V10, W00, hnb)); 264 | schur(M11, V10, W01, hnb); 265 | fibril_join(&fr); 266 | 267 | fibril_fork(&fr, schur, (M00, V01, W10, hnb)); 268 | fibril_fork(&fr, schur, (M01, V01, W11, hnb)); 269 | fibril_fork(&fr, schur, (M10, V11, W10, hnb)); 270 | schur(M11, V11, W11, hnb); 271 | fibril_join(&fr); 272 | 273 | return; 274 | } 275 | 276 | /* 277 | * lower_solve - Compute M' where LM' = M. 278 | */ 279 | fibril static void lower_solve(Matrix M, Matrix L, int nb); 280 | 281 | static void aux_lower_solve(Matrix Ma, Matrix Mb, Matrix L, int nb) 282 | { 283 | Matrix L00, L01, L10, L11; 284 | 285 | /* Break L matrix into 4 pieces. */ 286 | L00 = &MATRIX(L, 0, 0); 287 | L01 = &MATRIX(L, 0, nb); 288 | L10 = &MATRIX(L, nb, 0); 289 | L11 = &MATRIX(L, nb, nb); 290 | 291 | /* Solve with recursive calls. */ 292 | lower_solve(Ma, L00, nb); 293 | schur(Mb, L10, Ma, nb); 294 | lower_solve(Mb, L11, nb); 295 | } 296 | 297 | fibril static void lower_solve(Matrix M, Matrix L, int nb) 298 | { 299 | Matrix M00, M01, M10, M11; 300 | int hnb; 301 | 302 | /* Check base case. */ 303 | if (nb == 1) { 304 | block_lower_solve(*M, *L); 305 | return; 306 | } 307 | 308 | /* Break matrices into 4 pieces. */ 309 | hnb = nb / 2; 310 | M00 = &MATRIX(M, 0, 0); 311 | M01 = &MATRIX(M, 0, hnb); 312 | M10 = &MATRIX(M, hnb, 0); 313 | M11 = &MATRIX(M, hnb, hnb); 314 | 315 | /* Solve with recursive calls. */ 316 | fibril_t fr; 317 | fibril_init(&fr); 318 | 319 | fibril_fork(&fr, aux_lower_solve, (M00, M10, L, hnb)); 320 | aux_lower_solve(M01, M11, L, hnb); 321 | 322 | fibril_join(&fr); 323 | 324 | return; 325 | } 326 | 327 | /* 328 | * upper_solve - Compute M' where M'U = M. 329 | */ 330 | fibril static void upper_solve(Matrix M, Matrix U, int nb); 331 | 332 | static void aux_upper_solve(Matrix Ma, Matrix Mb, Matrix U, int nb) 333 | { 334 | Matrix U00, U01, U10, U11; 335 | 336 | /* Break U matrix into 4 pieces. */ 337 | U00 = &MATRIX(U, 0, 0); 338 | U01 = &MATRIX(U, 0, nb); 339 | U10 = &MATRIX(U, nb, 0); 340 | U11 = &MATRIX(U, nb, nb); 341 | 342 | /* Solve with recursive calls. */ 343 | upper_solve(Ma, U00, nb); 344 | schur(Mb, Ma, U01, nb); 345 | upper_solve(Mb, U11, nb); 346 | 347 | return; 348 | } 349 | 350 | fibril static void upper_solve(Matrix M, Matrix U, int nb) 351 | { 352 | Matrix M00, M01, M10, M11; 353 | int hnb; 354 | 355 | /* Check base case. */ 356 | if (nb == 1) { 357 | block_upper_solve(*M, *U); 358 | return; 359 | } 360 | 361 | /* Break matrices into 4 pieces. */ 362 | hnb = nb / 2; 363 | M00 = &MATRIX(M, 0, 0); 364 | M01 = &MATRIX(M, 0, hnb); 365 | M10 = &MATRIX(M, hnb, 0); 366 | M11 = &MATRIX(M, hnb, hnb); 367 | 368 | /* Solve with recursive calls. */ 369 | fibril_t fr; 370 | fibril_init(&fr); 371 | 372 | fibril_fork(&fr, aux_upper_solve, (M00, M01, U, hnb)); 373 | aux_upper_solve(M10, M11, U, hnb); 374 | 375 | fibril_join(&fr); 376 | 377 | return; 378 | } 379 | 380 | /* 381 | * lu - Perform LU decomposition of matrix M. 382 | */ 383 | fibril void lu(Matrix M, int nb) 384 | { 385 | Matrix M00, M01, M10, M11; 386 | int hnb; 387 | 388 | /* Check base case. */ 389 | if (nb == 1) { 390 | block_lu(*M); 391 | return; 392 | } 393 | 394 | /* Break matrix into 4 pieces. */ 395 | hnb = nb / 2; 396 | M00 = &MATRIX(M, 0, 0); 397 | M01 = &MATRIX(M, 0, hnb); 398 | M10 = &MATRIX(M, hnb, 0); 399 | M11 = &MATRIX(M, hnb, hnb); 400 | 401 | /* Decompose upper left. */ 402 | lu(M00, hnb); 403 | 404 | /* Solve for upper right and lower left. */ 405 | fibril_t fr; 406 | fibril_init(&fr); 407 | 408 | fibril_fork(&fr, lower_solve, (M01, M00, hnb)); 409 | upper_solve(M10, M00, hnb); 410 | 411 | fibril_join(&fr); 412 | 413 | /* Compute Schur complement of lower right. */ 414 | schur(M11, M10, M01, hnb); 415 | 416 | /* Decompose lower right. */ 417 | lu(M11, hnb); 418 | 419 | return; 420 | } 421 | 422 | void init() 423 | { 424 | nBlocks = n / BLOCK_SIZE; 425 | M = (Matrix) malloc(n * n * sizeof(double)); 426 | init_matrix(M, nBlocks); 427 | #ifndef BENCHMARK 428 | Msave = (Matrix) malloc(n * n * sizeof(double)); 429 | memcpy((void *) Msave, (void *) M, n * n * sizeof(double)); 430 | #endif 431 | 432 | } 433 | 434 | void prep() 435 | { 436 | #ifndef BENCHMARK 437 | memcpy((void *) M, (void *) Msave, n * n * sizeof(double)); 438 | #endif 439 | } 440 | 441 | void test() 442 | { 443 | lu(M, nBlocks); 444 | } 445 | 446 | int verify() 447 | { 448 | #ifndef BENCHMARK 449 | return test_result(M, Msave, nBlocks); 450 | #else 451 | return 0; 452 | #endif 453 | } 454 | -------------------------------------------------------------------------------- /test/cholesky.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Sparse Cholesky code with little blocks at the leaves of the Quad tree 3 | * Keith Randall -- Aske Plaat 4 | * 5 | * This code should run with any square sparse real symmetric matrix 6 | * from MatrixMarket (http://math.nist.gov/MatrixMarket) 7 | * 8 | * run with `cholesky -f george-liu.mtx' for a given matrix, or 9 | * `cholesky -n 1000 -z 10000' for a 1000x1000 random matrix with 10000 10 | * nonzeros (caution: random matrices produce lots of fill). 11 | */ 12 | /* 13 | * Copyright (c) 2000 Massachusetts Institute of Technology 14 | * Copyright (c) 2000 Matteo Frigo 15 | * 16 | * This program is free software; you can redistribute it and/or modify 17 | * it under the terms of the GNU General Public License as published by 18 | * the Free Software Foundation; either version 2 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * This program is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU General Public License 27 | * along with this program; if not, write to the Free Software 28 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 29 | * 30 | */ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include "test.h" 37 | 38 | /*************************************************************\ 39 | * Basic types 40 | \*************************************************************/ 41 | 42 | typedef double Real; 43 | 44 | #define BLOCK_DEPTH 2 /* logarithm base 2 of BLOCK_SIZE */ 45 | #define BLOCK_SIZE (1<= 0; n--) 93 | y[n] -= a * x[n]; 94 | } 95 | 96 | /* 97 | * block_schur - Compute Schur complement B' = B - AC. 98 | */ 99 | static void block_schur_full(Block B, Block A, Block C) 100 | { 101 | int i, j, k; 102 | for (i = 0; i < BLOCK_SIZE; i++) { 103 | for (j = 0; j < BLOCK_SIZE; j++) { 104 | for (k = 0; k < BLOCK_SIZE; k++) { 105 | BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k); 106 | } 107 | } 108 | } 109 | } 110 | 111 | /* 112 | * block_schur - Compute Schur complement B' = B - AC. 113 | */ 114 | static void block_schur_half(Block B, Block A, Block C) 115 | { 116 | int i, j, k; 117 | 118 | /* 119 | * printf("schur half\n"); 120 | */ 121 | /* Compute Schur complement. */ 122 | for (i = 0; i < BLOCK_SIZE; i++) { 123 | for (j = 0; j <= i /* BLOCK_SIZE */ ; j++) { 124 | for (k = 0; k < BLOCK_SIZE; k++) { 125 | BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k); 126 | } 127 | } 128 | } 129 | } 130 | 131 | /* 132 | * block_upper_solve - Perform substitution to solve for B' in 133 | * B'U = B. 134 | */ 135 | static void block_backsub(Block B, Block U) 136 | { 137 | int i, j, k; 138 | 139 | /* Perform backward substitution. */ 140 | for (i = 0; i < BLOCK_SIZE; i++) { 141 | for (j = 0; j < BLOCK_SIZE; j++) { 142 | for (k = 0; k < i; k++) { 143 | BLOCK(B, j, i) -= BLOCK(U, i, k) * BLOCK(B, j, k); /* transpose? */ 144 | } 145 | BLOCK(B, j, i) /= BLOCK(U, i, i); 146 | } 147 | } 148 | } 149 | 150 | /* 151 | * block_lower_solve - Perform forward substitution to solve for B' in 152 | * LB' = B. 153 | */ 154 | static void xblock_backsub(Block B, Block L) 155 | { 156 | int i, k; 157 | 158 | /* Perform forward substitution. */ 159 | for (i = 0; i < BLOCK_SIZE; i++) 160 | for (k = 0; k <= i; k++) { 161 | BLOCK(B, i, k) /= BLOCK(L, k, k); 162 | elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0), 163 | &BLOCK(B, i, 0), BLOCK_SIZE - k); 164 | } 165 | } 166 | 167 | /* 168 | * block_cholesky - Factor block B. 169 | */ 170 | static void block_cholesky(Block B) 171 | { 172 | int i, j, k; 173 | 174 | for (k = 0; k < BLOCK_SIZE; k++) { 175 | Real x; 176 | if (BLOCK(B, k, k) < 0.0) { 177 | printf("sqrt error: %f\n", BLOCK(B, k, k)); 178 | printf("matrix is probably not numerically stable\n"); 179 | exit(9); 180 | } 181 | x = sqrt(BLOCK(B, k, k)); 182 | for (i = k; i < BLOCK_SIZE; i++) { 183 | BLOCK(B, i, k) /= x; 184 | } 185 | for (j = k + 1; j < BLOCK_SIZE; j++) { 186 | for (i = j; i < BLOCK_SIZE; i++) { 187 | BLOCK(B, i, j) -= BLOCK(B, i, k) * BLOCK(B, j, k); 188 | if (j > i && BLOCK(B, i, j) != 0.0) { 189 | printf("Upper not empty\n"); 190 | } 191 | } 192 | } 193 | } 194 | } 195 | 196 | /* 197 | * block_zero - zero block B. 198 | */ 199 | static void block_zero(Block B) 200 | { 201 | int i, k; 202 | 203 | for (i = 0; i < BLOCK_SIZE; i++) { 204 | for (k = 0; k < BLOCK_SIZE; k++) { 205 | BLOCK(B, i, k) = 0.0; 206 | } 207 | } 208 | } 209 | 210 | /*************************************************************\ 211 | * Allocation and initialization 212 | \*************************************************************/ 213 | 214 | /* 215 | * Create new leaf nodes (BLOCK_SIZE x BLOCK_SIZE submatrices) 216 | */ 217 | static inline InternalNode *new_block_leaf(void) 218 | { 219 | LeafNode *leaf = malloc(sizeof(LeafNode)); 220 | if (leaf == NULL) { 221 | printf("out of memory!\n"); 222 | exit(1); 223 | } 224 | return (InternalNode *) leaf; 225 | } 226 | 227 | /* 228 | * Create internal node in quadtree representation 229 | */ 230 | static inline InternalNode *new_internal(InternalNode * a00, InternalNode * a01, 231 | InternalNode * a10, InternalNode * a11) 232 | { 233 | InternalNode *node = malloc(sizeof(InternalNode)); 234 | if (node == NULL) { 235 | printf("out of memory!\n"); 236 | exit(1); 237 | } 238 | node->child[_00] = a00; 239 | node->child[_01] = a01; 240 | node->child[_10] = a10; 241 | node->child[_11] = a11; 242 | return node; 243 | } 244 | 245 | /* 246 | * Duplicate matrix. Resulting matrix may be laid out in memory 247 | * better than source matrix. 248 | */ 249 | fibril static Matrix copy_matrix(int depth, Matrix a) 250 | { 251 | Matrix r; 252 | 253 | if (!a) 254 | return a; 255 | 256 | if (depth == BLOCK_DEPTH) { 257 | LeafNode *A = (LeafNode *) a; 258 | LeafNode *R; 259 | r = new_block_leaf(); 260 | R = (LeafNode *) r; 261 | memcpy(R->block, A->block, sizeof(Block)); 262 | } else { 263 | Matrix r00, r01, r10, r11; 264 | 265 | depth--; 266 | 267 | fibril_t fr; 268 | fibril_init(&fr); 269 | 270 | fibril_fork(&fr, &r00, copy_matrix, (depth, a->child[_00])); 271 | fibril_fork(&fr, &r01, copy_matrix, (depth, a->child[_01])); 272 | fibril_fork(&fr, &r10, copy_matrix, (depth, a->child[_10])); 273 | r11 = copy_matrix(depth, a->child[_11]); 274 | fibril_join(&fr); 275 | 276 | r = new_internal(r00, r01, r10, r11); 277 | } 278 | return r; 279 | } 280 | 281 | /* 282 | * Deallocate matrix. 283 | */ 284 | void free_matrix(int depth, Matrix a) 285 | { 286 | if (a == NULL) 287 | return; 288 | if (depth == BLOCK_DEPTH) { 289 | free(a); 290 | } else { 291 | depth--; 292 | free_matrix(depth, a->child[_00]); 293 | free_matrix(depth, a->child[_01]); 294 | free_matrix(depth, a->child[_10]); 295 | free_matrix(depth, a->child[_11]); 296 | free(a); 297 | } 298 | } 299 | 300 | /*************************************************************\ 301 | * Simple matrix operations 302 | \*************************************************************/ 303 | 304 | /* 305 | * Get matrix element at row r, column c. 306 | */ 307 | static Real get_matrix(int depth, Matrix a, int r, int c) 308 | { 309 | if (a == NULL) 310 | return 0.0; 311 | 312 | if (depth == BLOCK_DEPTH) { 313 | LeafNode *A = (LeafNode *) a; 314 | return BLOCK(A->block, r, c); 315 | } else { 316 | int mid; 317 | 318 | depth--; 319 | mid = 1 << depth; 320 | 321 | if (r < mid) { 322 | if (c < mid) 323 | return get_matrix(depth, a->child[_00], r, c); 324 | else 325 | return get_matrix(depth, a->child[_01], r, c - mid); 326 | } else { 327 | if (c < mid) 328 | return get_matrix(depth, a->child[_10], r - mid, c); 329 | else 330 | return get_matrix(depth, a->child[_11], r - mid, c - mid); 331 | } 332 | } 333 | } 334 | 335 | /* 336 | * Set matrix element at row r, column c to value. 337 | */ 338 | static Matrix set_matrix(int depth, Matrix a, int r, int c, Real value) 339 | { 340 | if (depth == BLOCK_DEPTH) { 341 | LeafNode *A; 342 | if (a == NULL) { 343 | a = new_block_leaf(); 344 | A = (LeafNode *) a; 345 | block_zero(A->block); 346 | } else { 347 | A = (LeafNode *) a; 348 | } 349 | BLOCK(A->block, r, c) = value; 350 | } else { 351 | int mid; 352 | 353 | if (a == NULL) 354 | a = new_internal(NULL, NULL, NULL, NULL); 355 | 356 | depth--; 357 | mid = 1 << depth; 358 | 359 | if (r < mid) { 360 | if (c < mid) 361 | a->child[_00] = set_matrix(depth, a->child[_00], 362 | r, c, value); 363 | else 364 | a->child[_01] = set_matrix(depth, a->child[_01], 365 | r, c - mid, value); 366 | } else { 367 | if (c < mid) 368 | a->child[_10] = set_matrix(depth, a->child[_10], 369 | r - mid, c, value); 370 | else 371 | a->child[_11] = set_matrix(depth, a->child[_11], 372 | r - mid, c - mid, value); 373 | } 374 | } 375 | return a; 376 | } 377 | 378 | /* 379 | * Compute sum of squares of elements of matrix 380 | */ 381 | static Real mag(int depth, Matrix a) 382 | { 383 | Real res = 0.0; 384 | if (!a) 385 | return res; 386 | 387 | if (depth == BLOCK_DEPTH) { 388 | LeafNode *A = (LeafNode *) a; 389 | int i, j; 390 | for (i = 0; i < BLOCK_SIZE; i++) 391 | for (j = 0; j < BLOCK_SIZE; j++) 392 | res += BLOCK(A->block, i, j) * BLOCK(A->block, i, j); 393 | } else { 394 | depth--; 395 | res += mag(depth, a->child[_00]); 396 | res += mag(depth, a->child[_01]); 397 | res += mag(depth, a->child[_10]); 398 | res += mag(depth, a->child[_11]); 399 | } 400 | return res; 401 | } 402 | 403 | /*************************************************************\ 404 | * Cholesky algorithm 405 | \*************************************************************/ 406 | 407 | /* 408 | * Perform R -= A * Transpose(B) 409 | * if lower==1, update only lower-triangular part of R 410 | */ 411 | fibril static 412 | Matrix mul_and_subT(int depth, int lower, Matrix a, Matrix b, Matrix r) 413 | { 414 | if (depth == BLOCK_DEPTH) { 415 | LeafNode *A = (LeafNode *) a; 416 | LeafNode *B = (LeafNode *) b; 417 | LeafNode *R; 418 | 419 | if (r == NULL) { 420 | r = new_block_leaf(); 421 | R = (LeafNode *) r; 422 | block_zero(R->block); 423 | } else 424 | R = (LeafNode *) r; 425 | 426 | if (lower) 427 | block_schur_half(R->block, A->block, B->block); 428 | else 429 | block_schur_full(R->block, A->block, B->block); 430 | } else { 431 | Matrix r00, r01, r10, r11; 432 | 433 | depth--; 434 | 435 | if (r != NULL) { 436 | r00 = r->child[_00]; 437 | r01 = r->child[_01]; 438 | r10 = r->child[_10]; 439 | r11 = r->child[_11]; 440 | } else { 441 | r00 = NULL; 442 | r01 = NULL; 443 | r10 = NULL; 444 | r11 = NULL; 445 | } 446 | 447 | fibril_t fr; 448 | fibril_init(&fr); 449 | 450 | if (a->child[_00] && b->child[TR_00]) 451 | fibril_fork(&fr, &r00, mul_and_subT, (depth, lower, 452 | a->child[_00], b->child[TR_00], 453 | r00)); 454 | 455 | if (!lower && a->child[_00] && b->child[TR_01]) 456 | fibril_fork(&fr, &r01, mul_and_subT, (depth, 0, 457 | a->child[_00], b->child[TR_01], 458 | r01)); 459 | 460 | if (a->child[_10] && b->child[TR_00]) 461 | fibril_fork(&fr, &r10, mul_and_subT, (depth, 0, 462 | a->child[_10], b->child[TR_00], 463 | r10)); 464 | 465 | if (a->child[_10] && b->child[TR_01]) 466 | fibril_fork(&fr, &r11, mul_and_subT, (depth, lower, 467 | a->child[_10], b->child[TR_01], 468 | r11)); 469 | 470 | fibril_join(&fr); 471 | 472 | if (a->child[_01] && b->child[TR_10]) 473 | fibril_fork(&fr, &r00, mul_and_subT, (depth, lower, 474 | a->child[_01], b->child[TR_10], 475 | r00)); 476 | 477 | if (!lower && a->child[_01] && b->child[TR_11]) 478 | fibril_fork(&fr, &r01, mul_and_subT, (depth, 0, 479 | a->child[_01], b->child[TR_11], 480 | r01)); 481 | 482 | if (a->child[_11] && b->child[TR_10]) 483 | fibril_fork(&fr, &r10, mul_and_subT, (depth, 0, 484 | a->child[_11], b->child[TR_10], 485 | r10)); 486 | 487 | if (a->child[_11] && b->child[TR_11]) 488 | fibril_fork(&fr, &r11, mul_and_subT, (depth, lower, 489 | a->child[_11], b->child[TR_11], 490 | r11)); 491 | 492 | fibril_join(&fr); 493 | 494 | if (r == NULL) { 495 | if (r00 || r01 || r10 || r11) 496 | r = new_internal(r00, r01, r10, r11); 497 | } else { 498 | r->child[_00] = r00; 499 | r->child[_01] = r01; 500 | r->child[_10] = r10; 501 | r->child[_11] = r11; 502 | } 503 | } 504 | return r; 505 | } 506 | 507 | /* 508 | * Perform substitution to solve for B in BL = A 509 | * Returns B in place of A. 510 | */ 511 | fibril static Matrix backsub(int depth, Matrix a, Matrix l) 512 | { 513 | if (depth == BLOCK_DEPTH) { 514 | LeafNode *A = (LeafNode *) a; 515 | LeafNode *L = (LeafNode *) l; 516 | block_backsub(A->block, L->block); 517 | } else { 518 | Matrix a00, a01, a10, a11; 519 | Matrix l00, l10, l11; 520 | 521 | depth--; 522 | 523 | a00 = a->child[_00]; 524 | a01 = a->child[_01]; 525 | a10 = a->child[_10]; 526 | a11 = a->child[_11]; 527 | 528 | l00 = l->child[_00]; 529 | l10 = l->child[_10]; 530 | l11 = l->child[_11]; 531 | 532 | fibril_t fr; 533 | fibril_init(&fr); 534 | 535 | if (a00) 536 | fibril_fork(&fr, &a00, backsub, (depth, a00, l00)); 537 | if (a10) 538 | fibril_fork(&fr, &a10, backsub, (depth, a10, l00)); 539 | 540 | fibril_join(&fr); 541 | 542 | if (a00 && l10) 543 | fibril_fork(&fr, &a01, mul_and_subT, (depth, 0, a00, l10, a01)); 544 | if (a10 && l10) 545 | fibril_fork(&fr, &a11, mul_and_subT, (depth, 0, a10, l10, a11)); 546 | 547 | fibril_join(&fr); 548 | 549 | if (a01) 550 | fibril_fork(&fr, &a01, backsub, (depth, a01, l11)); 551 | if (a11) 552 | fibril_fork(&fr, &a11, backsub, (depth, a11, l11)); 553 | 554 | fibril_join(&fr); 555 | 556 | a->child[_00] = a00; 557 | a->child[_01] = a01; 558 | a->child[_10] = a10; 559 | a->child[_11] = a11; 560 | } 561 | 562 | return a; 563 | } 564 | 565 | /* 566 | * Compute Cholesky factorization of A. 567 | */ 568 | fibril static Matrix cholesky(int depth, Matrix a) 569 | { 570 | if (depth == BLOCK_DEPTH) { 571 | LeafNode *A = (LeafNode *) a; 572 | block_cholesky(A->block); 573 | } else { 574 | Matrix a00, a10, a11; 575 | 576 | depth--; 577 | 578 | a00 = a->child[_00]; 579 | a10 = a->child[_10]; 580 | a11 = a->child[_11]; 581 | 582 | if (!a10) { 583 | fibril_t fr; 584 | fibril_init(&fr); 585 | fibril_fork(&fr, &a00, cholesky, (depth, a00)); 586 | a11 = cholesky(depth, a11); 587 | fibril_join(&fr); 588 | } else { 589 | a00 = cholesky(depth, a00); 590 | a10 = backsub(depth, a10, a00); 591 | a11 = mul_and_subT(depth, 1, a10, a10, a11); 592 | a11 = cholesky(depth, a11); 593 | } 594 | a->child[_00] = a00; 595 | a->child[_10] = a10; 596 | a->child[_11] = a11; 597 | } 598 | return a; 599 | } 600 | 601 | static int logarithm(int size) 602 | { 603 | int k = 0; 604 | 605 | while ((1 << k) < size) 606 | k++; 607 | return k; 608 | } 609 | 610 | void init() 611 | { 612 | /* generate random matrix */ 613 | depth = logarithm(n); 614 | 615 | /* diagonal elements */ 616 | int i; 617 | for (i = 0; i < n; i++) 618 | A = set_matrix(depth, A, i, i, 1.0); 619 | 620 | /* off-diagonal elements */ 621 | for (i = 0; i < nonzeros - n; i++) { 622 | int r, c; 623 | 624 | do { 625 | r = rand() % n; 626 | c = rand() % n; 627 | } while (r <= c || get_matrix(depth, A, r, c) != 0.0); 628 | 629 | A = set_matrix(depth, A, r, c, 0.1); 630 | } 631 | 632 | /* extend to power of two n with identity matrix */ 633 | for (i = n; i < (1 << depth); i++) { 634 | A = set_matrix(depth, A, i, i, 1.0); 635 | } 636 | } 637 | 638 | void prep() 639 | { 640 | free_matrix(depth, R); 641 | R = copy_matrix(depth, A); 642 | } 643 | 644 | void test() 645 | { 646 | R = cholesky(depth, R); 647 | } 648 | 649 | int verify() 650 | { 651 | int fail = 0; 652 | 653 | #ifndef BENCHMARK 654 | /* test - make sure R * Transpose(R) == A */ 655 | /* compute || A - R * Transpose(R) || */ 656 | A = mul_and_subT(depth, 1, R, R, A); 657 | Real error = mag(depth, A); 658 | fail = (error > 0.00001); 659 | #endif 660 | 661 | free_matrix(depth, A); 662 | free_matrix(depth, R); 663 | return fail; 664 | } 665 | -------------------------------------------------------------------------------- /test/strassen.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 1996 Massachusetts Institute of Technology 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining 5 | * a copy of this software and associated documentation files (the 6 | * "Software"), to use, copy, modify, and distribute the Software without 7 | * restriction, provided the Software, including any modified copies made 8 | * under this license, is not distributed for a fee, subject to 9 | * the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be 12 | * included in all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE 18 | * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 19 | * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | * 22 | * Except as contained in this notice, the name of the Massachusetts 23 | * Institute of Technology shall not be used in advertising or otherwise 24 | * to promote the sale, use or other dealings in this Software without 25 | * prior written authorization from the Massachusetts Institute of 26 | * Technology. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include "test.h" 34 | 35 | #define SizeAtWhichDivideAndConquerIsMoreEfficient 64 36 | #define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16 37 | #define CacheBlockSizeInBytes 32 38 | 39 | /* The real numbers we are using --- either double or float */ 40 | typedef double REAL; 41 | typedef unsigned long PTR; 42 | 43 | /* maximum tolerable relative error (for the checking routine) */ 44 | #define EPSILON (1.0E-6) 45 | 46 | /* 47 | * Matrices are stored in row-major order; A is a pointer to 48 | * the first element of the matrix, and an is the number of elements 49 | * between two rows. This macro produces the element A[i,j] 50 | * given A, an, i and j 51 | */ 52 | #define ELEM(A, an, i, j) (A[(i) * (an) + (j)]) 53 | 54 | #ifndef BENCHMARK 55 | int n = 512; 56 | #else 57 | int n = 4096; 58 | #endif 59 | 60 | static REAL * A, * B, * C; 61 | 62 | /* 63 | * Naive sequential algorithm, for comparison purposes 64 | */ 65 | void matrixmul(int n, REAL * A, int an, REAL * B, int bn, REAL * C, int cn) 66 | { 67 | int i, j, k; 68 | REAL s; 69 | 70 | for (i = 0; i < n; ++i) 71 | for (j = 0; j < n; ++j) { 72 | s = 0.0; 73 | for (k = 0; k < n; ++k) 74 | s += ELEM(A, an, i, k) * ELEM(B, bn, k, j); 75 | 76 | ELEM(C, cn, i, j) = s; 77 | } 78 | } 79 | 80 | /***************************************************************************** 81 | ** 82 | ** FastNaiveMatrixMultiply 83 | ** 84 | ** For small to medium sized matrices A, B, and C of size 85 | ** MatrixSize * MatrixSize this function performs the operation 86 | ** C = A x B efficiently. 87 | ** 88 | ** Note MatrixSize must be divisible by 8. 89 | ** 90 | ** INPUT: 91 | ** C = (*C WRITE) Address of top left element of matrix C. 92 | ** A = (*A IS READ ONLY) Address of top left element of matrix A. 93 | ** B = (*B IS READ ONLY) Address of top left element of matrix B. 94 | ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) 95 | ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] 96 | ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] 97 | ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] 98 | ** 99 | ** OUTPUT: 100 | ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) 101 | ** 102 | *****************************************************************************/ 103 | static void FastNaiveMatrixMultiply( 104 | REAL * C, REAL * A, REAL * B, unsigned MatrixSize, 105 | unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) 106 | { 107 | /* Assumes size of real is 8 bytes */ 108 | PTR RowWidthBInBytes = RowWidthB << 3; 109 | PTR RowWidthAInBytes = RowWidthA << 3; 110 | PTR MatrixWidthInBytes = MatrixSize << 3; 111 | PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; 112 | unsigned Horizontal, Vertical; 113 | 114 | REAL *ARowStart = A; 115 | for (Vertical = 0; Vertical < MatrixSize; Vertical++) { 116 | for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { 117 | REAL *BColumnStart = B + Horizontal; 118 | REAL FirstARowValue = *ARowStart++; 119 | 120 | REAL Sum0 = FirstARowValue * (*BColumnStart); 121 | REAL Sum1 = FirstARowValue * (*(BColumnStart+1)); 122 | REAL Sum2 = FirstARowValue * (*(BColumnStart+2)); 123 | REAL Sum3 = FirstARowValue * (*(BColumnStart+3)); 124 | REAL Sum4 = FirstARowValue * (*(BColumnStart+4)); 125 | REAL Sum5 = FirstARowValue * (*(BColumnStart+5)); 126 | REAL Sum6 = FirstARowValue * (*(BColumnStart+6)); 127 | REAL Sum7 = FirstARowValue * (*(BColumnStart+7)); 128 | 129 | unsigned Products; 130 | for (Products = 1; Products < MatrixSize; Products++) { 131 | REAL ARowValue = *ARowStart++; 132 | BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); 133 | 134 | Sum0 += ARowValue * (*BColumnStart); 135 | Sum1 += ARowValue * (*(BColumnStart+1)); 136 | Sum2 += ARowValue * (*(BColumnStart+2)); 137 | Sum3 += ARowValue * (*(BColumnStart+3)); 138 | Sum4 += ARowValue * (*(BColumnStart+4)); 139 | Sum5 += ARowValue * (*(BColumnStart+5)); 140 | Sum6 += ARowValue * (*(BColumnStart+6)); 141 | Sum7 += ARowValue * (*(BColumnStart+7)); 142 | } 143 | ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); 144 | 145 | *(C) = Sum0; 146 | *(C+1) = Sum1; 147 | *(C+2) = Sum2; 148 | *(C+3) = Sum3; 149 | *(C+4) = Sum4; 150 | *(C+5) = Sum5; 151 | *(C+6) = Sum6; 152 | *(C+7) = Sum7; 153 | C+=8; 154 | } 155 | 156 | ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); 157 | C = (REAL*) ( ((PTR) C) + RowIncrementC ); 158 | } 159 | } 160 | 161 | /***************************************************************************** 162 | ** 163 | ** FastAdditiveNaiveMatrixMultiply 164 | ** 165 | ** For small to medium sized matrices A, B, and C of size 166 | ** MatrixSize * MatrixSize this function performs the operation 167 | ** C += A x B efficiently. 168 | ** 169 | ** Note MatrixSize must be divisible by 8. 170 | ** 171 | ** INPUT: 172 | ** C = (*C READ/WRITE) Address of top left element of matrix C. 173 | ** A = (*A IS READ ONLY) Address of top left element of matrix A. 174 | ** B = (*B IS READ ONLY) Address of top left element of matrix B. 175 | ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) 176 | ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] 177 | ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] 178 | ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] 179 | ** 180 | ** OUTPUT: 181 | ** C = (*C READ/WRITE) Matrix C contains C + A x B. 182 | ** 183 | *****************************************************************************/ 184 | static void FastAdditiveNaiveMatrixMultiply( 185 | REAL * C, REAL * A, REAL * B, unsigned MatrixSize, 186 | unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) 187 | { 188 | /* Assumes size of real is 8 bytes */ 189 | PTR RowWidthBInBytes = RowWidthB << 3; 190 | PTR RowWidthAInBytes = RowWidthA << 3; 191 | PTR MatrixWidthInBytes = MatrixSize << 3; 192 | PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; 193 | unsigned Horizontal, Vertical; 194 | 195 | REAL *ARowStart = A; 196 | for (Vertical = 0; Vertical < MatrixSize; Vertical++) { 197 | for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { 198 | REAL *BColumnStart = B + Horizontal; 199 | 200 | REAL Sum0 = *C; 201 | REAL Sum1 = *(C+1); 202 | REAL Sum2 = *(C+2); 203 | REAL Sum3 = *(C+3); 204 | REAL Sum4 = *(C+4); 205 | REAL Sum5 = *(C+5); 206 | REAL Sum6 = *(C+6); 207 | REAL Sum7 = *(C+7); 208 | 209 | unsigned Products; 210 | for (Products = 0; Products < MatrixSize; Products++) { 211 | REAL ARowValue = *ARowStart++; 212 | 213 | Sum0 += ARowValue * (*BColumnStart); 214 | Sum1 += ARowValue * (*(BColumnStart+1)); 215 | Sum2 += ARowValue * (*(BColumnStart+2)); 216 | Sum3 += ARowValue * (*(BColumnStart+3)); 217 | Sum4 += ARowValue * (*(BColumnStart+4)); 218 | Sum5 += ARowValue * (*(BColumnStart+5)); 219 | Sum6 += ARowValue * (*(BColumnStart+6)); 220 | Sum7 += ARowValue * (*(BColumnStart+7)); 221 | 222 | BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); 223 | 224 | } 225 | ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); 226 | 227 | *(C) = Sum0; 228 | *(C+1) = Sum1; 229 | *(C+2) = Sum2; 230 | *(C+3) = Sum3; 231 | *(C+4) = Sum4; 232 | *(C+5) = Sum5; 233 | *(C+6) = Sum6; 234 | *(C+7) = Sum7; 235 | C+=8; 236 | } 237 | 238 | ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); 239 | C = (REAL*) ( ((PTR) C) + RowIncrementC ); 240 | } 241 | } 242 | 243 | 244 | /***************************************************************************** 245 | ** 246 | ** MultiplyByDivideAndConquer 247 | ** 248 | ** For medium to medium-large (would you like fries with that) sized 249 | ** matrices A, B, and C of size MatrixSize * MatrixSize this function 250 | ** efficiently performs the operation 251 | ** C = A x B (if AdditiveMode == 0) 252 | ** C += A x B (if AdditiveMode != 0) 253 | ** 254 | ** Note MatrixSize must be divisible by 16. 255 | ** 256 | ** INPUT: 257 | ** C = (*C READ/WRITE) Address of top left element of matrix C. 258 | ** A = (*A IS READ ONLY) Address of top left element of matrix A. 259 | ** B = (*B IS READ ONLY) Address of top left element of matrix B. 260 | ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) 261 | ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] 262 | ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] 263 | ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] 264 | ** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B 265 | ** 266 | ** OUTPUT: 267 | ** C (+)= A x B. (+ if AdditiveMode != 0) 268 | ** 269 | *****************************************************************************/ 270 | void MultiplyByDivideAndConquer( 271 | REAL * C, REAL * A, REAL * B, unsigned MatrixSize, 272 | unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, 273 | int AdditiveMode) 274 | { 275 | #define A00 A 276 | #define B00 B 277 | #define C00 C 278 | 279 | REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11; 280 | unsigned QuadrantSize = MatrixSize >> 1; 281 | 282 | /* partition the matrix */ 283 | A01 = A00 + QuadrantSize; 284 | A10 = A00 + RowWidthA * QuadrantSize; 285 | A11 = A10 + QuadrantSize; 286 | 287 | B01 = B00 + QuadrantSize; 288 | B10 = B00 + RowWidthB * QuadrantSize; 289 | B11 = B10 + QuadrantSize; 290 | 291 | C01 = C00 + QuadrantSize; 292 | C10 = C00 + RowWidthC * QuadrantSize; 293 | C11 = C10 + QuadrantSize; 294 | 295 | if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) { 296 | MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize, 297 | RowWidthC, RowWidthA, RowWidthB, AdditiveMode); 298 | MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize, 299 | RowWidthC, RowWidthA, RowWidthB, AdditiveMode); 300 | MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize, 301 | RowWidthC, RowWidthA, RowWidthB, AdditiveMode); 302 | MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize, 303 | RowWidthC, RowWidthA, RowWidthB, AdditiveMode); 304 | MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize, 305 | RowWidthC, RowWidthA, RowWidthB, 1); 306 | MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize, 307 | RowWidthC, RowWidthA, RowWidthB, 1); 308 | MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize, 309 | RowWidthC, RowWidthA, RowWidthB, 1); 310 | MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize, 311 | RowWidthC, RowWidthA, RowWidthB, 1); 312 | } else { 313 | if (AdditiveMode) { 314 | FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, 315 | RowWidthC, RowWidthA, RowWidthB); 316 | FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, 317 | RowWidthC, RowWidthA, RowWidthB); 318 | FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, 319 | RowWidthC, RowWidthA, RowWidthB); 320 | FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, 321 | RowWidthC, RowWidthA, RowWidthB); 322 | } else { 323 | FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, 324 | RowWidthC, RowWidthA, RowWidthB); 325 | 326 | FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, 327 | RowWidthC, RowWidthA, RowWidthB); 328 | 329 | FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, 330 | RowWidthC, RowWidthA, RowWidthB); 331 | 332 | FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, 333 | RowWidthC, RowWidthA, RowWidthB); 334 | } 335 | 336 | FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize, 337 | RowWidthC, RowWidthA, RowWidthB); 338 | FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize, 339 | RowWidthC, RowWidthA, RowWidthB); 340 | FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize, 341 | RowWidthC, RowWidthA, RowWidthB); 342 | FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize, 343 | RowWidthC, RowWidthA, RowWidthB); 344 | } 345 | 346 | return; 347 | } 348 | 349 | 350 | /***************************************************************************** 351 | ** 352 | ** OptimizedStrassenMultiply 353 | ** 354 | ** For large matrices A, B, and C of size MatrixSize * MatrixSize this 355 | ** function performs the operation C = A x B efficiently. 356 | ** 357 | ** INPUT: 358 | ** C = (*C WRITE) Address of top left element of matrix C. 359 | ** A = (*A IS READ ONLY) Address of top left element of matrix A. 360 | ** B = (*B IS READ ONLY) Address of top left element of matrix B. 361 | ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) 362 | ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] 363 | ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] 364 | ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] 365 | ** OUTPUT: 366 | ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) 367 | ** 368 | *****************************************************************************/ 369 | fibril static void OptimizedStrassenMultiply( 370 | REAL * C, REAL * A, REAL * B, unsigned MatrixSize, 371 | unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) 372 | { 373 | unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ 374 | unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * 375 | QuadrantSize + 32; 376 | unsigned Column, Row; 377 | 378 | /************************************************************************ 379 | ** For each matrix A, B, and C, we'll want pointers to each quandrant 380 | ** in the matrix. These quandrants will be addressed as follows: 381 | ** -- -- 382 | ** | A11 A12 | 383 | ** | | 384 | ** | A21 A22 | 385 | ** -- -- 386 | ************************************************************************/ 387 | REAL /**A11, *B11, *C11,*/ *A12, *B12, *C12, 388 | *A21, *B21, *C21, *A22, *B22, *C22; 389 | 390 | REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; 391 | #define NumberOfVariables 11 392 | 393 | PTR TempMatrixOffset = 0; 394 | PTR MatrixOffsetA = 0; 395 | PTR MatrixOffsetB = 0; 396 | 397 | char *Heap; 398 | void *StartHeap; 399 | 400 | /* Distance between the end of a matrix row and the start of the next row */ 401 | PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; 402 | PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; 403 | PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; 404 | 405 | if (MatrixSize <= SizeAtWhichDivideAndConquerIsMoreEfficient) { 406 | MultiplyByDivideAndConquer(C, A, B, MatrixSize, 407 | RowWidthC, RowWidthA, RowWidthB, 0); 408 | return; 409 | } 410 | 411 | /* Initialize quandrant matrices */ 412 | #define A11 A 413 | #define B11 B 414 | #define C11 C 415 | A12 = A11 + QuadrantSize; 416 | B12 = B11 + QuadrantSize; 417 | C12 = C11 + QuadrantSize; 418 | A21 = A + (RowWidthA * QuadrantSize); 419 | B21 = B + (RowWidthB * QuadrantSize); 420 | C21 = C + (RowWidthC * QuadrantSize); 421 | A22 = A21 + QuadrantSize; 422 | B22 = B21 + QuadrantSize; 423 | C22 = C21 + QuadrantSize; 424 | 425 | /* Allocate Heap Space Here */ 426 | StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); 427 | /* ensure that heap is on cache boundary */ 428 | if ( ((PTR) Heap) & 31) 429 | Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); 430 | 431 | /* Distribute the heap space over the variables */ 432 | S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 433 | S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 434 | S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 435 | S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 436 | S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 437 | S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 438 | S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 439 | S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 440 | M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 441 | M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; 442 | T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; 443 | 444 | /*************************************************************************** 445 | ** Step through all columns row by row (vertically) 446 | ** (jumps in memory by RowWidth => bad locality) 447 | ** (but we want the best locality on the innermost loop) 448 | ***************************************************************************/ 449 | for (Row = 0; Row < QuadrantSize; Row++) { 450 | 451 | /************************************************************************* 452 | ** Step through each row horizontally (addressing elements in each column) 453 | ** (jumps linearly througn memory => good locality) 454 | *************************************************************************/ 455 | for (Column = 0; Column < QuadrantSize; Column++) { 456 | 457 | /*********************************************************** 458 | ** Within this loop, the following holds for MatrixOffset: 459 | ** MatrixOffset = (Row * RowWidth) + Column 460 | ** (note: that the unit of the offset is number of reals) 461 | ***********************************************************/ 462 | /* Element of Global Matrix, such as A, B, C */ 463 | #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) 464 | #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) 465 | #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) 466 | 467 | /* FIXME - may pay to expand these out - got higher speed-ups below */ 468 | /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ 469 | E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); 470 | 471 | /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ 472 | E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); 473 | 474 | /* S3 = A11 - A21 */ 475 | E(S3) = EA(A11) - EA(A21); 476 | 477 | /* S7 = B22 - B12 */ 478 | E(S7) = EB(B22) - EB(B12); 479 | 480 | TempMatrixOffset += sizeof(REAL); 481 | MatrixOffsetA += sizeof(REAL); 482 | MatrixOffsetB += sizeof(REAL); 483 | } /* end row loop*/ 484 | 485 | MatrixOffsetA += RowIncrementA; 486 | MatrixOffsetB += RowIncrementB; 487 | } /* end column loop */ 488 | 489 | fibril_t fr; 490 | fibril_init(&fr); 491 | 492 | /* M2 = A11 x B11 */ 493 | fibril_fork(&fr, OptimizedStrassenMultiply, 494 | (M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB)); 495 | 496 | /* M5 = S1 * S5 */ 497 | fibril_fork(&fr, OptimizedStrassenMultiply, 498 | (M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize)); 499 | 500 | /* Step 1 of T1 = S2 x S6 + M2 */ 501 | fibril_fork(&fr, OptimizedStrassenMultiply, 502 | (T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize)); 503 | 504 | /* Step 1 of T2 = T1 + S3 x S7 */ 505 | fibril_fork(&fr, OptimizedStrassenMultiply, 506 | (C22, S3, S7, QuadrantSize, RowWidthC, QuadrantSize, QuadrantSize)); 507 | 508 | /* Step 1 of C11 = M2 + A12 * B21 */ 509 | fibril_fork(&fr, OptimizedStrassenMultiply, 510 | (C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB)); 511 | 512 | /* Step 1 of C12 = S4 x B22 + T1 + M5 */ 513 | fibril_fork(&fr, OptimizedStrassenMultiply, 514 | (C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB)); 515 | 516 | /* Step 1 of C21 = T2 - A22 * S8 */ 517 | OptimizedStrassenMultiply(C21, A22, S8, QuadrantSize, RowWidthC, 518 | RowWidthA, QuadrantSize); 519 | 520 | fibril_join(&fr); 521 | 522 | for (Row = 0; Row < QuadrantSize; Row++) { 523 | for (Column = 0; Column < QuadrantSize; Column += 4) { 524 | REAL LocalM5_0 = *(M5); 525 | REAL LocalM5_1 = *(M5+1); 526 | REAL LocalM5_2 = *(M5+2); 527 | REAL LocalM5_3 = *(M5+3); 528 | REAL LocalM2_0 = *(M2); 529 | REAL LocalM2_1 = *(M2+1); 530 | REAL LocalM2_2 = *(M2+2); 531 | REAL LocalM2_3 = *(M2+3); 532 | REAL T1_0 = *(T1sMULT) + LocalM2_0; 533 | REAL T1_1 = *(T1sMULT+1) + LocalM2_1; 534 | REAL T1_2 = *(T1sMULT+2) + LocalM2_2; 535 | REAL T1_3 = *(T1sMULT+3) + LocalM2_3; 536 | REAL T2_0 = *(C22) + T1_0; 537 | REAL T2_1 = *(C22+1) + T1_1; 538 | REAL T2_2 = *(C22+2) + T1_2; 539 | REAL T2_3 = *(C22+3) + T1_3; 540 | (*(C11)) += LocalM2_0; 541 | (*(C11+1)) += LocalM2_1; 542 | (*(C11+2)) += LocalM2_2; 543 | (*(C11+3)) += LocalM2_3; 544 | (*(C12)) += LocalM5_0 + T1_0; 545 | (*(C12+1)) += LocalM5_1 + T1_1; 546 | (*(C12+2)) += LocalM5_2 + T1_2; 547 | (*(C12+3)) += LocalM5_3 + T1_3; 548 | (*(C22)) = LocalM5_0 + T2_0; 549 | (*(C22+1)) = LocalM5_1 + T2_1; 550 | (*(C22+2)) = LocalM5_2 + T2_2; 551 | (*(C22+3)) = LocalM5_3 + T2_3; 552 | (*(C21 )) = (- *(C21 )) + T2_0; 553 | (*(C21+1)) = (- *(C21+1)) + T2_1; 554 | (*(C21+2)) = (- *(C21+2)) + T2_2; 555 | (*(C21+3)) = (- *(C21+3)) + T2_3; 556 | M5 += 4; 557 | M2 += 4; 558 | T1sMULT += 4; 559 | C11 += 4; 560 | C12 += 4; 561 | C21 += 4; 562 | C22 += 4; 563 | } 564 | 565 | C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); 566 | C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); 567 | C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); 568 | C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); 569 | } 570 | 571 | free(StartHeap); 572 | } 573 | 574 | static void strassen(int n, REAL * A, int an, REAL * B, int bn, 575 | REAL * C, int cn) { 576 | OptimizedStrassenMultiply(C, A, B, n, cn, bn, an); 577 | } 578 | 579 | /* 580 | * Set an n by n matrix A to random values. The distance between 581 | * rows is an 582 | */ 583 | void init_matrix(int n, REAL *A, int an) 584 | { 585 | int i, j; 586 | 587 | for (i = 0; i < n; ++i) 588 | for (j = 0; j < n; ++j) 589 | ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; 590 | } 591 | 592 | /* 593 | * Compare two matrices. Print an error message if they differ by 594 | * more than EPSILON. 595 | */ 596 | int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) 597 | { 598 | int i, j; 599 | REAL c; 600 | 601 | for (i = 0; i < n; ++i) 602 | for (j = 0; j < n; ++j) { 603 | /* compute the relative error c */ 604 | c = ELEM(A, an, i, j) - ELEM(B, bn, i, j); 605 | if (c < 0.0) 606 | c = -c; 607 | 608 | c = c / ELEM(A, an, i, j); 609 | if (c > EPSILON) { 610 | return 1; 611 | } 612 | } 613 | 614 | return 0; 615 | } 616 | 617 | void init() { 618 | A = malloc(n * n * sizeof(REAL)); 619 | B = malloc(n * n * sizeof(REAL)); 620 | C = malloc(n * n * sizeof(REAL)); 621 | 622 | init_matrix(n, A, n); 623 | init_matrix(n, B, n); 624 | } 625 | 626 | void prep() { 627 | } 628 | 629 | void test() { 630 | strassen(n, A, n, B, n, C, n); 631 | } 632 | 633 | int verify() { 634 | int fail = 0; 635 | 636 | #ifndef BENCHMARK 637 | REAL * E = malloc(n * n * sizeof(REAL)); 638 | matrixmul(n, A, n, B, n, E, n); 639 | fail = compare_matrix(n, E, n, C, n); 640 | if (fail > 0) printf("WRONG RESULT!\n"); 641 | #endif 642 | 643 | return fail; 644 | } 645 | --------------------------------------------------------------------------------