├── Makefile.am
├── bootstrap
├── .gitmodules
├── benchmark
    ├── serial
    │   └── Makefile.am
    ├── Makefile.am
    ├── cilkplus
    │   └── Makefile.am
    └── tbb
    │   └── Makefile.am
├── src
    ├── stats.c
    ├── pool.h
    ├── deque.h
    ├── stack.h
    ├── serial.h
    ├── mutex.h
    ├── param.h
    ├── Makefile.am
    ├── cilkplus.h
    ├── deque.c
    ├── mutex.c
    ├── safe.h
    ├── tbb.h
    ├── sync.h
    ├── stats.h
    ├── param.c
    ├── fibrile.h
    ├── fibril.h
    ├── fibrili.h
    ├── runtime.c
    ├── pool.c
    ├── stack.c
    ├── debug.h
    ├── fibrili.c
    └── fork.h
├── m4
    ├── fibril.m4
    └── acx_pthread.m4
├── test
    ├── Makefile.am
    ├── fib.c
    ├── nqueens.c
    ├── quicksort.c
    ├── integrate.c
    ├── test.h
    ├── matmul.c
    ├── knapsack.c
    ├── heat.c
    ├── fft.c
    ├── rectmul.c
    ├── lu.c
    ├── cholesky.c
    └── strassen.c
├── README.md
├── .gitignore
├── LICENSE
└── configure.ac


/Makefile.am:
--------------------------------------------------------------------------------
1 | ACLOCAL_AMFLAGS = -I m4
2 | SUBDIRS = src test benchmark
3 | 


--------------------------------------------------------------------------------
/bootstrap:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | autoreconf --install
4 | automake --add-missing --copy
5 | 
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "Hoard"]
2 | 	path = Hoard
3 | 	url = https://github.com/chaoran/Hoard
4 | 


--------------------------------------------------------------------------------
/benchmark/serial/Makefile.am:
--------------------------------------------------------------------------------
1 | include $(srcdir)/../Makefile.am
2 | AM_CPPFLAGS += -DFIBRIL_SERIAL
3 | 


--------------------------------------------------------------------------------
/benchmark/Makefile.am:
--------------------------------------------------------------------------------
1 | VPATH = $(top_srcdir)/test
2 | include $(top_srcdir)/test/Makefile.am
3 | AM_CPPFLAGS += -DBENCHMARK
4 | 


--------------------------------------------------------------------------------
/src/stats.c:
--------------------------------------------------------------------------------
1 | #include "stats.h"
2 | 
3 | #ifdef FIBRIL_STATS
4 | 
5 | struct _stats_counter_t _stats_table[STATS_LAST_ENTRY];
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/benchmark/cilkplus/Makefile.am:
--------------------------------------------------------------------------------
1 | include $(srcdir)/../Makefile.am
2 | AM_CPPFLAGS += -DFIBRIL_CILKPLUS
3 | AM_CFLAGS = -fcilkplus
4 | AM_LDFLAGS += -lcilkrts
5 | 


--------------------------------------------------------------------------------
/src/pool.h:
--------------------------------------------------------------------------------
1 | #ifndef POOL_H
2 | #define POOL_H
3 | 
4 | void pool_put(void * stack);
5 | void * pool_take();
6 | 
7 | #endif /* end of include guard: POOL_H */
8 | 


--------------------------------------------------------------------------------
/benchmark/tbb/Makefile.am:
--------------------------------------------------------------------------------
1 | include $(srcdir)/../Makefile.am
2 | AM_CFLAGS = -std=c++11
3 | AM_CPPFLAGS += -x c++ -DFIBRIL_TBB -fpermissive
4 | AM_LDFLAGS += -ltbb -lstdc++
5 | 


--------------------------------------------------------------------------------
/src/deque.h:
--------------------------------------------------------------------------------
 1 | #ifndef DEQUE_H
 2 | #define DEQUE_H
 3 | 
 4 | #include "fibrili.h"
 5 | 
 6 | typedef struct _fibrili_deque_t deque_t;
 7 | 
 8 | struct _fibril_t * deque_steal(deque_t * deq);
 9 | 
10 | #endif /* end of include guard: DEQUE_H */
11 | 


--------------------------------------------------------------------------------
/src/stack.h:
--------------------------------------------------------------------------------
 1 | #ifndef STACK_H
 2 | #define STACK_H
 3 | 
 4 | #include "fibrili.h"
 5 | 
 6 | void stack_init(int id);
 7 | void * stack_setup(struct _fibril_t * frptr);
 8 | void stack_reinstall(struct _fibril_t * frptr);
 9 | int stack_uninstall(struct _fibril_t * frptr);
10 | 
11 | #endif /* end of include guard: STACK_H */
12 | 


--------------------------------------------------------------------------------
/m4/fibril.m4:
--------------------------------------------------------------------------------
 1 | AC_DEFUN([FIBRIL_IF_ENABLED_NOHELP],[
 2 | case "$enable_[]patsubst([$1], -, _)" in
 3 |   '' | no) :
 4 |       $3 ;;
 5 |   *)  $2 ;;
 6 | esac
 7 | ])
 8 | 
 9 | AC_DEFUN([FIBRIL_IF_ENABLED],[
10 | AC_MSG_CHECKING(whether to enable $1)
11 | AC_ARG_ENABLE($1,AS_HELP_STRING(--enable-$1,[$2]))
12 | FIBRIL_IF_ENABLED_NOHELP([$1],[$3],[$4])
13 | AC_MSG_RESULT($enable_[]patsubst([$1], -, _))
14 | ])
15 | 
16 | 


--------------------------------------------------------------------------------
/src/serial.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIBRIL_SERIAL_H
 2 | #define FIBRIL_SERIAL_H
 3 | 
 4 | #define fibril
 5 | #define fibril_t __attribute__((unused)) int
 6 | #define fibril_init(fp)
 7 | #define fibril_join(fp)
 8 | 
 9 | #define fibril_fork_nrt(fp, fn, ag) (fn ag)
10 | #define fibril_fork_wrt(fp, rtp, fn, ag) (*rtp = fn ag)
11 | 
12 | #define fibril_rt_init(n)
13 | #define fibril_rt_exit()
14 | #define fibril_rt_nprocs(n) (1)
15 | 
16 | #endif /* end of include guard: FIBRIL_SERIAL_H */
17 | 


--------------------------------------------------------------------------------
/src/mutex.h:
--------------------------------------------------------------------------------
 1 | #ifndef MUTEX_H
 2 | #define MUTEX_H
 3 | 
 4 | #define MUTEX_LOCKED 1
 5 | 
 6 | typedef struct _mutex_t {
 7 |   struct _mutex_t * volatile next;
 8 |   volatile char flag;
 9 | } mutex_t __attribute__((aligned(128)));
10 | 
11 | void mutex_lock   (mutex_t * volatile * mutex, mutex_t * node);
12 | int  mutex_trylock(mutex_t * volatile * mutex, mutex_t * node);
13 | void mutex_unlock (mutex_t * volatile * mutex, mutex_t * node);
14 | 
15 | #endif /* end of include guard: MUTEX_H */
16 | 


--------------------------------------------------------------------------------
/src/param.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAM_H
 2 | #define PARAM_H
 3 | 
 4 | #include <stddef.h>
 5 | 
 6 | extern size_t PARAM_PAGE_SIZE;
 7 | extern void * PARAM_STACK_ADDR;
 8 | extern size_t PARAM_STACK_SIZE;
 9 | extern int PARAM_NPROCS;
10 | 
11 | #define PAGE_ALIGN_DOWN(x) ((void *) ((size_t) (x) & ~(PARAM_PAGE_SIZE - 1)))
12 | #define PAGE_ALIGNED(x) (0 == ((size_t) (x) & (PARAM_PAGE_SIZE - 1)))
13 | 
14 | extern int param_nprocs(int n);
15 | extern void param_init();
16 | 
17 | #endif /* end of include guard: PARAM_H */
18 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | lib_LTLIBRARIES = libfibril.la
 2 | 
 3 | include_HEADERS = fibril.h
 4 | 
 5 | pkginclude_HEADERS = \
 6 |                      cilkplus.h \
 7 |                      fibrile.h \
 8 |                      fibrili.h \
 9 |                      fork.h \
10 |                      serial.h \
11 |                      tbb.h
12 | 
13 | libfibril_la_SOURCES = deque.c \
14 |                        fibrili.c \
15 |                        param.c \
16 | 											 pool.c \
17 |                        runtime.c \
18 |                        stack.c \
19 |                        stats.c \
20 | 											 mutex.c
21 | 


--------------------------------------------------------------------------------
/src/cilkplus.h:
--------------------------------------------------------------------------------
 1 | #ifndef CILKPLUS_H
 2 | #define CILKPLUS_H
 3 | 
 4 | #include <cilk/cilk.h>
 5 | 
 6 | #define fibril
 7 | #define fibril_t __attribute__((unused)) int
 8 | #define fibril_init(fp)
 9 | #define fibril_join(fp) cilk_sync
10 | 
11 | #define fibril_fork_nrt(fp, fn, ag)     cilk_spawn fn ag
12 | #define fibril_fork_wrt(fp, rt, fn, ag) *rt = cilk_spawn fn ag
13 | 
14 | #define fibril_rt_init(n) (__cilkrts_set_param("stack size", "0x800000"))
15 | #define fibril_rt_exit() (__cilkrts_end_cilk())
16 | #define fibril_rt_nprocs() (__cilkrts_get_nworkers())
17 | 
18 | #endif /* end of include guard: CILKPLUS_H */
19 | 


--------------------------------------------------------------------------------
/test/Makefile.am:
--------------------------------------------------------------------------------
 1 | AM_CPPFLAGS = -I$(includedir)
 2 | AM_LDFLAGS = -L$(libdir) -l$(PACKAGE)
 3 | 
 4 | check_PROGRAMS = \
 5 |                  cholesky \
 6 |                  fft \
 7 |                  fib \
 8 |                  heat \
 9 |                  integrate \
10 |                  knapsack \
11 |                  lu \
12 |                  matmul \
13 |                  nqueens \
14 |                  quicksort \
15 |                  rectmul \
16 |                  strassen
17 | 
18 | cholesky_LDADD = -lm
19 | fft_LDADD = -lm
20 | heat_LDADD = -lm
21 | lu_LDADD = -lm
22 | strassen_LDADD = -lm
23 | 
24 | TESTS = $(check_PROGRAMS)
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # fibril
 2 | 
 3 | ## Install
 4 | 
 5 | ```
 6 | ./bootstrap
 7 | ./configure
 8 | make
 9 | make install
10 | ```
11 | 
12 | ## Test
13 | ```
14 | make check
15 | ```
16 | 
17 | ## Benchmark
18 | By default, `make check` will run standard tests AND benchmarks. To run benchmarks only,
19 | 
20 | ```
21 | cd benchmark
22 | make check
23 | ```
24 | 
25 | To run the benchmarks with serial version, do
26 | ```
27 | cd benchmark/serial
28 | make check
29 | ```
30 | 
31 | You can also compare the performance of **fibril** with **Intel CilkPlus**, or **Intel Threading Building Blocks**. To run these versions, you have to have a compiler that supports these frameworks. GCC 5+ supports Intel CilkPlus natively. To run these benchmarks, do
32 | ```
33 | cd benchmark/[cilkplus or tbb]
34 | make check
35 | ```
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | *.d
 7 | *.d.*
 8 | 
 9 | # Precompiled Headers
10 | *.gch
11 | *.pch
12 | 
13 | # Libraries
14 | *.lib
15 | *.a
16 | *.la
17 | *.lo
18 | 
19 | # Shared objects (inc. Windows DLLs)
20 | *.dll
21 | *.so
22 | *.so.*
23 | *.dylib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | *.i*86
30 | *.x86_64
31 | *.hex
32 | 
33 | # http://www.gnu.org/software/automake
34 | 
35 | Makefile.in
36 | 
37 | # http://www.gnu.org/software/autoconf
38 | 
39 | /autom4te.cache
40 | /aclocal.m4
41 | /compile
42 | /configure
43 | /depcomp
44 | /install-sh
45 | /missing
46 | /stamp-h1
47 | /config.guess
48 | /config.h.in
49 | /config.h.in~
50 | /config.sub
51 | /ltmain.sh
52 | /test-driver
53 | /m4/libtool.m4
54 | /m4/ltoptions.m4
55 | /m4/ltsugar.m4
56 | /m4/ltversion.m4
57 | /m4/lt~obsolete.m4
58 | /build
59 | 


--------------------------------------------------------------------------------
/src/deque.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | #include "sync.h"
 3 | #include "debug.h"
 4 | #include "deque.h"
 5 | 
 6 | __thread deque_t fibrili_deq;
 7 | 
 8 | struct _fibril_t * deque_steal(deque_t * deq)
 9 | {
10 |   if (deq->head >= deq->tail) return NULL;
11 | 
12 |   sync_lock(deq->lock);
13 | 
14 |   int head = deq->head++;
15 | 
16 |   sync_fence();
17 | 
18 |   if (head >= deq->tail) {
19 |     deq->head--;
20 |     sync_unlock(deq->lock);
21 | 
22 |     return NULL;
23 |   }
24 | 
25 |   struct _fibril_t * frptr = deq->buff[head];
26 |   DEBUG_ASSERT(frptr != NULL);
27 | 
28 |   sync_lock(frptr->lock);
29 |   int count = frptr->count;
30 | 
31 |   if (count < 0) {
32 |     frptr->count = 1;
33 |     frptr->stack.ptr = deq->stack;
34 |   } else {
35 |     frptr->count = count + 1;
36 |   }
37 | 
38 |   sync_unlock(deq->lock);
39 |   return frptr;
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/test/fib.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "test.h"
 3 | 
 4 | int n = 42;
 5 | int m;
 6 | 
 7 | static int fib_fast(int n)
 8 | {
 9 |   if (n < 2) return n;
10 | 
11 |   int i = 2, x = 0, y = 0, z = 1;
12 | 
13 |   do {
14 |     x = y;
15 |     y = z;
16 |     z = x + y;
17 |   } while (i++ < n);
18 | 
19 |   return z;
20 | }
21 | 
22 | fibril int fib(int n)
23 | {
24 |   if (n < 2) return n;
25 | 
26 |   int x, y;
27 |   fibril_t fr;
28 |   fibril_init(&fr);
29 | 
30 |   fibril_fork(&fr, &x, fib, (n - 1));
31 | 
32 |   y = fib(n - 2);
33 |   fibril_join(&fr);
34 | 
35 |   return x + y;
36 | }
37 | 
38 | int verify()
39 | {
40 |   int expect = fib_fast(n);
41 | 
42 |   if (expect != m) {
43 |     printf("fib(%d)=%d (expected %d)\n", n, m, expect);
44 |     return 1;
45 |   }
46 | 
47 |   return 0;
48 | }
49 | 
50 | void init() {};
51 | void prep() {};
52 | 
53 | void test() {
54 |   m = fib(n);
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/src/mutex.c:
--------------------------------------------------------------------------------
 1 | #include "mutex.h"
 2 | #include "sync.h"
 3 | 
 4 | #define NULL ((void *) 0)
 5 | #define spin_wait(x) while (!(x)) __asm__ ( "pause" ::: "memory" )
 6 | 
 7 | void mutex_lock(mutex_t * volatile * mutex, mutex_t * node)
 8 | {
 9 |   node->next = NULL;
10 |   mutex_t * prev = sync_swap(mutex, node);
11 | 
12 |   if (prev) {
13 |     node->flag = 0;
14 |     prev->next = node;
15 |     spin_wait(node->flag);
16 |   }
17 | }
18 | 
19 | int mutex_trylock(mutex_t * volatile * mutex, mutex_t * node)
20 | {
21 |   node->next = NULL;
22 |   mutex_t * prev = sync_cas(mutex, NULL, node);
23 |   return (prev == NULL);
24 | }
25 | 
26 | void mutex_unlock(mutex_t * volatile * mutex, mutex_t * node)
27 | {
28 |   if (node->next == NULL) {
29 |     if (node == sync_cas(mutex, node, NULL)) {
30 |       return;
31 |     }
32 | 
33 |     spin_wait(node->next);
34 |   }
35 | 
36 |   node->next->flag = MUTEX_LOCKED;
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/src/safe.h:
--------------------------------------------------------------------------------
 1 | #ifndef SAFE_H
 2 | #define SAFE_H
 3 | 
 4 | #include "debug.h"
 5 | 
 6 | #ifndef DISABLE_SAFE
 7 | 
 8 | #define SAFE_STRINGIFY(x) #x
 9 | #define SAFE_TOSTRING(x) SAFE_STRINGIFY(x)
10 | #define SAFE_AT __FILE__ ":" SAFE_TOSTRING(__LINE__) ": "
11 | 
12 | #define SAFE_ASSERT(cond) do { \
13 |   if (!(cond)) { \
14 |     DEBUG_DUMP(0, "error: " SAFE_AT "%m"); \
15 |     DEBUG_BREAK(!(cond)); \
16 |   } \
17 | } while (0)
18 | 
19 | #else
20 | #define SAFE_ASSERT(...)
21 | #endif
22 | 
23 | #include <stdint.h>
24 | 
25 | #define SAFE_NNCALL(call) do { \
26 |   intptr_t ret = (intptr_t) (call); \
27 |   SAFE_ASSERT(ret >= 0); \
28 | } while (0)
29 | 
30 | #define SAFE_NZCALL(call) do { \
31 |   intptr_t ret = (intptr_t) (call); \
32 |   SAFE_ASSERT(ret != 0); \
33 | } while (0)
34 | 
35 | #define SAFE_RZCALL(call) do { \
36 |   intptr_t ret = (intptr_t) (call); \
37 |   SAFE_ASSERT(ret == 0); \
38 | } while (0)
39 | 
40 | #endif /* end of include guard: SAFE_H */
41 | 


--------------------------------------------------------------------------------
/src/tbb.h:
--------------------------------------------------------------------------------
 1 | #ifndef TBB_H
 2 | #define TBB_H
 3 | 
 4 | #include <tbb/task_group.h>
 5 | #include <tbb/task_scheduler_init.h>
 6 | 
 7 | #define fibril
 8 | #define fibril_t tbb::task_group
 9 | #define fibril_init(fp)
10 | #define fibril_join(fp) (fp)->wait()
11 | 
12 | #define fibril_fork_nrt(fp, fn, ag) (fp)->run([=]{ fn ag; })
13 | #define fibril_fork_wrt(fp, rtp, fn, ag) do { \
14 |   __typeof__(rtp) pt = rtp; \
15 |   (fp)->run([=]{ *pt = fn ag; }); \
16 | } while (0)
17 | 
18 | extern "C" {
19 |   extern int PARAM_NPROCS;
20 |   extern int fibril_rt_nprocs();
21 | }
22 | 
23 | #define fibril_rt_init(n) \
24 |   do { \
25 |     int max_nprocs = fibril_rt_nprocs(); \
26 |     if (n > 0 && n <= max_nprocs) { \
27 |       PARAM_NPROCS = n; \
28 |     } else { \
29 |       PARAM_NPROCS = max_nprocs; \
30 |     } \
31 |   } while(0); \
32 | tbb::task_scheduler_init _fibril_rt_init(PARAM_NPROCS)
33 | 
34 | #define fibril_rt_exit()
35 | 
36 | #endif /* end of include guard: TBB_H */
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Chaoran Yang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/test/nqueens.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "test.h"
 3 | 
 4 | int n = 14;
 5 | int m;
 6 | 
 7 | fibril static int nqueens(const int * a, int n, int d, int i)
 8 | {
 9 |   int aa[d + 1];
10 |   int j;
11 | 
12 |   for (j = 0; j < d; ++j) {
13 |     aa[j] = a[j];
14 | 
15 |     int diff = a[j] - i;
16 |     int dist = d - j;
17 | 
18 |     if (diff == 0 || dist == diff || dist + diff == 0) return 0;
19 |   }
20 | 
21 |   if (d >= 0) aa[d] = i;
22 |   if (++d == n) return 1;
23 | 
24 |   int res[n];
25 |   a = aa;
26 | 
27 |   fibril_t fr;
28 |   fibril_init(&fr);
29 | 
30 |   for (i = 0; i < n; ++i) {
31 |     fibril_fork(&fr, &res[i], nqueens, (a, n, d, i));
32 |   }
33 | 
34 |   fibril_join(&fr);
35 | 
36 |   int sum = 0;
37 | 
38 |   for (i = 0; i < n; ++i) {
39 |     sum += res[i];
40 |   }
41 | 
42 |   return sum;
43 | }
44 | 
45 | void init() {}
46 | void prep() {}
47 | 
48 | void test()
49 | {
50 |   m = nqueens(NULL, n, -1, 0);
51 | }
52 | 
53 | int verify()
54 | {
55 |   static int res[16] = {
56 |     1, 0, 0, 2, 10, 4, 40, 92, 352, 724, 2680,
57 |     14200, 73712, 365596, 2279184, 14772512
58 |   };
59 | 
60 |   int failed;
61 | 
62 |   if (failed = (m != res[n - 1])) {
63 |     printf("nqueens(%d)=%d (expected %d)\n", n, m, res[n - 1]);
64 |   }
65 | 
66 |   return failed;
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/src/sync.h:
--------------------------------------------------------------------------------
 1 | #ifndef SYNC_H
 2 | #define SYNC_H
 3 | 
 4 | #include "fibrili.h"
 5 | 
 6 | #define sync_fence() fibrili_fence()
 7 | #define sync_lock(lock) fibrili_lock(lock)
 8 | #define sync_unlock(lock) fibrili_unlock(lock)
 9 | 
10 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7
11 | 
12 | #define sync_fadd(val, n) __atomic_fetch_add(&(val), n, __ATOMIC_ACQ_REL)
13 | #define sync_cas(ptr, cmp, val) __sync_val_compare_and_swap(ptr, cmp, val)
14 | #define sync_swap(ptr, val) __atomic_exchange_n(ptr, val, __ATOMIC_ACQ_REL)
15 | 
16 | #else
17 | #if defined(__x86_64__) || defined(_M_X64_)
18 | 
19 | #define sync_fadd(val, n) __sync_fetch_and_add(&(val), n)
20 | #define sync_cas(ptr, cmp, val) __sync_val_compare_and_swap(ptr, cmp, val)
21 | #define sync_swap(ptr, val) __sync_lock_test_and_set(ptr, val)
22 | 
23 | #endif
24 | #endif
25 | 
26 | static inline void sync_barrier(int nprocs)
27 | {
28 |   static volatile int _count;
29 |   static volatile int _sense;
30 |   static __thread volatile int _local_sense;
31 | 
32 |   int sense = !_local_sense;
33 | 
34 |   if (sync_fadd(_count, 1) == nprocs - 1) {
35 |     _count = 0;
36 |     _sense = sense;
37 |   }
38 | 
39 |   while (_sense != sense);
40 |   _local_sense = sense;
41 |   sync_fence();
42 | }
43 | 
44 | #endif /* end of include guard: SYNC_H */
45 | 


--------------------------------------------------------------------------------
/src/stats.h:
--------------------------------------------------------------------------------
 1 | #ifndef STATS_H
 2 | #define STATS_H
 3 | 
 4 | #if HAVE_CONFIG_H
 5 | #include "config.h"
 6 | #endif
 7 | 
 8 | //#define FIBRIL_STATS
 9 | 
10 | #ifndef FIBRIL_STATS
11 | 
12 | #define STATS_COUNT(...)
13 | #define STATS_INC(...)
14 | #define STATS_DEC(...)
15 | #define STATS_EXPORT(...)
16 | 
17 | #else // FIBRIL_STATS defined
18 | 
19 | #include <stdlib.h>
20 | #include "sync.h"
21 | 
22 | typedef enum _stats_t {
23 |   N_STEALS = 0,
24 |   N_SUSPENSIONS,
25 |   N_STACKS,
26 |   N_PAGES,
27 |   STATS_LAST_ENTRY /** No more enum entries after this. */
28 | } stats_t;
29 | 
30 | extern struct _stats_counter_t {
31 |   volatile long curr;
32 |   volatile size_t peak;
33 | } _stats_table[STATS_LAST_ENTRY];
34 | 
35 | #define STATS_COUNT(e, n) do { \
36 |   sync_fadd(_stats_table[e].peak, n); \
37 | } while (0)
38 | 
39 | #define STATS_INC(e, n) do { \
40 |   long curr = sync_fadd(_stats_table[e].curr, n); \
41 |   while (1) { \
42 |     size_t peak = _stats_table[e].peak; \
43 |     if (peak > curr) break; \
44 |     if (sync_cas(&_stats_table[e].peak, peak, curr + 1)) break; \
45 |   } \
46 | } while (0)
47 | 
48 | #define STATS_DEC(e, n) do { \
49 |   sync_fadd(_stats_table[e].curr, -n); \
50 | } while (0)
51 | 
52 | #define STATS_EXPORT(e) do { \
53 |   char tmp[32]; \
54 |   sprintf(tmp, "%ld", _stats_table[e].peak); \
55 |   setenv("FIBRIL_" #e, tmp, 1); \
56 | } while (0)
57 | 
58 | #endif
59 | #endif /* end of include guard: STATS_H */
60 | 


--------------------------------------------------------------------------------
/test/quicksort.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdlib.h>
 3 | #include "test.h"
 4 | 
 5 | int n = 8;
 6 | static int * a, * b;
 7 | static size_t size;
 8 | 
 9 | fibril void quicksort(int * a, size_t n)
10 | {
11 |   if (n < 2) return;
12 | 
13 |   int pivot = a[n / 2];
14 | 
15 |   int *left  = a;
16 |   int *right = a + n - 1;
17 | 
18 |   while (left <= right) {
19 |     if (*left < pivot) {
20 |       left++;
21 |     } else if (*right > pivot) {
22 |       right--;
23 |     } else {
24 |       int tmp = *left;
25 |       *left = *right;
26 |       *right = tmp;
27 |       left++;
28 |       right--;
29 |     }
30 |   }
31 | 
32 |   fibril_t fr;
33 |   fibril_init(&fr);
34 | 
35 |   fibril_fork(&fr, quicksort, (a, right - a + 1));
36 |   quicksort(left, a + n - left);
37 | 
38 |   fibril_join(&fr);
39 | }
40 | 
41 | int verify()
42 | {
43 |   if (size < 2) return 0;
44 | 
45 |   int prev = a[0];
46 |   int i;
47 |   for (i = 1; i < size; ++i) {
48 |     if (prev > a[i]) return 1;
49 |     prev = a[i];
50 |   }
51 | 
52 |   return 0;
53 | }
54 | 
55 | void init()
56 | {
57 |   size = 1;
58 | 
59 |   int i;
60 |   for (i = 0; i < n; ++i) {
61 |     size *= 10;
62 |   }
63 | 
64 |   a = malloc(sizeof(int [size]));
65 |   b = malloc(sizeof(int [size]));
66 | 
67 |   for (i = 0; i < size; ++i) {
68 |     b[i] = rand();
69 |   }
70 | }
71 | 
72 | void prep()
73 | {
74 |   int i;
75 |   for (i = 0; i < size; ++i) {
76 |     a[i] = b[i];
77 |   }
78 | }
79 | 
80 | void test()
81 | {
82 |   quicksort(a, size);
83 | }
84 | 
85 | 


--------------------------------------------------------------------------------
/src/param.c:
--------------------------------------------------------------------------------
 1 | #define _GNU_SOURCE
 2 | #include <unistd.h>
 3 | #include <stdlib.h>
 4 | #include <pthread.h>
 5 | #include "safe.h"
 6 | #include "param.h"
 7 | 
 8 | size_t PARAM_PAGE_SIZE;
 9 | void * PARAM_STACK_ADDR;
10 | size_t PARAM_STACK_SIZE;
11 | int PARAM_NPROCS;
12 | 
13 | static size_t get_page_size()
14 | {
15 |   int pagesize = sysconf(_SC_PAGESIZE);
16 |   return pagesize;
17 | }
18 | 
19 | static void get_stack_size(void ** addr, size_t * size)
20 | {
21 |   pthread_attr_t attr;
22 | 
23 |   pthread_getattr_np(pthread_self(), &attr);
24 |   pthread_attr_getstack(&attr, addr, size);
25 | }
26 | 
27 | int param_nprocs(int n) {
28 |   int nprocs = 0;
29 | 
30 |   /** If user provided a positive number, use that number. */
31 |   if (n > 0) {
32 |     nprocs = n;
33 |   }
34 | 
35 |   if (nprocs == 0) {
36 |     char * env = getenv("FIBRIL_NPROCS");
37 |     if (env) nprocs = atoi(env);
38 |   }
39 | 
40 |   int max_nprocs = sysconf(_SC_NPROCESSORS_ONLN);
41 | 
42 |   /**
43 |    * Make sure nprocs is positive and less than or equal to
44 |    * _SC_NPROCESSORS_ONLN.
45 |    */
46 |   if (nprocs <= 0 || nprocs > max_nprocs) {
47 |     nprocs = max_nprocs;
48 |   }
49 | 
50 |   return nprocs;
51 | }
52 | 
53 | void param_init(int n)
54 | {
55 |   PARAM_PAGE_SIZE = get_page_size();
56 |   DEBUG_DUMP(2, "init:", (PARAM_PAGE_SIZE, "0x%lx"));
57 | 
58 |   get_stack_size(&PARAM_STACK_ADDR, &PARAM_STACK_SIZE);
59 |   DEBUG_DUMP(2, "init:", (PARAM_STACK_ADDR, "%p"));
60 |   DEBUG_DUMP(2, "init:", (PARAM_STACK_SIZE, "0x%lx"));
61 | 
62 |   PARAM_NPROCS = param_nprocs(n);
63 |   DEBUG_DUMP(2, "init:", (PARAM_NPROCS, "%d"));
64 | }
65 | 
66 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | #                                               -*- Autoconf -*-
 2 | # Process this file with autoconf to produce a configure script.
 3 | 
 4 | AC_PREREQ([2.69])
 5 | AC_INIT([fibril], [0.0.2], [chaoran@rice.edu])
 6 | AM_INIT_AUTOMAKE([-Wall -Wno-extra-portability -Werror foreign])
 7 | LT_PREREQ([2.2])
 8 | LT_INIT
 9 | AC_CONFIG_MACRO_DIR([m4])
10 | AC_CONFIG_SRCDIR([src/fibril.h])
11 | AC_CONFIG_HEADERS([config.h])
12 | 
13 | # Checks for programs.
14 | AC_PROG_CC
15 | 
16 | # Checks for command-line.
17 | FIBRIL_IF_ENABLED([debug], [Build fibril in debugging mode],
18 |   [
19 |    case "${enable_debug}" in 1|2|3) ;; *) enable_debug=0 ;; esac
20 |    AC_DEFINE_UNQUOTED([FIBRIL_DEBUG], [${enable_debug}], [Fibril debug enabled])
21 |   ])
22 | FIBRIL_IF_ENABLED([stats], [Enable statistics collection],
23 |   [ AC_DEFINE([FIBRIL_STATS], [1], [Enable statistics collection.]) ])
24 | 
25 | # Check for pthreads
26 | ACX_PTHREAD([LIBS="$PTHREAD_LIBS $LIBS"
27 |              CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
28 |              CC="$PTHREAD_CC"])
29 | 
30 | # Checks for libraries.
31 | 
32 | # Checks for header files.
33 | AC_CHECK_HEADERS([stddef.h stdint.h stdlib.h unistd.h pthread.h])
34 | 
35 | # Checks for typedefs, structures, and compiler characteristics.
36 | AC_C_INLINE
37 | AC_TYPE_SIZE_T
38 | 
39 | # Checks for library functions.
40 | AC_FUNC_MMAP
41 | AC_CHECK_FUNCS([mmap, madvise])
42 | 
43 | AC_CONFIG_FILES([Makefile
44 |                  src/Makefile
45 |                  test/Makefile
46 |                  benchmark/Makefile
47 |                  benchmark/cilkplus/Makefile
48 |                  benchmark/tbb/Makefile
49 |                  benchmark/serial/Makefile])
50 | AC_OUTPUT
51 | 


--------------------------------------------------------------------------------
/src/fibrile.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIBRILE_H
 2 | #define FIBRILE_H
 3 | 
 4 | #include "fibrili.h"
 5 | 
 6 | /** fibril. */
 7 | #define fibril __attribute__((optimize("no-omit-frame-pointer")))
 8 | 
 9 | /** fibril_t. */
10 | typedef struct _fibril_t fibril_t;
11 | 
12 | /** fibril_init. */
13 | __attribute__((always_inline)) extern inline
14 | void fibril_init(fibril_t * frptr)
15 | {
16 |   register void * rbp asm ("rbp");
17 |   register void * rsp asm ("rsp");
18 | 
19 |   frptr->lock = 0;
20 |   frptr->unmapped = 0;
21 |   frptr->count = -1;
22 |   frptr->stack.btm = rbp;
23 |   frptr->stack.top = rsp;
24 | }
25 | 
26 | /** fibril_join. */
27 | __attribute__((always_inline)) extern inline
28 | void fibril_join(fibril_t * frptr)
29 | {
30 |   if (frptr->count > -1) {
31 |     fibrili_membar(fibrili_join(frptr));
32 |   }
33 | }
34 | 
35 | #include "fork.h"
36 | 
37 | /** _fibril_fork_nrt. */
38 | #define fibril_fork_nrt(fp, fn, ag) do { \
39 |   __attribute__((noinline, hot, optimize(3))) \
40 |   void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f) { \
41 |     fibrili_push(f); \
42 |     fn(_fibril_args ag); \
43 |     if (!fibrili_pop()) fibrili_resume(f); \
44 |   } \
45 |   fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp)); \
46 | } while (0)
47 | 
48 | /** _fibril_fork_wrt. */
49 | #define fibril_fork_wrt(fp, rtp, fn, ag) do { \
50 |   __attribute__((noinline, hot, optimize(3))) \
51 |   void _fibril_##fn##_fork(_fibril_defs ag fibril_t * f, __typeof__(rtp) p) { \
52 |     fibrili_push(f); \
53 |     *p = fn(_fibril_args ag); \
54 |     if (!fibrili_pop()) fibrili_resume(f); \
55 |   } \
56 |   fibrili_membar(_fibril_##fn##_fork(_fibril_expand ag fp, rtp)); \
57 | } while (0)
58 | 
59 | extern int fibril_rt_init(int nprocs);
60 | extern int fibril_rt_exit();
61 | extern int fibril_rt_nprocs();
62 | 
63 | #endif /* end of include guard: FIBRILE_H */
64 | 


--------------------------------------------------------------------------------
/src/fibril.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIBRIL_H
 2 | #define FIBRIL_H
 3 | 
 4 | #define FIBRIL_SUCCESS 0
 5 | #define FIBRIL_FAILURE -1
 6 | 
 7 | /**
 8 |  * These are special arguments to fibril_rt_init().
 9 |  * FIBRIL_NPROCS tells the runtime to fetch the number of processors
10 |  * from the environment variable FIBRIL_NPROCS (getenv(FIBRIL_NPROCS)).
11 |  * FIBRIL_NPROCS_ONLN tells the runtime to use all available processors
12 |  * in the system (sysconf(_SC_NPROCESSORS_ONLN)).
13 |  */
14 | #define FIBRIL_NPROCS 0
15 | #define FIBRIL_NPROCS_ONLN -1
16 | 
17 | /** Serial version. */
18 | #ifdef FIBRIL_SERIAL
19 | #include <fibril/serial.h>
20 | 
21 | /** Cilkplus version. */
22 | #elif FIBRIL_CILKPLUS
23 | #include <fibril/cilkplus.h>
24 | 
25 | /** TBB version. */
26 | #elif FIBRIL_TBB
27 | #include <fibril/tbb.h>
28 | 
29 | /** Fibril version. */
30 | #else
31 | #include <fibril/fibrile.h>
32 | #endif
33 | 
34 | /** fibril_fork has two versions: one with return value and one without. */
35 | #define fibril_fork(...) _fibril_fork_(_fibril_nth(__VA_ARGS__), __VA_ARGS__)
36 | #define _fibril_fork_(n, ...) _fibril_concat(_fibril_fork_, n)(__VA_ARGS__)
37 | 
38 | /** If nargs is 3, use the no-return-value version. */
39 | #define _fibril_fork_3(...) fibril_fork_nrt(__VA_ARGS__)
40 | 
41 | /** If nargs is 4, use the with-return-value version. */
42 | #define _fibril_fork_4(...) fibril_fork_wrt(__VA_ARGS__)
43 | 
44 | /** Helper macros to count number of arguments. */
45 | #define _fibril_nth(...) _fibril_nth_(__VA_ARGS__, ## __VA_ARGS__, \
46 |     16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, \
47 |     8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 0)
48 | #define _fibril_nth_(_1, _1_, _2, _2_, _3, _3_, _4, _4_, _5, _5_, \
49 |     _6, _6_, _7, _7_, _8, _8_, _9, _9_, _10, _10_, _11, _11_, _12, _12_, \
50 |     _13, _13_, _14, _14_, _15, _15_, _16, _16_, N, ...) N
51 | #define _fibril_concat(left, right) left##right
52 | 
53 | #endif /* end of include guard: FIBRIL_H */
54 | 


--------------------------------------------------------------------------------
/test/integrate.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "test.h"
 3 | 
 4 | int n = 10000;
 5 | 
 6 | static double m;
 7 | static const double epsilon = 1.0e-9;
 8 | 
 9 | static double f(double x)
10 | {
11 |   return (x * x + 1.0) * x;
12 | }
13 | 
14 | static
15 | double integrate_serial(double x1, double y1, double x2, double y2, double area)
16 | {
17 |   double half = (x2 - x1) / 2;
18 |   double x0 = x1 + half;
19 |   double y0 = f(x0);
20 | 
21 |   double area_x1x0 = (y1 + y0) / 2 * half;
22 |   double area_x0x2 = (y0 + y2) / 2 * half;
23 |   double area_x1x2 = area_x1x0 + area_x0x2;
24 | 
25 |   if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) {
26 |     return area_x1x2;
27 |   }
28 | 
29 |   area_x1x0 = integrate_serial(x1, y1, x0, y0, area_x1x0);
30 |   area_x0x2 = integrate_serial(x0, y0, x2, y2, area_x0x2);
31 | 
32 |   return area_x1x0 + area_x0x2;
33 | }
34 | 
35 | static fibril
36 | double integrate(double x1, double y1, double x2, double y2, double area)
37 | {
38 |   double half = (x2 - x1) / 2;
39 |   double x0 = x1 + half;
40 |   double y0 = f(x0);
41 | 
42 |   double area_x1x0 = (y1 + y0) / 2 * half;
43 |   double area_x0x2 = (y0 + y2) / 2 * half;
44 |   double area_x1x2 = area_x1x0 + area_x0x2;
45 | 
46 |   if (area_x1x2 - area < epsilon && area - area_x1x2 < epsilon) {
47 |     return area_x1x2;
48 |   }
49 | 
50 |   fibril_t fr;
51 |   fibril_init(&fr);
52 | 
53 |   fibril_fork(&fr, &area_x1x0, integrate, (x1, y1, x0, y0, area_x1x0));
54 |   area_x0x2 = integrate(x0, y0, x2, y2, area_x0x2);
55 | 
56 |   fibril_join(&fr);
57 |   return area_x1x0 + area_x0x2;
58 | }
59 | 
60 | void init() {}
61 | void prep() {}
62 | 
63 | void test()
64 | {
65 |   m = integrate(0, f(0), n, f(n), 0);
66 | }
67 | 
68 | int verify()
69 | {
70 |   double expect = integrate_serial(0, f(0), n, f(n), 0);
71 | 
72 |   if (m - expect < epsilon && expect - m < epsilon) {
73 |     return 0;
74 |   }
75 | 
76 |   printf("integrate(%d)=%lf (expected %lf)\n", n, m, expect);
77 |   return 1;
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/src/fibrili.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIBRILI_H
 2 | #define FIBRILI_H
 3 | 
 4 | struct _fibril_t {
 5 |   char lock;
 6 |   char unmapped;
 7 |   int count;
 8 |   struct {
 9 |     void * btm;
10 |     void * top;
11 |     void * ptr;
12 |   } stack;
13 |   void * pc;
14 | };
15 | 
16 | extern __thread struct _fibrili_deque_t {
17 |   char lock;
18 |   int  head;
19 |   int  tail;
20 |   void * stack;
21 |   void * buff[1000];
22 | } fibrili_deq;
23 | 
24 | #if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ > 7
25 | 
26 | #define fibrili_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST)
27 | #define fibrili_lock(l) do { \
28 |   __asm__ ( "pause" : : : "memory" ); \
29 | } while (__atomic_test_and_set(&(l), __ATOMIC_ACQUIRE))
30 | #define fibrili_unlock(l) __atomic_clear(&(l), __ATOMIC_RELEASE)
31 | 
32 | #else
33 | #if defined(__x86_64__) || defined(_M_X64_)
34 | 
35 | #define fibrili_fence() __sync_synchronize()
36 | #define fibrili_lock(l) do { \
37 |   __asm__ ( "pause" ::: "memory" ); \
38 | } while (__sync_lock_test_and_set(&(l), 1))
39 | #define fibrili_unlock(l) __sync_lock_release(&(l))
40 | 
41 | #endif
42 | #endif
43 | 
44 | __attribute__((noinline)) extern
45 | void fibrili_join(struct _fibril_t * frptr);
46 | __attribute__((noreturn)) extern
47 | void fibrili_resume(struct _fibril_t * frptr);
48 | 
49 | #define fibrili_push(frptr) do { \
50 |   (frptr)->pc = __builtin_return_address(0); \
51 |   fibrili_deq.buff[fibrili_deq.tail++] = (frptr); \
52 | } while (0)
53 | 
54 | __attribute__((hot)) static
55 | int fibrili_pop(void)
56 | {
57 |   int tail = fibrili_deq.tail;
58 | 
59 |   if (tail == 0) return 0;
60 | 
61 |   fibrili_deq.tail = --tail;
62 | 
63 |   fibrili_fence();
64 | 
65 |   if (fibrili_deq.head > tail) {
66 |     fibrili_deq.tail = tail + 1;
67 | 
68 |     fibrili_lock(fibrili_deq.lock);
69 | 
70 |     if (fibrili_deq.head > tail) {
71 |       fibrili_deq.head = 0;
72 |       fibrili_deq.tail = 0;
73 | 
74 |       fibrili_unlock(fibrili_deq.lock);
75 |       return 0;
76 |     }
77 | 
78 |     fibrili_deq.tail = tail;
79 |     fibrili_unlock(fibrili_deq.lock);
80 |   }
81 | 
82 |   return 1;
83 | }
84 | 
85 | #define fibrili_membar(call) do { \
86 |   call; \
87 |   __asm__ ( "nop" : : : "rbx", "r12", "r13", "r14", "r15", "memory" ); \
88 | } while (0)
89 | 
90 | #endif /* end of include guard: FIBRILI_H */
91 | 


--------------------------------------------------------------------------------
/src/runtime.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdint.h>
  3 | #include <pthread.h>
  4 | #include "safe.h"
  5 | #include "debug.h"
  6 | #include "param.h"
  7 | #include "stats.h"
  8 | 
  9 | static pthread_t * _procs;
 10 | static void ** _stacks;
 11 | 
 12 | __thread int _tid;
 13 | 
 14 | extern void fibrili_init(int id, int nprocs);
 15 | extern void fibrili_exit(int id, int nprocs);
 16 | 
 17 | #ifdef FIBRIL_STATS
 18 | void * MAIN_STACK_TOP;
 19 | #endif
 20 | 
 21 | static void * __main(void * id)
 22 | {
 23 |   _tid = (int) (intptr_t) id;
 24 | 
 25 |   fibrili_init(_tid, PARAM_NPROCS);
 26 |   return NULL;
 27 | }
 28 | 
 29 | int fibril_rt_nprocs()
 30 | {
 31 |   if (PARAM_NPROCS == 0) {
 32 |     return param_nprocs(0);
 33 |   } else {
 34 |     return PARAM_NPROCS;
 35 |   }
 36 | }
 37 | 
 38 | int fibril_rt_init(int n)
 39 | {
 40 |   param_init(n);
 41 | 
 42 |   int nprocs = PARAM_NPROCS;
 43 |   if (nprocs <= 0) return -1;
 44 | 
 45 |   size_t stacksize = PARAM_STACK_SIZE;
 46 | 
 47 |   _procs = malloc(sizeof(pthread_t [nprocs]));
 48 |   _stacks = malloc(sizeof(void * [nprocs]));
 49 | 
 50 |   pthread_attr_t attrs[nprocs];
 51 |   int i;
 52 | 
 53 |   for (i = 1; i < nprocs; ++i) {
 54 |     SAFE_RZCALL(posix_memalign(&_stacks[i], PARAM_PAGE_SIZE, stacksize));
 55 |     pthread_attr_init(&attrs[i]);
 56 |     pthread_attr_setstack(&attrs[i], _stacks[i], stacksize);
 57 |     pthread_create(&_procs[i], &attrs[i], __main, (void *) (intptr_t) i);
 58 |     pthread_attr_destroy(&attrs[i]);
 59 |   }
 60 | 
 61 |   _procs[0] = pthread_self();
 62 |   SAFE_RZCALL(posix_memalign(&_stacks[0], PARAM_PAGE_SIZE, stacksize));
 63 | 
 64 |   register void * rsp asm ("r15");
 65 |   rsp = _stacks[0] + stacksize;
 66 | 
 67 | #ifdef FIBRIL_STATS
 68 |   register void * top asm ("rsp");
 69 |   MAIN_STACK_TOP = PAGE_ALIGN_DOWN(top);
 70 | #endif
 71 | 
 72 |   __asm__ ( "xchg\t%0,%%rsp" : "+r" (rsp) :: "memory" );
 73 |   __main((void *) 0);
 74 |   __asm__ ( "xchg\t%0,%%rsp" : : "r" (rsp) : "memory" );
 75 | 
 76 |   return 0;
 77 | }
 78 | 
 79 | int fibril_rt_exit()
 80 | {
 81 |   fibrili_exit(_tid, PARAM_NPROCS);
 82 | 
 83 |   int i;
 84 | 
 85 |   for (i = 1; i < PARAM_NPROCS; ++i) {
 86 |     pthread_join(_procs[i], NULL);
 87 |     free(_stacks[i]);
 88 |   }
 89 | 
 90 |   free(_procs);
 91 |   free(_stacks);
 92 | 
 93 |   STATS_EXPORT(N_STEALS);
 94 |   STATS_EXPORT(N_SUSPENSIONS);
 95 |   STATS_EXPORT(N_STACKS);
 96 |   STATS_EXPORT(N_PAGES);
 97 | 
 98 |   return 0;
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/src/pool.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/mman.h>
  3 | #include "safe.h"
  4 | #include "sync.h"
  5 | #include "mutex.h"
  6 | #include "param.h"
  7 | #include "stats.h"
  8 | 
  9 | #ifndef POOL_GLOBAL_SIZE
 10 | #define POOL_GLOBAL_SIZE (2048 - 3)
 11 | #endif
 12 | 
 13 | #ifndef POOL_LOCAL_SIZE
 14 | #define POOL_LOCAL_SIZE 7
 15 | #endif
 16 | 
 17 | #ifndef POOL_CACHE_SIZE
 18 | #define POOL_CACHE_SIZE 4
 19 | #endif
 20 | 
 21 | static struct {
 22 |   mutex_t * volatile lock;
 23 |   size_t volatile avail;
 24 |   void * buff[POOL_GLOBAL_SIZE];
 25 | } _pg __attribute__((aligned(128)));
 26 | 
 27 | static __thread struct {
 28 |   size_t volatile avail;
 29 |   void * buff[POOL_LOCAL_SIZE];
 30 | } _pl __attribute__((aligned(128)));
 31 | 
 32 | /**
 33 |  * Take a stack from the pool or allocate from heap if the pool is empty.
 34 |  * @return Return a stack or NULL if the pool has reached its limit.
 35 |  */
 36 | void * pool_take()
 37 | {
 38 |   void * stack = NULL;
 39 | 
 40 |   /** Take a stack from the available stacks. */
 41 |   if (_pl.avail > 0) {
 42 |     stack = _pl.buff[--_pl.avail];
 43 |   } else {
 44 |     /** Take a stack from the parent pool. */
 45 |     if (_pg.avail > 0) {
 46 |       mutex_t mutex;
 47 |       mutex_lock(&_pg.lock, &mutex);
 48 | 
 49 |       if (_pg.avail > 0) {
 50 |         stack = _pg.buff[--_pg.avail];
 51 |       }
 52 | 
 53 |       mutex_unlock(&_pg.lock, &mutex);
 54 |     }
 55 | 
 56 |     if (!stack) {
 57 |       SAFE_RZCALL(posix_memalign(&stack, PARAM_PAGE_SIZE, PARAM_STACK_SIZE));
 58 |       STATS_INC(N_STACKS, 1);
 59 | 
 60 | #ifdef FIBRIL_STATS
 61 |       SAFE_NNCALL(mprotect(stack, PARAM_STACK_SIZE, PROT_NONE));
 62 | #endif
 63 |     }
 64 |   }
 65 | 
 66 |   SAFE_ASSERT(stack);
 67 |   return stack;
 68 | }
 69 | 
 70 | /**
 71 |  * Put a stack back into pool.
 72 |  * @param p The pool to put back into.
 73 |  * @param stack The stack to put back.
 74 |  */
 75 | void pool_put(void * stack)
 76 | {
 77 |   SAFE_ASSERT(stack);
 78 | 
 79 |   /** If local pool does not have space, */
 80 |   if (_pl.avail >= POOL_LOCAL_SIZE) {
 81 |     /** Try moving stacks to parent pool. */
 82 |     if (_pg.avail < POOL_GLOBAL_SIZE) {
 83 |       mutex_t mutex;
 84 |       mutex_lock(&_pg.lock, &mutex);
 85 | 
 86 |       /** Keep only POOL_CACHE_SIZE stacks. */
 87 |       while (_pl.avail > POOL_CACHE_SIZE && _pg.avail < POOL_GLOBAL_SIZE) {
 88 |         _pg.buff[_pg.avail++] = _pl.buff[--_pl.avail];
 89 |       }
 90 | 
 91 |       mutex_unlock(&_pg.lock, &mutex);
 92 |     }
 93 | 
 94 |     /** Free local pool for space. */
 95 |     while (_pl.avail >= POOL_LOCAL_SIZE) {
 96 |       free(_pl.buff[--_pl.avail]);
 97 |       STATS_DEC(N_STACKS, 1);
 98 |     }
 99 |   }
100 | 
101 |   /** Invariant: we always put stack into local pool. */
102 |   _pl.buff[_pl.avail++] = stack;
103 | }
104 | 
105 | 


--------------------------------------------------------------------------------
/src/stack.c:
--------------------------------------------------------------------------------
  1 | #define _GNU_SOURCE
  2 | #include <fcntl.h>
  3 | #include <signal.h>
  4 | #include <stdlib.h>
  5 | #include <sys/mman.h>
  6 | #include "pool.h"
  7 | #include "safe.h"
  8 | #include "sync.h"
  9 | #include "mutex.h"
 10 | #include "param.h"
 11 | #include "stack.h"
 12 | #include "stats.h"
 13 | 
 14 | #ifdef FIBRIL_STATS
 15 | extern void * MAIN_STACK_TOP;
 16 | 
 17 | void handle_segfault(int s, siginfo_t * si, void * unused)
 18 | {
 19 |   if (si->si_code != SEGV_ACCERR) {
 20 |     struct sigaction default_action = {
 21 |       .sa_handler = SIG_DFL,
 22 |       .sa_sigaction = NULL,
 23 |       .sa_mask = 0,
 24 |       .sa_flags = 0,
 25 |       .sa_restorer = NULL
 26 |     };
 27 |     sigaction(SIGSEGV, &default_action, NULL);
 28 |     return;
 29 |   }
 30 | 
 31 |   void * stack = fibrili_deq.stack;
 32 |   void * addr = PAGE_ALIGN_DOWN(si->si_addr);
 33 |   if (addr < stack || addr >= stack + PARAM_STACK_SIZE) return;
 34 | 
 35 |   STATS_COUNT(N_PAGES, 1);
 36 |   SAFE_NNCALL(mprotect(addr, PARAM_PAGE_SIZE, PROT_READ | PROT_WRITE));
 37 | }
 38 | #endif
 39 | 
 40 | void stack_init(int id)
 41 | {
 42 | #ifdef FIBRIL_STATS
 43 |   stack_t altstack = {
 44 |     .ss_flags = 0,
 45 |     .ss_size = PARAM_STACK_SIZE
 46 |   };
 47 |   SAFE_RZCALL(posix_memalign(&altstack.ss_sp, PARAM_PAGE_SIZE,
 48 |         PARAM_STACK_SIZE));
 49 |   SAFE_NNCALL(sigaltstack(&altstack, NULL));
 50 | 
 51 |   struct sigaction sa = {
 52 |     .sa_flags = SA_SIGINFO | SA_STACK,
 53 |     .sa_sigaction = handle_segfault,
 54 |   };
 55 |   SAFE_NNCALL(sigaction(SIGSEGV, &sa, NULL));
 56 | #endif
 57 | 
 58 |   if (id == 0) {
 59 |     fibrili_deq.stack = PARAM_STACK_ADDR;
 60 | 
 61 | #ifdef FIBRIL_STATS
 62 |     SAFE_ASSERT(MAIN_STACK_TOP >= PARAM_STACK_ADDR);
 63 |     SAFE_ASSERT(MAIN_STACK_TOP < (PARAM_STACK_ADDR + PARAM_STACK_SIZE));
 64 |     size_t size = MAIN_STACK_TOP - PARAM_STACK_ADDR;
 65 | 
 66 |     STATS_COUNT(N_PAGES, ((PARAM_STACK_ADDR + PARAM_STACK_SIZE) -
 67 |           MAIN_STACK_TOP)  / PARAM_PAGE_SIZE);
 68 | 
 69 |     int flags = MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
 70 |     SAFE_NNCALL(mmap(PARAM_STACK_ADDR, size, PROT_NONE, flags, -1, 0));
 71 | #endif
 72 |   }
 73 | }
 74 | 
 75 | void * stack_setup(struct _fibril_t * frptr)
 76 | {
 77 |   void ** rsp = fibrili_deq.stack + PARAM_STACK_SIZE;
 78 | 
 79 |   /** Reserve 128 byte at the bottom. */
 80 |   rsp -= 16;
 81 |   return rsp;
 82 | }
 83 | 
 84 | int stack_uninstall(struct _fibril_t * frptr)
 85 | {
 86 |   DEBUG_ASSERT(frptr != NULL);
 87 | 
 88 |   void * addr = frptr->stack.ptr;
 89 |   fibrili_deq.stack = NULL;
 90 | 
 91 |   if (addr != PARAM_STACK_ADDR) {
 92 |     size_t size = PAGE_ALIGN_DOWN(frptr->stack.top) - addr;
 93 |     SAFE_NNCALL(madvise(addr, size, MADV_DONTNEED));
 94 |   }
 95 | 
 96 |   return 1;
 97 | }
 98 | 
 99 | void stack_reinstall(struct _fibril_t * frptr)
100 | {
101 |   DEBUG_ASSERT(frptr != NULL);
102 | 
103 |   void * addr = fibrili_deq.stack;
104 |   SAFE_ASSERT(addr != PARAM_STACK_ADDR);
105 | 
106 |   if (addr) pool_put(addr);
107 | 
108 |   fibrili_deq.stack = frptr->stack.ptr;
109 | }
110 | 
111 | 


--------------------------------------------------------------------------------
/src/debug.h:
--------------------------------------------------------------------------------
  1 | #ifndef DEBUG_H
  2 | #define DEBUG_H
  3 | 
  4 | #if HAVE_CONFIG_H
  5 | #include "config.h"
  6 | #ifdef FIBRIL_DEBUG
  7 | #define ENABLE_DEBUG
  8 | #define DEBUG_LEVEL FIBRIL_DEBUG
  9 | #endif
 10 | #endif
 11 | 
 12 | extern __thread int _tid;
 13 | #define DEBUG_TID _tid
 14 | 
 15 | #ifndef DEBUG_LEVEL
 16 | #define DEBUG_LEVEL 0
 17 | #endif
 18 | 
 19 | #ifndef DEBUG_WAIT
 20 | #define DEBUG_WAIT 0
 21 | #endif
 22 | 
 23 | #define DEBUG_CRIT 1
 24 | #define DEBUG_INFO 2
 25 | #define DEBUG_STEP 3
 26 | 
 27 | #define DEBUG_CONCAT(left, right) left##right
 28 | #define DEBUG_NARG(...) DEBUG_NARG_(__VA_ARGS__, ## __VA_ARGS__, \
 29 |     8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 0)
 30 | #define DEBUG_NARG_(...) DEBUG_ARG_N(__VA_ARGS__)
 31 | #define DEBUG_ARG_N(_1, _11, _2, _22, _3, _33, _4, _44, _5, _55, \
 32 |     _6, _66, _7, _77, _8, _88, N, ...) N
 33 | 
 34 | #define DEBUG_FORMAT(...) DEBUG_FORMAT_(DEBUG_NARG(__VA_ARGS__), ##__VA_ARGS__)
 35 | #define DEBUG_FORMAT_(N, ...) DEBUG_CONCAT(DEBUG_FORMAT_, N)(__VA_ARGS__)
 36 | #define DEBUG_FORMAT_0(...)
 37 | #define DEBUG_FORMAT_1(p, ...) DEBUG_FORM p
 38 | #define DEBUG_FORMAT_2(p, ...) DEBUG_FORM p DEBUG_FORMAT_1(__VA_ARGS__)
 39 | #define DEBUG_FORMAT_3(p, ...) DEBUG_FORM p DEBUG_FORMAT_2(__VA_ARGS__)
 40 | #define DEBUG_FORMAT_4(p, ...) DEBUG_FORM p DEBUG_FORMAT_3(__VA_ARGS__)
 41 | #define DEBUG_FORMAT_5(p, ...) DEBUG_FORM p DEBUG_FORMAT_4(__VA_ARGS__)
 42 | #define DEBUG_FORMAT_6(p, ...) DEBUG_FORM p DEBUG_FORMAT_5(__VA_ARGS__)
 43 | #define DEBUG_FORMAT_7(p, ...) DEBUG_FORM p DEBUG_FORMAT_6(__VA_ARGS__)
 44 | #define DEBUG_FORM(var, spec) " " #var "=" spec
 45 | 
 46 | #define DEBUG_VARS(...) DEBUG_VARS_(DEBUG_NARG(__VA_ARGS__), ##__VA_ARGS__)
 47 | #define DEBUG_VARS_(N, ...) DEBUG_CONCAT(DEBUG_VARS_, N)(__VA_ARGS__)
 48 | #define DEBUG_VARS_0(...)
 49 | #define DEBUG_VARS_1(p, ...) DEBUG_VAR p
 50 | #define DEBUG_VARS_2(p, ...) DEBUG_VAR p DEBUG_VARS_1(__VA_ARGS__)
 51 | #define DEBUG_VARS_3(p, ...) DEBUG_VAR p DEBUG_VARS_2(__VA_ARGS__)
 52 | #define DEBUG_VARS_4(p, ...) DEBUG_VAR p DEBUG_VARS_3(__VA_ARGS__)
 53 | #define DEBUG_VARS_5(p, ...) DEBUG_VAR p DEBUG_VARS_4(__VA_ARGS__)
 54 | #define DEBUG_VARS_6(p, ...) DEBUG_VAR p DEBUG_VARS_5(__VA_ARGS__)
 55 | #define DEBUG_VARS_7(p, ...) DEBUG_VAR p DEBUG_VARS_6(__VA_ARGS__)
 56 | #define DEBUG_VAR(var, spec) , var
 57 | 
 58 | #include <stdio.h>
 59 | #include <assert.h>
 60 | #include <stdlib.h>
 61 | #include <unistd.h>
 62 | 
 63 | #define DEBUG_DUMP(lv, tag, ...) do { \
 64 |   if (lv <= DEBUG_LEVEL) { \
 65 |     fprintf(stderr, "[%d]: " tag DEBUG_FORMAT(__VA_ARGS__) "\n", \
 66 |         DEBUG_TID DEBUG_VARS(__VA_ARGS__) \
 67 |     ); \
 68 |     fflush(stderr); \
 69 |   } \
 70 | } while (0)
 71 | 
 72 | #define DEBUG_BREAK(T) do { \
 73 |   volatile int wait = (T); \
 74 |   if (wait) { \
 75 |     if (DEBUG_WAIT) { \
 76 |       int pid = getpid(); \
 77 |       DEBUG_DUMP(0, "waiting for debugger:", (pid, "%d")); \
 78 |     } else { \
 79 |       abort(); \
 80 |     } \
 81 |   } \
 82 |   while (wait); \
 83 | } while (0)
 84 | 
 85 | #ifdef ENABLE_DEBUG
 86 | 
 87 | #define DEBUG_ASSERT(F) do { \
 88 |   if (!(F)) { \
 89 |     DEBUG_DUMP(0, "assertion failed: " # F); \
 90 |     DEBUG_BREAK(!(F)); \
 91 |   } \
 92 | } while (0)
 93 | 
 94 | #else /* ENABLE_DEBUG is undefined */
 95 | 
 96 | #define DEBUG_ASSERT(...)
 97 | 
 98 | #endif /* end of ENABLE_DEBUG */
 99 | #endif /* end of include guard: DEBUG_H */
100 | 


--------------------------------------------------------------------------------
/test/test.h:
--------------------------------------------------------------------------------
  1 | #ifndef TEST_H
  2 | #define TEST_H
  3 | 
  4 | #if HAVE_CONFIG_H
  5 | #include "config.h"
  6 | #endif
  7 | 
  8 | extern void init();
  9 | extern void prep();
 10 | extern void test();
 11 | extern int verify();
 12 | 
 13 | extern int n;
 14 | 
 15 | #include <stdlib.h>
 16 | #include <fibril.h>
 17 | 
 18 | #ifdef BENCHMARK
 19 | 
 20 | #include <stdio.h>
 21 | #include <float.h>
 22 | #include <string.h>
 23 | #include <sys/time.h>
 24 | #include <sys/resource.h>
 25 | 
 26 | static void sort(float * a, int n)
 27 | {
 28 |   int i, sorted = 0;
 29 | 
 30 |   while (!sorted) {
 31 |     sorted = 1;
 32 | 
 33 |     for (i = 1; i < n; ++i) {
 34 |       if (a[i] < a[i - 1]) {
 35 |         float t = a[i];
 36 |         a[i] = a[i - 1];
 37 |         a[i - 1] = t;
 38 |         sorted = 0;
 39 |       }
 40 |     }
 41 |   }
 42 | }
 43 | 
 44 | size_t static inline time_elapsed(size_t val)
 45 | {
 46 |   struct timeval t;
 47 |   gettimeofday(&t, NULL);
 48 |   return t.tv_sec * 1000000 + t.tv_usec - val;
 49 | }
 50 | 
 51 | static void bench(const char * name, int nprocs)
 52 | {
 53 |   static int iter = 10;
 54 |   float times[iter];
 55 | 
 56 |   printf("===========================================\n");
 57 |   printf("  Benchmark: %s\n", strrchr(name, '/') + 1);
 58 |   printf("  Input size: %d\n", n);
 59 |   printf("  Number of iterations: %d\n", iter);
 60 |   printf("  Number of processors: %d\n", nprocs);
 61 | 
 62 |   struct rusage ru;
 63 |   getrusage(RUSAGE_SELF, &ru);
 64 |   long rss = ru.ru_maxrss;
 65 |   long flt = ru.ru_minflt;
 66 | 
 67 |   int i;
 68 |   for (i = 0; i < iter; ++i) {
 69 |     prep();
 70 |     size_t usecs = time_elapsed(0);
 71 |     test();
 72 |     usecs = time_elapsed(usecs);
 73 |     times[i] = usecs / 1000000.0;
 74 |     printf("  #%d execution time: %f s\n", i, times[i]);
 75 |   }
 76 | 
 77 |   sort(times, iter);
 78 | 
 79 |   float p10 = times[1];
 80 |   float p90 = times[8];
 81 |   float med = times[5];
 82 | 
 83 |   getrusage(RUSAGE_SELF, &ru);
 84 |   rss = ru.ru_maxrss - rss;
 85 |   flt = ru.ru_minflt - flt;
 86 | 
 87 |   printf("  Execution time summary:\n");
 88 |   printf("    Median: %f s\n", med);
 89 |   printf("    10th %%: %f s\n", p10);
 90 |   printf("    90th %%: %f s\n", p90);
 91 |   printf("  Resources summary: \n");
 92 |   printf("    Max RSS: %ld (KB)\n", ru.ru_maxrss);
 93 |   printf("    Runtime RSS: %ld (KB)\n", rss);
 94 |   printf("    # of page faults: %ld\n", flt);
 95 | }
 96 | 
 97 | #endif
 98 | 
 99 | #include <stdlib.h>
100 | 
101 | int main(int argc, const char * argv[])
102 | {
103 |   if (argc > 1 && (argc = atoi(argv[1])) > 0) {
104 |     n = argc;
105 |   }
106 | 
107 |   init();
108 | 
109 |   fibril_rt_init(0);
110 |   int nprocs = fibril_rt_nprocs();
111 | 
112 | #ifdef BENCHMARK
113 |   bench(argv[0], nprocs);
114 | #else
115 |   prep();
116 |   test();
117 | #endif
118 | 
119 |   fibril_rt_exit();
120 | 
121 | #ifdef BENCHMARK
122 | #ifdef FIBRIL_STATS
123 |   printf("  Statistics summary:\n");
124 |   printf("    # of steals: %s\n", getenv("FIBRIL_N_STEALS"));
125 |   printf("    # of suspensions: %s\n", getenv("FIBRIL_N_SUSPENSIONS"));
126 |   printf("    # of stacks used: %s\n", getenv("FIBRIL_N_STACKS"));
127 |   printf("    # of pages used: %s\n", getenv("FIBRIL_N_PAGES"));
128 | #endif
129 |   printf("===========================================\n");
130 | #endif
131 | 
132 |   return verify();
133 | }
134 | 
135 | #endif /* end of include guard: TEST_H */
136 | 


--------------------------------------------------------------------------------
/src/fibrili.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <pthread.h>
  3 | #include "pool.h"
  4 | #include "sync.h"
  5 | #include "stack.h"
  6 | #include "debug.h"
  7 | #include "deque.h"
  8 | #include "param.h"
  9 | #include "stats.h"
 10 | #include "fibrile.h"
 11 | 
 12 | static __thread fibril_t * _restart;
 13 | static __thread fibril_t * _frptr;
 14 | static deque_t ** _deqs;
 15 | static fibril_t * volatile _stop;
 16 | 
 17 | __attribute__((noreturn)) static
 18 | void longjmp(fibril_t * frptr, void * rsp)
 19 | {
 20 |   DEBUG_DUMP(3, "jump:", (frptr->pc, "%p"), (rsp, "%p"));
 21 |   sync_unlock(frptr->lock);
 22 |   __asm__ ( "mov\t%1,%%rsp\n\t"
 23 |             "mov\t%0,%%rbp\n\t"
 24 |             "jmp\t*%2\n\t"
 25 |             : : "r" (frptr->stack.btm), "r" (rsp), "r" (frptr->pc) : "memory");
 26 |   __builtin_unreachable();
 27 | }
 28 | 
 29 | __attribute__((noinline)) static
 30 | void schedule(int id, int nprocs, fibril_t * frptr)
 31 | {
 32 |   struct drand48_data _buffer;
 33 | 
 34 |   if (frptr != _restart && frptr != _stop) {
 35 |     sync_lock(frptr->lock);
 36 | 
 37 |     if (frptr->count-- == 0) {
 38 |       if (frptr->stack.ptr != fibrili_deq.stack) {
 39 |         stack_reinstall(frptr);
 40 |       }
 41 | 
 42 |       longjmp(frptr, frptr->stack.top);
 43 |     } else {
 44 |       if (frptr->stack.ptr == fibrili_deq.stack) {
 45 |         STATS_COUNT(N_SUSPENSIONS, 1);
 46 |         stack_uninstall(frptr);
 47 |       }
 48 | 
 49 |       sync_unlock(frptr->lock);
 50 |     }
 51 |   } else {
 52 |     if (id == 0) return;
 53 |   }
 54 | 
 55 |   while (!_stop) {
 56 |     long victim;
 57 |     lrand48_r(&_buffer, &victim);
 58 |     victim %= nprocs - 1;
 59 |     if (victim >= id) victim += 1;
 60 | 
 61 |     fibril_t * frptr = deque_steal(_deqs[victim]);
 62 | 
 63 |     if (frptr) {
 64 |       if (!fibrili_deq.stack) fibrili_deq.stack = pool_take();
 65 | 
 66 |       DEBUG_DUMP(1, "steal:", (victim, "%d"), (frptr, "%p"));
 67 |       STATS_COUNT(N_STEALS, 1);
 68 |       longjmp(frptr, stack_setup(frptr));
 69 |     }
 70 | 
 71 |     /** Force the worker to yield as a penalty for the failed steal. */
 72 |     sched_yield();
 73 |   }
 74 | 
 75 |   sync_barrier(nprocs);
 76 | 
 77 |   if (id) pthread_exit(NULL);
 78 |   else longjmp(_stop, _stop->stack.top);
 79 | }
 80 | 
 81 | void fibrili_init(int id, int nprocs)
 82 | {
 83 |   _tid = id;
 84 |   stack_init(id);
 85 | 
 86 |   if (id == 0) {
 87 |     /** Setup deque pointers. */
 88 |     _deqs = malloc(sizeof(deque_t * [nprocs]));
 89 |   }
 90 | 
 91 |   sync_barrier(nprocs);
 92 |   _deqs[id] = &fibrili_deq;
 93 |   sync_barrier(nprocs);
 94 | 
 95 |   DEBUG_DUMP(2, "proc_start:", (id, "%d"), (_deqs[id], "%p"));
 96 |   sync_barrier(nprocs);
 97 | 
 98 |   fibril_t fr;
 99 |   fibril_init(&fr);
100 |   _restart = &fr;
101 |   DEBUG_DUMP(2, "restart:", (_restart, "%p"), (_restart->stack.top, "%p"),
102 |       (_restart->stack.btm, "%p"));
103 |   fibrili_membar(fibrili_join(_restart));
104 |   schedule(id, nprocs, _frptr);
105 | }
106 | 
107 | void fibrili_exit(int id, int nprocs)
108 | {
109 |   fibril_t fr;
110 | 
111 |   if (id != 0) {
112 |     fibril_init(&fr);
113 |     _stop = &fr;
114 |     DEBUG_DUMP(2, "proc_stop:", (_stop, "%p"), (fibrili_deq.stack, "%p"));
115 |     fibrili_membar(fibrili_join(_stop));
116 |   } else {
117 |     _stop = &fr;
118 |     sync_barrier(nprocs);
119 |   }
120 | 
121 |   free(_deqs);
122 | }
123 | 
124 | void fibrili_resume(fibril_t * frptr)
125 | {
126 |   _frptr = frptr;
127 |   longjmp(_restart, _restart->stack.top);
128 | }
129 | 
130 | __attribute__((noinline))
131 | void fibrili_join(fibril_t * frptr)
132 | {
133 |   frptr->pc = __builtin_return_address(0);
134 |   fibrili_resume(frptr);
135 | }
136 | 
137 | 


--------------------------------------------------------------------------------
/src/fork.h:
--------------------------------------------------------------------------------
 1 | #ifndef FIBRIL_FORK_H
 2 | #define FIBRIL_FORK_H
 3 | 
 4 | #define _fibril_defs(...) \
 5 |   _fibril_defs_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
 6 | #define _fibril_defs_(n, ...) \
 7 |   _fibril_concat(_fibril_defs_, n)(__VA_ARGS__)
 8 | #define _fibril_defs_16(a,...) __typeof__(a) a16,_fibril_defs_15(__VA_ARGS__)
 9 | #define _fibril_defs_15(a,...) __typeof__(a) a15,_fibril_defs_14(__VA_ARGS__)
10 | #define _fibril_defs_14(a,...) __typeof__(a) a14,_fibril_defs_13(__VA_ARGS__)
11 | #define _fibril_defs_13(a,...) __typeof__(a) a13,_fibril_defs_12(__VA_ARGS__)
12 | #define _fibril_defs_12(a,...) __typeof__(a) a12,_fibril_defs_11(__VA_ARGS__)
13 | #define _fibril_defs_11(a,...) __typeof__(a) a11,_fibril_defs_10(__VA_ARGS__)
14 | #define _fibril_defs_10(a,...) __typeof__(a) a10,_fibril_defs_9 (__VA_ARGS__)
15 | #define _fibril_defs_9(a, ...) __typeof__(a) a9, _fibril_defs_8 (__VA_ARGS__)
16 | #define _fibril_defs_8(a, ...) __typeof__(a) a8, _fibril_defs_7 (__VA_ARGS__)
17 | #define _fibril_defs_7(a, ...) __typeof__(a) a7, _fibril_defs_6 (__VA_ARGS__)
18 | #define _fibril_defs_6(a, ...) __typeof__(a) a6, _fibril_defs_5 (__VA_ARGS__)
19 | #define _fibril_defs_5(a, ...) __typeof__(a) a5, _fibril_defs_4 (__VA_ARGS__)
20 | #define _fibril_defs_4(a, ...) __typeof__(a) a4, _fibril_defs_3 (__VA_ARGS__)
21 | #define _fibril_defs_3(a, ...) __typeof__(a) a3, _fibril_defs_2 (__VA_ARGS__)
22 | #define _fibril_defs_2(a, ...) __typeof__(a) a2, _fibril_defs_1 (__VA_ARGS__)
23 | #define _fibril_defs_1(a)      __typeof__(a) a1,
24 | #define _fibril_defs_0()
25 | 
26 | #define _fibril_args(...) \
27 |   _fibril_args_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
28 | #define _fibril_args_(n, ...) \
29 |   _fibril_concat(_fibril_args_, n)(__VA_ARGS__)
30 | #define _fibril_args_16(a,...) a16,_fibril_args_15(__VA_ARGS__)
31 | #define _fibril_args_15(a,...) a15,_fibril_args_14(__VA_ARGS__)
32 | #define _fibril_args_14(a,...) a14,_fibril_args_13(__VA_ARGS__)
33 | #define _fibril_args_13(a,...) a13,_fibril_args_12(__VA_ARGS__)
34 | #define _fibril_args_12(a,...) a12,_fibril_args_11(__VA_ARGS__)
35 | #define _fibril_args_11(a,...) a11,_fibril_args_10(__VA_ARGS__)
36 | #define _fibril_args_10(a,...) a10,_fibril_args_9 (__VA_ARGS__)
37 | #define _fibril_args_9(a, ...) a9, _fibril_args_8 (__VA_ARGS__)
38 | #define _fibril_args_8(a, ...) a8, _fibril_args_7 (__VA_ARGS__)
39 | #define _fibril_args_7(a, ...) a7, _fibril_args_6 (__VA_ARGS__)
40 | #define _fibril_args_6(a, ...) a6, _fibril_args_5 (__VA_ARGS__)
41 | #define _fibril_args_5(a, ...) a5, _fibril_args_4 (__VA_ARGS__)
42 | #define _fibril_args_4(a, ...) a4, _fibril_args_3 (__VA_ARGS__)
43 | #define _fibril_args_3(a, ...) a3, _fibril_args_2 (__VA_ARGS__)
44 | #define _fibril_args_2(a, ...) a2, _fibril_args_1 (__VA_ARGS__)
45 | #define _fibril_args_1(a)      a1
46 | #define _fibril_args_0()
47 | 
48 | #define _fibril_expand(...) \
49 |   _fibril_expand_(_fibril_nth(__VA_ARGS__), ## __VA_ARGS__)
50 | #define _fibril_expand_(n, ...) \
51 |   _fibril_concat(_fibril_expand_, n)(__VA_ARGS__)
52 | #define _fibril_expand_16(...) __VA_ARGS__,
53 | #define _fibril_expand_15(...) __VA_ARGS__,
54 | #define _fibril_expand_14(...) __VA_ARGS__,
55 | #define _fibril_expand_13(...) __VA_ARGS__,
56 | #define _fibril_expand_12(...) __VA_ARGS__,
57 | #define _fibril_expand_11(...) __VA_ARGS__,
58 | #define _fibril_expand_10(...) __VA_ARGS__,
59 | #define _fibril_expand_9( ...) __VA_ARGS__,
60 | #define _fibril_expand_8( ...) __VA_ARGS__,
61 | #define _fibril_expand_7( ...) __VA_ARGS__,
62 | #define _fibril_expand_6( ...) __VA_ARGS__,
63 | #define _fibril_expand_5( ...) __VA_ARGS__,
64 | #define _fibril_expand_4( ...) __VA_ARGS__,
65 | #define _fibril_expand_3( ...) __VA_ARGS__,
66 | #define _fibril_expand_2( ...) __VA_ARGS__,
67 | #define _fibril_expand_1( ...) __VA_ARGS__,
68 | #define _fibril_expand_0()
69 | 
70 | #endif /* end of include guard: FIBRIL_FORK_H */
71 | 


--------------------------------------------------------------------------------
/test/matmul.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include "test.h"
  4 | 
  5 | int n = 2048;
  6 | 
  7 | static float *  a;
  8 | static float *  b;
  9 | static float ** c;
 10 | 
 11 | fibril static void compute(float *, int, int, float *, int, int,
 12 |     float **, int, int, int);
 13 | 
 14 | static void compute00(float * a, int ai, int aj, float * b, int bi, int bj,
 15 |     float ** c, int ci, int cj, int n)
 16 | {
 17 |   compute(a, ai, aj,     b, bi,     bj, c, ci, cj, n);
 18 |   compute(a, ai, aj + n, b, bi + n, bj, c, ci, cj, n);
 19 | }
 20 | 
 21 | static void compute01(float * a, int ai, int aj, float * b, int bi, int bj,
 22 |     float ** c, int ci, int cj, int n)
 23 | {
 24 |   compute(a, ai, aj,     b, bi,     bj + n, c, ci, cj + n, n);
 25 |   compute(a, ai, aj + n, b, bi + n, bj + n, c, ci, cj + n, n);
 26 | }
 27 | 
 28 | static void compute10(float * a, int ai, int aj, float * b, int bi, int bj,
 29 |     float ** c, int ci, int cj, int n)
 30 | {
 31 |   compute(a, ai + n, aj,     b, bi,     bj, c, ci + n, cj, n);
 32 |   compute(a, ai + n, aj + n, b, bi + n, bj, c, ci + n, cj, n);
 33 | }
 34 | 
 35 | static void compute11(float * a, int ai, int aj, float * b, int bi, int bj,
 36 |     float ** c, int ci, int cj, int n)
 37 | {
 38 |   compute(a, ai + n, aj,     b, bi,     bj + n, c, ci + n, cj + n, n);
 39 |   compute(a, ai + n, aj + n, b, bi + n, bj + n, c, ci + n, cj + n, n);
 40 | }
 41 | 
 42 | static void multiply(float * a, int ai, int aj, float * b, int bi, int bj,
 43 |     float ** c, int ci, int cj)
 44 | {
 45 |   int a0 = ai;
 46 |   int a1 = ai + 1;
 47 | 
 48 |   float s00 = 0.0F;
 49 |   float s01 = 0.0F;
 50 |   float s10 = 0.0F;
 51 |   float s11 = 0.0F;
 52 | 
 53 |   int b0 = bi;
 54 |   int b1 = bi + 1;
 55 | 
 56 |   s00 += a[a0 + aj] * b[b0 + bj];
 57 |   s10 += a[a1 + aj] * b[b0 + bj];
 58 |   s01 += a[a0 + aj] * b[b0 + bj + 1];
 59 |   s11 += a[a1 + aj] * b[b0 + bj + 1];
 60 | 
 61 |   s00 += a[a0 + aj + 1] * b[b1 + bj];
 62 |   s10 += a[a1 + aj + 1] * b[b1 + bj];
 63 |   s01 += a[a0 + aj + 1] * b[b1 + bj + 1];
 64 |   s11 += a[a1 + aj + 1] * b[b1 + bj + 1];
 65 | 
 66 |   c[ci]    [cj]     += s00;
 67 |   c[ci]    [cj + 1] += s01;
 68 |   c[ci + 1][cj]     += s10;
 69 |   c[ci + 1][cj + 1] += s11;
 70 | }
 71 | 
 72 | fibril static void compute(float * a, int ai, int aj, float * b, int bi, int bj,
 73 |     float ** c, int ci, int cj, int n)
 74 | {
 75 |   if (n == 2) {
 76 |     multiply(a, ai, aj, b, bi, bj, c, ci, cj);
 77 |   } else {
 78 |     int h = n / 2;
 79 | 
 80 |     fibril_t fr;
 81 |     fibril_init(&fr);
 82 | 
 83 |     fibril_fork(&fr, compute00, (a, ai, aj, b, bi, bj, c, ci, cj, h));
 84 |     fibril_fork(&fr, compute10, (a, ai, aj, b, bi, bj, c, ci, cj, h));
 85 |     fibril_fork(&fr, compute01, (a, ai, aj, b, bi, bj, c, ci, cj, h));
 86 |     compute11(a, ai, aj, b, bi, bj, c, ci, cj, h);
 87 | 
 88 |     fibril_join(&fr);
 89 |   }
 90 | }
 91 | 
 92 | void init()
 93 | {
 94 |   a = malloc(sizeof(float [n * n]));
 95 |   b = malloc(sizeof(float [n * n]));
 96 |   c = malloc(sizeof(float * [n]));
 97 | 
 98 |   int i, j;
 99 |   for (i = 0; i < n; ++i) {
100 |     c[i] = malloc(sizeof(float [n]));
101 |   }
102 | 
103 |   for (i = 0; i < n * n; ++i) {
104 |     a[i] = 1.0F;
105 |   }
106 | 
107 |   for (i = 0; i < n * n; ++i) {
108 |     b[i] = 1.0F;
109 |   }
110 | }
111 | 
112 | void prep()
113 | {
114 |   int i, j;
115 | 
116 |   for (i = 0; i < n; ++i) {
117 |     for (j = 0; j < n; ++j) {
118 |       c[i][j] = 0;
119 |     }
120 |   }
121 | }
122 | 
123 | void test()
124 | {
125 |   compute(a, 0, 0, b, 0, 0, c, 0, 0, n);
126 | }
127 | 
128 | int verify() {
129 |   int i, j;
130 | 
131 |   for (i = 0; i < n; ++i) {
132 |     for (j = 0; j < n; j++) {
133 |       if (c[i][j] != n) {
134 |         printf("c[%d][%d]=%f (expected %f)\n", i, j, c[i][j], n);
135 |         return 1;
136 |       }
137 |     }
138 |   }
139 | 
140 |   return 0;
141 | }
142 | 


--------------------------------------------------------------------------------
/test/knapsack.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Cilk program to solve the 0-1 knapsack problem using a branch-and-bound
  3 |  * technique.
  4 |  *
  5 |  * Author: Matteo Frigo
  6 |  */
  7 | /*
  8 |  * Copyright (c) 2000 Massachusetts Institute of Technology
  9 |  * Copyright (c) 2000 Matteo Frigo
 10 |  *
 11 |  * This program is free software; you can redistribute it and/or modify
 12 |  * it under the terms of the GNU General Public License as published by
 13 |  * the Free Software Foundation; either version 2 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * This program is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU General Public License
 22 |  * along with this program; if not, write to the Free Software
 23 |  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 24 |  *
 25 |  */
 26 | 
 27 | #include <stdio.h>
 28 | #include <stdlib.h>
 29 | #include <limits.h>
 30 | #include "test.h"
 31 | 
 32 | struct item {
 33 |   int value;
 34 |   int weight;
 35 | };
 36 | 
 37 | int n = 32;
 38 | static int capacity = 900;
 39 | static int sol;
 40 | 
 41 | static struct item items[] = {
 42 |   { 15, 23 },
 43 |   { 22, 12 },
 44 |   { 17, 42 },
 45 |   { 1, 13 },
 46 |   { 32, 21 },
 47 |   { 65, 43 },
 48 |   { 23, 56 },
 49 |   { 4, 7 },
 50 |   { 4, 8 },
 51 |   { 32, 42 },
 52 |   { 51, 32 },
 53 |   { 22, 12 },
 54 |   { 17, 24 },
 55 |   { 12, 13 },
 56 |   { 23, 21 },
 57 |   { 56, 47 },
 58 |   { 23, 65 },
 59 |   { 6, 7 },
 60 |   { 4, 7 },
 61 |   { 32, 42 },
 62 |   { 22, 42 },
 63 |   { 59, 32 },
 64 |   { 23, 12 },
 65 |   { 12, 24 },
 66 |   { 12, 13 },
 67 |   { 23, 21 },
 68 |   { 39, 48 },
 69 |   { 22, 65 },
 70 |   { 6, 7 },
 71 |   { 4, 7 },
 72 |   { 33, 42 },
 73 |   { 18, 53 }
 74 | };
 75 | 
 76 | static int best_so_far = INT_MIN;
 77 | 
 78 | static int compare(struct item *a, struct item *b)
 79 | {
 80 |   double c = ((double) a->value / a->weight) -
 81 |     ((double) b->value / b->weight);
 82 | 
 83 |   if (c > 0)
 84 |     return -1;
 85 |   if (c < 0)
 86 |     return 1;
 87 |   return 0;
 88 | }
 89 | 
 90 | /*
 91 |  * return the optimal solution for n items (first is e) and
 92 |  * capacity c. Value so far is v.
 93 |  */
 94 | fibril static int knapsack(struct item *e, int c, int n, int v)
 95 | {
 96 |   int with, without, best;
 97 |   double ub;
 98 | 
 99 |   /* base case: full knapsack or no items */
100 |   if (c < 0)
101 |     return INT_MIN;
102 | 
103 |   if (n == 0 || c == 0)
104 |     return v;		/* feasible solution, with value v */
105 | 
106 |   ub = (double) v + c * e->value / e->weight;
107 | 
108 |   if (ub < best_so_far) {
109 |     /* prune ! */
110 |     return INT_MIN;
111 |   }
112 | 
113 |   fibril_t fr;
114 |   fibril_init(&fr);
115 |   /*
116 |    * compute the best solution without the current item in the knapsack
117 |    */
118 |   fibril_fork(&fr, &without, knapsack, (e + 1, c, n - 1, v));
119 | 
120 |   /* compute the best solution with the current item in the knapsack */
121 |   with = knapsack(e + 1, c - e->weight, n - 1, v + e->value);
122 | 
123 |   fibril_join(&fr);
124 | 
125 |   best = with > without ? with : without;
126 | 
127 |   /*
128 |    * notice the race condition here. The program is still
129 |    * correct, in the sense that the best solution so far
130 |    * is at least best_so_far. Moreover best_so_far gets updated
131 |    * when returning, so eventually it should get the right
132 |    * value. The program is highly non-deterministic.
133 |    */
134 |   if (best > best_so_far)
135 |     best_so_far = best;
136 | 
137 |   return best;
138 | }
139 | 
140 | void init()
141 | {
142 |   /* sort the items on decreasing order of value/weight */
143 |   qsort(items, n, sizeof(struct item),
144 |       (int (*)(const void *, const void *)) compare);
145 | }
146 | 
147 | void prep() {}
148 | 
149 | void test()
150 | {
151 |   sol = knapsack(items, capacity, n, 0);
152 | }
153 | 
154 | int verify()
155 | {
156 |   int expected = 733;
157 | 
158 |   if (sol != expected) {
159 |     printf("sol: %d (expected: %d)\n", sol, expected);
160 |     return 1;
161 |   }
162 | 
163 |   return 0;
164 | }
165 | 
166 | 


--------------------------------------------------------------------------------
/test/heat.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Heat diffusion (Jacobi-type iteration)
  3 |  *
  4 |  * Volker Strumpen, Boston                                 August 1996
  5 |  *
  6 |  * Copyright (c) 1996 Massachusetts Institute of Technology
  7 |  *
  8 |  * This program is free software; you can redistribute it and/or modify
  9 |  * it under the terms of the GNU General Public License as published by
 10 |  * the Free Software Foundation; either version 2 of the License, or
 11 |  * (at your option) any later version.
 12 |  *
 13 |  * This program is distributed in the hope that it will be useful,
 14 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 |  * GNU General Public License for more details.
 17 |  *
 18 |  * You should have received a copy of the GNU General Public License
 19 |  * along with this program; if not, write to the Free Software
 20 |  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 21 |  */
 22 | #include <math.h>
 23 | #include <stdio.h>
 24 | #include <stdlib.h>
 25 | #include "test.h"
 26 | 
 27 | #define f(x,y)     (sin(x)*sin(y))
 28 | #define randa(x,t) (0.0)
 29 | #define randb(x,t) (exp(-2*(t))*sin(x))
 30 | #define randc(y,t) (0.0)
 31 | #define randd(y,t) (exp(-2*(t))*sin(y))
 32 | #define solu(x,y,t) (exp(-2*(t))*sin(x)*sin(y))
 33 | 
 34 | int n = 4096;
 35 | 
 36 | int nx, ny, nt;
 37 | double xu, xo, yu, yo, tu, to;
 38 | 
 39 | double dx, dy, dt;
 40 | double dtdxsq, dtdysq;
 41 | 
 42 | double **  odd;
 43 | double ** even;
 44 | 
 45 | fibril static void heat(double ** m, int il, int iu)
 46 | {
 47 |   if (iu - il > 1) {
 48 |     int im = (il + iu) / 2;
 49 | 
 50 |     fibril_t fr;
 51 |     fibril_init(&fr);
 52 | 
 53 |     fibril_fork(&fr, heat, (m, il, im));
 54 |     heat(m, im, iu);
 55 | 
 56 |     fibril_join(&fr);
 57 |     return;
 58 |   }
 59 | 
 60 |   int i = il;
 61 |   int j;
 62 |   double * row = m[i];
 63 | 
 64 |   if (i == 0) {
 65 |     for (j = 0; j < ny; ++j) {
 66 |       row[j] = randc(yu + j * dy, 0);
 67 |     }
 68 |   } else if (i == nx - 1) {
 69 |     for (j = 0; j < ny; ++j) {
 70 |       row[j] = randd(yu + j * dy, 0);
 71 |     }
 72 |   } else {
 73 |     row[0] = randa(xu + i * dx, 0);
 74 |     for (j = 1; j < ny - 1; ++j) {
 75 |       row[j] = f(xu + i * dx, yu + j * dy);
 76 |     }
 77 |     row[ny - 1] = randb(xu + i * dx, 0);
 78 |   }
 79 | }
 80 | 
 81 | fibril void diffuse(double ** out, double ** in, int il, int iu, double t)
 82 | {
 83 |   if (iu - il > 1) {
 84 |     int im = (il + iu) / 2;
 85 | 
 86 |     fibril_t fr;
 87 |     fibril_init(&fr);
 88 | 
 89 |     fibril_fork(&fr, diffuse, (out, in, il, im, t));
 90 |     diffuse(out, in, im, iu, t);
 91 | 
 92 |     fibril_join(&fr);
 93 |     return;
 94 |   }
 95 | 
 96 |   int i = il;
 97 |   int j;
 98 |   double * row = out[i];
 99 | 
100 |   if (i == 0) {
101 |     for (j = 0; j < ny; ++j) {
102 |       row[j] = randc(yu + j * dy, t);
103 |     }
104 |   } else if (i == nx - 1) {
105 |     for (j = 0; j < ny; ++j) {
106 |       row[j] = randd(yu + j * dy, t);
107 |     }
108 |   } else {
109 |     row[0] = randa(xu + i * dx, t);
110 |     for (j = 1; j < ny - 1; ++j) {
111 |       row[j] = in[i][j]
112 |         + dtdysq * (in[i][j + 1] - 2 * in[i][j] + in[i][j - 1])
113 |         + dtdxsq * (in[i + 1][j] - 2 * in[i][j] + in[i - 1][j]);
114 |     }
115 |     row[ny - 1] = randb(xu + i * dx, t);
116 |   }
117 | }
118 | 
119 | void init()
120 | {
121 |   nx = n;
122 |   ny = 1024;
123 |   nt = 100;
124 |   xu = 0.0;
125 |   xo = 1.570796326794896558;
126 |   yu = 0.0;
127 |   yo = 1.570796326794896558;
128 |   tu = 0.0;
129 |   to = 0.0000001;
130 | 
131 |   dx = (xo - xu) / (nx - 1);
132 |   dy = (yo - yu) / (ny - 1);
133 |   dt = (to - tu) / nt;
134 | 
135 |   dtdxsq = dt / (dx * dx);
136 |   dtdysq = dt / (dy * dy);
137 | 
138 |   even = malloc(sizeof(double * [nx]));
139 |   odd  = malloc(sizeof(double * [nx]));
140 | 
141 |   int i;
142 |   for (i = 0; i < nx; ++i) {
143 |     even[i] = malloc(sizeof(double [ny]));
144 |     odd [i] = malloc(sizeof(double [ny]));
145 |   }
146 | }
147 | 
148 | void prep()
149 | {
150 |   heat(even, 0, nx);
151 | }
152 | 
153 | void test()
154 | {
155 |   double t = tu;
156 |   int i;
157 | 
158 |   for (i = 1; i <= nt; i += 2) {
159 |     diffuse(odd, even, 0, nx, t += dt);
160 |     diffuse(even, odd, 0, nx, t += dt);
161 |   }
162 | 
163 |   if (nt % 2) {
164 |     diffuse(odd, even, 0, nx, t += dt);
165 |   }
166 | }
167 | 
168 | int verify()
169 | {
170 |   double **mat;
171 |   double mae = 0.0;
172 |   double mre = 0.0;
173 |   double me = 0.0;
174 | 
175 |   mat = nt % 2 ? odd : even;
176 | 
177 |   int a, b;
178 | 
179 |   for (a = 0; a < nx; ++a) {
180 |     for (b = 0; b < ny; ++b) {
181 |       double tmp = fabs(mat[a][b] - solu(xu + a * dx, yu + b * dy, to));
182 | 
183 |       me += tmp;
184 |       if (tmp > mae) mae = tmp;
185 |       if (mat[a][b] != 0.0) tmp = tmp / mat[a][b];
186 |       if (tmp > mre) mre = tmp;
187 |     }
188 |   }
189 | 
190 |   me = me / (nx * ny);
191 | 
192 |   if (mae > 1e-12) {
193 |     printf("Local maximal absolute error %10e\n", mae);
194 |     return 1;
195 |   } if (mre > 1e-12) {
196 |     printf("Local maximal relative error %10e\n", mre);
197 |     return 1;
198 |   } if (me > 1e-12) {
199 |     printf("Global Mean absolute error %10e\n", me);
200 |     return 1;
201 |   }
202 | 
203 |   return 0;
204 | }
205 | 
206 | 


--------------------------------------------------------------------------------
/test/fft.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2000 Massachusetts Institute of Technology
  3 |  * Copyright (c) 2000 Matteo Frigo
  4 |  *
  5 |  * This program is free software; you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation; either version 2 of the License, or
  8 |  * (at your option) any later version.
  9 |  *
 10 |  * This program is distributed in the hope that it will be useful,
 11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  * GNU General Public License for more details.
 14 |  *
 15 |  * You should have received a copy of the GNU General Public License
 16 |  * along with this program; if not, write to the Free Software
 17 |  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 18 |  */
 19 | 
 20 | #include <math.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | #include <sys/mman.h>
 25 | #include "test.h"
 26 | #include "fft.h"
 27 | 
 28 | #ifdef BENCHMARK
 29 | int n = 26;
 30 | #else
 31 | int n = 12;
 32 | #endif
 33 | 
 34 | static int size;
 35 | static COMPLEX *in, *out, *cp, *W;
 36 | static const REAL pi = 3.1415926535897932384626434;
 37 | 
 38 | /*
 39 |  * compute the W coefficients (that is, powers of the root of 1)
 40 |  * and store them into an array.
 41 |  */
 42 | fibril static void compute_w_coefficients(int n, int a, int b, COMPLEX * W)
 43 | {
 44 |   register double twoPiOverN;
 45 |   register int k;
 46 |   register REAL s, c;
 47 | 
 48 |   if (b - a < 128) {
 49 |     twoPiOverN = 2.0 * pi / n;
 50 |     for (k = a; k <= b; ++k) {
 51 |       c = cos(twoPiOverN * k);
 52 |       c_re(W[k]) = c_re(W[n - k]) = c;
 53 |       s = sin(twoPiOverN * k);
 54 |       c_im(W[k]) = -s;
 55 |       c_im(W[n - k]) = s;
 56 |     }
 57 |   } else {
 58 |     int ab = (a + b) / 2;
 59 | 
 60 |     fibril_t fr;
 61 |     fibril_init(&fr);
 62 | 
 63 |     fibril_fork(&fr, compute_w_coefficients, (n, a, ab, W));
 64 |     compute_w_coefficients(n, ab + 1, b, W);
 65 | 
 66 |     fibril_join(&fr);
 67 |   }
 68 | }
 69 | 
 70 | /*
 71 |  * Determine (in a stupid way) if n is divisible by eight, then by four, else
 72 |  * find the smallest prime factor of n.
 73 |  */
 74 | static int factor(int n)
 75 | {
 76 |   int r;
 77 | 
 78 |   if (n < 2)
 79 |     return 1;
 80 | 
 81 |   if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048
 82 |       || n == 4096)
 83 |     return 8;
 84 |   if ((n & 15) == 0)
 85 |     return 16;
 86 |   if ((n & 7) == 0)
 87 |     return 8;
 88 |   if ((n & 3) == 0)
 89 |     return 4;
 90 |   if ((n & 1) == 0)
 91 |     return 2;
 92 | 
 93 | #if 0
 94 |   /* radix-32 is too big --- wait for processors with more registers
 95 |    * :-) */
 96 |   if ((n & 31) == 0 && n > 256)
 97 |     return 32;
 98 | #endif
 99 | 
100 |   /* try odd numbers up to n (computing the sqrt may be slower) */
101 |   for (r = 3; r < n; r += 2)
102 |     if (n % r == 0)
103 |       return r;
104 | 
105 |   /* n is prime */
106 |   return n;
107 | }
108 | 
109 | fibril static void unshuffle(int a, int b,
110 |     COMPLEX * in, COMPLEX * out, int r, int m)
111 | {
112 |   int i, j;
113 |   int r4 = r & (~0x3);
114 |   const COMPLEX *ip;
115 |   COMPLEX *jp;
116 | 
117 |   if (b - a < 16) {
118 |     ip = in + a * r;
119 |     for (i = a; i < b; ++i) {
120 |       jp = out + i;
121 |       for (j = 0; j < r4; j += 4) {
122 |         jp[0] = ip[0];
123 |         jp[m] = ip[1];
124 |         jp[2 * m] = ip[2];
125 |         jp[3 * m] = ip[3];
126 |         jp += 4 * m;
127 |         ip += 4;
128 |       }
129 |       for (; j < r; ++j) {
130 |         *jp = *ip;
131 |         ip++;
132 |         jp += m;
133 |       }
134 |     }
135 |   } else {
136 |     int ab = (a + b) / 2;
137 | 
138 |     fibril_t fr;
139 |     fibril_init(&fr);
140 | 
141 |     fibril_fork(&fr, unshuffle, (a, ab, in, out, r, m));
142 |     unshuffle(ab, b, in, out, r, m);
143 | 
144 |     fibril_join(&fr);
145 |   }
146 | }
147 | 
148 | /*
149 |  * Recursive complex FFT on the n complex components of the array in:
150 |  * basic Cooley-Tukey algorithm, with some improvements for
151 |  * n power of two. The result is placed in the array out. n is arbitrary.
152 |  * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk
153 |  * are prime numbers, and r1 * r2 * ... * rk = n.
154 |  *
155 |  * n: size of the input
156 |  * in: pointer to input
157 |  * out: pointer to output
158 |  * factors: list of factors of n, precomputed
159 |  * W: twiddle factors
160 |  * nW: size of W, that is, size of the original transform
161 |  *
162 |  */
163 | fibril static void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors,
164 |     COMPLEX * W, int nW)
165 | {
166 |   int r, m;
167 | 
168 |   /* special cases */
169 |   if (n == 32) {
170 |     fft_base_32(in, out);
171 |     return;
172 |   }
173 |   if (n == 16) {
174 |     fft_base_16(in, out);
175 |     return;
176 |   }
177 |   if (n == 8) {
178 |     fft_base_8(in, out);
179 |     return;
180 |   }
181 |   if (n == 4) {
182 |     fft_base_4(in, out);
183 |     return;
184 |   }
185 |   if (n == 2) {
186 |     fft_base_2(in, out);
187 |     return;
188 |   }
189 |   /* the cases n == 3, n == 5, and maybe 7 should be implemented as well */
190 | 
191 |   r = *factors;
192 |   m = n / r;
193 | 
194 |   if (r < n) {
195 |     /* split the DFT of length n into r DFTs of length n/r,  and recurse */
196 |     if (r == 32)
197 |       fft_unshuffle_32(0, m, in, out, m);
198 |     else if (r == 16)
199 |       fft_unshuffle_16(0, m, in, out, m);
200 |     else if (r == 8)
201 |       fft_unshuffle_8(0, m, in, out, m);
202 |     else if (r == 4)
203 |       fft_unshuffle_4(0, m, in, out, m);
204 |     else if (r == 2)
205 |       fft_unshuffle_2(0, m, in, out, m);
206 |     else
207 |       unshuffle(0, m, in, out, r, m);
208 | 
209 |     fibril_t fr;
210 |     fibril_init(&fr);
211 | 
212 |     int k;
213 |     for(k = 0; k < n; k += m) {
214 |       fibril_fork(&fr, fft_aux, (m, out + k, in + k, factors + 1, W, nW));
215 |     }
216 | 
217 |     fibril_join(&fr);
218 |   }
219 | 
220 |   /* now multiply by the twiddle factors, and perform m FFTs of length r */
221 |   if (r == 2)
222 |     fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
223 |   else if (r == 4)
224 |     fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
225 |   else if (r == 8)
226 |     fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
227 |   else if (r == 16)
228 |     fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
229 |   else if (r == 32)
230 |     fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
231 |   else
232 |     fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
233 | 
234 |   return;
235 | }
236 | 
237 | /*
238 |  * user interface for fft_aux
239 |  */
240 | static void fft(int n, COMPLEX * in, COMPLEX * out)
241 | {
242 |   int factors[40];		/* allows FFTs up to at least 3^40 */
243 |   int *p = factors;
244 |   int l = n;
245 |   int r;
246 | 
247 |   compute_w_coefficients(n, 0, n / 2, W);
248 | 
249 |   /**
250 |    * find factors of n, first 8, then 4 and then primes in ascending
251 |    * order.
252 |    */
253 |   do {
254 |     r = factor(l);
255 |     *p++ = r;
256 |     l /= r;
257 |   } while (l > 1);
258 | 
259 |   fft_aux(n, in, out, factors, W, n);
260 |   return;
261 | }
262 | 
263 | /****************************************************************
264 |  *                     END OF FFT ALGORITHM
265 |  ****************************************************************/
266 | 
267 | /*                            tests                             */
268 | 
269 | static void fft_alt(int n, COMPLEX * in, COMPLEX * out)
270 | {
271 |   int i, j;
272 |   COMPLEX sum;
273 |   COMPLEX w;
274 | 
275 |   for (j = 0; j < n; ++j) {
276 |     c_re(sum) = c_im(sum) = 0.0;
277 | 
278 |     for (i = 0; i < n; ++i) {
279 |       c_re(w) = cos((2.0 * pi * (i * j % n)) / n);
280 |       c_im(w) = -sin((2.0 * pi * (i * j % n)) / n);
281 |       c_re(sum) += c_re(in[i]) * c_re(w) - c_im(in[i]) * c_im(w);
282 |       c_im(sum) += c_im(in[i]) * c_re(w) + c_re(in[i]) * c_im(w);
283 |     }
284 | 
285 |     out[j] = sum;
286 |   }
287 | 
288 |   return;
289 | }
290 | 
291 | void init()
292 | {
293 |   size = (1 << n);
294 |   out = malloc(sizeof(COMPLEX [size]));
295 |   in  = malloc(sizeof(COMPLEX [size]));
296 |   W   = malloc(sizeof(COMPLEX [size + 1]));
297 | 
298 |   int i;
299 |   for (i = 0; i < size; ++i) {
300 |     c_re(in[i]) = drand48();
301 |     c_im(in[i]) = drand48();
302 |   }
303 | }
304 | 
305 | void prep()
306 | {
307 |   if (cp == NULL)
308 |     cp = malloc(sizeof(COMPLEX [size]));
309 | 
310 |   memcpy(cp, in, sizeof(COMPLEX [size]));
311 | }
312 | 
313 | void test()
314 | {
315 |   fft(size, cp, out);
316 | }
317 | 
318 | #ifdef BENCHMARK
319 | int verify(void) { return 0; }
320 | #else
321 | int verify(void)
322 | {
323 |   COMPLEX * expect = malloc(sizeof(COMPLEX [size]));
324 | 
325 |   fft_alt(size, in, expect);
326 | 
327 |   /* compute the relative error */
328 |   double error = 0.0;
329 | 
330 |   int i;
331 |   for (i = 0; i < size; ++i) {
332 |     double a = sqrt(
333 |         (c_re(out[i]) - c_re(expect[i])) * (c_re(out[i]) - c_re(expect[i])) +
334 |         (c_im(out[i]) - c_im(expect[i])) * (c_im(out[i]) - c_im(expect[i])));
335 |     double d = sqrt(
336 |         c_re(expect[i]) * c_re(expect[i]) + c_im(expect[i]) * c_im(expect[i]));
337 | 
338 |     if (d < -1.0e-10 || d > 1.0e-10) a /= d;
339 |     if (a > error) error = a;
340 |   }
341 | 
342 |   if (error > 1e-3) {
343 |     printf("size=%d error=%e\n", size, error);
344 |     return 1;
345 |   } else {
346 |     return 0;
347 |   }
348 | }
349 | #endif
350 | 
351 | 


--------------------------------------------------------------------------------
/m4/acx_pthread.m4:
--------------------------------------------------------------------------------
  1 | dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
  2 | dnl
  3 | dnl @summary figure out how to build C programs using POSIX threads
  4 | dnl
  5 | dnl This macro figures out how to build C programs using POSIX threads.
  6 | dnl It sets the PTHREAD_LIBS output variable to the threads library and
  7 | dnl linker flags, and the PTHREAD_CFLAGS output variable to any special
  8 | dnl C compiler flags that are needed. (The user can also force certain
  9 | dnl compiler flags/libs to be tested by setting these environment
 10 | dnl variables.)
 11 | dnl
 12 | dnl Also sets PTHREAD_CC to any special C compiler that is needed for
 13 | dnl multi-threaded programs (defaults to the value of CC otherwise).
 14 | dnl (This is necessary on AIX to use the special cc_r compiler alias.)
 15 | dnl
 16 | dnl NOTE: You are assumed to not only compile your program with these
 17 | dnl flags, but also link it with them as well. e.g. you should link
 18 | dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS
 19 | dnl $LIBS
 20 | dnl
 21 | dnl If you are only building threads programs, you may wish to use
 22 | dnl these variables in your default LIBS, CFLAGS, and CC:
 23 | dnl
 24 | dnl        LIBS="$PTHREAD_LIBS $LIBS"
 25 | dnl        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
 26 | dnl        CC="$PTHREAD_CC"
 27 | dnl
 28 | dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute
 29 | dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to
 30 | dnl that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
 31 | dnl
 32 | dnl ACTION-IF-FOUND is a list of shell commands to run if a threads
 33 | dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands to
 34 | dnl run it if it is not found. If ACTION-IF-FOUND is not specified, the
 35 | dnl default action will define HAVE_PTHREAD.
 36 | dnl
 37 | dnl Please let the authors know if this macro fails on any platform, or
 38 | dnl if you have any other suggestions or comments. This macro was based
 39 | dnl on work by SGJ on autoconf scripts for FFTW (www.fftw.org) (with
 40 | dnl help from M. Frigo), as well as ac_pthread and hb_pthread macros
 41 | dnl posted by Alejandro Forero Cuervo to the autoconf macro repository.
 42 | dnl We are also grateful for the helpful feedback of numerous users.
 43 | dnl
 44 | dnl @category InstalledPackages
 45 | dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
 46 | dnl @version 2006-05-29
 47 | dnl @license GPLWithACException
 48 | 
 49 | AC_DEFUN([ACX_PTHREAD], [
 50 | AC_REQUIRE([AC_CANONICAL_HOST])
 51 | AC_LANG_SAVE
 52 | AC_LANG_C
 53 | acx_pthread_ok=no
 54 | 
 55 | # We used to check for pthread.h first, but this fails if pthread.h
 56 | # requires special compiler flags (e.g. on True64 or Sequent).
 57 | # It gets checked for in the link test anyway.
 58 | 
 59 | # First of all, check if the user has set any of the PTHREAD_LIBS,
 60 | # etcetera environment variables, and if threads linking works using
 61 | # them:
 62 | if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
 63 |         save_CFLAGS="$CFLAGS"
 64 |         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
 65 |         save_LIBS="$LIBS"
 66 |         LIBS="$PTHREAD_LIBS $LIBS"
 67 |         AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
 68 |         AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes)
 69 |         AC_MSG_RESULT($acx_pthread_ok)
 70 |         if test x"$acx_pthread_ok" = xno; then
 71 |                 PTHREAD_LIBS=""
 72 |                 PTHREAD_CFLAGS=""
 73 |         fi
 74 |         LIBS="$save_LIBS"
 75 |         CFLAGS="$save_CFLAGS"
 76 | fi
 77 | 
 78 | # We must check for the threads library under a number of different
 79 | # names; the ordering is very important because some systems
 80 | # (e.g. DEC) have both -lpthread and -lpthreads, where one of the
 81 | # libraries is broken (non-POSIX).
 82 | 
 83 | # Create a list of thread flags to try.  Items starting with a "-" are
 84 | # C compiler flags, and other items are library names, except for "none"
 85 | # which indicates that we try without any flags at all, and "pthread-config"
 86 | # which is a program returning the flags for the Pth emulation library.
 87 | 
 88 | acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
 89 | 
 90 | # The ordering *is* (sometimes) important.  Some notes on the
 91 | # individual items follow:
 92 | 
 93 | # pthreads: AIX (must check this before -lpthread)
 94 | # none: in case threads are in libc; should be tried before -Kthread and
 95 | #       other compiler flags to prevent continual compiler warnings
 96 | # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
 97 | # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
 98 | # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
 99 | # -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
100 | # -pthreads: Solaris/gcc
101 | # -mthreads: Mingw32/gcc, Lynx/gcc
102 | # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
103 | #      doesn't hurt to check since this sometimes defines pthreads too;
104 | #      also defines -D_REENTRANT)
105 | #      ... -mt is also the pthreads flag for HP/aCC
106 | # pthread: Linux, etcetera
107 | # --thread-safe: KAI C++
108 | # pthread-config: use pthread-config program (for GNU Pth library)
109 | 
110 | case "${host_cpu}-${host_os}" in
111 |         *solaris*)
112 | 
113 |         # On Solaris (at least, for some versions), libc contains stubbed
114 |         # (non-functional) versions of the pthreads routines, so link-based
115 |         # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
116 |         # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
117 |         # a function called by this macro, so we could check for that, but
118 |         # who knows whether they'll stub that too in a future libc.)  So,
119 |         # we'll just look for -pthreads and -lpthread first:
120 | 
121 |         acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
122 |         ;;
123 | esac
124 | 
125 | if test x"$acx_pthread_ok" = xno; then
126 | for flag in $acx_pthread_flags; do
127 | 
128 |         case $flag in
129 |                 none)
130 |                 AC_MSG_CHECKING([whether pthreads work without any flags])
131 |                 ;;
132 | 
133 |                 -*)
134 |                 AC_MSG_CHECKING([whether pthreads work with $flag])
135 |                 PTHREAD_CFLAGS="$flag"
136 |                 ;;
137 | 
138 | 		pthread-config)
139 | 		AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no)
140 | 		if test x"$acx_pthread_config" = xno; then continue; fi
141 | 		PTHREAD_CFLAGS="`pthread-config --cflags`"
142 | 		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
143 | 		;;
144 | 
145 |                 *)
146 |                 AC_MSG_CHECKING([for the pthreads library -l$flag])
147 |                 PTHREAD_LIBS="-l$flag"
148 |                 ;;
149 |         esac
150 | 
151 |         save_LIBS="$LIBS"
152 |         save_CFLAGS="$CFLAGS"
153 |         LIBS="$PTHREAD_LIBS $LIBS"
154 |         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
155 | 
156 |         # Check for various functions.  We must include pthread.h,
157 |         # since some functions may be macros.  (On the Sequent, we
158 |         # need a special flag -Kthread to make this header compile.)
159 |         # We check for pthread_join because it is in -lpthread on IRIX
160 |         # while pthread_create is in libc.  We check for pthread_attr_init
161 |         # due to DEC craziness with -lpthreads.  We check for
162 |         # pthread_cleanup_push because it is one of the few pthread
163 |         # functions on Solaris that doesn't have a non-functional libc stub.
164 |         # We try pthread_create on general principles.
165 |         AC_TRY_LINK([#include <pthread.h>],
166 |                     [pthread_t th; pthread_join(th, 0);
167 |                      pthread_attr_init(0); pthread_cleanup_push(0, 0);
168 |                      pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
169 |                     [acx_pthread_ok=yes])
170 | 
171 |         LIBS="$save_LIBS"
172 |         CFLAGS="$save_CFLAGS"
173 | 
174 |         AC_MSG_RESULT($acx_pthread_ok)
175 |         if test "x$acx_pthread_ok" = xyes; then
176 |                 break;
177 |         fi
178 | 
179 |         PTHREAD_LIBS=""
180 |         PTHREAD_CFLAGS=""
181 | done
182 | fi
183 | 
184 | # Various other checks:
185 | if test "x$acx_pthread_ok" = xyes; then
186 |         save_LIBS="$LIBS"
187 |         LIBS="$PTHREAD_LIBS $LIBS"
188 |         save_CFLAGS="$CFLAGS"
189 |         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
190 | 
191 |         # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
192 | 	AC_MSG_CHECKING([for joinable pthread attribute])
193 | 	attr_name=unknown
194 | 	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
195 | 	    AC_TRY_LINK([#include <pthread.h>], [int attr=$attr; return attr;],
196 |                         [attr_name=$attr; break])
197 | 	done
198 |         AC_MSG_RESULT($attr_name)
199 |         if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
200 |             AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
201 |                                [Define to necessary symbol if this constant
202 |                                 uses a non-standard name on your system.])
203 |         fi
204 | 
205 |         AC_MSG_CHECKING([if more special flags are required for pthreads])
206 |         flag=no
207 |         case "${host_cpu}-${host_os}" in
208 |             *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";;
209 |             *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";;
210 |         esac
211 |         AC_MSG_RESULT(${flag})
212 |         if test "x$flag" != xno; then
213 |             PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
214 |         fi
215 | 
216 |         LIBS="$save_LIBS"
217 |         CFLAGS="$save_CFLAGS"
218 | 
219 |         # More AIX lossage: must compile with xlc_r or cc_r
220 | 	if test x"$GCC" != xyes; then
221 |           AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC})
222 |         else
223 |           PTHREAD_CC=$CC
224 | 	fi
225 | else
226 |         PTHREAD_CC="$CC"
227 | fi
228 | 
229 | AC_SUBST(PTHREAD_LIBS)
230 | AC_SUBST(PTHREAD_CFLAGS)
231 | AC_SUBST(PTHREAD_CC)
232 | 
233 | # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
234 | if test x"$acx_pthread_ok" = xyes; then
235 |         ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
236 |         :
237 | else
238 |         acx_pthread_ok=no
239 |         $2
240 | fi
241 | AC_LANG_RESTORE
242 | ])dnl ACX_PTHREAD
243 | 


--------------------------------------------------------------------------------
/test/rectmul.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Program to multiply two rectangualar matrizes A(n,m) * B(m,n), where
  3 |  * (n < m) and (n mod 16 = 0) and (m mod n = 0). (Otherwise fill with 0s
  4 |  * to fit the shape.)
  5 |  *
  6 |  * written by Harald Prokop (prokop@mit.edu) Fall 97.
  7 |  */
  8 | /*
  9 |  * Copyright (c) 2003 Massachusetts Institute of Technology
 10 |  *
 11 |  * This program is free software; you can redistribute it and/or modify
 12 |  * it under the terms of the GNU General Public License as published by
 13 |  * the Free Software Foundation; either version 2 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * This program is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU General Public License
 22 |  * along with this program; if not, write to the Free Software
 23 |  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 24 |  *
 25 |  */
 26 | 
 27 | #include <stdlib.h>
 28 | #include <stdio.h>
 29 | #include "test.h"
 30 | 
 31 | #define BLOCK_EDGE 16
 32 | #define BLOCK_SIZE (BLOCK_EDGE * BLOCK_EDGE)
 33 | 
 34 | typedef double block[BLOCK_SIZE];
 35 | 
 36 | #ifndef BENCHMARK
 37 | int n = 512;
 38 | #else
 39 | int n = 4096;
 40 | #endif
 41 | 
 42 | static block * A, * B, * R;
 43 | static int x, y, z;
 44 | 
 45 | /* compute R = R+AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies
 46 | */
 47 | static void mult_add_block(block * A, block * B, block * R)
 48 | {
 49 |   int i, j;
 50 | 
 51 |   for (j = 0; j < 16; j += 2) {	/* 2 columns at a time */
 52 |     double *bp = &((double *) B)[j];
 53 |     for (i = 0; i < 16; i += 2) {		/* 2 rows at a time */
 54 |       double *ap = &((double *) A)[i * 16];
 55 |       double *rp = &((double *) R)[j + i * 16];
 56 |       register double s0_0, s0_1;
 57 |       register double s1_0, s1_1;
 58 |       s0_0 = rp[0];
 59 |       s0_1 = rp[1];
 60 |       s1_0 = rp[16];
 61 |       s1_1 = rp[17];
 62 |       s0_0 += ap[0] * bp[0];
 63 |       s0_1 += ap[0] * bp[1];
 64 |       s1_0 += ap[16] * bp[0];
 65 |       s1_1 += ap[16] * bp[1];
 66 |       s0_0 += ap[1] * bp[16];
 67 |       s0_1 += ap[1] * bp[17];
 68 |       s1_0 += ap[17] * bp[16];
 69 |       s1_1 += ap[17] * bp[17];
 70 |       s0_0 += ap[2] * bp[32];
 71 |       s0_1 += ap[2] * bp[33];
 72 |       s1_0 += ap[18] * bp[32];
 73 |       s1_1 += ap[18] * bp[33];
 74 |       s0_0 += ap[3] * bp[48];
 75 |       s0_1 += ap[3] * bp[49];
 76 |       s1_0 += ap[19] * bp[48];
 77 |       s1_1 += ap[19] * bp[49];
 78 |       s0_0 += ap[4] * bp[64];
 79 |       s0_1 += ap[4] * bp[65];
 80 |       s1_0 += ap[20] * bp[64];
 81 |       s1_1 += ap[20] * bp[65];
 82 |       s0_0 += ap[5] * bp[80];
 83 |       s0_1 += ap[5] * bp[81];
 84 |       s1_0 += ap[21] * bp[80];
 85 |       s1_1 += ap[21] * bp[81];
 86 |       s0_0 += ap[6] * bp[96];
 87 |       s0_1 += ap[6] * bp[97];
 88 |       s1_0 += ap[22] * bp[96];
 89 |       s1_1 += ap[22] * bp[97];
 90 |       s0_0 += ap[7] * bp[112];
 91 |       s0_1 += ap[7] * bp[113];
 92 |       s1_0 += ap[23] * bp[112];
 93 |       s1_1 += ap[23] * bp[113];
 94 |       s0_0 += ap[8] * bp[128];
 95 |       s0_1 += ap[8] * bp[129];
 96 |       s1_0 += ap[24] * bp[128];
 97 |       s1_1 += ap[24] * bp[129];
 98 |       s0_0 += ap[9] * bp[144];
 99 |       s0_1 += ap[9] * bp[145];
100 |       s1_0 += ap[25] * bp[144];
101 |       s1_1 += ap[25] * bp[145];
102 |       s0_0 += ap[10] * bp[160];
103 |       s0_1 += ap[10] * bp[161];
104 |       s1_0 += ap[26] * bp[160];
105 |       s1_1 += ap[26] * bp[161];
106 |       s0_0 += ap[11] * bp[176];
107 |       s0_1 += ap[11] * bp[177];
108 |       s1_0 += ap[27] * bp[176];
109 |       s1_1 += ap[27] * bp[177];
110 |       s0_0 += ap[12] * bp[192];
111 |       s0_1 += ap[12] * bp[193];
112 |       s1_0 += ap[28] * bp[192];
113 |       s1_1 += ap[28] * bp[193];
114 |       s0_0 += ap[13] * bp[208];
115 |       s0_1 += ap[13] * bp[209];
116 |       s1_0 += ap[29] * bp[208];
117 |       s1_1 += ap[29] * bp[209];
118 |       s0_0 += ap[14] * bp[224];
119 |       s0_1 += ap[14] * bp[225];
120 |       s1_0 += ap[30] * bp[224];
121 |       s1_1 += ap[30] * bp[225];
122 |       s0_0 += ap[15] * bp[240];
123 |       s0_1 += ap[15] * bp[241];
124 |       s1_0 += ap[31] * bp[240];
125 |       s1_1 += ap[31] * bp[241];
126 |       rp[0] = s0_0;
127 |       rp[1] = s0_1;
128 |       rp[16] = s1_0;
129 |       rp[17] = s1_1;
130 |     }
131 |   }
132 | }
133 | 
134 | 
135 | /* compute R = AB, where R,A,B are BLOCK_EDGE x BLOCK_EDGE matricies
136 | */
137 | static void multiply_block(block * A, block * B, block * R)
138 | {
139 |   int i, j;
140 | 
141 |   for (j = 0; j < 16; j += 2) {	/* 2 columns at a time */
142 |     double *bp = &((double *) B)[j];
143 |     for (i = 0; i < 16; i += 2) {		/* 2 rows at a time */
144 |       double *ap = &((double *) A)[i * 16];
145 |       double *rp = &((double *) R)[j + i * 16];
146 |       register double s0_0, s0_1;
147 |       register double s1_0, s1_1;
148 |       s0_0 = ap[0] * bp[0];
149 |       s0_1 = ap[0] * bp[1];
150 |       s1_0 = ap[16] * bp[0];
151 |       s1_1 = ap[16] * bp[1];
152 |       s0_0 += ap[1] * bp[16];
153 |       s0_1 += ap[1] * bp[17];
154 |       s1_0 += ap[17] * bp[16];
155 |       s1_1 += ap[17] * bp[17];
156 |       s0_0 += ap[2] * bp[32];
157 |       s0_1 += ap[2] * bp[33];
158 |       s1_0 += ap[18] * bp[32];
159 |       s1_1 += ap[18] * bp[33];
160 |       s0_0 += ap[3] * bp[48];
161 |       s0_1 += ap[3] * bp[49];
162 |       s1_0 += ap[19] * bp[48];
163 |       s1_1 += ap[19] * bp[49];
164 |       s0_0 += ap[4] * bp[64];
165 |       s0_1 += ap[4] * bp[65];
166 |       s1_0 += ap[20] * bp[64];
167 |       s1_1 += ap[20] * bp[65];
168 |       s0_0 += ap[5] * bp[80];
169 |       s0_1 += ap[5] * bp[81];
170 |       s1_0 += ap[21] * bp[80];
171 |       s1_1 += ap[21] * bp[81];
172 |       s0_0 += ap[6] * bp[96];
173 |       s0_1 += ap[6] * bp[97];
174 |       s1_0 += ap[22] * bp[96];
175 |       s1_1 += ap[22] * bp[97];
176 |       s0_0 += ap[7] * bp[112];
177 |       s0_1 += ap[7] * bp[113];
178 |       s1_0 += ap[23] * bp[112];
179 |       s1_1 += ap[23] * bp[113];
180 |       s0_0 += ap[8] * bp[128];
181 |       s0_1 += ap[8] * bp[129];
182 |       s1_0 += ap[24] * bp[128];
183 |       s1_1 += ap[24] * bp[129];
184 |       s0_0 += ap[9] * bp[144];
185 |       s0_1 += ap[9] * bp[145];
186 |       s1_0 += ap[25] * bp[144];
187 |       s1_1 += ap[25] * bp[145];
188 |       s0_0 += ap[10] * bp[160];
189 |       s0_1 += ap[10] * bp[161];
190 |       s1_0 += ap[26] * bp[160];
191 |       s1_1 += ap[26] * bp[161];
192 |       s0_0 += ap[11] * bp[176];
193 |       s0_1 += ap[11] * bp[177];
194 |       s1_0 += ap[27] * bp[176];
195 |       s1_1 += ap[27] * bp[177];
196 |       s0_0 += ap[12] * bp[192];
197 |       s0_1 += ap[12] * bp[193];
198 |       s1_0 += ap[28] * bp[192];
199 |       s1_1 += ap[28] * bp[193];
200 |       s0_0 += ap[13] * bp[208];
201 |       s0_1 += ap[13] * bp[209];
202 |       s1_0 += ap[29] * bp[208];
203 |       s1_1 += ap[29] * bp[209];
204 |       s0_0 += ap[14] * bp[224];
205 |       s0_1 += ap[14] * bp[225];
206 |       s1_0 += ap[30] * bp[224];
207 |       s1_1 += ap[30] * bp[225];
208 |       s0_0 += ap[15] * bp[240];
209 |       s0_1 += ap[15] * bp[241];
210 |       s1_0 += ap[31] * bp[240];
211 |       s1_1 += ap[31] * bp[241];
212 |       rp[0] = s0_0;
213 |       rp[1] = s0_1;
214 |       rp[16] = s1_0;
215 |       rp[17] = s1_1;
216 |     }
217 |   }
218 | }
219 | 
220 | 
221 | int check_matrix(block * R, long x, long y, long o, double v)
222 | {
223 |   int a, b;
224 | 
225 |   if (x * y == 1) {
226 |     /**
227 |      * Checks if each A[i,j] of a martix A of size nb x nb blocks has
228 |      * value v.
229 |      */
230 |     int i;
231 |     for (i = 0; i < BLOCK_SIZE; i++)
232 |       if (((double *) R)[i] != v)
233 |         return 1;
234 | 
235 |     return 0;
236 |   }
237 | 
238 |   if (x>y) {
239 |     a = check_matrix(R, x / 2, y, o, v);
240 |     b = check_matrix(R + (x / 2) * o,(x + 1) / 2, y, o, v);
241 |   } else {
242 |     a = check_matrix(R, x, y / 2, o, v);
243 |     b = check_matrix(R + (y / 2), x, (y + 1) / 2, o, v);
244 |   }
245 | 
246 |   return a + b;
247 | }
248 | 
249 | /* Add matrix T into matrix R, where T and R are bl blocks in size
250 |  *
251 |  */
252 | fibril void add_matrix(block * T, long ot, block * R, long oR, long x, long y)
253 | {
254 |   if (x + y == 2) {
255 |     long i;
256 |     for (i = 0; i < BLOCK_SIZE; i += 4) {
257 |       ((double *) R)[i + 0] += ((double *) T)[i + 0];
258 |       ((double *) R)[i + 1] += ((double *) T)[i + 1];
259 |       ((double *) R)[i + 2] += ((double *) T)[i + 2];
260 |       ((double *) R)[i + 3] += ((double *) T)[i + 3];
261 |     }
262 |     return;
263 |   }
264 | 
265 |   fibril_t fr;
266 |   fibril_init(&fr);
267 | 
268 |   if (x > y) {
269 |     fibril_fork(&fr, add_matrix, (T, ot, R, oR, x/2, y));
270 |     add_matrix(T+(x/2)*ot, ot, R+(x/2)*oR, oR, (x+1)/2, y);
271 |   } else {
272 |     fibril_fork(&fr, add_matrix, (T, ot, R, oR, x, y/2));
273 |     add_matrix(T+(y/2), ot, R+(y/2), oR, x, (y+1)/2);
274 |   }
275 | 
276 |   fibril_join(&fr);
277 | }
278 | 
279 | void init_matrix(block * R, long x, long y, long o, double v)
280 | {
281 |   if (x + y ==2) {
282 |     int i;
283 |     for (i = 0; i < BLOCK_SIZE; i++)
284 |       ((double *) R)[i] = v;
285 |     return;
286 |   }
287 | 
288 |   if (x > y) {
289 |     init_matrix(R, x/2, y, o, v);
290 |     init_matrix(R+(x/2) * o, (x+1)/2, y, o, v);
291 |   } else {
292 |     init_matrix(R, x, y/2, o, v);
293 |     init_matrix(R+(y/2), x, (y+1)/2, o, v);
294 |   }
295 | }
296 | 
297 | fibril static void multiply_matrix(block * A, long oa, block * B, long ob,
298 |     long x, long y, long z, block * R, long oR, int add)
299 | {
300 |   if (x + y + z == 3) {
301 |     if (add)
302 |       return mult_add_block(A, B, R);
303 |     else
304 |       return multiply_block(A, B, R);
305 |   }
306 | 
307 |   fibril_t fr;
308 |   fibril_init(&fr);
309 | 
310 |   if (x >= y && x >= z) {
311 |     fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x/2, y, z, R, oR, add));
312 |     multiply_matrix(A+(x/2)*oa, oa, B, ob, (x+1)/2, y, z, R+(x/2)*oR, oR, add);
313 |     fibril_join(&fr);
314 |   } else if (y > x && y > z) {
315 |     fibril_fork(&fr, multiply_matrix,
316 |         (A+(y/2), oa, B+(y/2)*ob, ob, x, (y+1)/2, z, R, oR, add));
317 | 
318 |     block * tmp = malloc(x * z * sizeof(block));
319 |     multiply_matrix(A, oa, B, ob, x, y/2, z, tmp, z, 0);
320 |     fibril_join(&fr);
321 | 
322 |     add_matrix(tmp, z, R, oR, x, z);
323 |     free(tmp);
324 |   } else {
325 |     fibril_fork(&fr, multiply_matrix, (A, oa, B, ob, x, y, z/2, R, oR, add));
326 |     multiply_matrix(A, oa, B+(z/2), ob, x, y, (z+1)/2, R+(z/2), oR, add);
327 |     fibril_join(&fr);
328 |   }
329 | }
330 | 
331 | void init() {
332 |   x = n / BLOCK_EDGE;
333 |   y = n / BLOCK_EDGE;
334 |   z = n / BLOCK_EDGE;
335 | 
336 |   A = malloc(x * y * sizeof(block));
337 |   B = malloc(y * z * sizeof(block));
338 |   R = malloc(x * z * sizeof(block));
339 | 
340 |   init_matrix(A, x, y, y, 1.0);
341 |   init_matrix(B, y, z, z, 1.0);
342 | }
343 | 
344 | void prep() {
345 |   init_matrix(R, x, z, z, 0.0);
346 | }
347 | 
348 | void test() {
349 |   multiply_matrix(A, y, B, z, x, y, z, R, z, 0);
350 | }
351 | 
352 | int verify() {
353 | #ifndef BENCHMARK
354 |   if (check_matrix(R, x, z, z, y * 16)) {
355 |     printf("WRONG RESULT!\n");
356 |     return 1;
357 |   };
358 | #endif
359 | 
360 |   return 0;
361 | }
362 | 


--------------------------------------------------------------------------------
/test/lu.c:
--------------------------------------------------------------------------------
  1 | /****************************************************************************\
  2 |  * LU decomposition
  3 |  * Robert Blumofe
  4 |  *
  5 |  * Copyright (c) 1996, Robert Blumofe.  All rights reserved.
  6 |  * This program is free software; you can redistribute it and/or modify
  7 |  * it under the terms of the GNU General Public License as published by
  8 |  * the Free Software Foundation; either version 2 of the License, or
  9 |  * (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  * GNU General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program; if not, write to the Free Software
 18 |  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 19 |  *
 20 | \****************************************************************************/
 21 | 
 22 | #include <math.h>
 23 | #include <stdio.h>
 24 | #include <stdlib.h>
 25 | #include <string.h>
 26 | #include "test.h"
 27 | 
 28 | /* Define the size of a block. */
 29 | #ifndef BLOCK_SIZE
 30 | #define BLOCK_SIZE 16
 31 | #endif
 32 | 
 33 | /* Define the default matrix size. */
 34 | #ifndef DEFAULT_SIZE
 35 | #ifndef BENCHMARK
 36 | #define DEFAULT_SIZE (16 * BLOCK_SIZE)
 37 | #else
 38 | #define DEFAULT_SIZE 4096
 39 | #endif
 40 | #endif
 41 | 
 42 | /* A block is a 2D array of doubles. */
 43 | typedef double Block[BLOCK_SIZE][BLOCK_SIZE];
 44 | #define BLOCK(B,I,J) (B[I][J])
 45 | 
 46 | /* A matrix is a 1D array of blocks. */
 47 | typedef Block * Matrix;
 48 | #define MATRIX(M,I,J) ((M)[(I)*nBlocks+(J)])
 49 | 
 50 | /** Matrix size. */
 51 | int n = DEFAULT_SIZE;
 52 | 
 53 | /** The global matrix and a copy of the matrix. */
 54 | static Matrix M, Msave;
 55 | 
 56 | /* Matrix size in blocks. */
 57 | static int nBlocks;
 58 | 
 59 | /****************************************************************************\
 60 |  * Utility routines.
 61 |  \****************************************************************************/
 62 | 
 63 | /*
 64 |  * init_matrix - Fill in matrix M with random values.
 65 |  */
 66 | static void init_matrix(Matrix M, int nb)
 67 | {
 68 |   int I, J, K, i, j, k;
 69 | 
 70 |   /* Initialize random number generator. */
 71 |   srand(1);
 72 | 
 73 |   /* For each element of each block, fill in random value. */
 74 |   for (I = 0; I < nb; I++)
 75 |     for (J = 0; J < nb; J++)
 76 |       for (i = 0; i < BLOCK_SIZE; i++)
 77 |         for (j = 0; j < BLOCK_SIZE; j++)
 78 |           BLOCK(MATRIX(M, I, J), i, j) = ((double)rand()) / (double)RAND_MAX;
 79 | 
 80 |   /* Inflate diagonal entries. */
 81 |   for (K = 0; K < nb; K++)
 82 |     for (k = 0; k < BLOCK_SIZE; k++)
 83 |       BLOCK(MATRIX(M, K, K), k, k) *= 10.0;
 84 | }
 85 | 
 86 | /*
 87 |  * print_matrix - Print matrix M.
 88 |  */
 89 | static void print_matrix(Matrix M, int nb)
 90 | {
 91 |   int i, j;
 92 | 
 93 |   /* Print out matrix. */
 94 |   for (i = 0; i < nb * BLOCK_SIZE; i++) {
 95 |     for (j = 0; j < nb * BLOCK_SIZE; j++)
 96 |       printf(" %6.4f",
 97 |           BLOCK(MATRIX(M, i / BLOCK_SIZE, j / BLOCK_SIZE),
 98 |             i % BLOCK_SIZE, j % BLOCK_SIZE));
 99 |     printf("\n");
100 |   }
101 | }
102 | 
103 | /*
104 |  * test_result - Check that matrix LU contains LU decomposition of M.
105 |  */
106 | static int test_result(Matrix LU, Matrix M, int nb)
107 | {
108 |   int I, J, K, i, j, k;
109 |   double diff, max_diff;
110 |   double v;
111 | 
112 |   /* Initialize test. */
113 |   max_diff = 0.0;
114 | 
115 |   /* Find maximum difference between any element of LU and M. */
116 |   for (i = 0; i < nb * BLOCK_SIZE; i++)
117 |     for (j = 0; j < nb * BLOCK_SIZE; j++) {
118 |       I = i / BLOCK_SIZE;
119 |       J = j / BLOCK_SIZE;
120 |       v = 0.0;
121 |       for (k = 0; k < i && k <= j; k++) {
122 |         K = k / BLOCK_SIZE;
123 |         v += BLOCK(MATRIX(LU, I, K), i % BLOCK_SIZE,
124 |             k % BLOCK_SIZE) *
125 |           BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE,
126 |               j % BLOCK_SIZE);
127 |       }
128 |       if (k == i && k <= j) {
129 |         K = k / BLOCK_SIZE;
130 |         v += BLOCK(MATRIX(LU, K, J), k % BLOCK_SIZE,
131 |             j % BLOCK_SIZE);
132 |       }
133 |       diff = fabs(BLOCK(MATRIX(M, I, J), i % BLOCK_SIZE,
134 |             j % BLOCK_SIZE) - v);
135 |       if (diff > max_diff)
136 |         max_diff = diff;
137 |     }
138 | 
139 |   /* Check maximum difference against threshold. */
140 |   return (max_diff > 0.00001);
141 | }
142 | 
143 | /****************************************************************************\
144 |  * Element operations.
145 |  \****************************************************************************/
146 | /*
147 |  * elem_daxmy - Compute y' = y - ax where a is a double and x and y are
148 |  * vectors of doubles.
149 |  */
150 | static void elem_daxmy(double a, double *x, double *y, int n)
151 | {
152 |   for (n--; n >= 0; n--) y[n] -= a * x[n];
153 | }
154 | 
155 | /****************************************************************************\
156 |  * Block operations.
157 |  \****************************************************************************/
158 | 
159 | /*
160 |  * block_lu - Factor block B.
161 |  */
162 | static void block_lu(Block B)
163 | {
164 |   int i, k;
165 | 
166 |   /* Factor block. */
167 |   for (k = 0; k < BLOCK_SIZE; k++)
168 |     for (i = k + 1; i < BLOCK_SIZE; i++) {
169 |       BLOCK(B, i, k) /= BLOCK(B, k, k);
170 |       elem_daxmy(BLOCK(B, i, k), &BLOCK(B, k, k + 1),
171 |           &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1);
172 |     }
173 | }
174 | 
175 | /*
176 |  * block_lower_solve - Perform forward substitution to solve for B' in
177 |  * LB' = B.
178 |  */
179 | static void block_lower_solve(Block B, Block L)
180 | {
181 |   int i, k;
182 | 
183 |   /* Perform forward substitution. */
184 |   for (i = 1; i < BLOCK_SIZE; i++)
185 |     for (k = 0; k < i; k++)
186 |       elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0),
187 |           &BLOCK(B, i, 0), BLOCK_SIZE);
188 | }
189 | 
190 | /*
191 |  * block_upper_solve - Perform forward substitution to solve for B' in
192 |  * B'U = B.
193 |  */
194 | static void block_upper_solve(Block B, Block U)
195 | {
196 |   int i, k;
197 | 
198 |   /* Perform forward substitution. */
199 |   for (i = 0; i < BLOCK_SIZE; i++)
200 |     for (k = 0; k < BLOCK_SIZE; k++) {
201 |       BLOCK(B, i, k) /= BLOCK(U, k, k);
202 |       elem_daxmy(BLOCK(B, i, k), &BLOCK(U, k, k + 1),
203 |           &BLOCK(B, i, k + 1), BLOCK_SIZE - k - 1);
204 |     }
205 | }
206 | 
207 | /*
208 |  * block_schur - Compute Schur complement B' = B - AC.
209 |  */
210 | static void block_schur(Block B, Block A, Block C)
211 | {
212 |   int i, k;
213 | 
214 |   /* Compute Schur complement. */
215 |   for (i = 0; i < BLOCK_SIZE; i++)
216 |     for (k = 0; k < BLOCK_SIZE; k++)
217 |       elem_daxmy(BLOCK(A, i, k), &BLOCK(C, k, 0),
218 |           &BLOCK(B, i, 0), BLOCK_SIZE);
219 | }
220 | 
221 | 
222 | /****************************************************************************\
223 |  * Divide-and-conquer matrix LU decomposition.
224 |  \****************************************************************************/
225 | 
226 | /**
227 |  * schur - Compute M' = M - VW.
228 |  */
229 | fibril static void schur(Matrix M, Matrix V, Matrix W, int nb)
230 | {
231 |   Matrix M00, M01, M10, M11;
232 |   Matrix V00, V01, V10, V11;
233 |   Matrix W00, W01, W10, W11;
234 |   int hnb;
235 | 
236 |   /* Check base case. */
237 |   if (nb == 1) {
238 |     block_schur(*M, *V, *W);
239 |     return;
240 |   }
241 | 
242 |   /* Break matrices into 4 pieces. */
243 |   hnb = nb / 2;
244 |   M00 = &MATRIX(M, 0, 0);
245 |   M01 = &MATRIX(M, 0, hnb);
246 |   M10 = &MATRIX(M, hnb, 0);
247 |   M11 = &MATRIX(M, hnb, hnb);
248 |   V00 = &MATRIX(V, 0, 0);
249 |   V01 = &MATRIX(V, 0, hnb);
250 |   V10 = &MATRIX(V, hnb, 0);
251 |   V11 = &MATRIX(V, hnb, hnb);
252 |   W00 = &MATRIX(W, 0, 0);
253 |   W01 = &MATRIX(W, 0, hnb);
254 |   W10 = &MATRIX(W, hnb, 0);
255 |   W11 = &MATRIX(W, hnb, hnb);
256 | 
257 |   /* Form Schur complement with recursive calls. */
258 |   fibril_t fr;
259 |   fibril_init(&fr);
260 | 
261 |   fibril_fork(&fr, schur, (M00, V00, W00, hnb));
262 |   fibril_fork(&fr, schur, (M01, V00, W01, hnb));
263 |   fibril_fork(&fr, schur, (M10, V10, W00, hnb));
264 |   schur(M11, V10, W01, hnb);
265 |   fibril_join(&fr);
266 | 
267 |   fibril_fork(&fr, schur, (M00, V01, W10, hnb));
268 |   fibril_fork(&fr, schur, (M01, V01, W11, hnb));
269 |   fibril_fork(&fr, schur, (M10, V11, W10, hnb));
270 |   schur(M11, V11, W11, hnb);
271 |   fibril_join(&fr);
272 | 
273 |   return;
274 | }
275 | 
276 | /*
277 |  * lower_solve - Compute M' where LM' = M.
278 |  */
279 | fibril static void lower_solve(Matrix M, Matrix L, int nb);
280 | 
281 | static void aux_lower_solve(Matrix Ma, Matrix Mb, Matrix L, int nb)
282 | {
283 |   Matrix L00, L01, L10, L11;
284 | 
285 |   /* Break L matrix into 4 pieces. */
286 |   L00 = &MATRIX(L, 0, 0);
287 |   L01 = &MATRIX(L, 0, nb);
288 |   L10 = &MATRIX(L, nb, 0);
289 |   L11 = &MATRIX(L, nb, nb);
290 | 
291 |   /* Solve with recursive calls. */
292 |   lower_solve(Ma, L00, nb);
293 |   schur(Mb, L10, Ma, nb);
294 |   lower_solve(Mb, L11, nb);
295 | }
296 | 
297 | fibril static void lower_solve(Matrix M, Matrix L, int nb)
298 | {
299 |   Matrix M00, M01, M10, M11;
300 |   int hnb;
301 | 
302 |   /* Check base case. */
303 |   if (nb == 1) {
304 |     block_lower_solve(*M, *L);
305 |     return;
306 |   }
307 | 
308 |   /* Break matrices into 4 pieces. */
309 |   hnb = nb / 2;
310 |   M00 = &MATRIX(M, 0, 0);
311 |   M01 = &MATRIX(M, 0, hnb);
312 |   M10 = &MATRIX(M, hnb, 0);
313 |   M11 = &MATRIX(M, hnb, hnb);
314 | 
315 |   /* Solve with recursive calls. */
316 |   fibril_t fr;
317 |   fibril_init(&fr);
318 | 
319 |   fibril_fork(&fr, aux_lower_solve, (M00, M10, L, hnb));
320 |   aux_lower_solve(M01, M11, L, hnb);
321 | 
322 |   fibril_join(&fr);
323 | 
324 |   return;
325 | }
326 | 
327 | /*
328 |  * upper_solve - Compute M' where M'U = M.
329 |  */
330 | fibril static void upper_solve(Matrix M, Matrix U, int nb);
331 | 
332 | static void aux_upper_solve(Matrix Ma, Matrix Mb, Matrix U, int nb)
333 | {
334 |   Matrix U00, U01, U10, U11;
335 | 
336 |   /* Break U matrix into 4 pieces. */
337 |   U00 = &MATRIX(U, 0, 0);
338 |   U01 = &MATRIX(U, 0, nb);
339 |   U10 = &MATRIX(U, nb, 0);
340 |   U11 = &MATRIX(U, nb, nb);
341 | 
342 |   /* Solve with recursive calls. */
343 |   upper_solve(Ma, U00, nb);
344 |   schur(Mb, Ma, U01, nb);
345 |   upper_solve(Mb, U11, nb);
346 | 
347 |   return;
348 | }
349 | 
350 | fibril static void upper_solve(Matrix M, Matrix U, int nb)
351 | {
352 |   Matrix M00, M01, M10, M11;
353 |   int hnb;
354 | 
355 |   /* Check base case. */
356 |   if (nb == 1) {
357 |     block_upper_solve(*M, *U);
358 |     return;
359 |   }
360 | 
361 |   /* Break matrices into 4 pieces. */
362 |   hnb = nb / 2;
363 |   M00 = &MATRIX(M, 0, 0);
364 |   M01 = &MATRIX(M, 0, hnb);
365 |   M10 = &MATRIX(M, hnb, 0);
366 |   M11 = &MATRIX(M, hnb, hnb);
367 | 
368 |   /* Solve with recursive calls. */
369 |   fibril_t fr;
370 |   fibril_init(&fr);
371 | 
372 |   fibril_fork(&fr, aux_upper_solve, (M00, M01, U, hnb));
373 |   aux_upper_solve(M10, M11, U, hnb);
374 | 
375 |   fibril_join(&fr);
376 | 
377 |   return;
378 | }
379 | 
380 | /*
381 |  * lu - Perform LU decomposition of matrix M.
382 |  */
383 | fibril void lu(Matrix M, int nb)
384 | {
385 |   Matrix M00, M01, M10, M11;
386 |   int hnb;
387 | 
388 |   /* Check base case. */
389 |   if (nb == 1) {
390 |     block_lu(*M);
391 |     return;
392 |   }
393 | 
394 |   /* Break matrix into 4 pieces. */
395 |   hnb = nb / 2;
396 |   M00 = &MATRIX(M, 0, 0);
397 |   M01 = &MATRIX(M, 0, hnb);
398 |   M10 = &MATRIX(M, hnb, 0);
399 |   M11 = &MATRIX(M, hnb, hnb);
400 | 
401 |   /* Decompose upper left. */
402 |   lu(M00, hnb);
403 | 
404 |   /* Solve for upper right and lower left. */
405 |   fibril_t fr;
406 |   fibril_init(&fr);
407 | 
408 |   fibril_fork(&fr, lower_solve, (M01, M00, hnb));
409 |   upper_solve(M10, M00, hnb);
410 | 
411 |   fibril_join(&fr);
412 | 
413 |   /* Compute Schur complement of lower right. */
414 |   schur(M11, M10, M01, hnb);
415 | 
416 |   /* Decompose lower right. */
417 |   lu(M11, hnb);
418 | 
419 |   return;
420 | }
421 | 
422 | void init()
423 | {
424 |   nBlocks = n / BLOCK_SIZE;
425 |   M = (Matrix) malloc(n * n * sizeof(double));
426 |   init_matrix(M, nBlocks);
427 | #ifndef BENCHMARK
428 |   Msave = (Matrix) malloc(n * n * sizeof(double));
429 |   memcpy((void *) Msave, (void *) M, n * n * sizeof(double));
430 | #endif
431 | 
432 | }
433 | 
434 | void prep()
435 | {
436 | #ifndef BENCHMARK
437 |   memcpy((void *) M, (void *) Msave, n * n * sizeof(double));
438 | #endif
439 | }
440 | 
441 | void test()
442 | {
443 |   lu(M, nBlocks);
444 | }
445 | 
446 | int verify()
447 | {
448 | #ifndef BENCHMARK
449 |   return test_result(M, Msave, nBlocks);
450 | #else
451 |   return 0;
452 | #endif
453 | }
454 | 


--------------------------------------------------------------------------------
/test/cholesky.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Sparse Cholesky code with little blocks at the leaves of the Quad tree
  3 |  * Keith Randall -- Aske Plaat
  4 |  *
  5 |  * This code should run with any square sparse real symmetric matrix
  6 |  * from MatrixMarket (http://math.nist.gov/MatrixMarket)
  7 |  *
  8 |  * run with `cholesky -f george-liu.mtx' for a given matrix, or
  9 |  * `cholesky -n 1000 -z 10000' for a 1000x1000 random matrix with 10000
 10 |  * nonzeros (caution: random matrices produce lots of fill).
 11 |  */
 12 | /*
 13 |  * Copyright (c) 2000 Massachusetts Institute of Technology
 14 |  * Copyright (c) 2000 Matteo Frigo
 15 |  *
 16 |  * This program is free software; you can redistribute it and/or modify
 17 |  * it under the terms of the GNU General Public License as published by
 18 |  * the Free Software Foundation; either version 2 of the License, or
 19 |  * (at your option) any later version.
 20 |  *
 21 |  * This program is distributed in the hope that it will be useful,
 22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 |  * GNU General Public License for more details.
 25 |  *
 26 |  * You should have received a copy of the GNU General Public License
 27 |  * along with this program; if not, write to the Free Software
 28 |  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 29 |  *
 30 |  */
 31 | 
 32 | #include <math.h>
 33 | #include <string.h>
 34 | #include <stdio.h>
 35 | #include <stdlib.h>
 36 | #include "test.h"
 37 | 
 38 | /*************************************************************\
 39 |  * Basic types
 40 |  \*************************************************************/
 41 | 
 42 | typedef double Real;
 43 | 
 44 | #define BLOCK_DEPTH 2		/* logarithm base 2 of BLOCK_SIZE */
 45 | #define BLOCK_SIZE  (1<<BLOCK_DEPTH)	/* 4 seems to be the optimum */
 46 | 
 47 | typedef Real Block[BLOCK_SIZE][BLOCK_SIZE];
 48 | 
 49 | #define BLOCK(B,I,J) (B[I][J])
 50 | 
 51 | #define _00 0
 52 | #define _01 1
 53 | #define _10 2
 54 | #define _11 3
 55 | 
 56 | #define TR_00 _00
 57 | #define TR_01 _10
 58 | #define TR_10 _01
 59 | #define TR_11 _11
 60 | 
 61 | typedef struct InternalNode {
 62 |   struct InternalNode *child[4];
 63 | } InternalNode;
 64 | 
 65 | typedef struct {
 66 |   Block block;
 67 | } LeafNode;
 68 | 
 69 | typedef InternalNode *Matrix;
 70 | 
 71 | static Matrix A, R;
 72 | static int depth;
 73 | 
 74 | #ifndef BENCHMARK
 75 | int n = 2000;
 76 | static int nonzeros = 10000;
 77 | #else
 78 | int n = 4000;
 79 | static int nonzeros = 40000;
 80 | #endif
 81 | 
 82 | /*************************************************************\
 83 |  * Linear algebra on blocks
 84 |  \*************************************************************/
 85 | 
 86 | /*
 87 |  * elem_daxmy - Compute y' = y - ax where a is a Real and x and y are
 88 |  * vectors of Reals.
 89 |  */
 90 | static void elem_daxmy(Real a, Real * x, Real * y, int n)
 91 | {
 92 |   for (n--; n >= 0; n--)
 93 |     y[n] -= a * x[n];
 94 | }
 95 | 
 96 | /*
 97 |  * block_schur - Compute Schur complement B' = B - AC.
 98 |  */
 99 | static void block_schur_full(Block B, Block A, Block C)
100 | {
101 |   int i, j, k;
102 |   for (i = 0; i < BLOCK_SIZE; i++) {
103 |     for (j = 0; j < BLOCK_SIZE; j++) {
104 |       for (k = 0; k < BLOCK_SIZE; k++) {
105 |         BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k);
106 |       }
107 |     }
108 |   }
109 | }
110 | 
111 | /*
112 |  * block_schur - Compute Schur complement B' = B - AC.
113 |  */
114 | static void block_schur_half(Block B, Block A, Block C)
115 | {
116 |   int i, j, k;
117 | 
118 |   /*
119 |    * printf("schur half\n");
120 |    */
121 |   /* Compute Schur complement. */
122 |   for (i = 0; i < BLOCK_SIZE; i++) {
123 |     for (j = 0; j <= i /* BLOCK_SIZE */ ; j++) {
124 |       for (k = 0; k < BLOCK_SIZE; k++) {
125 |         BLOCK(B, i, j) -= BLOCK(A, i, k) * BLOCK(C, j, k);
126 |       }
127 |     }
128 |   }
129 | }
130 | 
131 | /*
132 |  * block_upper_solve - Perform substitution to solve for B' in
133 |  * B'U = B.
134 |  */
135 | static void block_backsub(Block B, Block U)
136 | {
137 |   int i, j, k;
138 | 
139 |   /* Perform backward substitution. */
140 |   for (i = 0; i < BLOCK_SIZE; i++) {
141 |     for (j = 0; j < BLOCK_SIZE; j++) {
142 |       for (k = 0; k < i; k++) {
143 |         BLOCK(B, j, i) -= BLOCK(U, i, k) * BLOCK(B, j, k);	/* transpose? */
144 |       }
145 |       BLOCK(B, j, i) /= BLOCK(U, i, i);
146 |     }
147 |   }
148 | }
149 | 
150 | /*
151 |  * block_lower_solve - Perform forward substitution to solve for B' in
152 |  * LB' = B.
153 |  */
154 | static void xblock_backsub(Block B, Block L)
155 | {
156 |   int i, k;
157 | 
158 |   /* Perform forward substitution. */
159 |   for (i = 0; i < BLOCK_SIZE; i++)
160 |     for (k = 0; k <= i; k++) {
161 |       BLOCK(B, i, k) /= BLOCK(L, k, k);
162 |       elem_daxmy(BLOCK(L, i, k), &BLOCK(B, k, 0),
163 |           &BLOCK(B, i, 0), BLOCK_SIZE - k);
164 |     }
165 | }
166 | 
167 | /*
168 |  * block_cholesky - Factor block B.
169 |  */
170 | static void block_cholesky(Block B)
171 | {
172 |   int i, j, k;
173 | 
174 |   for (k = 0; k < BLOCK_SIZE; k++) {
175 |     Real x;
176 |     if (BLOCK(B, k, k) < 0.0) {
177 |       printf("sqrt error: %f\n", BLOCK(B, k, k));
178 |       printf("matrix is probably not numerically stable\n");
179 |       exit(9);
180 |     }
181 |     x = sqrt(BLOCK(B, k, k));
182 |     for (i = k; i < BLOCK_SIZE; i++) {
183 |       BLOCK(B, i, k) /= x;
184 |     }
185 |     for (j = k + 1; j < BLOCK_SIZE; j++) {
186 |       for (i = j; i < BLOCK_SIZE; i++) {
187 |         BLOCK(B, i, j) -= BLOCK(B, i, k) * BLOCK(B, j, k);
188 |         if (j > i && BLOCK(B, i, j) != 0.0) {
189 |           printf("Upper not empty\n");
190 |         }
191 |       }
192 |     }
193 |   }
194 | }
195 | 
196 | /*
197 |  * block_zero - zero block B.
198 |  */
199 | static void block_zero(Block B)
200 | {
201 |   int i, k;
202 | 
203 |   for (i = 0; i < BLOCK_SIZE; i++) {
204 |     for (k = 0; k < BLOCK_SIZE; k++) {
205 |       BLOCK(B, i, k) = 0.0;
206 |     }
207 |   }
208 | }
209 | 
210 | /*************************************************************\
211 |  * Allocation and initialization
212 |  \*************************************************************/
213 | 
214 |     /*
215 |      * Create new leaf nodes (BLOCK_SIZE x BLOCK_SIZE submatrices)
216 |      */
217 | static inline InternalNode *new_block_leaf(void)
218 | {
219 |   LeafNode *leaf = malloc(sizeof(LeafNode));
220 |   if (leaf == NULL) {
221 |     printf("out of memory!\n");
222 |     exit(1);
223 |   }
224 |   return (InternalNode *) leaf;
225 | }
226 | 
227 | /*
228 |  * Create internal node in quadtree representation
229 |  */
230 | static inline InternalNode *new_internal(InternalNode * a00, InternalNode * a01,
231 |     InternalNode * a10, InternalNode * a11)
232 | {
233 |   InternalNode *node = malloc(sizeof(InternalNode));
234 |   if (node == NULL) {
235 |     printf("out of memory!\n");
236 |     exit(1);
237 |   }
238 |   node->child[_00] = a00;
239 |   node->child[_01] = a01;
240 |   node->child[_10] = a10;
241 |   node->child[_11] = a11;
242 |   return node;
243 | }
244 | 
245 | /*
246 |  * Duplicate matrix.  Resulting matrix may be laid out in memory
247 |  * better than source matrix.
248 |  */
249 | fibril static Matrix copy_matrix(int depth, Matrix a)
250 | {
251 |   Matrix r;
252 | 
253 |   if (!a)
254 |     return a;
255 | 
256 |   if (depth == BLOCK_DEPTH) {
257 |     LeafNode *A = (LeafNode *) a;
258 |     LeafNode *R;
259 |     r = new_block_leaf();
260 |     R = (LeafNode *) r;
261 |     memcpy(R->block, A->block, sizeof(Block));
262 |   } else {
263 |     Matrix r00, r01, r10, r11;
264 | 
265 |     depth--;
266 | 
267 |     fibril_t fr;
268 |     fibril_init(&fr);
269 | 
270 |     fibril_fork(&fr, &r00, copy_matrix, (depth, a->child[_00]));
271 |     fibril_fork(&fr, &r01, copy_matrix, (depth, a->child[_01]));
272 |     fibril_fork(&fr, &r10, copy_matrix, (depth, a->child[_10]));
273 |     r11 = copy_matrix(depth, a->child[_11]);
274 |     fibril_join(&fr);
275 | 
276 |     r = new_internal(r00, r01, r10, r11);
277 |   }
278 |   return r;
279 | }
280 | 
281 | /*
282 |  * Deallocate matrix.
283 |  */
284 | void free_matrix(int depth, Matrix a)
285 | {
286 |   if (a == NULL)
287 |     return;
288 |   if (depth == BLOCK_DEPTH) {
289 |     free(a);
290 |   } else {
291 |     depth--;
292 |     free_matrix(depth, a->child[_00]);
293 |     free_matrix(depth, a->child[_01]);
294 |     free_matrix(depth, a->child[_10]);
295 |     free_matrix(depth, a->child[_11]);
296 |     free(a);
297 |   }
298 | }
299 | 
300 | /*************************************************************\
301 |  * Simple matrix operations
302 |  \*************************************************************/
303 | 
304 |   /*
305 |    * Get matrix element at row r, column c.
306 |    */
307 | static Real get_matrix(int depth, Matrix a, int r, int c)
308 | {
309 |   if (a == NULL)
310 |     return 0.0;
311 | 
312 |   if (depth == BLOCK_DEPTH) {
313 |     LeafNode *A = (LeafNode *) a;
314 |     return BLOCK(A->block, r, c);
315 |   } else {
316 |     int mid;
317 | 
318 |     depth--;
319 |     mid = 1 << depth;
320 | 
321 |     if (r < mid) {
322 |       if (c < mid)
323 |         return get_matrix(depth, a->child[_00], r, c);
324 |       else
325 |         return get_matrix(depth, a->child[_01], r, c - mid);
326 |     } else {
327 |       if (c < mid)
328 |         return get_matrix(depth, a->child[_10], r - mid, c);
329 |       else
330 |         return get_matrix(depth, a->child[_11], r - mid, c - mid);
331 |     }
332 |   }
333 | }
334 | 
335 | /*
336 |  * Set matrix element at row r, column c to value.
337 |  */
338 | static Matrix set_matrix(int depth, Matrix a, int r, int c, Real value)
339 | {
340 |   if (depth == BLOCK_DEPTH) {
341 |     LeafNode *A;
342 |     if (a == NULL) {
343 |       a = new_block_leaf();
344 |       A = (LeafNode *) a;
345 |       block_zero(A->block);
346 |     } else {
347 |       A = (LeafNode *) a;
348 |     }
349 |     BLOCK(A->block, r, c) = value;
350 |   } else {
351 |     int mid;
352 | 
353 |     if (a == NULL)
354 |       a = new_internal(NULL, NULL, NULL, NULL);
355 | 
356 |     depth--;
357 |     mid = 1 << depth;
358 | 
359 |     if (r < mid) {
360 |       if (c < mid)
361 |         a->child[_00] = set_matrix(depth, a->child[_00],
362 |             r, c, value);
363 |       else
364 |         a->child[_01] = set_matrix(depth, a->child[_01],
365 |             r, c - mid, value);
366 |     } else {
367 |       if (c < mid)
368 |         a->child[_10] = set_matrix(depth, a->child[_10],
369 |             r - mid, c, value);
370 |       else
371 |         a->child[_11] = set_matrix(depth, a->child[_11],
372 |             r - mid, c - mid, value);
373 |     }
374 |   }
375 |   return a;
376 | }
377 | 
378 | /*
379 |  * Compute sum of squares of elements of matrix
380 |  */
381 | static Real mag(int depth, Matrix a)
382 | {
383 |   Real res = 0.0;
384 |   if (!a)
385 |     return res;
386 | 
387 |   if (depth == BLOCK_DEPTH) {
388 |     LeafNode *A = (LeafNode *) a;
389 |     int i, j;
390 |     for (i = 0; i < BLOCK_SIZE; i++)
391 |       for (j = 0; j < BLOCK_SIZE; j++)
392 |         res += BLOCK(A->block, i, j) * BLOCK(A->block, i, j);
393 |   } else {
394 |     depth--;
395 |     res += mag(depth, a->child[_00]);
396 |     res += mag(depth, a->child[_01]);
397 |     res += mag(depth, a->child[_10]);
398 |     res += mag(depth, a->child[_11]);
399 |   }
400 |   return res;
401 | }
402 | 
403 | /*************************************************************\
404 |  * Cholesky algorithm
405 |  \*************************************************************/
406 | 
407 |   /*
408 |    * Perform R -= A * Transpose(B)
409 |    * if lower==1, update only lower-triangular part of R
410 |    */
411 | fibril static
412 | Matrix mul_and_subT(int depth, int lower, Matrix a, Matrix b, Matrix r)
413 | {
414 |   if (depth == BLOCK_DEPTH) {
415 |     LeafNode *A = (LeafNode *) a;
416 |     LeafNode *B = (LeafNode *) b;
417 |     LeafNode *R;
418 | 
419 |     if (r == NULL) {
420 |       r = new_block_leaf();
421 |       R = (LeafNode *) r;
422 |       block_zero(R->block);
423 |     } else
424 |       R = (LeafNode *) r;
425 | 
426 |     if (lower)
427 |       block_schur_half(R->block, A->block, B->block);
428 |     else
429 |       block_schur_full(R->block, A->block, B->block);
430 |   } else {
431 |     Matrix r00, r01, r10, r11;
432 | 
433 |     depth--;
434 | 
435 |     if (r != NULL) {
436 |       r00 = r->child[_00];
437 |       r01 = r->child[_01];
438 |       r10 = r->child[_10];
439 |       r11 = r->child[_11];
440 |     } else {
441 |       r00 = NULL;
442 |       r01 = NULL;
443 |       r10 = NULL;
444 |       r11 = NULL;
445 |     }
446 | 
447 |     fibril_t fr;
448 |     fibril_init(&fr);
449 | 
450 |     if (a->child[_00] && b->child[TR_00])
451 |       fibril_fork(&fr, &r00, mul_and_subT, (depth, lower,
452 |           a->child[_00], b->child[TR_00],
453 |           r00));
454 | 
455 |     if (!lower && a->child[_00] && b->child[TR_01])
456 |       fibril_fork(&fr, &r01, mul_and_subT, (depth, 0,
457 |           a->child[_00], b->child[TR_01],
458 |           r01));
459 | 
460 |     if (a->child[_10] && b->child[TR_00])
461 |       fibril_fork(&fr, &r10, mul_and_subT, (depth, 0,
462 |           a->child[_10], b->child[TR_00],
463 |           r10));
464 | 
465 |     if (a->child[_10] && b->child[TR_01])
466 |       fibril_fork(&fr, &r11, mul_and_subT, (depth, lower,
467 |           a->child[_10], b->child[TR_01],
468 |           r11));
469 | 
470 |     fibril_join(&fr);
471 | 
472 |     if (a->child[_01] && b->child[TR_10])
473 |       fibril_fork(&fr, &r00, mul_and_subT, (depth, lower,
474 |           a->child[_01], b->child[TR_10],
475 |           r00));
476 | 
477 |     if (!lower && a->child[_01] && b->child[TR_11])
478 |       fibril_fork(&fr, &r01, mul_and_subT, (depth, 0,
479 |           a->child[_01], b->child[TR_11],
480 |           r01));
481 | 
482 |     if (a->child[_11] && b->child[TR_10])
483 |       fibril_fork(&fr, &r10, mul_and_subT, (depth, 0,
484 |           a->child[_11], b->child[TR_10],
485 |           r10));
486 | 
487 |     if (a->child[_11] && b->child[TR_11])
488 |       fibril_fork(&fr, &r11, mul_and_subT, (depth, lower,
489 |           a->child[_11], b->child[TR_11],
490 |           r11));
491 | 
492 |     fibril_join(&fr);
493 | 
494 |     if (r == NULL) {
495 |       if (r00 || r01 || r10 || r11)
496 |         r = new_internal(r00, r01, r10, r11);
497 |     } else {
498 |       r->child[_00] = r00;
499 |       r->child[_01] = r01;
500 |       r->child[_10] = r10;
501 |       r->child[_11] = r11;
502 |     }
503 |   }
504 |   return r;
505 | }
506 | 
507 | /*
508 |  * Perform substitution to solve for B in BL = A
509 |  * Returns B in place of A.
510 |  */
511 | fibril static Matrix backsub(int depth, Matrix a, Matrix l)
512 | {
513 |   if (depth == BLOCK_DEPTH) {
514 |     LeafNode *A = (LeafNode *) a;
515 |     LeafNode *L = (LeafNode *) l;
516 |     block_backsub(A->block, L->block);
517 |   } else {
518 |     Matrix a00, a01, a10, a11;
519 |     Matrix l00, l10, l11;
520 | 
521 |     depth--;
522 | 
523 |     a00 = a->child[_00];
524 |     a01 = a->child[_01];
525 |     a10 = a->child[_10];
526 |     a11 = a->child[_11];
527 | 
528 |     l00 = l->child[_00];
529 |     l10 = l->child[_10];
530 |     l11 = l->child[_11];
531 | 
532 |     fibril_t fr;
533 |     fibril_init(&fr);
534 | 
535 |     if (a00)
536 |       fibril_fork(&fr, &a00, backsub, (depth, a00, l00));
537 |     if (a10)
538 |       fibril_fork(&fr, &a10, backsub, (depth, a10, l00));
539 | 
540 |     fibril_join(&fr);
541 | 
542 |     if (a00 && l10)
543 |       fibril_fork(&fr, &a01, mul_and_subT, (depth, 0, a00, l10, a01));
544 |     if (a10 && l10)
545 |       fibril_fork(&fr, &a11, mul_and_subT, (depth, 0, a10, l10, a11));
546 | 
547 |     fibril_join(&fr);
548 | 
549 |     if (a01)
550 |       fibril_fork(&fr, &a01, backsub, (depth, a01, l11));
551 |     if (a11)
552 |       fibril_fork(&fr, &a11, backsub, (depth, a11, l11));
553 | 
554 |     fibril_join(&fr);
555 | 
556 |     a->child[_00] = a00;
557 |     a->child[_01] = a01;
558 |     a->child[_10] = a10;
559 |     a->child[_11] = a11;
560 |   }
561 | 
562 |   return a;
563 | }
564 | 
565 | /*
566 |  * Compute Cholesky factorization of A.
567 |  */
568 | fibril static Matrix cholesky(int depth, Matrix a)
569 | {
570 |   if (depth == BLOCK_DEPTH) {
571 |     LeafNode *A = (LeafNode *) a;
572 |     block_cholesky(A->block);
573 |   } else {
574 |     Matrix a00, a10, a11;
575 | 
576 |     depth--;
577 | 
578 |     a00 = a->child[_00];
579 |     a10 = a->child[_10];
580 |     a11 = a->child[_11];
581 | 
582 |     if (!a10) {
583 |       fibril_t fr;
584 |       fibril_init(&fr);
585 |       fibril_fork(&fr, &a00, cholesky, (depth, a00));
586 |       a11 = cholesky(depth, a11);
587 |       fibril_join(&fr);
588 |     } else {
589 |       a00 = cholesky(depth, a00);
590 |       a10 = backsub(depth, a10, a00);
591 |       a11 = mul_and_subT(depth, 1, a10, a10, a11);
592 |       a11 = cholesky(depth, a11);
593 |     }
594 |     a->child[_00] = a00;
595 |     a->child[_10] = a10;
596 |     a->child[_11] = a11;
597 |   }
598 |   return a;
599 | }
600 | 
601 | static int logarithm(int size)
602 | {
603 |   int k = 0;
604 | 
605 |   while ((1 << k) < size)
606 |     k++;
607 |   return k;
608 | }
609 | 
610 | void init()
611 | {
612 |   /* generate random matrix */
613 |   depth = logarithm(n);
614 | 
615 |   /* diagonal elements */
616 |   int i;
617 |   for (i = 0; i < n; i++)
618 |     A = set_matrix(depth, A, i, i, 1.0);
619 | 
620 |   /* off-diagonal elements */
621 |   for (i = 0; i < nonzeros - n; i++) {
622 |     int r, c;
623 | 
624 |     do {
625 |       r = rand() % n;
626 |       c = rand() % n;
627 |     } while (r <= c || get_matrix(depth, A, r, c) != 0.0);
628 | 
629 |     A = set_matrix(depth, A, r, c, 0.1);
630 |   }
631 | 
632 |   /* extend to power of two n with identity matrix */
633 |   for (i = n; i < (1 << depth); i++) {
634 |     A = set_matrix(depth, A, i, i, 1.0);
635 |   }
636 | }
637 | 
638 | void prep()
639 | {
640 |   free_matrix(depth, R);
641 |   R = copy_matrix(depth, A);
642 | }
643 | 
644 | void test()
645 | {
646 |   R = cholesky(depth, R);
647 | }
648 | 
649 | int verify()
650 | {
651 |   int fail = 0;
652 | 
653 | #ifndef BENCHMARK
654 |   /* test - make sure R * Transpose(R) == A */
655 |   /* compute || A - R * Transpose(R) ||    */
656 |   A = mul_and_subT(depth, 1, R, R, A);
657 |   Real error = mag(depth, A);
658 |   fail = (error > 0.00001);
659 | #endif
660 | 
661 |   free_matrix(depth, A);
662 |   free_matrix(depth, R);
663 |   return fail;
664 | }
665 | 


--------------------------------------------------------------------------------
/test/strassen.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 1996 Massachusetts Institute of Technology
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining
  5 |  * a copy of this software and associated documentation files (the
  6 |  * "Software"), to use, copy, modify, and distribute the Software without
  7 |  * restriction, provided the Software, including any modified copies made
  8 |  * under this license, is not distributed for a fee, subject to
  9 |  * the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be
 12 |  * included in all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 15 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 16 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 17 |  * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
 18 |  * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 19 |  * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 20 |  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 21 |  *
 22 |  * Except as contained in this notice, the name of the Massachusetts
 23 |  * Institute of Technology shall not be used in advertising or otherwise
 24 |  * to promote the sale, use or other dealings in this Software without
 25 |  * prior written authorization from the Massachusetts Institute of
 26 |  * Technology.
 27 |  *
 28 |  */
 29 | 
 30 | #include <math.h>
 31 | #include <stdio.h>
 32 | #include <stdlib.h>
 33 | #include "test.h"
 34 | 
 35 | #define SizeAtWhichDivideAndConquerIsMoreEfficient 64
 36 | #define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
 37 | #define CacheBlockSizeInBytes 32
 38 | 
 39 | /* The real numbers we are using --- either double or float */
 40 | typedef double REAL;
 41 | typedef unsigned long PTR;
 42 | 
 43 | /* maximum tolerable relative error (for the checking routine) */
 44 | #define EPSILON (1.0E-6)
 45 | 
 46 | /*
 47 |  * Matrices are stored in row-major order; A is a pointer to
 48 |  * the first element of the matrix, and an is the number of elements
 49 |  * between two rows. This macro produces the element A[i,j]
 50 |  * given A, an, i and j
 51 |  */
 52 | #define ELEM(A, an, i, j) (A[(i) * (an) + (j)])
 53 | 
 54 | #ifndef BENCHMARK
 55 | int n = 512;
 56 | #else
 57 | int n = 4096;
 58 | #endif
 59 | 
 60 | static REAL * A, * B, * C;
 61 | 
 62 | /*
 63 |  * Naive sequential algorithm, for comparison purposes
 64 |  */
 65 | void matrixmul(int n, REAL * A, int an, REAL * B, int bn, REAL * C, int cn)
 66 | {
 67 |   int i, j, k;
 68 |   REAL s;
 69 | 
 70 |   for (i = 0; i < n; ++i)
 71 |     for (j = 0; j < n; ++j) {
 72 |       s = 0.0;
 73 |       for (k = 0; k < n; ++k)
 74 |         s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
 75 | 
 76 |       ELEM(C, cn, i, j) = s;
 77 |     }
 78 | }
 79 | 
 80 | /*****************************************************************************
 81 |  **
 82 |  ** FastNaiveMatrixMultiply
 83 |  **
 84 |  ** For small to medium sized matrices A, B, and C of size
 85 |  ** MatrixSize * MatrixSize this function performs the operation
 86 |  ** C = A x B efficiently.
 87 |  **
 88 |  ** Note MatrixSize must be divisible by 8.
 89 |  **
 90 |  ** INPUT:
 91 |  **    C = (*C WRITE) Address of top left element of matrix C.
 92 |  **    A = (*A IS READ ONLY) Address of top left element of matrix A.
 93 |  **    B = (*B IS READ ONLY) Address of top left element of matrix B.
 94 |  **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
 95 |  **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
 96 |  **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
 97 |  **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
 98 |  **
 99 |  ** OUTPUT:
100 |  **    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
101 |  **
102 |  *****************************************************************************/
103 | static void FastNaiveMatrixMultiply(
104 |     REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
105 |     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
106 | {
107 |   /* Assumes size of real is 8 bytes */
108 |   PTR RowWidthBInBytes = RowWidthB  << 3;
109 |   PTR RowWidthAInBytes = RowWidthA << 3;
110 |   PTR MatrixWidthInBytes = MatrixSize << 3;
111 |   PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
112 |   unsigned Horizontal, Vertical;
113 | 
114 |   REAL *ARowStart = A;
115 |   for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
116 |     for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
117 |       REAL *BColumnStart = B + Horizontal;
118 |       REAL FirstARowValue = *ARowStart++;
119 | 
120 |       REAL Sum0 = FirstARowValue * (*BColumnStart);
121 |       REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
122 |       REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
123 |       REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
124 |       REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
125 |       REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
126 |       REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
127 |       REAL Sum7 = FirstARowValue * (*(BColumnStart+7));
128 | 
129 |       unsigned Products;
130 |       for (Products = 1; Products < MatrixSize; Products++) {
131 |         REAL ARowValue = *ARowStart++;
132 |         BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
133 | 
134 |         Sum0 += ARowValue * (*BColumnStart);
135 |         Sum1 += ARowValue * (*(BColumnStart+1));
136 |         Sum2 += ARowValue * (*(BColumnStart+2));
137 |         Sum3 += ARowValue * (*(BColumnStart+3));
138 |         Sum4 += ARowValue * (*(BColumnStart+4));
139 |         Sum5 += ARowValue * (*(BColumnStart+5));
140 |         Sum6 += ARowValue * (*(BColumnStart+6));
141 |         Sum7 += ARowValue * (*(BColumnStart+7));
142 |       }
143 |       ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
144 | 
145 |       *(C) = Sum0;
146 |       *(C+1) = Sum1;
147 |       *(C+2) = Sum2;
148 |       *(C+3) = Sum3;
149 |       *(C+4) = Sum4;
150 |       *(C+5) = Sum5;
151 |       *(C+6) = Sum6;
152 |       *(C+7) = Sum7;
153 |       C+=8;
154 |     }
155 | 
156 |     ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
157 |     C = (REAL*) ( ((PTR) C) + RowIncrementC );
158 |   }
159 | }
160 | 
161 | /*****************************************************************************
162 |  **
163 |  ** FastAdditiveNaiveMatrixMultiply
164 |  **
165 |  ** For small to medium sized matrices A, B, and C of size
166 |  ** MatrixSize * MatrixSize this function performs the operation
167 |  ** C += A x B efficiently.
168 |  **
169 |  ** Note MatrixSize must be divisible by 8.
170 |  **
171 |  ** INPUT:
172 |  **    C = (*C READ/WRITE) Address of top left element of matrix C.
173 |  **    A = (*A IS READ ONLY) Address of top left element of matrix A.
174 |  **    B = (*B IS READ ONLY) Address of top left element of matrix B.
175 |  **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
176 |  **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
177 |  **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
178 |  **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
179 |  **
180 |  ** OUTPUT:
181 |  **    C = (*C READ/WRITE) Matrix C contains C + A x B.
182 |  **
183 |  *****************************************************************************/
184 | static void FastAdditiveNaiveMatrixMultiply(
185 |     REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
186 |     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
187 | {
188 |   /* Assumes size of real is 8 bytes */
189 |   PTR RowWidthBInBytes = RowWidthB  << 3;
190 |   PTR RowWidthAInBytes = RowWidthA << 3;
191 |   PTR MatrixWidthInBytes = MatrixSize << 3;
192 |   PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
193 |   unsigned Horizontal, Vertical;
194 | 
195 |   REAL *ARowStart = A;
196 |   for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
197 |     for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
198 |       REAL *BColumnStart = B + Horizontal;
199 | 
200 |       REAL Sum0 = *C;
201 |       REAL Sum1 = *(C+1);
202 |       REAL Sum2 = *(C+2);
203 |       REAL Sum3 = *(C+3);
204 |       REAL Sum4 = *(C+4);
205 |       REAL Sum5 = *(C+5);
206 |       REAL Sum6 = *(C+6);
207 |       REAL Sum7 = *(C+7);
208 | 
209 |       unsigned Products;
210 |       for (Products = 0; Products < MatrixSize; Products++) {
211 |         REAL ARowValue = *ARowStart++;
212 | 
213 |         Sum0 += ARowValue * (*BColumnStart);
214 |         Sum1 += ARowValue * (*(BColumnStart+1));
215 |         Sum2 += ARowValue * (*(BColumnStart+2));
216 |         Sum3 += ARowValue * (*(BColumnStart+3));
217 |         Sum4 += ARowValue * (*(BColumnStart+4));
218 |         Sum5 += ARowValue * (*(BColumnStart+5));
219 |         Sum6 += ARowValue * (*(BColumnStart+6));
220 |         Sum7 += ARowValue * (*(BColumnStart+7));
221 | 
222 |         BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
223 | 
224 |       }
225 |       ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
226 | 
227 |       *(C) = Sum0;
228 |       *(C+1) = Sum1;
229 |       *(C+2) = Sum2;
230 |       *(C+3) = Sum3;
231 |       *(C+4) = Sum4;
232 |       *(C+5) = Sum5;
233 |       *(C+6) = Sum6;
234 |       *(C+7) = Sum7;
235 |       C+=8;
236 |     }
237 | 
238 |     ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
239 |     C = (REAL*) ( ((PTR) C) + RowIncrementC );
240 |   }
241 | }
242 | 
243 | 
244 | /*****************************************************************************
245 |  **
246 |  ** MultiplyByDivideAndConquer
247 |  **
248 |  ** For medium to medium-large (would you like fries with that) sized
249 |  ** matrices A, B, and C of size MatrixSize * MatrixSize this function
250 |  ** efficiently performs the operation
251 |  **    C  = A x B (if AdditiveMode == 0)
252 |  **    C += A x B (if AdditiveMode != 0)
253 |  **
254 |  ** Note MatrixSize must be divisible by 16.
255 |  **
256 |  ** INPUT:
257 |  **    C = (*C READ/WRITE) Address of top left element of matrix C.
258 |  **    A = (*A IS READ ONLY) Address of top left element of matrix A.
259 |  **    B = (*B IS READ ONLY) Address of top left element of matrix B.
260 |  **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
261 |  **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
262 |  **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
263 |  **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
264 |  **    AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
265 |  **
266 |  ** OUTPUT:
267 |  **    C (+)= A x B. (+ if AdditiveMode != 0)
268 |  **
269 |  *****************************************************************************/
270 | void MultiplyByDivideAndConquer(
271 |     REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
272 |     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB,
273 |     int AdditiveMode)
274 | {
275 | #define A00 A
276 | #define B00 B
277 | #define C00 C
278 | 
279 |   REAL  *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
280 |   unsigned QuadrantSize = MatrixSize >> 1;
281 | 
282 |   /* partition the matrix */
283 |   A01 = A00 + QuadrantSize;
284 |   A10 = A00 + RowWidthA * QuadrantSize;
285 |   A11 = A10 + QuadrantSize;
286 | 
287 |   B01 = B00 + QuadrantSize;
288 |   B10 = B00 + RowWidthB * QuadrantSize;
289 |   B11 = B10 + QuadrantSize;
290 | 
291 |   C01 = C00 + QuadrantSize;
292 |   C10 = C00 + RowWidthC * QuadrantSize;
293 |   C11 = C10 + QuadrantSize;
294 | 
295 |   if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
296 |     MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
297 |         RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
298 |     MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
299 |         RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
300 |     MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
301 |         RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
302 |     MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
303 |         RowWidthC, RowWidthA, RowWidthB, AdditiveMode);
304 |     MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
305 |         RowWidthC, RowWidthA, RowWidthB, 1);
306 |     MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
307 |         RowWidthC, RowWidthA, RowWidthB, 1);
308 |     MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
309 |         RowWidthC, RowWidthA, RowWidthB, 1);
310 |     MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
311 |         RowWidthC, RowWidthA, RowWidthB, 1);
312 |   } else {
313 |     if (AdditiveMode) {
314 |       FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
315 |           RowWidthC, RowWidthA, RowWidthB);
316 |       FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
317 |           RowWidthC, RowWidthA, RowWidthB);
318 |       FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
319 |           RowWidthC, RowWidthA, RowWidthB);
320 |       FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
321 |           RowWidthC, RowWidthA, RowWidthB);
322 |     } else {
323 |       FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
324 |           RowWidthC, RowWidthA, RowWidthB);
325 | 
326 |       FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
327 |           RowWidthC, RowWidthA, RowWidthB);
328 | 
329 |       FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
330 |           RowWidthC, RowWidthA, RowWidthB);
331 | 
332 |       FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
333 |           RowWidthC, RowWidthA, RowWidthB);
334 |     }
335 | 
336 |     FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
337 |         RowWidthC, RowWidthA, RowWidthB);
338 |     FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
339 |         RowWidthC, RowWidthA, RowWidthB);
340 |     FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
341 |         RowWidthC, RowWidthA, RowWidthB);
342 |     FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
343 |         RowWidthC, RowWidthA, RowWidthB);
344 |   }
345 | 
346 |   return;
347 | }
348 | 
349 | 
350 | /*****************************************************************************
351 |  **
352 |  ** OptimizedStrassenMultiply
353 |  **
354 |  ** For large matrices A, B, and C of size MatrixSize * MatrixSize this
355 |  ** function performs the operation C = A x B efficiently.
356 |  **
357 |  ** INPUT:
358 |  **    C = (*C WRITE) Address of top left element of matrix C.
359 |  **    A = (*A IS READ ONLY) Address of top left element of matrix A.
360 |  **    B = (*B IS READ ONLY) Address of top left element of matrix B.
361 |  **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
362 |  **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
363 |  **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
364 |  **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
365 |  ** OUTPUT:
366 |  **    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
367 |  **
368 |  *****************************************************************************/
369 | fibril static void OptimizedStrassenMultiply(
370 |     REAL * C, REAL * A, REAL * B, unsigned MatrixSize,
371 |     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
372 | {
373 |   unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
374 |   unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize *
375 |     QuadrantSize + 32;
376 |   unsigned Column, Row;
377 | 
378 |   /************************************************************************
379 |    ** For each matrix A, B, and C, we'll want pointers to each quandrant
380 |    ** in the matrix. These quandrants will be addressed as follows:
381 |    **  --        --
382 |    **  | A11  A12 |
383 |    **  |          |
384 |    **  | A21  A22 |
385 |    **  --        --
386 |    ************************************************************************/
387 |   REAL /**A11, *B11, *C11,*/ *A12, *B12, *C12,
388 |        *A21, *B21, *C21, *A22, *B22, *C22;
389 | 
390 |   REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
391 | #define NumberOfVariables 11
392 | 
393 |   PTR TempMatrixOffset = 0;
394 |   PTR MatrixOffsetA = 0;
395 |   PTR MatrixOffsetB = 0;
396 | 
397 |   char *Heap;
398 |   void *StartHeap;
399 | 
400 |   /* Distance between the end of a matrix row and the start of the next row */
401 |   PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
402 |   PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
403 |   PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
404 | 
405 |   if (MatrixSize <= SizeAtWhichDivideAndConquerIsMoreEfficient) {
406 |     MultiplyByDivideAndConquer(C, A, B, MatrixSize,
407 |         RowWidthC, RowWidthA, RowWidthB, 0);
408 |     return;
409 |   }
410 | 
411 |   /* Initialize quandrant matrices */
412 | #define A11 A
413 | #define B11 B
414 | #define C11 C
415 |   A12 = A11 + QuadrantSize;
416 |   B12 = B11 + QuadrantSize;
417 |   C12 = C11 + QuadrantSize;
418 |   A21 = A + (RowWidthA * QuadrantSize);
419 |   B21 = B + (RowWidthB * QuadrantSize);
420 |   C21 = C + (RowWidthC * QuadrantSize);
421 |   A22 = A21 + QuadrantSize;
422 |   B22 = B21 + QuadrantSize;
423 |   C22 = C21 + QuadrantSize;
424 | 
425 |   /* Allocate Heap Space Here */
426 |   StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
427 |   /* ensure that heap is on cache boundary */
428 |   if ( ((PTR) Heap) & 31)
429 |     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
430 | 
431 |   /* Distribute the heap space over the variables */
432 |   S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
433 |   S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
434 |   S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
435 |   S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
436 |   S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
437 |   S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
438 |   S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
439 |   S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
440 |   M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
441 |   M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
442 |   T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
443 | 
444 |   /***************************************************************************
445 |    ** Step through all columns row by row (vertically)
446 |    ** (jumps in memory by RowWidth => bad locality)
447 |    ** (but we want the best locality on the innermost loop)
448 |    ***************************************************************************/
449 |   for (Row = 0; Row < QuadrantSize; Row++) {
450 | 
451 |     /*************************************************************************
452 |      ** Step through each row horizontally (addressing elements in each column)
453 |      ** (jumps linearly througn memory => good locality)
454 |      *************************************************************************/
455 |     for (Column = 0; Column < QuadrantSize; Column++) {
456 | 
457 |       /***********************************************************
458 |        ** Within this loop, the following holds for MatrixOffset:
459 |        ** MatrixOffset = (Row * RowWidth) + Column
460 |        ** (note: that the unit of the offset is number of reals)
461 |        ***********************************************************/
462 |       /* Element of Global Matrix, such as A, B, C */
463 | #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
464 | #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
465 | #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
466 | 
467 |       /* FIXME - may pay to expand these out - got higher speed-ups below */
468 |       /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
469 |       E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
470 | 
471 |       /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
472 |       E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
473 | 
474 |       /* S3 = A11 - A21 */
475 |       E(S3) = EA(A11) - EA(A21);
476 | 
477 |       /* S7 = B22 - B12 */
478 |       E(S7) = EB(B22) - EB(B12);
479 | 
480 |       TempMatrixOffset += sizeof(REAL);
481 |       MatrixOffsetA += sizeof(REAL);
482 |       MatrixOffsetB += sizeof(REAL);
483 |     } /* end row loop*/
484 | 
485 |     MatrixOffsetA += RowIncrementA;
486 |     MatrixOffsetB += RowIncrementB;
487 |   } /* end column loop */
488 | 
489 |   fibril_t fr;
490 |   fibril_init(&fr);
491 | 
492 |   /* M2 = A11 x B11 */
493 |   fibril_fork(&fr, OptimizedStrassenMultiply,
494 |       (M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB));
495 | 
496 |   /* M5 = S1 * S5 */
497 |   fibril_fork(&fr, OptimizedStrassenMultiply,
498 |       (M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize));
499 | 
500 |   /* Step 1 of T1 = S2 x S6 + M2 */
501 |   fibril_fork(&fr, OptimizedStrassenMultiply,
502 |       (T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize));
503 | 
504 |   /* Step 1 of T2 = T1 + S3 x S7 */
505 |   fibril_fork(&fr, OptimizedStrassenMultiply,
506 |       (C22, S3, S7, QuadrantSize, RowWidthC, QuadrantSize, QuadrantSize));
507 | 
508 |   /* Step 1 of C11 = M2 + A12 * B21 */
509 |   fibril_fork(&fr, OptimizedStrassenMultiply,
510 |       (C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB));
511 | 
512 |   /* Step 1 of C12 = S4 x B22 + T1 + M5 */
513 |   fibril_fork(&fr, OptimizedStrassenMultiply,
514 |     (C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB));
515 | 
516 |   /* Step 1 of C21 = T2 - A22 * S8 */
517 |   OptimizedStrassenMultiply(C21, A22, S8, QuadrantSize, RowWidthC,
518 |       RowWidthA, QuadrantSize);
519 | 
520 |   fibril_join(&fr);
521 | 
522 |   for (Row = 0; Row < QuadrantSize; Row++) {
523 |     for (Column = 0; Column < QuadrantSize; Column += 4) {
524 |       REAL LocalM5_0 = *(M5);
525 |       REAL LocalM5_1 = *(M5+1);
526 |       REAL LocalM5_2 = *(M5+2);
527 |       REAL LocalM5_3 = *(M5+3);
528 |       REAL LocalM2_0 = *(M2);
529 |       REAL LocalM2_1 = *(M2+1);
530 |       REAL LocalM2_2 = *(M2+2);
531 |       REAL LocalM2_3 = *(M2+3);
532 |       REAL T1_0 = *(T1sMULT) + LocalM2_0;
533 |       REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
534 |       REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
535 |       REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
536 |       REAL T2_0 = *(C22) + T1_0;
537 |       REAL T2_1 = *(C22+1) + T1_1;
538 |       REAL T2_2 = *(C22+2) + T1_2;
539 |       REAL T2_3 = *(C22+3) + T1_3;
540 |       (*(C11))   += LocalM2_0;
541 |       (*(C11+1)) += LocalM2_1;
542 |       (*(C11+2)) += LocalM2_2;
543 |       (*(C11+3)) += LocalM2_3;
544 |       (*(C12))   += LocalM5_0 + T1_0;
545 |       (*(C12+1)) += LocalM5_1 + T1_1;
546 |       (*(C12+2)) += LocalM5_2 + T1_2;
547 |       (*(C12+3)) += LocalM5_3 + T1_3;
548 |       (*(C22))   = LocalM5_0 + T2_0;
549 |       (*(C22+1)) = LocalM5_1 + T2_1;
550 |       (*(C22+2)) = LocalM5_2 + T2_2;
551 |       (*(C22+3)) = LocalM5_3 + T2_3;
552 |       (*(C21  )) = (- *(C21  )) + T2_0;
553 |       (*(C21+1)) = (- *(C21+1)) + T2_1;
554 |       (*(C21+2)) = (- *(C21+2)) + T2_2;
555 |       (*(C21+3)) = (- *(C21+3)) + T2_3;
556 |       M5 += 4;
557 |       M2 += 4;
558 |       T1sMULT += 4;
559 |       C11 += 4;
560 |       C12 += 4;
561 |       C21 += 4;
562 |       C22 += 4;
563 |     }
564 | 
565 |     C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
566 |     C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
567 |     C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
568 |     C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
569 |   }
570 | 
571 |   free(StartHeap);
572 | }
573 | 
574 | static void strassen(int n, REAL * A, int an, REAL * B, int bn,
575 |     REAL * C, int cn) {
576 |   OptimizedStrassenMultiply(C, A, B, n, cn, bn, an);
577 | }
578 | 
579 | /*
580 |  * Set an n by n matrix A to random values.  The distance between
581 |  * rows is an
582 |  */
583 | void init_matrix(int n, REAL *A, int an)
584 | {
585 |   int i, j;
586 | 
587 |   for (i = 0; i < n; ++i)
588 |     for (j = 0; j < n; ++j)
589 |       ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX;
590 | }
591 | 
592 | /*
593 |  * Compare two matrices.  Print an error message if they differ by
594 |  * more than EPSILON.
595 |  */
596 | int compare_matrix(int n, REAL *A, int an, REAL *B, int bn)
597 | {
598 |   int i, j;
599 |   REAL c;
600 | 
601 |   for (i = 0; i < n; ++i)
602 |     for (j = 0; j < n; ++j) {
603 |       /* compute the relative error c */
604 |       c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
605 |       if (c < 0.0)
606 |         c = -c;
607 | 
608 |       c = c / ELEM(A, an, i, j);
609 |       if (c > EPSILON) {
610 |         return 1;
611 |       }
612 |     }
613 | 
614 |   return 0;
615 | }
616 | 
617 | void init() {
618 |   A = malloc(n * n * sizeof(REAL));
619 |   B = malloc(n * n * sizeof(REAL));
620 |   C = malloc(n * n * sizeof(REAL));
621 | 
622 |   init_matrix(n, A, n);
623 |   init_matrix(n, B, n);
624 | }
625 | 
626 | void prep() {
627 | }
628 | 
629 | void test() {
630 |   strassen(n, A, n, B, n, C, n);
631 | }
632 | 
633 | int verify() {
634 |   int fail = 0;
635 | 
636 | #ifndef BENCHMARK
637 |   REAL * E = malloc(n * n * sizeof(REAL));
638 |   matrixmul(n, A, n, B, n, E, n);
639 |   fail = compare_matrix(n, E, n, C, n);
640 |   if (fail > 0) printf("WRONG RESULT!\n");
641 | #endif
642 | 
643 |   return fail;
644 | }
645 | 


--------------------------------------------------------------------------------