├── .gitignore
├── LICENSE
├── Makefile
├── Makefile.inc
├── README.md
├── TODO.md
├── configure
├── plat
    ├── arch
    │   ├── armv7a
    │   │   ├── ps_arch.h
    │   │   └── ps_arch_armv7a_common.h
    │   ├── x86
    │   │   ├── ps_arch.h
    │   │   └── ps_arch_x86_common.h
    │   └── x86_64
    │   │   ├── ps_arch.h
    │   │   └── ps_arch_x86_common.h
    └── os
    │   ├── cos
    │       └── ps_os.h
    │   └── linux
    │       ├── ps_os.c
    │       └── ps_os.h
├── ps.h
├── ps_config.h
├── ps_ertrie.h
├── ps_global.h
├── ps_list.h
├── ps_ns.c
├── ps_ns.h
├── ps_pgalloc.h
├── ps_refcnt.h
├── ps_slab.c
├── ps_slab.h
├── ps_smr.c
├── ps_smr.h
├── quiesce_type
    ├── general
    │   ├── ps_quiesce.c
    │   └── ps_quiesce_impl.h
    ├── real_time
    │   ├── ps_quiesce.c
    │   ├── ps_quiesce_impl.h
    │   ├── ps_quiesce_rt.c
    │   └── ps_quiesce_rt.h
    └── temporal
    │   ├── ps_quiesce.c
    │   ├── ps_quiesce_impl.h
    │   ├── ps_quiesce_rt.c
    │   └── ps_quiesce_rt.h
└── tests
    ├── Makefile
    ├── ht.c.inprogress
    ├── list.c
    ├── ns.c
    ├── pgalloc.c
    ├── slab.c
    └── smr.c


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 | 
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 | 
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 | 
31 | # Debug files
32 | *.dSYM/
33 | *.su
34 | 
35 | # build system oddities
36 | ps_plat.h
37 | /ps_quiesce.h
38 | Makefile.config
39 | *.d
40 | *.test
41 | 
42 | # indexing
43 | GPATH
44 | GRTAGS
45 | GSYMS
46 | GTAGS
47 | 
48 | *.swp
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2011, Gabriel Parmer
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | include Makefile.config
 2 | include Makefile.inc
 3 | 
 4 | # library
 5 | PLATFILE = ps_plat.h
 6 | QUIESCEFILE = ps_quiesce.h
 7 | CFILES  = $(wildcard *.c) $(wildcard plat/os/$(OSNAME)/*.c) $(wildcard plat/arch/$(ARCHNAME)/*.c) $(wildcard quiesce_type/$(QUIESCETYPE)/*.c)
 8 | COBJS   = $(patsubst %.c,%.o,$(CFILES))
 9 | CDEPS   = $(patsubst %.c,%.d,$(CFILES))
10 | CDEPRM  = $(patsubst %.c,%.d,$(CFILES))
11 | 
12 | .PHONY: config clean all
13 | 
14 | all: $(CLIB)
15 | 
16 | config:
17 | 	@arch_res=`cat $(PLATFILE) | grep -w $(ARCHNAME)` || true; \
18 | 	osname_res=`cat $(PLATFILE) | grep -w $(OSNAME)` || true; \
19 | 	if [ -f $(PLATFILE) -a "$$arch_res" != "" -a "$$osname_res" != "" ]; then \
20 | 		exit 0; \
21 | 	else \
22 | 		rm -f $(PLATFILE); \
23 | 		echo '#ifndef PS_PLAT_H'                                        >  $(PLATFILE); \
24 | 		echo '#define PS_PLAT_H'                                        >> $(PLATFILE); \
25 | 		echo '#include "plat/arch/$(ARCHNAME)/ps_arch.h"'               >> $(PLATFILE); \
26 | 		echo '#include "plat/os/$(OSNAME)/ps_os.h"'                     >> $(PLATFILE); \
27 | 		echo '#endif	/* PS_PLAT_H */'                                 >> $(PLATFILE); \
28 | 	fi 
29 | 	
30 | 	@quiesce_res=`cat $(QUIESCEFILE) | grep -w $(QUIESCETYPE)` || true; \
31 | 	if [ -f $(QUIESCEFILE) -a "$$quiesce_res" != "" ]; then \
32 | 		exit 0; \
33 | 	else \
34 | 		rm -f $(QUIESCEFILE); \
35 | 		echo '#ifndef PS_QUIESCE_H'                                     >  $(QUIESCEFILE); \
36 | 		echo '#define PS_QUIESCE_H'                                     >> $(QUIESCEFILE); \
37 | 		echo '#include "quiesce_type/$(QUIESCETYPE)/ps_quiesce_impl.h"' >> $(QUIESCEFILE); \
38 | 		echo '#endif	/* PS_QUIESCE_H */'                              >> $(QUIESCEFILE); \
39 | 	fi
40 | 
41 | 
42 | $(PLATFILE): config
43 | 
44 | %.o:%.c
45 | 	$(CC) $(CFLAGS) -o $@ -c $<
46 | 
47 | $(CLIB):$(PLATFILE) $(COBJS)
48 | 	$(AR) cr $@ $^
49 | 
50 | tests: $(CLIB)
51 | 	$(MAKE) $(MAKEFLAGS) -C tests/ all
52 | 
53 | clean:
54 | 	rm -f $(PLATFILE) $(COBJS) $(CLIB) $(CDEPRM)
55 | 	$(MAKE) $(MAKEFLAGS) -C tests/ clean
56 | 
57 | -include $(CDEPS)
58 | 


--------------------------------------------------------------------------------
/Makefile.inc:
--------------------------------------------------------------------------------
 1 | LNAME     = ps
 2 | CLIB      = lib$(LNAME).a
 3 | 
 4 | CINC     += -I. $(CINC_ENV)
 5 | CC        = $(CROSS_COMPILE)gcc
 6 | LD        = $(CC)
 7 | MAKE      = make
 8 | 
 9 | COPT      = -O3
10 | CWARN     = -Wno-long-long
11 | CFLAGS    = -MD -MP -std=c11 -Wall -Wpedantic -Wextra $(CWARN) $(COPT) $(CINC) -Wno-unused-function -Wno-address-of-packed-member
12 | 
13 | ifeq ($(ARCHNAME), x86)
14 | CFLAGS   += -m32 -D__x86__
15 | else ifeq ($(ARCHNAME), x86_64)
16 | CFLAGS   += -m64 -D__x86_64__
17 | else ifeq ($(ARCHNAME), armv7a)
18 | CFLAGS   += -march=armv7-a -D__arm__
19 | endif
20 | 
21 | override MAKEFLAGS = --no-print-directory -I$(PWD)
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Building and testing
 2 | 
 3 | ```
 4 | $ ./configure linux x86 general
 5 | $ make
 6 | ```
 7 | 
 8 | Additional options include:
 9 | 
10 | ```
11 | $ make tests
12 | $ make clean
13 | ```
14 | 
15 | Testing:
16 | 
17 | ```
18 | $ ./configure linux x86 general
19 | $ make tests
20 | $ cd tests/
21 | $ ./list.test
22 | $ sudo ./slab.test
23 | $ ...repeat with each *.test executable...
24 | ```
25 | 
26 | # Organization
27 | 
28 | - `ps_config.h` includes some configuration variables...edit this directly for your system.
29 | - `Makefile.config` is an auto-generated file (with `configure`) includes two variables that control your OS and architecture.  These variables must exactly match a pair of directories in `plat/os/` and `plat/arch/`.
30 | - `libps.a` is the output of the library compilation.  Including this in your `-L` path, and including this directory in `-I` will enable you to use the library.
31 | - `README.md` and `TODO.md` ...take a wild guess ;-)
32 | - `plat/` is the platform directory including both architecture-specific, and OS-specific functions.
33 | - `quiesce_type/` is a directory including different quiescence calculation implementations (See RTAS paper for detail).
34 |   - `general/` is optimized for best effort environment;
35 |   - `real_time/` provides guarantee for response time and memory utilization, and can be used in soft/hard real-time system;
36 |   - `temporal/` can only be used in hard real-time system, as it requires to know maximal response time in parallel section apriori.
37 | - `tests/` the set of tests for each parsec abstraction.  A set of `*.test` executables are generated by this that can each be run as `root` (necessary to set thread affinity).
38 | 
39 | # FAQ
40 | 
41 | We presented the Parsec work at Eurosys, and a number of good questions arose.
42 | I'll answer a few of those here, along with some questions from the program committee's reviews.
43 | 
44 | ### Using `rdtsc` Properly
45 | 
46 | Q: There are complications with using `rdtsc`.
47 | It is not a serializing instruction, so it can be reordered in the pipeline relative to memory operations.
48 | Put another way, the accesses to the data-structure can be reordered *before* the time stamp counter (TSC) can be made visible to other cores in memory.
49 | 
50 | A: This is a great question, and originates from the fact that using `rdtsc` is surprisingly difficult to get right large because it is a *non-serializing* instruction.
51 | Using the serializing variant (`rdtscp`) has a significant overhead (~80 vs 30 cycles).
52 | We use a memory barrier to make sure that the memory value generated by `rdtsc` is visible to other cores *before* accessing the enclosed data-structure.
53 | This has the effect of serializing with surrounding memory accesses.
54 | 
55 | It is certainly desirable to get rid of the memory barrier as flushing the store buffer can have significant overhead.
56 | If we were to do so (using a previous technique that relies on the bounded size of the store buffer), then we'd have to add a conservative offset on the comparison between when memory is freed, and when tasks are accessing the parallel section.
57 | 
58 | ### Avoiding `rdtsc`
59 | 
60 | Q: The `rdtsc` instruction is not free (roughly 30 cycles on our machine).
61 | Is it possible to remove it in some way?
62 | 
63 | A: The benefit of `rdtsc` is that is provides local access to a global relation.
64 | However, it *is* possible to use a global variable that is incremented periodically, and use that as our global time.
65 | Each read-side section will read this global variable, thus will cause coherency traffic after it is updated.
66 | However, these updates can be scheduled (i.e. modifying the period of time updates) to trade between the coherency overheads, and the rapid advancement of time.
67 | The slower that time ticks by, the more difficult it is to distinguish between when memory is freed, and when parallel sections are being accessed.
68 | 
69 | In this case, the benefit of Parsec SMR stems from the fact that it tries to ascertain quiescence for when memory was freed.
70 | Is any parallel section accessing that memory since before it was freed?
71 | When memory is freed, it is queued.
72 | When we attempt to quiesce, we try for the memory at the *head* of the queue (that was freed furthest in the past).
73 | When we get a quiescence value, we can apply it to as many nodes of memory as possible.
74 | This means that the operation is *wait-free* while still guaranteeing *progress* in deallocating memory.
75 | Even if we can't reclaim memory *now*, as quiescence can't be achieved, we will be able to at a future point in time (assuming that all threads eventually clear their parallel sections).
76 | Thus, even if we use a global variable to track time, there still is some benefit as we still get wait-free memory reclamation that is as scalable for quiescence as the `rdtsc` approach.
77 | 
78 | ### Vs. Batch Frees
79 | 
80 | Q: Can't we just do a batch quiescence for many memory items, instead of a quiescence per memory free?
81 | This would amortize the cost of the synchronization operation.
82 | 
83 | A: Yes.
84 | In that case, we're manually attempting to compensate for the lack of scalability within the quiescence primitives.
85 | The downsides of this approach are 1. that it is still using a primitive that spins determining quiescence, and 2. that the batch size is a key factor in the system that must be tuned.
86 | In this context, Parsec SMR can be seen as a runtime that determines batch sizes automatically, and avoids inducing the latency spikes of spin-base quiescence (regardless how infrequent).
87 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Tests
 2 | 
 3 | - Test SMR with varying levels of batch size
 4 | - Test that the memory utilization in SMR converges on some specific size.
 5 | 
 6 | # Benchmarks
 7 | 
 8 | - namespace lookup
 9 | - ns alloc/dealloc
10 | 
11 | # Features
12 | 
13 | - NUMA awareness in the SMR
14 | - Destructors for SMR memory
15 | - Customizable quiescence functions for SMR memory.
16 | - Lower the amount of memory saved in the SMR lists.
17 | - Atomic operations on all the freelists
18 | - Add a policy for which slab to remove memory from (based on utilization)
19 | - Cache coloring in the slab, where possible
20 | - Linked list that is SMR-interoperable (`rcu_list` equivalent)
21 | - Page (and page extent) manager to avoid mmap/munmap calls everywhere
22 | - When retrieving remotely freed memory, move it to a local list, and
23 |   bound the number of items added into slabs (to bound execution time)
24 | - Save header memory for slab (without smr) by making the smr stuff a header _before_ the slab info
25 |   struct ps_mheader { union {struct ps_slab *slab; struct ps_mheader *n; } u; };
26 |   struct ps_sheader { ps_free_token_t tsc; struct ps_sheader *n; struct ps_mheader m; };
27 | - Deallocate non-leaf levels of the lookup table
28 | 


--------------------------------------------------------------------------------
/configure:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ $# -ne 3 ]; then
 4 |     echo "Usage: $0 osname archname quiesce_type"
 5 |     echo "\tosname is in {linux, cos}"
 6 |     echo "\tarchname is in {x86, x86_64, armv7a}"
 7 |     echo "\tquiesce_type is in {general, real_time, temporal}"
 8 | else
 9 |     echo "OSNAME      = $1" > Makefile.config
10 |     echo "ARCHNAME    = $2" >> Makefile.config
11 |     echo "QUIESCETYPE = $3" >> Makefile.config
12 | fi
13 | 


--------------------------------------------------------------------------------
/plat/arch/armv7a/ps_arch.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
 3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
 4 |  *
 5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
 6 |  */
 7 | 
 8 | /*
 9 |  * TODO: most of this file should simply use the concurrency kit
10 |  * versions.
11 |  */
12 | 
13 | #ifndef PS_ARCH_H
14 | #define PS_ARCH_H
15 | 
16 | #include <ps_config.h>
17 | 
18 | #define PS_PLAT_SHIFTR32(v)
19 | #define PS_ATOMIC_POSTFIX "l"
20 | 
21 | #include "ps_arch_armv7a_common.h"
22 | 
23 | #endif /* PS_ARCH_H */
24 | 


--------------------------------------------------------------------------------
/plat/arch/armv7a/ps_arch_armv7a_common.h:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  */
  7 | 
  8 | /*
  9 |  * TODO: most of this file should simply use the concurrency kit
 10 |  * versions.
 11 |  */
 12 | 
 13 | #ifndef PS_ARCH_ARMV7A_COMMON_H
 14 | #define PS_ARCH_ARMV7A_COMMON_H
 15 | 
 16 | typedef unsigned short int u16_t;
 17 | typedef unsigned int u32_t;
 18 | typedef unsigned long long u64_t;
 19 | typedef u64_t ps_tsc_t; 	/* our time-stamp counter representation */
 20 | typedef u16_t coreid_t;
 21 | typedef u16_t localityid_t;
 22 | 
 23 | #ifndef likely
 24 | #define likely(x)      __builtin_expect(!!(x), 1)
 25 | #endif
 26 | #ifndef unlikely
 27 | #define unlikely(x)    __builtin_expect(!!(x), 0)
 28 | #endif
 29 | 
 30 | #define PS_CACHE_LINE       64
 31 | #define PS_CACHE_PAD        (PS_CACHE_LINE*2)
 32 | #define PS_CACHE_PAD_SZ(sz) (PS_CACHE_PAD - ((sz) % PS_CACHE_PAD))
 33 | #define PS_WORD             sizeof(long)
 34 | #define PS_PACKED           __attribute__((packed))
 35 | #define PS_ALIGNED          __attribute__((aligned(PS_CACHE_LINE)))
 36 | #define PS_WORDALIGNED      __attribute__((aligned(PS_WORD)))
 37 | #define PS_PAGE_SIZE        4096
 38 | #define PS_RNDUP(v, a)      (-(-(v) & -(a))) /* from blogs.oracle.com/jwadams/entry/macros_and_powers_of_two */
 39 | 
 40 | #ifndef PS_WORDSIZE
 41 | #define PS_WORDSIZE 32
 42 | #endif
 43 | 
 44 | #define PS_CAS_INSTRUCTION "cmpxchg"
 45 | #define PS_FAA_INSTRUCTION "xadd"
 46 | #define PS_CAS_STR PS_CAS_INSTRUCTION PS_ATOMIC_POSTFIX " %2, %0; setz %1"
 47 | #define PS_FAA_STR PS_FAA_INSTRUCTION PS_ATOMIC_POSTFIX " %1, %0"
 48 | 
 49 | #ifndef ps_cc_barrier
 50 | #define ps_cc_barrier() __asm__ __volatile__ ("" : : : "memory")
 51 | #endif
 52 | 
 53 | /* Basic assembly for Cortex-A */
 54 | static inline unsigned long
 55 | ps_ldrexw(volatile unsigned long *addr)
 56 | {
 57 | 	unsigned long result;
 58 | 	__asm__ __volatile__ ( "ldrex %0, %1" : "=r" (result) : "Q" (*addr) );
 59 | 
 60 | 	return(result);
 61 | }
 62 | 
 63 | static inline unsigned long
 64 | ps_strexw(unsigned long value, volatile unsigned long *addr)
 65 | {
 66 | 	unsigned long result;
 67 | 	__asm__ __volatile__ ( "strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) );
 68 | 
 69 | 	return(result);
 70 | }
 71 | 
 72 | 
 73 | static inline void
 74 | ps_clrex(void)
 75 | {
 76 | 	__asm__ __volatile__ ("clrex" ::: "memory");
 77 | }
 78 | 
 79 | /*
 80 |  * Return values:
 81 |  * 0 on failure due to contention (*target != old)
 82 |  * 1 otherwise (*target == old -> *target = updated)
 83 |  */
 84 | static inline int 
 85 | ps_cas(unsigned long *target, unsigned long old, unsigned long updated)
 86 | {
 87 | 	unsigned long oldval, res;
 88 | 
 89 | 	do {
 90 | 		oldval = ps_ldrexw(target);
 91 | 
 92 | 		if(oldval == old) {
 93 | 			/* 0 = succeeded, 1 = failed */
 94 | 			res = ps_strexw(updated, target);
 95 | 		} else {
 96 | 			ps_clrex();
 97 | 
 98 | 			return 0;
 99 | 		}
100 | 	} while(res);
101 | 
102 | 	return 1;
103 | }
104 | 
105 | /*
106 |  * Fetch-and-add implementation on Cortex-A. Returns the original value.
107 |  */
108 | static inline int 
109 | ps_faa(unsigned long *var, long value)
110 | {
111 | 	unsigned int res;
112 | 	long oldval;
113 | 
114 | 	do {
115 | 		oldval = (int) ps_ldrexw((volatile unsigned long *) var);
116 | 		res    = ps_strexw((unsigned long) (oldval + value), (volatile unsigned long *) var);
117 | 	} while(res);
118 | 
119 | 	return oldval;
120 | }
121 | 
122 | static inline void
123 | ps_mem_fence(void)
124 | { __asm__ __volatile__("dsb" ::: "memory"); }
125 | 
126 | #define ps_load(addr) (*(volatile __typeof__(*addr) *)(addr))
127 | #define ps_store(addr, val) ((*(volatile __typeof__(*addr) *)(addr)) = val)
128 | 
129 | static inline int
130 | ps_upcas(unsigned long *target, unsigned long old, unsigned long updated)
131 | { return ps_cas(target, old, updated); }
132 | 
133 | static inline long
134 | ps_upfaa(unsigned long *var, long value)
135 | { return ps_faa(var, value); }
136 | 
137 | /*
138 |  * FIXME: this is truly an affront to humanity for now, but it is a
139 |  * simple lock for testing -- naive spin *without* backoff, gulp
140 |  *
141 |  * This is a great example where we should be using CK.
142 |  */
143 | struct ps_lock {
144 | 	unsigned long o;
145 | };
146 | 
147 | static inline void
148 | ps_lock_take(struct ps_lock *l)
149 | { while (!ps_cas(&l->o, 0, 1)) ; }
150 | 
151 | static inline void
152 | ps_lock_release(struct ps_lock *l)
153 | { l->o = 0; }
154 | 
155 | static inline void
156 | ps_lock_init(struct ps_lock *l)
157 | { l->o = 0; }
158 | 
159 | static inline ps_tsc_t
160 | ps_tsc(void)
161 | {
162 | 	unsigned int val;
163 | 	
164 | 	/*
165 | 	 * NOTE: This only works if the cycle counter access is enabled in the kernel.
166 | 	 * https://blog.regehr.org/archives/794
167 | 	 */
168 | 
169 | 	/* Read CCNT Register */
170 | 	__asm__ __volatile__ ("MRC p15, 0, %0, c9, c13, 0\t\n": "=r"(val));
171 | 
172 | 	return val;
173 | }
174 | 
175 | #endif /* PS_ARCH_ARMV7A_COMMON_H */
176 | 


--------------------------------------------------------------------------------
/plat/arch/x86/ps_arch.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
 3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
 4 |  *
 5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
 6 |  */
 7 | 
 8 | /*
 9 |  * TODO: most of this file should simply use the concurrency kit
10 |  * versions.
11 |  */
12 | 
13 | #ifndef PS_ARCH_H
14 | #define PS_ARCH_H
15 | 
16 | #include <ps_config.h>
17 | 
18 | #define PS_PLAT_SHIFTR32(v)
19 | #define PS_ATOMIC_POSTFIX "l"
20 | 
21 | #include "ps_arch_x86_common.h"
22 | 
23 | #endif /* PS_ARCH_H */
24 | 


--------------------------------------------------------------------------------
/plat/arch/x86/ps_arch_x86_common.h:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  */
  7 | 
  8 | /*
  9 |  * TODO: most of this file should simply use the concurrency kit
 10 |  * versions.
 11 |  */
 12 | 
 13 | #ifndef PS_ARCH_X86_COMMON_H
 14 | #define PS_ARCH_X86_COMMON_H
 15 | 
 16 | typedef unsigned short int u16_t;
 17 | typedef unsigned int u32_t;
 18 | typedef unsigned long long u64_t;
 19 | typedef u64_t ps_tsc_t; 	/* our time-stamp counter representation */
 20 | typedef u16_t coreid_t;
 21 | typedef u16_t localityid_t;
 22 | 
 23 | #ifndef likely
 24 | #define likely(x)      __builtin_expect(!!(x), 1)
 25 | #endif
 26 | #ifndef unlikely
 27 | #define unlikely(x)    __builtin_expect(!!(x), 0)
 28 | #endif
 29 | 
 30 | #define PS_CACHE_LINE       64
 31 | #define PS_CACHE_PAD        (PS_CACHE_LINE*2)
 32 | #define PS_CACHE_PAD_SZ(sz) (PS_CACHE_PAD - ((sz) % PS_CACHE_PAD))
 33 | #define PS_WORD             sizeof(long)
 34 | #define PS_PACKED           __attribute__((packed))
 35 | #define PS_ALIGNED          __attribute__((aligned(PS_CACHE_LINE)))
 36 | #define PS_WORDALIGNED      __attribute__((aligned(PS_WORD)))
 37 | #define PS_PAGE_SIZE        4096
 38 | #define PS_RNDUP(v, a)      (-(-(v) & -(a))) /* from blogs.oracle.com/jwadams/entry/macros_and_powers_of_two */
 39 | 
 40 | #define PS_CAS_INSTRUCTION "cmpxchg"
 41 | #define PS_FAA_INSTRUCTION "xadd"
 42 | #define PS_CAS_STR PS_CAS_INSTRUCTION PS_ATOMIC_POSTFIX " %2, %0; setz %1"
 43 | #define PS_FAA_STR PS_FAA_INSTRUCTION PS_ATOMIC_POSTFIX " %1, %0"
 44 | 
 45 | #ifndef ps_cc_barrier
 46 | #define ps_cc_barrier() __asm__ __volatile__ ("" : : : "memory")
 47 | #endif
 48 | 
 49 | /*
 50 |  * Return values:
 51 |  * 0 on failure due to contention (*target != old)
 52 |  * 1 otherwise (*target == old -> *target = updated)
 53 |  */
 54 | static inline int
 55 | ps_cas(unsigned long *target, unsigned long old, unsigned long updated)
 56 | {
 57 |         char z;
 58 |         __asm__ __volatile__("lock " PS_CAS_STR
 59 |                              : "+m" (*target), "=a" (z)
 60 |                              : "q"  (updated), "a"  (old)
 61 |                              : "memory", "cc");
 62 |         return (int)z;
 63 | }
 64 | 
 65 | static inline long
 66 | ps_faa(unsigned long *target, long inc)
 67 | {
 68 | #pragma GCC diagnostic push
 69 | #pragma GCC diagnostic ignored "-Warray-bounds"
 70 | /* GCC has bug of processing a warning(-Warray-bounds) and not get fixed, thus close the warning in this function */
 71 |         __asm__ __volatile__("lock " PS_FAA_STR
 72 |                              : "+m" (*target), "+q" (inc)
 73 |                              : : "memory", "cc");
 74 |         return inc;
 75 | #pragma GCC diagnostic pop
 76 | }
 77 | 
 78 | static inline void
 79 | ps_mem_fence(void)
 80 | { __asm__ __volatile__("mfence" ::: "memory"); }
 81 | 
 82 | #define ps_load(addr) (*(volatile __typeof__(*addr) *)(addr))
 83 | #define ps_store(addr, val) ((*(volatile __typeof__(*addr) *)(addr)) = val)
 84 | 
 85 | /*
 86 |  * Only atomic on a uni-processor, so not for cross-core coordination.
 87 |  * Faster on a multiprocessor when used to synchronize between threads
 88 |  * on a single core by avoiding locking.
 89 |  */
 90 | static inline int
 91 | ps_upcas(unsigned long *target, unsigned long old, unsigned long updated)
 92 | {
 93 |         char z;
 94 |         __asm__ __volatile__(PS_CAS_STR
 95 |                              : "+m" (*target), "=a" (z)
 96 |                              : "q"  (updated), "a"  (old)
 97 |                              : "memory", "cc");
 98 |         return (int)z;
 99 | }
100 | 
101 | static inline long
102 | ps_upfaa(unsigned long *target, long inc)
103 | {
104 |         __asm__ __volatile__(PS_FAA_STR
105 |                              : "+m" (*target), "+q" (inc)
106 |                              : : "memory", "cc");
107 |         return inc;
108 | }
109 | 
110 | 
111 | /*
112 |  * FIXME: this is truly an affront to humanity for now, but it is a
113 |  * simple lock for testing -- naive spin *without* backoff, gulp
114 |  *
115 |  * This is a great example where we should be using CK.
116 |  */
117 | struct ps_lock {
118 | 	unsigned long o;
119 | };
120 | 
121 | static inline void
122 | ps_lock_take(struct ps_lock *l)
123 | { while (!ps_cas(&l->o, 0, 1)) ; }
124 | 
125 | static inline void
126 | ps_lock_release(struct ps_lock *l)
127 | { l->o = 0; }
128 | 
129 | static inline void
130 | ps_lock_init(struct ps_lock *l)
131 | { l->o = 0; }
132 | 
133 | static inline ps_tsc_t
134 | ps_tsc(void)
135 | {
136 | 	unsigned long a, d, c;
137 | 
138 | 	__asm__ __volatile__("rdtsc" : "=a" (a), "=d" (d), "=c" (c) : : );
139 | 
140 | 	return ((u64_t)d << 32) | (u64_t)a;
141 | }
142 | 
143 | #endif /* PS_ARCH_X86_COMMON_H */
144 | 


--------------------------------------------------------------------------------
/plat/arch/x86_64/ps_arch.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
 3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
 4 |  *
 5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
 6 |  */
 7 | 
 8 | /*
 9 |  * TODO: most of this file should simply use the concurrency kit
10 |  * versions.
11 |  */
12 | 
13 | #ifndef PS_ARCH_H
14 | #define PS_ARCH_H
15 | 
16 | #include <ps_config.h>
17 | 
18 | #define PS_PLAT_SHIFTR32(v) (v |= v >> 32)
19 | #define PS_ATOMIC_POSTFIX "q"
20 | 
21 | #include "ps_arch_x86_common.h"
22 | 
23 | #endif /* PS_ARCH_H */
24 | 


--------------------------------------------------------------------------------
/plat/arch/x86_64/ps_arch_x86_common.h:
--------------------------------------------------------------------------------
1 | ../x86/ps_arch_x86_common.h


--------------------------------------------------------------------------------
/plat/os/cos/ps_os.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
 3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
 4 |  *
 5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
 6 |  */
 7 | 
 8 | #ifndef PS_OS_H
 9 | #define PS_OS_H
10 | 
11 | #include <ps_config.h>
12 | #include <ps_plat.h>
13 | 
14 | #include <stdlib.h>
15 | #include <cos_debug.h>
16 | #include <string.h>
17 | 
18 | /* Default allocation and deallocation functions */
19 | static inline void *
20 | ps_plat_alloc(size_t sz, coreid_t coreid)
21 | {
22 | 	(void)sz;
23 | 	(void)coreid;
24 | 
25 | 	return NULL;
26 | }
27 | 
28 | static inline void
29 | ps_plat_free(void *s, size_t sz, coreid_t coreid)
30 | {
31 | 	(void)s; (void)coreid; (void)sz;
32 | }
33 | 
34 | /*
35 |  * These functions are ambiguously os vs. x86 as the locality API
36 |  * requires that the underlying OS set up the locality information
37 |  * readable with rdtscp correctly.  Arbitrary decision to put it here
38 |  * for now.
39 |  */
40 | static inline ps_tsc_t
41 | ps_tsc_locality(coreid_t *coreid, localityid_t *numaid)
42 | {
43 | 	*coreid = 0;
44 | 	*numaid = 0;
45 | 
46 | 	return ps_tsc();
47 | }
48 | 
49 | static inline unsigned int
50 | ps_coreid(void)
51 | {
52 | 	coreid_t coreid, numaid;
53 | 
54 | 	if (PS_NUMCORES == 1) return 0;
55 | 	ps_tsc_locality(&coreid, &numaid);
56 | 
57 | 	return coreid;
58 | }
59 | 
60 | 
61 | #endif	/* PS_OS_H */
62 | 


--------------------------------------------------------------------------------
/plat/os/linux/ps_os.c:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * This file is dual licensed both under the GPL v2 license with the
  4 |  * classpath exception and the BSD 2 clause license.
  5 |  *
  6 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  7 |  */
  8 | 
  9 | #define _GNU_SOURCE
 10 | #include <stdlib.h>
 11 | #include <sched.h>
 12 | #include <sys/resource.h>
 13 | #include <pthread.h>
 14 | #include <sys/types.h>
 15 | 
 16 | #include <ps_plat.h>
 17 | 
 18 | struct thd_active {
 19 | 	volatile int barrierval;
 20 | } CACHE_ALIGNED;
 21 | 
 22 | struct thd_active thd_active[PS_NUMCORES] PS_ALIGNED;
 23 | 
 24 | /* Only used in Linux tests. */
 25 | const int identity_mapping[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
 26 | const int *cpu_assign        = identity_mapping;
 27 | /* int cpu_assign[41] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, */
 28 | /* 		      1, 5, 9, 13, 17, 21, 25, 29, 33, 37, */
 29 | /* 		      2, 6, 10, 14, 18, 22, 26, 30, 34, 38, */
 30 | /* 		      3, 7, 11, 15, 19, 23, 27, 31, 35, 39, -1}; */
 31 | 
 32 | static void
 33 | call_getrlimit(int id, char *name)
 34 | {
 35 | 	struct rlimit rl;
 36 | 	(void)name;
 37 | 
 38 | 	if (getrlimit(id, &rl)) {
 39 | 		perror("getrlimit: ");
 40 | 		exit(-1);
 41 | 	}
 42 | }
 43 | 
 44 | static void
 45 | call_setrlimit(int id, rlim_t c, rlim_t m)
 46 | {
 47 | 	struct rlimit rl;
 48 | 
 49 | 	rl.rlim_cur = c;
 50 | 	rl.rlim_max = m;
 51 | 	if (setrlimit(id, &rl)) {
 52 | 		exit(-1);
 53 | 	}
 54 | }
 55 | 
 56 | void
 57 | set_prio(void)
 58 | {
 59 | 	struct sched_param sp;
 60 | 
 61 | 	call_getrlimit(RLIMIT_CPU, "CPU");
 62 | #ifdef RLIMIT_RTTIME
 63 | 	call_getrlimit(RLIMIT_RTTIME, "RTTIME");
 64 | #endif
 65 | 	call_getrlimit(RLIMIT_RTPRIO, "RTPRIO");
 66 | 	call_setrlimit(RLIMIT_RTPRIO, RLIM_INFINITY, RLIM_INFINITY);
 67 | 	call_getrlimit(RLIMIT_RTPRIO, "RTPRIO");
 68 | 	call_getrlimit(RLIMIT_NICE, "NICE");
 69 | 
 70 | 	if (sched_getparam(0, &sp) < 0) {
 71 | 		perror("getparam: ");
 72 | 		exit(-1);
 73 | 	}
 74 | 	sp.sched_priority = sched_get_priority_max(SCHED_RR);
 75 | 	if (sched_setscheduler(0, SCHED_RR, &sp) < 0) {
 76 | 		perror("setscheduler: ");
 77 | 		exit(-1);
 78 | 	}
 79 | 	if (sched_getparam(0, &sp) < 0) {
 80 | 		perror("getparam: ");
 81 | 		exit(-1);
 82 | 	}
 83 | 	assert(sp.sched_priority == sched_get_priority_max(SCHED_RR));
 84 | 
 85 | 	return;
 86 | }
 87 | 
 88 | void
 89 | thd_set_affinity(pthread_t tid, int id)
 90 | {
 91 | 	cpu_set_t s;
 92 | 	int ret, cpuid;
 93 | 	coreid_t cid, n;
 94 | 
 95 | 	cpuid = cpu_assign[id];
 96 | 	CPU_ZERO(&s);
 97 | 	CPU_SET(cpuid, &s);
 98 | 
 99 | 	ret = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &s);
100 | 	if (ret) {
101 | 		perror("setting affinity error\n");
102 | 		exit(-1);
103 | 	}
104 | 
105 | 	/* set_prio(); */
106 | 	/* confirm that the library's version of coreid == benchmark's */
107 | 	ps_tsc_locality(&cid, &n);
108 | 
109 | 	printf("desired core %d, actual %d, pthreads %d\n", cpuid, cid, sched_getcpu());
110 | 	assert(cpuid == cid);
111 | }
112 | 
113 | /*
114 |  * Trivial barrier
115 |  */
116 | void
117 | meas_barrier(int ncores)
118 | {
119 | 	int cpu = ps_coreid();
120 | 	int initval = thd_active[cpu].barrierval, doneval = !initval;
121 | 
122 | 	if (cpu == 0) {
123 | 		int k;
124 | 		for (k = 1 ; k < ncores ; k++) {
125 | 			while (thd_active[k].barrierval == initval) ;
126 | 		}
127 | 		thd_active[0].barrierval = doneval;
128 | 	} else {
129 | 		thd_active[cpu].barrierval = doneval;
130 | 		while (thd_active[0].barrierval == initval) ;
131 | 	}
132 | 	/* gogogo! */
133 | }
134 | 


--------------------------------------------------------------------------------
/plat/os/linux/ps_os.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
 3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
 4 |  *
 5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
 6 |  */
 7 | 
 8 | #ifndef PS_OS_H
 9 | #define PS_OS_H
10 | 
11 | #include <assert.h>
12 | #include <sys/mman.h>
13 | #include <stdlib.h>
14 | #include <malloc.h>
15 | #include <stdio.h>
16 | #include <string.h>
17 | 
18 | /* Linux library */
19 | #include <pthread.h>
20 | 
21 | /* useful abstractions on Linux */
22 | void set_prio(void);
23 | void thd_set_affinity(pthread_t tid, int id);
24 | void meas_barrier(int ncores);
25 | 
26 | #include <ps_config.h>
27 | #include <ps_plat.h>
28 | 
29 | /* Default allocation and deallocation functions */
30 | static inline void *
31 | ps_plat_alloc(size_t sz, coreid_t coreid)
32 | {
33 | 	void *m;
34 | 	(void)coreid;
35 | 
36 | 	m = aligned_alloc(PS_PAGE_SIZE, sz);
37 | 	assert(m);
38 | 	memset(m, 0, sz);
39 | 
40 | 	return m;
41 | 	/* mmap(0, sz, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, (size_t)0); */
42 | }
43 | 
44 | static inline void
45 | ps_plat_free(void *s, size_t sz, coreid_t coreid)
46 | {
47 | 	(void)coreid; (void)sz;
48 | 	free(s);
49 | 	/* munmap(s, sz); */
50 | }
51 | 
52 | /*
53 |  * These functions are ambiguously os vs. x86 as the locality API
54 |  * requires that the underlying OS set up the locality information
55 |  * readable with rdtscp correctly.  Arbitrary decision to put it here
56 |  * for now.
57 |  */
58 | static inline ps_tsc_t
59 | ps_tsc_locality(coreid_t *coreid, localityid_t *numaid)
60 | {
61 | 	unsigned long a, d, c;
62 | 
63 | 	__asm__ __volatile__("rdtscp" : "=a" (a), "=d" (d), "=c" (c) : : );
64 | 	*coreid = c & 0xFFF; 	/* lower 12 bits in Linux = coreid */
65 | 	*numaid = c >> 12; 	/* next 8 = socket/numa id */
66 | 
67 | 	return ((u64_t)d << 32) | (u64_t)a;
68 | }
69 | 
70 | static inline unsigned int
71 | ps_coreid(void)
72 | {
73 | 	coreid_t coreid, numaid;
74 | 
75 | 	if (PS_NUMCORES == 1) return 0;
76 | 	ps_tsc_locality(&coreid, &numaid);
77 | 
78 | 	return coreid;
79 | }
80 | 
81 | 
82 | #endif	/* PS_OS_H */
83 | 


--------------------------------------------------------------------------------
/ps.h:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * The public parsec interface.
 3 |  */
 4 | 
 5 | #ifndef PS_H
 6 | #define PS_H
 7 | 
 8 | /* Some of these are superfluous, but reasonable for documentation */
 9 | #include <ps_slab.h>
10 | #include <ps_smr.h>
11 | #include <ps_ns.h>
12 | 
13 | #endif /* PS_H */
14 | 


--------------------------------------------------------------------------------
/ps_config.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2017 by Gabriel Parmer.  All rights reserved.
 3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
 4 |  *
 5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
 6 |  */
 7 | 
 8 | #ifndef PS_CONFIG_H
 9 | #define PS_CONFIG_H
10 | 
11 | /*
12 |  * How frequently do we check remote free lists when we make an
13 |  * allocation?  This is in platform-specific code because it is
14 |  * dependent on the hardware costs for cache-line contention on a
15 |  * remote numa node.
16 |  *
17 |  * If that contention has 16x the cost of a normal allocation, for
18 |  * example, then choosing to batch checking remote frees once every
19 |  * 128 iterations increases allocation cost by a factor of (2^4/2^7 =
20 |  * 2^-3) 1/8.
21 |  */
22 | #ifndef PS_REMOTE_BATCH
23 | /* Needs to be a power of 2 */
24 | #define PS_REMOTE_BATCH 64
25 | #endif
26 | 
27 | /*
28 |  * Core and socket counts.  We're concerned with servers and embedded
29 |  * systems; both domains where the machine configuration is often
30 |  * known apriori which gives use the opportunity to more efficiently
31 |  * lay out data, and gives some flexibiliy in system design and
32 |  * optimization (see the parsec paper on the optimization based on
33 |  * different cores iterating through other cores by starting at
34 |  * different offsets).
35 |  */
36 | #ifndef PS_NUMCORES
37 | #define PS_NUMCORES      4
38 | #endif
39 | #ifndef PS_NUMLOCALITIES
40 | #define PS_NUMLOCALITIES 1
41 | #endif
42 | 
43 | #endif	/* PS_CONFIG_H */
44 | 


--------------------------------------------------------------------------------
/ps_ertrie.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014 by Gabriel Parmer, gparmer@gwu.edu
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  */
  5 | 
  6 | #ifndef ERTRIE_H
  7 | #define ERTRIE_H
  8 | 
  9 | #include <errno.h>
 10 | 
 11 | #define CFORCEINLINE __attribute__((always_inline))
 12 | 
 13 | /*
 14 |  * TODO:
 15 |  * - change the accum variable to be void *, and be named load_info,
 16 |  *   and memctxt to be named store_info.
 17 |  */
 18 | 
 19 | /* Internal node in the trie */
 20 | struct ert_intern {
 21 | 	/*
 22 | 	 * This "next" value should be opaque and only interpreted by
 23 | 	 * the specialized functions.  It might be a pointer, or
 24 | 	 * array.
 25 | 	 */
 26 | 	void *next;
 27 | };
 28 | struct ert {
 29 | 	struct ert_intern vect; /* in-place data-structure */
 30 | };
 31 | 
 32 | /* get the next level/value in the internal structure/value */
 33 | typedef struct ert_intern *(*ert_get_fn_t)(struct ert_intern *, void *accum, int isleaf);
 34 | /* check if the value in the internal structure is "null" */
 35 | typedef int  (*ert_isnull_fn_t)(struct ert_intern *, void *accum, int isleaf);
 36 | /* does this final ert_intern in a lookup resolve to an successful lookup? */
 37 | typedef int (*ert_resolve_fn_t)(struct ert_intern *a, void *accum, int leaf, u32_t order, u32_t sz);
 38 | /* set values to their initial value (often "null") */
 39 | typedef void (*ert_initval_fn_t)(struct ert_intern *, int isleaf);
 40 | /* set a value in an internal structure/value */
 41 | typedef int (*ert_set_fn_t)(struct ert_intern *e, void *val, void *accum, int isleaf);
 42 | /* allocate an internal or leaf structure */
 43 | typedef void *(*ert_alloc_fn_t)(void *data, int sz, int last_lvl);
 44 | /* if we you extending the leaf level, this is called to set the leaf entry */
 45 | typedef int (*ert_setleaf_fn_t)(struct ert_intern *entry, void *data);
 46 | typedef void *(*ert_getleaf_fn_t)(struct ert_intern *entry, void *accum);
 47 | 
 48 | #define ERT_CONST_PARAMS						\
 49 | 	u32_t depth, u32_t order, u32_t intern_sz, u32_t last_order, u32_t last_sz, void *initval, \
 50 |         ert_initval_fn_t initfn, ert_get_fn_t getfn, ert_isnull_fn_t isnullfn, ert_set_fn_t setfn, \
 51 |         ert_alloc_fn_t allocfn, ert_setleaf_fn_t setleaffn, ert_getleaf_fn_t getleaffn, ert_resolve_fn_t resolvefn
 52 | #define ERT_CONST_ARGS depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn
 53 | #define ERT_CONSTS_DEWARN  (void)depth; (void)order; (void)intern_sz; (void)last_order; (void)last_sz; (void)initval; \
 54 |         (void)initfn; (void)getfn; (void)isnullfn; (void)setfn; (void)allocfn; (void)setleaffn; (void)getleaffn; (void)resolvefn;
 55 | #define ERT_DEWARN ERT_CONSTS_DEWARN
 56 | 
 57 | /*
 58 |  * Default implementations of the customization functions that assume
 59 |  * a normal tree with pointers for internal nodes, with the "null
 60 |  * node" being equal to NULL (i.e. you can't store NULL values in the
 61 |  * structure), and setting values in internal and leaf nodes being
 62 |  * done with straightforward stores.
 63 |  */
 64 | static inline CFORCEINLINE struct ert_intern *
 65 | ert_defget(struct ert_intern *a, void *accum, int leaf)
 66 | { (void)accum; (void)leaf; return a->next; }
 67 | static inline void *
 68 | ert_defgetleaf(struct ert_intern *a, void *accum)
 69 | { (void)accum;  return a->next; }
 70 | static inline int
 71 | ert_defisnull(struct ert_intern *a, void *accum, int leaf)
 72 | { (void)accum; (void)leaf; return a->next == NULL; }
 73 | static inline int
 74 | ert_defresolve(struct ert_intern *a, void *accum, int leaf, u32_t order, u32_t sz)
 75 | { (void)a; (void)accum; (void)leaf; (void)order; (void)sz; return 1; }
 76 | static inline int ert_defset(struct ert_intern *a, void *v, void *accum, int leaf)
 77 | { (void)leaf; (void)accum; a->next = v; return 0; }
 78 | static inline int ert_defsetleaf(struct ert_intern *a, void *data)
 79 | { a->next = data; return 0; }
 80 | static inline void ert_definit(struct ert_intern *a, int leaf)
 81 | { (void)a; (void)leaf; a->next = NULL; }
 82 | 
 83 | /*
 84 |  * This macro is the key using the compiler to generate fast code.
 85 |  * This is generating function calls that are often inlined that are
 86 |  * being passed _constants_.  After function inlining, loop unrolling,
 87 |  * and constant propagation, the code generated should be very
 88 |  * specific to the set of parameters used.  Loops should be
 89 |  * eliminated, conditionals removed, and straight-line code produced.
 90 |  *
 91 |  * The informal goal of this is to ensure that the lookup code
 92 |  * generated is on the order of 10-20 instructions, depending on
 93 |  * depth.  In terms of (hot-cache) performance, we're shooting for
 94 |  * ~5*depth cycles (if L1 is 5 cycles to access).  For cold caches,
 95 |  * we're looking for ~500*depth cycles (if memory accesses are 500
 96 |  * cycles).  When there is parallel contention (writes), the cost
 97 |  * should be comparable to the latter case.
 98 |  *
 99 |  * Question: How can I replace the long argument lists here with a
100 |  * macro?  I'm hitting some preprocessor limitation I didn't
101 |  * anticipate here.
102 |  */
103 | #define ERT_CREATE(name, structname, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn) \
104 | struct structname { struct ert t; };				        \
105 | static struct structname *name##_alloc(void *memctxt)                   \
106 | { return (struct structname*)ert_alloc(memctxt, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
107 | static inline void *name##_lkup(struct structname *v, unsigned long id)	\
108 | { unsigned long a; return __ert_lookup((struct ert*)v, id, 0, depth, &a, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
109 | static inline void *name##_lkupa(struct structname *v, unsigned long id, void *accum)  \
110 | { return __ert_lookup((struct ert*)v, id, 0, depth, accum, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
111 | static inline void *name##_lkupan(struct structname *v, unsigned long id, u32_t dlimit, void *accum) \
112 | { return __ert_lookup((struct ert*)v, id, 0, dlimit, accum, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
113 | static inline void *name##_lkupani(struct structname *v, unsigned long id, u32_t dstart, u32_t dlimit, void *accum) \
114 | { return __ert_lookup((struct ert*)v, id, dstart, dlimit, accum, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
115 | static inline int name##_expandni(struct structname *v, unsigned long id, u32_t dstart, u32_t dlimit, void *accum, void *memctxt, void *data) \
116 | { return __ert_expand((struct ert*)v, id, dstart, dlimit, accum, memctxt, data, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
117 | static inline int name##_expandn(struct structname *v, unsigned long id, u32_t dlimit, void *accum, void *memctxt, void *data) \
118 | { return __ert_expand((struct ert*)v, id, 0, dlimit, accum, memctxt, data, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
119 | static inline int name##_expand(struct structname *v, unsigned long id, void *accum, void *memctxt, void *data) \
120 | { return __ert_expand((struct ert*)v, id, 0, depth, accum, memctxt, data, depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
121 | static inline unsigned long name##_maxid(void)				\
122 | { return __ert_maxid(depth, order, intern_sz, last_order, last_sz, initval, initfn, getfn, isnullfn, setfn, allocfn, setleaffn, getleaffn, resolvefn); } \
123 | static inline u32_t name##_maxdepth(void) { return (u32_t)depth; }
124 | 
125 | 
126 | #define ERT_CREATE_DEF(name, depth, order, last_order, last_sz, allocfn) \
127 | ERT_CREATE(name, name##_ert, depth, order, sizeof(int*), last_order, last_sz, NULL, ert_definit, ert_defget, ert_defisnull, ert_defset, allocfn, ert_defsetleaf, ert_defgetleaf, ert_defresolve)
128 | 
129 | /* maxid = min(2^wordsize, radix trie max representable) */
130 | static inline unsigned long
131 | __ert_maxid(ERT_CONST_PARAMS)
132 | {
133 | 	unsigned long off    = (unsigned long)(((order * (depth-1)) + last_order));
134 | 	unsigned long maxoff = (unsigned long)(sizeof(int*)*8); /* 8 bits per byte */
135 | 	ERT_CONSTS_DEWARN;
136 | 
137 | 	return (off > maxoff) ? ((unsigned long)1)<<maxoff : ((unsigned long)1)<<off;
138 | }
139 | 
140 | /*
141 |  * Initialize a level in the ertrie.  lvl is either an internal level,
142 |  * lvl > 1, or a leaf level in the tree in which case embedded leaf
143 |  * structs require a different initialization.
144 |  */
145 | static inline void
146 | __ert_init(struct ert_intern *vi, int isleaf, ERT_CONST_PARAMS)
147 | {
148 | 	int i, base, sz;
149 | 	ERT_CONSTS_DEWARN;
150 | 
151 | 	assert(vi);
152 | 	if (!isleaf) {
153 | 		base = 1<<order;
154 | 		sz   = intern_sz;
155 | 	} else {
156 | 		base = 1<<last_order;
157 | 		sz   = last_sz;
158 | 	}
159 | 	for (i = 0 ; i < base ; i++) {
160 | 		struct ert_intern *t = (void*)(((char*)vi) + (sz * i));
161 | 		initfn(t, isleaf);
162 | 	}
163 | }
164 | 
165 | static inline struct ert *
166 | ert_alloc(void *memctxt, ERT_CONST_PARAMS)
167 | {
168 | 	struct ert *v;
169 | 	struct ert_intern e;
170 | 	unsigned long accum = 0;
171 | 
172 | 	/* Make sure the id size can be represented on our system */
173 | 	assert(((order * (depth-1)) + last_order) < (sizeof(unsigned long)*8));
174 | 	assert(depth >= 1);
175 | 	if (depth > 1) v = allocfn(memctxt, (1<<order) * intern_sz, 0);
176 | 	else           v = allocfn(memctxt, (1<<last_order) * last_sz, 1);
177 | 	if (NULL == v) return NULL;
178 | 	__ert_init(&v->vect, depth == 1, ERT_CONST_ARGS);
179 | 
180 | 	setfn(&e, v, &accum, depth == 1);
181 | 	return (struct ert *)e.next;
182 | }
183 | 
184 | static inline struct ert_intern *
185 | __ert_walk(struct ert_intern *vi, unsigned long id, void *accum, u32_t lvl, ERT_CONST_PARAMS)
186 | {
187 | 	u32_t last_off;
188 | #define ERT_M(id, o) ((id) & ((1<<(o))-1)) /* Mask out order number of bits */
189 | 	ERT_CONSTS_DEWARN;
190 | 
191 | 	vi = getfn(vi, accum, 0);
192 | 	if (lvl-1 == 0) {
193 | 		/* offset into the last level, leaf node */
194 | 		last_off = ERT_M(id, last_order) * last_sz;
195 | 	} else {
196 | 		/* calculate the offset in an internal node */
197 | 		last_off = ERT_M((id >> ((order * (lvl-2)) + last_order)), order) * intern_sz;
198 | 	}
199 | 	return (struct ert_intern *)(((char *)vi) + last_off);
200 | }
201 | 
202 | /*
203 |  * This is the most optimized/most important function.
204 |  *
205 |  * We rely on compiler optimizations -- including constant
206 |  * propagation, loop unrolling, function inlining, dead-code
207 |  * elimination, and function inlining from constant function pointers
208 |  * -- to turn this into straight-line code.  It should be on the order
209 |  * of 10-20 instructions without loops, only including error checking
210 |  * branches that are not taken by the static branch detection
211 |  * algorithms.
212 |  *
213 |  * dlimit is the depth we should look into the tree.  This can be 0
214 |  * (return the highest-level of the tree) all the way to depth+1 which
215 |  * actually treats the entries in the last level of the trie as a
216 |  * pointer and returns its destination.  dlimit = depth+1 means that the
217 |  * size of the last-level nodes should be the size of an integer.
218 |  */
219 | 
220 | static inline CFORCEINLINE void *
221 | __ert_lookup(struct ert *v, unsigned long id, u32_t dstart, u32_t dlimit, void *accum, ERT_CONST_PARAMS)
222 | {
223 | 	struct ert_intern r, *n;
224 | 	u32_t i, limit;
225 | 
226 | 	assert(v);
227 | 	assert(id < __ert_maxid(ERT_CONST_ARGS));
228 | 	assert(dlimit <= depth+1);
229 | 	assert(dstart <= dlimit);
230 | 
231 | 	/* simply gets the address of the vector */
232 | 	r.next = &v->vect;
233 | 	n      = &r;
234 | 	limit  = dlimit < depth ? dlimit : depth;
235 | 	for (i = dstart ; i < limit ; i++) {
236 | 		if (unlikely(isnullfn(n, accum, 0))) return NULL;
237 | 		n = __ert_walk(n, id, accum, depth-i, ERT_CONST_ARGS);
238 | 	}
239 | 
240 | 	if (i == depth &&
241 | 	    unlikely(!resolvefn(n, accum, 1, last_order, last_sz))) return NULL;
242 | 	if (i < depth  &&
243 | 	    unlikely(!resolvefn(n, accum, 0, order, intern_sz))) return NULL;
244 | 	if (dlimit == depth+1) n = getleaffn(n, accum);
245 | 
246 | 	return n;
247 | }
248 | 
249 | /*
250 |  * Expand the data-structure starting from level/depth dstart, and up
251 |  * to and including some depth limit (dlimit).  This will call the
252 |  * initialization routines for that level, and hook it into the
253 |  * overall trie.  If you want to control the costs of memory
254 |  * allocation and initialization, then you should use limit to ensure
255 |  * that multiple levels of the trie are not expanded here, if desired.
256 |  *
257 |  * limit == 1 does not make sense (i.e. ert is already allocated),
258 |  * and limit = depth+1 means that we're trying to "expand" or set the
259 |  * leaf data to something provided in the memctxt.
260 |  */
261 | static inline int
262 | __ert_expand(struct ert *v, unsigned long id, u32_t dstart, u32_t dlimit, void *accum, void *memctxt, void *data, ERT_CONST_PARAMS)
263 | {
264 | 	struct ert_intern r, *n, *new;
265 | 	u32_t i, limit;
266 | 
267 | 	assert(v);
268 | 	assert(id < __ert_maxid(ERT_CONST_ARGS));
269 | 	assert(dlimit <= depth+1); /* cannot expand past leaf */
270 | 	assert(dstart <= dlimit);
271 | 
272 | 	r.next = &v->vect;
273 | 	n      = &r;
274 | 	limit  = dlimit < depth ? dlimit : depth;
275 | 	for (i = dstart ; i < limit-1 ; i++) {
276 | 		n = __ert_walk(n, id, accum, depth-i, ERT_CONST_ARGS);
277 | 		if (!isnullfn(n, accum, 0)) continue;
278 | 
279 | 		/* expand via memory allocation */
280 | 		if (i+2 < depth) new = allocfn(memctxt, (1<<order) * intern_sz, 0);
281 | 		else             new = allocfn(memctxt, (1<<last_order) * last_sz, 1);
282 | 
283 | 		if (unlikely(!new)) return -1;
284 | 		__ert_init(new, i+2 >= depth, ERT_CONST_ARGS);
285 | 		setfn(n, new, accum, 0);
286 | 	}
287 | 	if (dlimit == depth+1) {
288 | 		n = __ert_walk(n, id, accum, depth-i, ERT_CONST_ARGS);
289 | 		/* don't overwrite a value, unless we want to set it to the initval */
290 | 		if (data != initval && !isnullfn(n, accum, 0)) return 1;
291 | 
292 | 		if (setleaffn(n, data)) return -EAGAIN;
293 | 	}
294 | 	return 0;
295 | }
296 | 
297 | #endif /* ERTRIE_H */
298 | 


--------------------------------------------------------------------------------
/ps_global.h:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2011-2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Author: Gabriel Parmer, gparmer@gwu.edu, 2011
  6 |  */
  7 | 
  8 | #ifndef PS_GLOBAL_H
  9 | #define PS_GLOBAL_H
 10 | 
 11 | #include <ps_plat.h>
 12 | #include <ps_quiesce.h>
 13 | 
 14 | #define NUM_REMOTE_LIST (PS_CACHE_LINE/sizeof(struct ps_mheader *))
 15 | typedef unsigned long ps_desc_t;
 16 | 
 17 | /*
 18 |  * Lists of free memory.  The slab freelist is all slabs that have at
 19 |  * least one free object in them.  The qsc_list is a quiescence list
 20 |  * of memory that has been freed, but might still have references to
 21 |  * it (ala parsec).
 22 |  */
 23 | struct ps_slab;
 24 | struct ps_slab_freelist {
 25 | 	struct ps_slab    *list;
 26 | };
 27 | 
 28 | typedef ps_tsc_t ps_free_token_t;
 29 | /* Memory header */
 30 | struct ps_mheader {
 31 | 	ps_free_token_t    tsc_free;
 32 | 	struct ps_slab    *slab;	       /* slab header ptr */
 33 | 	struct ps_mheader *next;	       /* slab freelist ptr */
 34 | } PS_PACKED;
 35 | 
 36 | static inline struct ps_mheader *
 37 | __ps_mhead_get(void *mem)
 38 | { return (struct ps_mheader *)((char*)mem - sizeof(struct ps_mheader)); }
 39 | 
 40 | static inline void *
 41 | __ps_mhead_mem(struct ps_mheader *h)
 42 | { return &h[1]; }
 43 | 
 44 | static inline int
 45 | __ps_mhead_isfree(struct ps_mheader *h)
 46 | { return h->tsc_free != 0; }
 47 | 
 48 | static inline void
 49 | __ps_mhead_reset(struct ps_mheader *h)
 50 | {
 51 | 	h->tsc_free = 0;
 52 | 	h->next     = NULL;
 53 | }
 54 | 
 55 | /* If you don't need memory anymore, set it free! Assumes: token != 0*/
 56 | static inline void
 57 | __ps_mhead_setfree(struct ps_mheader *h, ps_free_token_t token)
 58 | {
 59 | 	/* TODO: atomic w/ error out */
 60 | 	h->tsc_free = token; /* Assumption: token must be guaranteed to be non-zero */
 61 | }
 62 | 
 63 | static inline void
 64 | __ps_mhead_init(struct ps_mheader *h, struct ps_slab *s)
 65 | {
 66 | 	h->slab     = s;
 67 | 	__ps_mhead_setfree(h, 1);
 68 | }
 69 | 
 70 | struct ps_qsc_list {
 71 | 	struct ps_mheader *head, *tail;
 72 | };
 73 | 
 74 | static inline struct ps_mheader *
 75 | __ps_qsc_peek(struct ps_qsc_list *ql)
 76 | { return ql->head; }
 77 | 
 78 | static inline void
 79 | __ps_qsc_enqueue(struct ps_qsc_list *ql, struct ps_mheader *n)
 80 | {
 81 | 	struct ps_mheader *t;
 82 | 
 83 | 	t = ql->tail;
 84 | 	if (likely(t)) t->next  = ql->tail = n;
 85 | 	else           ql->head = ql->tail = n;
 86 | }
 87 | 
 88 | static inline struct ps_mheader *
 89 | __ps_qsc_dequeue(struct ps_qsc_list *ql)
 90 | {
 91 | 	struct ps_mheader *a = ql->head;
 92 | 
 93 | 	if (a) {
 94 | 		ql->head = a->next;
 95 | 		if (unlikely(ql->tail == a)) ql->tail = NULL;
 96 | 		a->next = NULL;
 97 | 	}
 98 | 	return a;
 99 | }
100 | 
101 | static inline struct ps_mheader *
102 | __ps_qsc_clear(struct ps_qsc_list *l)
103 | {
104 | 	struct ps_mheader *m = l->head;
105 | 
106 | 	l->head = l->tail = NULL;
107 | 
108 | 	return m;
109 | }
110 | 
111 | struct ps_slab_remote_list {
112 | 	struct ps_mheader *remote_frees[NUM_REMOTE_LIST];
113 | 	char padding[PS_CACHE_PAD_SZ(sizeof(struct ps_mheader *) * NUM_REMOTE_LIST)];
114 | } PS_PACKED PS_ALIGNED;
115 | 
116 | static inline void
117 | __ps_rfl_stack_push(struct ps_mheader **h, struct ps_mheader *n)
118 | {
119 | 	struct ps_mheader *t;
120 | 
121 | 	do {
122 | 		t       = *h;
123 | 		n->next = t;
124 | 	} while(!ps_cas((unsigned long *)h, (unsigned long)t, (unsigned long)n));
125 | }
126 | 
127 | static inline struct ps_mheader *
128 | __ps_rfl_stack_remove_all(struct ps_mheader **h)
129 | {
130 | 	struct ps_mheader *t;
131 | 
132 | 	do {
133 | 		t = *h;
134 | 	} while(!ps_cas((unsigned long *)h, (unsigned long)t, (unsigned long)NULL));
135 | 
136 | 	return t;
137 | }
138 | 
139 | struct ps_slab_info {
140 | 	struct ps_slab_freelist fl;	      /* freelist of slabs with available objects */
141 | 	unsigned long           salloccnt;    /* # of times we've allocated, used to batch remote dequeues */
142 | 	unsigned long           remote_token; /* which of the remote NUMA nodes will we dequeue from this time? */
143 | 	unsigned long           nslabs;       /* # of slabs allocated here */
144 | };
145 | 
146 | struct parsec;
147 | struct ps_smr_info {
148 | 	struct parsec     *ps;         /* the parallel section that wraps this memory, or NULL */
149 | 	struct ps_qsc_list qsc_list;   /* queue of freed, but not quiesced memory */
150 | 	struct ps_qsc_account account;
151 | };
152 | 
153 | typedef void *(*ps_lkupan_fn_t)(void *v, unsigned long id, u32_t dlimit, void *accum);
154 | typedef int   (*ps_expand_fn_t)(void *v, unsigned long id, u32_t dlimit, void *accum, void *memctxt, void *data);
155 | 
156 | struct ps_ns_info {
157 | 	void *ert;
158 | 	size_t ert_depth;
159 | 	ps_lkupan_fn_t lkupfn;
160 | 	ps_expand_fn_t expandfn;
161 | 
162 | 	size_t desc_range; 	/* num descriptors per slab */
163 | 	ps_desc_t desc_max;
164 | 	char  padding[PS_CACHE_PAD_SZ(3*sizeof(void *) + 2*sizeof(size_t) + sizeof(ps_desc_t))];
165 | 
166 | 	struct ps_lock lock;
167 | 	struct ps_slab_freelist fl;
168 | 	ps_desc_t frontier;
169 | 	char  padding2[PS_CACHE_PAD_SZ(sizeof(struct ps_lock) + sizeof(ps_desc_t) + sizeof(struct ps_slab_freelist))];
170 | } PS_PACKED PS_ALIGNED;
171 | 
172 | /*
173 |  * TODO:
174 |  * 1. save memory by packing multiple freelists into the same
175 |  * cache-line
176 |  * 2. have multiple freelists (e.g. 4) for different "fullness"
177 |  * values, so that we can in O(1) always allocate from the slab that
178 |  * is most full, modulo the granularity of these bins.
179 |  * 3. implement the slab to allocate the headers for the other slab.
180 |  *
181 |  * Note: some of these TODOs are more applicable to the
182 |  * ps_slab_freelist.
183 |  *
184 |  * Note: the padding is for two cache-lines due to the observed
185 |  * behavior on Intel chips to aggressively prefetch an additional
186 |  * cache-line.
187 |  */
188 | struct ps_mem_percore {
189 | 	struct ps_slab_info slab_info;
190 | 	struct ps_smr_info  smr_info;
191 | 
192 | 	char padding[PS_CACHE_PAD_SZ(sizeof(struct ps_slab_info) + sizeof(struct ps_smr_info))];
193 | 
194 | 	/*
195 | 	 * Isolate the contended cache-lines from the common-case
196 | 	 * ones, maintain a set of them per numa node to ensure that
197 | 	 * performing remote frees does not contend across more than 2
198 | 	 * numa nodes (this node, and the destination node).
199 | 	 */
200 | 	struct ps_slab_remote_list slab_remote[PS_NUMLOCALITIES] PS_ALIGNED;
201 | } PS_ALIGNED;
202 | 
203 | struct ps_locality_info {
204 | 	localityid_t core_locality[PS_NUMCORES];
205 | } PS_ALIGNED;
206 | 
207 | struct ps_mem {
208 | 	struct ps_ns_info       ns_info;
209 | 	struct ps_mem_percore   percore[PS_NUMCORES];
210 | } PS_ALIGNED;
211 | 
212 | #define __PS_MEM_CREATE_DATA(name) struct ps_mem __ps_mem_##name;
213 | 
214 | typedef void            (*ps_free_fn_t)(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t curr);
215 | typedef struct ps_slab *(*ps_alloc_fn_t)(struct ps_mem *m, size_t sz, coreid_t curr);
216 | 
217 | /*
218 |  * Round up to the nearest power of 2.
219 |  *
220 |  * Value "v" must be an unsigned type the size of a word (e.g. unsigned long).
221 |  *
222 |  * from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
223 |  */
224 | static inline unsigned long
225 | ps_rndpow2(unsigned long v)
226 | {
227 | 	v--;
228 | 	v |= v >> 1;
229 | 	v |= v >> 2;
230 | 	v |= v >> 4;
231 | 	v |= v >> 8;
232 | 	v |= v >> 16;
233 | 	PS_PLAT_SHIFTR32(v);
234 | 	v++;
235 | 
236 | 	return v;
237 | }
238 | 
239 | #ifndef EQUIESCENCE
240 | #define EQUIESCENCE (200)
241 | #endif
242 | 
243 | #endif	/* PS_GLOBAL_H */
244 | 


--------------------------------------------------------------------------------
/ps_list.h:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2009-2017 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  * Author: Gabriel Parmer, gparmer@gwu.edu, 2017
  5 |  *
  6 |  * History:
  7 |  * - Initial implementation, ~2009
  8 |  * - Adapted for parsec and relicensed, 2016
  9 |  */
 10 | 
 11 | /*
 12 |  * API Conventions:
 13 |  * - obj, new, and tmp are pointers of type T where T is the struct containing the linked list
 14 |  * - head is a pointer to struct ps_list_head
 15 |  * - l is the list field name within T
 16 |  * - type is T without any ()
 17 |  * - all ps_list_head_* functions should be applied to struct ps_list_head pointers
 18 |  * - all ps_list_* functions should be passed items of type T, not struct ps_list_head
 19 |  * - as with most macro-based APIs, please avoid passing in functions that cannot be multiply evaluated;
 20 |  *   generally passing only variables is a good move
 21 |  *
 22 |  * Example Usage:
 23 |  *
 24 |  * struct ps_list_head h;
 25 |  * struct foo {
 26 |  *         struct ps_list l;
 27 |  *         void *d;
 28 |  * } node, *i, *tmp;
 29 |  *
 30 |  * ps_list_head_init(&h);
 31 |  * ps_list_init(&node);
 32 |  * ps_list_head_add(&h, &node, l);
 33 |  * ...
 34 |  * for (i = ps_list_head_first(&h, struct foo, l) ;
 35 |  *      i != ps_list_head(&h, struct foo, l) ;
 36 |  *      i = ps_list_next(i, l)) { ... }
 37 |  *
 38 |  * for (ps_list_iter_init(&h, i, l) ; !ps_list_iter_term(&h, i, l) ; i = ps_list_next(i, l)) { ... }
 39 |  *
 40 |  * ps_list_foreach(&h, i, l) { ... }
 41 |  *
 42 |  * ps_list_foreach_del(&h, i, tmp, l) {
 43 |  *          ps_list_rem(i, l);
 44 |  *          ps_free(i);
 45 |  * }
 46 |  *
 47 |  */
 48 | 
 49 | #ifndef PS_LIST_H
 50 | #define PS_LIST_H
 51 | 
 52 | struct ps_list {
 53 | 	struct ps_list *n, *p;
 54 | };
 55 | 
 56 | /*
 57 |  * This is a separate type to 1) provide guidance on how to use the
 58 |  * API, and 2) to prevent developers from comparing pointers that
 59 |  * should not be compared.
 60 |  */
 61 | struct ps_list_head {
 62 | 	struct ps_list l;
 63 | };
 64 | 
 65 | #define PS_LIST_DEF_NAME list
 66 | 
 67 | static inline void
 68 | ps_list_ll_init(struct ps_list *l)
 69 | { l->n = l->p = l; }
 70 | 
 71 | static inline void
 72 | ps_list_head_init(struct ps_list_head *lh)
 73 | { ps_list_ll_init(&lh->l); }
 74 | 
 75 | static inline int
 76 | ps_list_ll_empty(struct ps_list *l)
 77 | { return l->n == l; }
 78 | 
 79 | static inline int
 80 | ps_list_head_empty(struct ps_list_head *lh)
 81 | { return ps_list_ll_empty(&lh->l); }
 82 | 
 83 | static inline void
 84 | ps_list_ll_add(struct ps_list *l, struct ps_list *new)
 85 | {
 86 | 	new->n    = l->n;
 87 | 	new->p    = l;
 88 | 	l->n      = new;
 89 | 	new->n->p = new;
 90 | }
 91 | 
 92 | static inline void
 93 | ps_list_ll_rem(struct ps_list *l)
 94 | {
 95 | 	l->n->p = l->p;
 96 | 	l->p->n = l->n;
 97 | 	l->p = l->n = l;
 98 | }
 99 | 
100 | #define ps_offsetof(s, field) __builtin_offsetof(s, field)
101 | //#define ps_offsetof(s, field) ((unsigned long)&(((s *)0)->field))
102 | 
103 | #define ps_container(intern, type, field)				\
104 | 	((type *)((char *)(intern) - ps_offsetof(type, field)))
105 | 
106 | /*
107 |  * Get a pointer to the object containing *l, of a type shared with
108 |  * *o.  Importantly, "o" is not accessed here, and is _only_ used for
109 |  * its type.  It will typically be the iterator/cursor working through
110 |  * a list.  Do _not_ use this function.  It is a utility used by the
111 |  * following functions.
112 |  */
113 | #define ps_list_obj_get(l, o, lname) \
114 | 	ps_container(l, __typeof__(*(o)), lname)
115 | 
116 | //(typeof (*(o)) *)(((char*)(l)) - ps_offsetof(typeof(*(o)), lname))
117 | 
118 | /***
119 |  * The object API.  These functions are called with pointers to your
120 |  * own (typed) structures.
121 |  */
122 | 
123 | #define ps_list_is_head(lh, o, lname)   (ps_list_obj_get((lh), (o), lname) == (o))
124 | 
125 | /* functions for if we don't use the default name for the list field */
126 | #define ps_list_singleton(o, lname)    ps_list_ll_empty(&(o)->lname)
127 | #define ps_list_init(o, lname)         ps_list_ll_init(&(o)->lname)
128 | #define ps_list_next(o, lname)         ps_list_obj_get((o)->lname.n, (o), lname)
129 | #define ps_list_prev(o, lname)         ps_list_obj_get((o)->lname.p, (o), lname)
130 | #define ps_list_add(o, n, lname)       ps_list_ll_add(&(o)->lname, &(n)->lname)
131 | #define ps_list_append(o, n, lname)    ps_list_add(ps_list_prev((o), lname), n, lname)
132 | #define ps_list_rem(o, lname)          ps_list_ll_rem(&(o)->lname)
133 | #define ps_list_head_add(lh, o, lname) ps_list_ll_add((&(lh)->l), &(o)->lname)
134 | #define ps_list_head_append(lh, o, lname) ps_list_ll_add(((&(lh)->l)->p), &(o)->lname)
135 | 
136 | /**
137 |  * Explicit type API: Pass in the types of the nodes in the list, and
138 |  * the name of the ps_list field in that type.
139 |  */
140 | 
141 | #define ps_list_head_first(lh, type, lname) \
142 | 	ps_container(((lh)->l.n), type, lname)
143 | #define ps_list_head_last(lh, type, lname) \
144 | 	ps_container(((lh)->l.p), type, lname)
145 | 
146 | /* If your struct named the list field "list" (as defined by PS_LIST_DEF_NAME */
147 | #define ps_list_is_head_d(lh, o)           ps_list_is_head(lh, o, PS_LIST_DEF_NAME)
148 | #define ps_list_singleton_d(o)             ps_list_singleton(o, PS_LIST_DEF_NAME)
149 | #define ps_list_init_d(o)                  ps_list_init(o, PS_LIST_DEF_NAME)
150 | #define ps_list_next_d(o)                  ps_list_next(o, PS_LIST_DEF_NAME)
151 | #define ps_list_prev_d(o)                  ps_list_prev(o, PS_LIST_DEF_NAME)
152 | #define ps_list_add_d(o, n)                ps_list_add(o, n, PS_LIST_DEF_NAME)
153 | #define ps_list_append_d(o, n)             ps_list_append(o, n, PS_LIST_DEF_NAME)
154 | #define ps_list_rem_d(o)                   ps_list_rem(o, PS_LIST_DEF_NAME)
155 | 
156 | #define ps_list_head_last_d(lh, o)         ps_list_head_last(lh, o, PS_LIST_DEF_NAME)
157 | #define ps_list_head_first_d(lh, type)     ps_list_head_first(lh, type, PS_LIST_DEF_NAME)
158 | #define ps_list_head_add_d(lh, o)          ps_list_head_add(lh, o, PS_LIST_DEF_NAME)
159 | #define ps_list_head_append_d(lh, o)       ps_list_head_append(lh, o, PS_LIST_DEF_NAME)
160 | 
161 | /**
162 |  * Iteration API
163 |  */
164 | 
165 | /* Iteration without mutating the list */
166 | #define ps_list_foreach(head, iter, lname)				\
167 | 	for (iter = ps_list_head_first((head), __typeof__(*iter), lname) ; \
168 | 	     !ps_list_is_head((head), iter, lname)    ;			\
169 | 	     (iter) = ps_list_next(iter, lname))
170 | 
171 | #define ps_list_foreach_d(head, iter) ps_list_foreach(head, iter, PS_LIST_DEF_NAME)
172 | 
173 | /*
174 |  * Iteration where the current node can be ps_list_rem'ed.
175 |  * Notes:
176 |  * - typeof(iter) == typeof(tmp)
177 |  * - ps_list_add can be used on iter, but the added node will not be iterated over
178 |  *
179 |  * TODO: Add SMR/parallel version of this macro
180 |  */
181 | #define ps_list_foreach_del(head, iter, tmp, lname)			\
182 | 	for (iter = ps_list_head_first((head), __typeof__(*iter), lname), \
183 |              (tmp) = ps_list_next((iter), lname) ;		        \
184 | 	     !ps_list_is_head((head), iter, lname) ;			\
185 | 	     (iter) = (tmp), (tmp) = ps_list_next((tmp), lname))
186 | 
187 | #define ps_list_foreach_del_d(head, iter, tmp) ps_list_foreach_del(head, iter, tmp, PS_LIST_DEF_NAME)
188 | 
189 | #endif	/* PS_LIST_H */
190 | 


--------------------------------------------------------------------------------
/ps_ns.c:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  */
  7 | 
  8 | #include <ps_ns.h>
  9 | 
 10 | /* The slab allocator for slab heads that are not internal to the slab itself */
 11 | PS_SLAB_CREATE_DEF(slabhead, sizeof(struct ps_slab))
 12 | 
 13 | /*
 14 |  * Namespace allocators make sure that the slab head is allocated
 15 |  * separately from the memory itself so that all lookups within the
 16 |  * lookup tree are properly aligned.
 17 |  *
 18 |  * FIXME: this is a scalability bottleneck.  A single list for
 19 |  * balancing all namespaces that are freed (i.e. when a slab is
 20 |  * deallocated).  This makes balancing faster, which is a significant
 21 |  * benefit.
 22 |  */
 23 | struct ps_slab *
 24 | ps_slab_nsalloc(struct ps_mem *m, size_t sz, coreid_t coreid)
 25 | {
 26 | 	ps_desc_t id = 0, range = m->ns_info.desc_range;
 27 | 	struct ps_slab *s;
 28 | 	int newslab = 0;
 29 | 	void *mem;
 30 | 	struct ps_ns_info *nsi;
 31 | 	(void)coreid;
 32 | 
 33 | 	ps_lock_take(&m->ns_info.lock);
 34 | 	s = m->ns_info.fl.list;
 35 | 	if (s) __slab_freelist_rem(&m->ns_info.fl, s);
 36 | 	ps_lock_release(&m->ns_info.lock);
 37 | 
 38 | 	if (!s) {
 39 | 		id = ps_faa(&m->ns_info.frontier, range);
 40 | 		if (unlikely(id >= m->ns_info.desc_max)) goto reset_frontier;
 41 | 		s = ps_slab_alloc_slabhead();
 42 | 		if (unlikely(!s))                        goto reset_frontier;
 43 | 
 44 | 		s->start = id;
 45 | 		s->end   = s->start + range;
 46 | 		newslab  = 1;
 47 | 	}
 48 | 
 49 |  	assert(!s->memory);
 50 | 	mem = ps_plat_alloc(sz, coreid);
 51 | 	if (unlikely(!mem)) goto free_slab;
 52 | 	memset(mem, 0, sz);
 53 | 	s->memory = mem;
 54 | 
 55 | 	/* Add the slab's identities to the lookup table */
 56 | 	nsi = &m->ns_info;
 57 | 	assert(!nsi->lkupfn(nsi->ert, s->start, nsi->ert_depth, NULL));
 58 | 	if (nsi->expandfn(nsi->ert,   s->start, nsi->ert_depth, NULL, mem, NULL) != 0) goto free_mem;
 59 | 	assert(nsi->lkupfn(nsi->ert,  s->start, nsi->ert_depth, NULL) == mem);
 60 | 
 61 | 	return s;
 62 | free_mem:
 63 | 	ps_plat_free(mem, sz, coreid);
 64 | free_slab:
 65 | 	ps_slab_free_slabhead(s);
 66 | reset_frontier:
 67 | 	/* possible to leak namespace if many threads race between faa and here */
 68 | 	if (newslab) ps_cas(&m->ns_info.frontier, id+range, id);
 69 | 
 70 | 	return NULL;
 71 | }
 72 | 
 73 | void
 74 | ps_slab_nsfree(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid)
 75 | {
 76 | 	struct ps_ns_info *nsi;
 77 | 	struct ert_intern *intern;
 78 | 
 79 | 	ps_plat_free(s->memory, sz, coreid);
 80 | 
 81 | 	/* Remove the reference in the lookup table to the slab */
 82 | 	nsi = &m->ns_info;
 83 | 	if (nsi->ert_depth > 1) {
 84 | 		intern = nsi->lkupfn(nsi->ert, s->start, nsi->ert_depth-1, NULL);
 85 | 		assert(intern->next == s->memory);
 86 | 		intern->next = NULL;
 87 | 		assert(!nsi->lkupfn(nsi->ert, s->start, nsi->ert_depth, NULL));
 88 | 	}
 89 | 	s->memory = NULL;
 90 | 
 91 | 	ps_lock_take(&m->ns_info.lock);
 92 | 	__slab_freelist_add(&m->ns_info.fl, s);
 93 | 	ps_lock_release(&m->ns_info.lock);
 94 | }
 95 | 
 96 | void
 97 | ps_ns_init(struct ps_mem *m, void *ert, ps_lkupan_fn_t lkup, ps_expand_fn_t expand, size_t depth, ps_desc_t maxid, size_t range)
 98 | {
 99 | 	struct ps_ns_info *ni;
100 | 	static unsigned long executed = 0;
101 | 
102 | 	if (executed == 0 && ps_faa(&executed, 1) == 0) ps_slab_init_slabhead();
103 | 
104 | 	ni = &m->ns_info;
105 | 	ni->desc_max   = maxid;
106 | 	ni->desc_range = range;
107 | 	ni->fl.list    = NULL;
108 | 	ni->frontier   = 0;
109 | 	ni->ert        = ert;
110 | 	ni->ert_depth  = depth;
111 | 	ni->lkupfn     = lkup;
112 | 	ni->expandfn   = expand;
113 | 	ps_lock_init(&ni->lock);
114 | }
115 | 


--------------------------------------------------------------------------------
/ps_ns.h:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  */
  7 | 
  8 | #ifndef PS_NS_H
  9 | #define PS_NS_H
 10 | 
 11 | /***
 12 |  * This file is nearly entirely glue.  Glue embedded radix tries with
 13 |  * slabs and smr.  This file represents the glue code at the
 14 |  * highest-levels.  The ps_ns.c file contains the glue code between
 15 |  * the slab allocator and the underlying system providing memory.
 16 |  * This enables sharing and movement of pieces of the namespace
 17 |  * between cores, and also manages the allocation/deallocation of the
 18 |  * ertrie.
 19 |  */
 20 | 
 21 | #include <ps_config.h>
 22 | #include <ps_smr.h>
 23 | #include <ps_ertrie.h>
 24 | 
 25 | /* Just namespace encapsulation... */
 26 | struct ps_ns {
 27 | 	struct ps_mem m;
 28 | };
 29 | 
 30 | struct ps_slab *ps_slab_nsalloc(struct ps_mem *m, size_t sz, coreid_t coreid);
 31 | void ps_slab_nsfree(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid);
 32 | void ps_ns_init(struct ps_mem *m, void *ert, ps_lkupan_fn_t lkup, ps_expand_fn_t expand, size_t depth, ps_desc_t maxid, size_t range);
 33 | 
 34 | static inline int
 35 | __ps_ns_desc_isfree(void *slot)
 36 | { return __ps_mhead_isfree(__ps_mhead_get(slot)); }
 37 | 
 38 | #define __PS_NS_TYPE_CREATE(name, type, objsz, nobjord, depth, maxid)					\
 39 | __PS_PARSLAB_CREATE_AFNS(name, (ps_rndpow2(__ps_slab_objmemsz((objsz)))-sizeof(struct ps_mheader)),	\
 40 | 			 (ps_rndpow2(__ps_slab_objmemsz((objsz))) * (1<<nobjord)), 0,			\
 41 | 			 ps_slab_nsalloc, ps_slab_nsfree)						\
 42 | static inline ps_desc_t											\
 43 | ps_ns_desc_##name(void *slot)										\
 44 | { return __ps_mhead_get(slot)->slab->start + ps_slab_objoff_##name(slot); }				\
 45 | static inline void *											\
 46 | ps_nsptr_lkup_##name(struct ps_ns *ns, ps_desc_t desc)							\
 47 | {													\
 48 | 	struct ps_mheader *h = name##_lkup(ns->m.ns_info.ert, desc);					\
 49 | 	if (unlikely(!h)) return NULL;									\
 50 | 	return __ps_mhead_mem(h);									\
 51 | }													\
 52 | static inline void *											\
 53 | ps_nsptr_alloc_##name(struct ps_ns *ns, ps_desc_t *d)							\
 54 | {													\
 55 | 	void *a = ps_##type##ptr_alloc_##name(&ns->m);							\
 56 | 	if (unlikely(!a)) return NULL;									\
 57 | 	*d = ps_ns_desc_##name(a);									\
 58 | 													\
 59 | 	return a;											\
 60 | }													\
 61 | static inline void *											\
 62 | ps_ns_alloc_##name(ps_desc_t *d)									\
 63 | { return ps_nsptr_alloc_##name((struct ps_ns *)&__ps_mem_##name, d); }					\
 64 | static inline void											\
 65 | ps_nsptr_free_##name(struct ps_ns *ns, void *slot)							\
 66 | { ps_##type##ptr_free_##name(&ns->m, slot); }								\
 67 | static inline void											\
 68 | ps_nsptr_freedesc_##name(struct ps_ns *ns, ps_desc_t d)							\
 69 | {													\
 70 | 	void *m = ps_nsptr_lkup_##name(ns, d);								\
 71 | 	if (m) ps_##type##ptr_free_##name(&ns->m, m);							\
 72 | }													\
 73 | static inline void											\
 74 | ps_ns_free_##name(void *slot)										\
 75 | { ps_nsptr_free_##name((struct ps_ns *)&__ps_mem_##name, slot); }					\
 76 | static inline void											\
 77 | ps_ns_init_##name(struct parsec *ps, void *ert)								\
 78 | {													\
 79 | 	ps_mem_init_##name(ps);										\
 80 | 	ps_ns_init(&__ps_mem_##name, ert, (ps_lkupan_fn_t)name##_lkupan,				\
 81 | 		   (ps_expand_fn_t)name##_expandn, depth, maxid, 1<<nobjord);				\
 82 | }													\
 83 | static inline void											\
 84 | ps_ns_init_slab_##name(void *ert)									\
 85 | {													\
 86 | 	ps_slab_init_##name();										\
 87 | 	ps_ns_init(&__ps_mem_##name, ert, (ps_lkupan_fn_t)name##_lkupan,				\
 88 | 		   (ps_expand_fn_t)name##_expandn, depth, maxid, 1<<nobjord);				\
 89 | }													\
 90 | static inline int											\
 91 | ps_nsptr_delete_##name(struct ps_ns *ns)								\
 92 | { /* TODO: deallocate the ert */ return ps_memptr_delete_##name(&ns->m); }				\
 93 | static inline struct ps_ns *										\
 94 | ps_nsptr_create_##name(struct parsec *ps)								\
 95 | {													\
 96 | 	struct ps_mem *m;										\
 97 | 	struct ps_ns_ert_##name *e;									\
 98 | 	if (ps) m = ps_memptr_create_##name(ps);							\
 99 | 	else	m = ps_slabptr_create_##name();								\
100 | 	if (!m) return NULL;										\
101 | 	e = name##_alloc(NULL);										\
102 | 	if (!e) ps_memptr_delete_##name(m);								\
103 | 	ps_ns_init(m, e, (ps_lkupan_fn_t)name##_lkupan,							\
104 | 		   (ps_expand_fn_t)name##_expandn, depth, maxid, 1<<nobjord);				\
105 | 	return (struct ps_ns *)m;									\
106 | }													\
107 | static inline struct ps_ns *										\
108 | ps_nsptr_create_slab_##name(void)									\
109 | { return ps_nsptr_create_##name(NULL); }
110 | 
111 | /*
112 |  * The ert functions when the namespace itself is SMRed, and the nodes
113 |  * within it are interned into the structure of the leaf of the radix
114 |  * trie.
115 |  */
116 | static inline void *
117 | __ps_ns_alloc_intern(void *c, int sz, int isleaf)
118 | {
119 | 	if (isleaf) return c; 	                /* passed in the slab */
120 | 	return ps_plat_alloc(sz, ps_coreid());  /* internal node */
121 | }
122 | static inline int
123 | __ps_ns_setleaf_intern(struct ert_intern *e, void *data)
124 | { (void)e; (void)data; assert(0); return 0; }
125 | static inline void *
126 | __ps_ns_getleaf_intern(struct ert_intern *e, void *accum)
127 | { (void)e; (void)accum; assert(0); return NULL; }
128 | static inline int
129 | __ps_ns_resolve_intern(struct ert_intern *a, void *accum, int leaf, u32_t order, u32_t size)
130 | {
131 | 	if (!leaf) return ert_defresolve(a, accum, leaf, order, size);
132 | 	return !__ps_mhead_isfree((void*)a);
133 | }
134 | static inline void
135 | __ps_ns_init_intern(struct ert_intern *a, int leaf)
136 | {
137 | 	if (!leaf) ert_definit(a, leaf);
138 | 	return; 		/* already initialized by the slab allocator... */
139 | }
140 | 
141 | 
142 | #define PS_NS_CREATE(name, nodesz, depth, order, leaforder)								\
143 | ERT_CREATE(name, ps_ns_ert_##name, depth, order, PS_WORD, leaforder,							\
144 | 	   ps_rndpow2(__ps_slab_objmemsz(nodesz)), NULL, __ps_ns_init_intern, ert_defget, ert_defisnull, ert_defset,	\
145 | 	   __ps_ns_alloc_intern, __ps_ns_setleaf_intern, __ps_ns_getleaf_intern, __ps_ns_resolve_intern)		\
146 | __PS_NS_TYPE_CREATE(name, mem, nodesz, leaforder, depth, name##_maxid())
147 | 
148 | #define PS_NSSLAB_CREATE(name, nodesz, depth, order, leaforder)								\
149 | ERT_CREATE(name, ps_ns_ert_##name, depth, order, PS_WORD, leaforder,							\
150 | 	   ps_rndpow2(__ps_slab_objmemsz(nodesz)), NULL, __ps_ns_init_intern, ert_defget, ert_defisnull, ert_defset,	\
151 | 	   __ps_ns_alloc_intern, __ps_ns_setleaf_intern, __ps_ns_getleaf_intern, __ps_ns_resolve_intern)		\
152 | __PS_NS_TYPE_CREATE(name, slab, nodesz, leaforder, depth, name##_maxid())
153 | 
154 | #endif	/* PS_NS_H */
155 | 


--------------------------------------------------------------------------------
/ps_pgalloc.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2017 by Gabriel Parmer.  All rights reserved.
 3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
 4 |  *
 5 |  * A simple page allocator and free list.  Dumbest possible memory
 6 |  * allocation.  Useful instead of a buddy allocator when you know that
 7 |  * all allocations will be of a specific size.
 8 |  */
 9 | 
10 | #ifndef PS_PGALLOC_H
11 | #define PS_PGALLOC_H
12 | 
13 | #include <ps_plat.h>
14 | 
15 | struct ps_freelist {
16 | 	size_t pgsize;
17 | 	struct ps_freelist *next;
18 | };
19 | 
20 | #define PS_PGALLOC_STATICDEF(page_size) { .pgsize = page_size, .next = NULL }
21 | #define PS_PGALLOC_INITDEF(page_size) (struct ps_freelist)PS_PGALLOC_STATICDEF(page_size)
22 | 
23 | static inline void
24 | ps_pgalloc_init(struct ps_freelist *fl, size_t pgsize)
25 | { *fl = PS_PGALLOC_INITDEF(pgsize); }
26 | 
27 | /*
28 |  * We have to assume that all previously allocated memory is freed...
29 |  * Not only is this required for correctness, it is used here to avoid
30 |  * the need for atomic instructions.
31 |  */
32 | static inline void
33 | ps_pgalloc_destroy(struct ps_freelist *fl)
34 | {
35 | 	struct ps_freelist *l, *n;
36 | 
37 | 	for (l = fl->next ; l ; l = n) {
38 | 		n = l->next;
39 | 		ps_plat_free(l, fl->pgsize, ps_coreid());
40 | 	}
41 | }
42 | 
43 | static inline void *
44 | ps_pgalloc(struct ps_freelist *fl)
45 | {
46 | 	void *a = NULL;
47 | 	struct ps_freelist *n;
48 | 
49 | retry:
50 | 	n = ps_load(&fl->next);
51 | 	if (n) {
52 | 		/* FIXME: ABA */
53 | 		if (!ps_cas((unsigned long *)&fl->next, (unsigned long)n, (unsigned long)ps_load(&n->next))) goto retry;
54 | 
55 | 		a       = n;
56 | 		n->next = NULL;
57 | 	} else {
58 | 		a = ps_plat_alloc(fl->pgsize, ps_coreid());
59 | 	}
60 | 
61 | 	return a;
62 | }
63 | 
64 | static inline void
65 | ps_pgfree(struct ps_freelist *fl, void *p)
66 | {
67 | 	struct ps_freelist *l, *n;
68 | 
69 | 	l = (struct ps_freelist *)p;
70 | retry:
71 | 	l->next = n = fl->next;
72 | 	if (!ps_cas((unsigned long *)&fl->next, (unsigned long)n, (unsigned long)l)) goto retry;
73 | 
74 | 	return;
75 | }
76 | 
77 | #endif	/* PS_PGALLOC_H */
78 | 


--------------------------------------------------------------------------------
/ps_refcnt.h:
--------------------------------------------------------------------------------
 1 | #ifndef PS_REFCNT_H
 2 | #define PS_REFCNT_H
 3 | 
 4 | #include <ps_plat.h>
 5 | 
 6 | struct ps_refcnt {
 7 | 	unsigned long cnt;
 8 | };
 9 | 
10 | static inline unsigned long
11 | ps_refcnt_get(struct ps_refcnt *rc)
12 | {
13 | 	return rc->cnt;
14 | }
15 | 
16 | static inline void
17 | ps_refcnt_take(struct ps_refcnt *rc)
18 | {
19 | 	ps_faa(&rc->cnt, 1);
20 | }
21 | 
22 | static inline int
23 | ps_refcnt_release(struct ps_refcnt *rc)
24 | {
25 | 	ps_faa(&rc->cnt, -1);
26 | 
27 | 	return rc->cnt == 0;
28 | }
29 | 
30 | #endif	/* PS_REFCNT_H */
31 | 


--------------------------------------------------------------------------------
/ps_slab.c:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2011-2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Author: Gabriel Parmer, gparmer@gwu.edu, 2011
  6 |  *
  7 |  * History:
  8 |  * - Initial slab allocator, 2011
  9 |  * - Adapted for parsec, 2015
 10 |  */
 11 | 
 12 | #include <ps_slab.h>
 13 | 
 14 | /*
 15 |  * Default allocation and deallocation functions: assume header is
 16 |  * internal to the slab's memory
 17 |  */
 18 | struct ps_slab *
 19 | ps_slab_defalloc(struct ps_mem *m, size_t sz, coreid_t coreid)
 20 | {
 21 | 	struct ps_slab *s = ps_plat_alloc(sz, coreid);
 22 | 	(void)coreid; (void)m;
 23 | 
 24 | 	if (!s) return NULL;
 25 | 	s->memory = s;
 26 | 	return s;
 27 | }
 28 | 
 29 | void
 30 | ps_slab_deffree(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid)
 31 | { (void)m; ps_plat_free(s, sz, coreid); }
 32 | 
 33 | void
 34 | __ps_slab_init(struct ps_slab *s, struct ps_slab_info *si, PS_SLAB_PARAMS)
 35 | {
 36 | 	size_t nfree, i;
 37 | 	size_t objmemsz  = __ps_slab_objmemsz(obj_sz);
 38 | 	struct ps_mheader *alloc, *prev;
 39 | 	PS_SLAB_DEWARN;
 40 | 
 41 | 	s->nfree  = nfree = (allocsz - headoff) / objmemsz;
 42 | 	s->memsz  = allocsz;
 43 | 	s->coreid = ps_coreid();
 44 | 
 45 | 	/*
 46 | 	 * Set up the slab's freelist
 47 | 	 *
 48 | 	 * TODO: cache coloring
 49 | 	 */
 50 | 	alloc = (struct ps_mheader *)((char *)s->memory + headoff);
 51 | 	prev  = s->freelist = alloc;
 52 | 	for (i = 0 ; i < nfree ; i++, prev = alloc, alloc = (struct ps_mheader *)((char *)alloc + objmemsz)) {
 53 | 		__ps_mhead_init(alloc, s);
 54 | 		prev->next = alloc;
 55 | 	}
 56 | 	/* better not overrun memory */
 57 | 	assert((void *)alloc <= (void *)((char*)s->memory + allocsz));
 58 | 
 59 | 	ps_list_init(s, list);
 60 | 	__slab_freelist_add(&si->fl, s);
 61 | 	__ps_slab_freelist_check(&si->fl);
 62 | }
 63 | 
 64 | /*
 65 |  * This is not thread-safe. It may lost some concurrent remote freed
 66 |  * objects. So use it only for approximate accounting or debugging.
 67 |  */
 68 | int
 69 | __ps_remote_free_cnt(struct ps_mheader *h)
 70 | {
 71 | 	struct ps_mheader *t;
 72 | 	int ret = 0;
 73 | 
 74 | 	for (t = h; t; t = t->next, ret++) ;
 75 | 
 76 | 	return ret;
 77 | }
 78 | 
 79 | void
 80 | ps_slabptr_init(struct ps_mem *m)
 81 | {
 82 | 	/* ns_info, slab_info and smr_info are all inlined into struct ps_mem, */
 83 | 	/* see ps_global.h. So this single memset initializes everything.*/
 84 | 	memset(m, 0, sizeof(struct ps_mem));
 85 | }
 86 | 
 87 | void
 88 | ps_slabptr_stats(struct ps_mem *m, struct ps_slab_stats *stats)
 89 | {
 90 | 	int i, j, k;
 91 | 	struct ps_slab *s;
 92 | 	struct ps_mem_percore *pc;
 93 | 
 94 | 	memset(stats, 0, sizeof(struct ps_slab_stats));
 95 | 
 96 | 	for (i = 0 ; i < PS_NUMCORES ; i++) {
 97 | 		pc = &m->percore[i];
 98 | 		s = pc->slab_info.fl.list;
 99 | 		stats->percore[i].nslabs = pc->slab_info.nslabs;
100 | 		do {
101 | 			if (!s) break;
102 | 			stats->percore[i].npartslabs++;
103 | 			stats->percore[i].nfree += s->nfree;
104 | 			s = ps_list_next(s, list);
105 | 		} while (s != pc->slab_info.fl.list);
106 | 
107 | 		for (j = 0 ; j < PS_NUMLOCALITIES ; j++) {
108 | 			for (k = 0 ; k < PS_NUMLOCALITIES ; k++) {
109 | 				stats->percore[i].nremote += __ps_remote_free_cnt(pc->slab_remote[j].remote_frees[k]);
110 | 			}
111 | 		}
112 | 	}
113 | }
114 | 
115 | int
116 | ps_slabptr_isempty(struct ps_mem *m)
117 | {
118 | 	int i, j, k;
119 | 	struct ps_mem_percore *pc;
120 | 
121 | 	for (i = 0 ; i < PS_NUMCORES ; i++) {
122 | 		pc = &m->percore[i];
123 | 		if (pc->slab_info.nslabs) return 0;
124 | 
125 | 		for (j = 0 ; j < PS_NUMLOCALITIES ; j++) {
126 | 			for (k = 0 ; k < PS_NUMLOCALITIES ; k++) {
127 | 				if (pc->slab_remote[j].remote_frees[k]) return 0;
128 | 			}
129 | 		}
130 | 	}
131 | 	return 1;
132 | }
133 | 
134 | void
135 | __ps_slab_mem_remote_free(struct ps_mem *mem, struct ps_mheader *h, coreid_t core_target)
136 | {
137 | 	struct ps_slab_remote_list *r;
138 | 	coreid_t     tmpcoreid;
139 | 	localityid_t numaid;
140 | 
141 | 	ps_tsc_locality(&tmpcoreid, &numaid);
142 | 	r = &mem->percore[core_target].slab_remote[numaid];
143 | 
144 | 	__ps_rfl_stack_push(&(r->remote_frees[tmpcoreid % NUM_REMOTE_LIST]), h);
145 | }
146 | 
147 | static inline int
148 | __ps_slab_mem_remote_clear(struct ps_mem *mem, int locality, PS_SLAB_PARAMS)
149 | {
150 | 	int ret = 0;
151 | 	unsigned int i;
152 | 	struct ps_mheader *h, *n;
153 | 	struct ps_slab_remote_list *r = &mem->percore[coreid].slab_remote[locality];
154 | 
155 | 	for (i = 0 ; i < NUM_REMOTE_LIST ; i++) {
156 | 		h = r->remote_frees[i];
157 | 		if (h) h = __ps_rfl_stack_remove_all(&(r->remote_frees[i]));
158 | 		while (h) {
159 | 			n       = h->next;
160 | 			h->next = NULL;
161 | 			__ps_slab_mem_free(__ps_mhead_mem(h), mem, PS_SLAB_ARGS);
162 | 			h       = n;
163 | 			ret    += 1;
164 | 		}
165 | 	}
166 | 
167 | 	return ret;
168 | }
169 | 
170 | /*
171 |  * This function wants to contend cache-lines with another numa chip
172 |  * at most once, or else the latency will blow up.  It can detect this
173 |  * contention fairly well with the fact that there are, or aren't,
174 |  * aren't any items in the remote freelist.  Thus, this function
175 |  * processes the remote free lists for exactly _one_ remote numa node
176 |  * each time it is called.
177 |  */
178 | void
179 | __ps_slab_mem_remote_process(struct ps_mem *mem, struct ps_slab_info *si, PS_SLAB_PARAMS)
180 | {
181 | 	int ret;
182 | 	unsigned long locality = si->remote_token;
183 | 	PS_SLAB_DEWARN;
184 | 
185 | 	do {
186 | 		ret = __ps_slab_mem_remote_clear(mem, locality, PS_SLAB_ARGS);
187 | 		locality = (locality + 1) % PS_NUMLOCALITIES;
188 | 	} while (!ret && locality != si->remote_token);
189 | 
190 | 	si->remote_token = locality;
191 | }
192 | 


--------------------------------------------------------------------------------
/ps_slab.h:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2011-2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Author: Gabriel Parmer, gparmer@gwu.edu, 2011
  6 |  *
  7 |  * History:
  8 |  * - Initial slab allocator, 2011
  9 |  * - Adapted for parsec, 2015
 10 |  */
 11 | 
 12 | #ifndef  PS_SLAB_H
 13 | #define  PS_SLAB_H
 14 | 
 15 | #include <ps_config.h>
 16 | #include <ps_list.h>
 17 | #include <ps_plat.h>
 18 | #include <ps_global.h>
 19 | 
 20 | /* #define PS_SLAB_DEBUG 1 */
 21 | 
 22 | /* The header for a slab. */
 23 | struct ps_slab {
 24 | 	/*
 25 | 	 * Read-only data.  coreid is read by _other_ cores, so we
 26 | 	 * want it on a separate cache-line from the frequently
 27 | 	 * modified stuff.
 28 | 	 */
 29 | 	void  *memory;		/* != NULL iff slab is separately allocated */
 30 | 	ps_desc_t start, end;	/* A slab used as a namespace: min and max descriptor ids */
 31 | 	size_t    memsz;	/* size of backing memory */
 32 | 	coreid_t  coreid;	/* which is the home core for this slab? */
 33 | 	char   pad[PS_CACHE_LINE-(sizeof(void *)+sizeof(size_t)+sizeof(u16_t)+sizeof(ps_desc_t)*2)];
 34 | 
 35 | 	/* Frequently modified data on the owning core... */
 36 | 	struct ps_mheader *freelist; /* free objs in this slab */
 37 | 	struct ps_list     list;     /* freelist of slabs */
 38 | 	size_t             nfree;    /* # allocations in freelist */
 39 | } PS_PACKED;
 40 | 
 41 | 
 42 | /*** Operations on the freelist of slabs ***/
 43 | 
 44 | /*
 45 |  * These functions should really must be statically computed for
 46 |  * efficiency (see macros below)...
 47 |  */
 48 | static inline unsigned long
 49 | __ps_slab_objmemsz(size_t obj_sz)
 50 | { return PS_RNDUP(obj_sz + sizeof(struct ps_mheader), PS_WORD); }
 51 | static inline unsigned long
 52 | __ps_slab_max_nobjs(size_t obj_sz, size_t allocsz, size_t headoff)
 53 | { return (allocsz - headoff) / __ps_slab_objmemsz(obj_sz); }
 54 | /* The offset of the given object in its slab */
 55 | static inline unsigned long
 56 | __ps_slab_objsoff(struct ps_slab *s, struct ps_mheader *h, size_t obj_sz, size_t headoff)
 57 | { return ((unsigned long)h - ((unsigned long)s->memory + headoff)) / __ps_slab_objmemsz(obj_sz); }
 58 | 
 59 | #ifdef PS_SLAB_DEBUG
 60 | static inline void
 61 | __ps_slab_check_consistency(struct ps_slab *s)
 62 | {
 63 | 	struct ps_mheader *h;
 64 | 	unsigned int i;
 65 | 
 66 | 	assert(s);
 67 | 	h = s->freelist;
 68 | 	for (i = 0 ; h ; i++) {
 69 | 		assert(h->slab == s);
 70 | 		assert(h->tsc_free != 0);
 71 | 		h = h->next;
 72 | 	}
 73 | 	assert(i == s->nfree);
 74 | }
 75 | 
 76 | static inline void
 77 | __ps_slab_freelist_check(struct ps_slab_freelist *fl)
 78 | {
 79 | 	struct ps_slab *s = fl->list;
 80 | 
 81 | 	if (!s) return;
 82 | 	do {
 83 | 		assert(s->memory && s->freelist);
 84 | 		assert(ps_list_prev(ps_list_next(s, list), list) == s);
 85 | 		assert(ps_list_next(ps_list_prev(s, list), list) == s);
 86 | 		__ps_slab_check_consistency(s);
 87 | 	} while ((s = ps_list_next(s, list)) != fl->list);
 88 | }
 89 | #else  /* PS_SLAB_DEBUG */
 90 | static inline void __ps_slab_check_consistency(struct ps_slab *s) { (void)s; }
 91 | static inline void __ps_slab_freelist_check(struct ps_slab_freelist *fl) { (void)fl; }
 92 | #endif /* PS_SLAB_DEBUG */
 93 | 
 94 | static void
 95 | __slab_freelist_rem(struct ps_slab_freelist *fl, struct ps_slab *s)
 96 | {
 97 | 	assert(s && fl);
 98 | 	if (fl->list == s) {
 99 | 		if (ps_list_singleton(s, list)) fl->list = NULL;
100 | 		else                            fl->list = ps_list_next(s, list);
101 | 	}
102 | 	ps_list_rem(s, list);
103 | }
104 | 
105 | static void
106 | __slab_freelist_add(struct ps_slab_freelist *fl, struct ps_slab *s)
107 | {
108 | 	assert(s && fl);
109 | 	assert(ps_list_singleton(s, list));
110 | 	assert(s != fl->list);
111 | 	if (fl->list) ps_list_add(fl->list, s, list);
112 | 	fl->list = s;
113 | 	/* TODO: sort based on emptiness...just use N bins */
114 | }
115 | 
116 | /*** Alloc and free ***/
117 | 
118 | #define PS_SLAB_PARAMS coreid_t coreid, size_t obj_sz, size_t allocsz, size_t headoff, ps_alloc_fn_t afn, ps_free_fn_t ffn
119 | #define PS_SLAB_ARGS coreid, obj_sz, allocsz, headoff, afn, ffn
120 | #define PS_SLAB_DEWARN (void)coreid; (void)obj_sz; (void)allocsz; (void)headoff; (void)afn; (void)ffn
121 | 
122 | /* Create function prototypes for cross-object usage */
123 | #define PS_SLAB_CREATE_PROTOS(name)			\
124 | inline void  *ps_slab_alloc_##name(void);		\
125 | inline void   ps_slab_free_##name(void *buf);		\
126 | inline size_t ps_slab_objmem_##name(void);		\
127 | inline size_t ps_slab_nobjs_##name(void);
128 | 
129 | void __ps_slab_mem_remote_free(struct ps_mem *mem, struct ps_mheader *h, coreid_t core_target);
130 | void __ps_slab_mem_remote_process(struct ps_mem *mem, struct ps_slab_info *si, PS_SLAB_PARAMS);
131 | void __ps_slab_init(struct ps_slab *s, struct ps_slab_info *si, PS_SLAB_PARAMS);
132 | void ps_slab_deffree(struct ps_mem *m, struct ps_slab *x, size_t sz, coreid_t coreid);
133 | struct ps_slab *ps_slab_defalloc(struct ps_mem *m, size_t sz, coreid_t coreid);
134 | void ps_slabptr_init(struct ps_mem *m);
135 | int ps_slabptr_isempty(struct ps_mem *m);
136 | 
137 | struct ps_slab_stats {
138 | 	struct {
139 | 		size_t nslabs, npartslabs, nfree, nremote;
140 | 	} percore[PS_NUMCORES];
141 | };
142 | void ps_slabptr_stats(struct ps_mem *m, struct ps_slab_stats *stats);
143 | 
144 | static inline void
145 | __ps_slab_mem_free(void *buf, struct ps_mem *mem, PS_SLAB_PARAMS)
146 | {
147 | 	struct ps_slab *s;
148 | 	struct ps_mheader *h, *next;
149 | 	unsigned int max_nobjs = __ps_slab_max_nobjs(obj_sz, allocsz, headoff);
150 | 	struct ps_slab_freelist *fl;
151 | 	coreid_t target;
152 | 	assert(__ps_slab_objmemsz(obj_sz) + headoff <= allocsz);
153 | 	PS_SLAB_DEWARN;
154 | 
155 | 	h = __ps_mhead_get(buf);
156 | 	assert(!__ps_mhead_isfree(h)); /* freeing freed memory? */
157 | 	s = h->slab;
158 | 	assert(s);
159 | 
160 | 	target = s->coreid;
161 | 	if (unlikely(target != coreid)) {
162 | 		__ps_slab_mem_remote_free(mem, h, target);
163 | 		return;
164 | 	}
165 | 
166 | 	__ps_mhead_setfree(h, 1);
167 | 	next        = s->freelist;
168 | 	s->freelist = h; 	/* TODO: should be atomic/locked */
169 | 	h->next     = next;
170 | 	s->nfree++;		/* TODO: ditto */
171 | 
172 | 	if (s->nfree == max_nobjs) {
173 | 		struct ps_slab_info *si = &mem->percore[coreid].slab_info;
174 | 
175 | 		/* remove from the freelist */
176 | 		fl = &si->fl;
177 | 		si->nslabs--;
178 | 		__slab_freelist_rem(fl, s);
179 | 	 	ffn(mem, s, s->memsz, coreid);
180 | 	} else if (s->nfree == 1) {
181 | 		fl = &mem->percore[coreid].slab_info.fl;
182 | 		/* add back onto the freelists */
183 | 		assert(ps_list_singleton(s, list));
184 | 		assert(s->memory && s->freelist);
185 | 		__slab_freelist_add(fl, s);
186 | 	}
187 | 	__ps_slab_freelist_check(&mem->percore[coreid].slab_info.fl);
188 | 
189 | 	return;
190 | }
191 | 
192 | static inline void *
193 | __ps_slab_mem_alloc(struct ps_mem *mem, PS_SLAB_PARAMS)
194 | {
195 | 	struct ps_slab      *s;
196 | 	struct ps_mheader   *h;
197 | 	struct ps_slab_info *si = &mem->percore[coreid].slab_info;
198 | 	assert(obj_sz + headoff <= allocsz);
199 | 	PS_SLAB_DEWARN;
200 | 
201 | 	si->salloccnt++;
202 | 	if (unlikely((si->salloccnt % PS_REMOTE_BATCH) == 0)) {
203 | 		__ps_slab_mem_remote_process(mem, si, PS_SLAB_ARGS);
204 | 	}
205 | 
206 | 	s = si->fl.list;
207 | 	if (unlikely(!s)) {
208 | 		/* allocation function must initialize s->memory */
209 | 		s = afn(mem, allocsz, coreid);
210 | 		if (unlikely(!s)) return NULL;
211 | 
212 | 		__ps_slab_init(s, si, PS_SLAB_ARGS);
213 | 		si->nslabs++;
214 | 		assert(s->memory && s->freelist);
215 | 	}
216 | 
217 | 	assert(s && s->freelist);
218 | 	/* TODO: atomic modification to the freelist */
219 | 	h           = s->freelist;
220 | 	s->freelist = h->next;
221 | 	h->next     = NULL;
222 | 	s->nfree--;
223 | 	__ps_mhead_reset(h);
224 | 
225 | 	/* remove from the freelist */
226 | 	if (s->nfree == 0) {
227 | 		__slab_freelist_rem(&si->fl, s);
228 | 		assert(ps_list_singleton(s, list));
229 | 	}
230 | 	assert(!__ps_mhead_isfree(h));
231 | 	__ps_slab_freelist_check(&si->fl);
232 | 
233 | 	return __ps_mhead_mem(h);
234 | }
235 | 
236 | 
237 | /***
238 |  * This macro is very important for high-performance.  It creates the
239 |  * functions for allocation and deallocation passing in the freelist
240 |  * directly, and size information for these objects, thus enabling the
241 |  * compiler to do partial evaluation.  This avoids freelist lookups,
242 |  * and relies on the compilers optimizations to generate specialized
243 |  * code for the given sizes -- requiring function inlining, constant
244 |  * propagation, and dead-code elimination.  To me, relying on these
245 |  * optimizations is better than putting all of the code for allocation
246 |  * and deallocation in the macro due to maintenance and readability.
247 |  */
248 | #define __PS_SLAB_CREATE_FNS(name, obj_sz, allocsz, headoff, afn, ffn)	\
249 | static inline void *							\
250 | ps_slabptr_alloc_##name(struct ps_mem *m)				\
251 | { return __ps_slab_mem_alloc(m, ps_coreid(), obj_sz, allocsz, headoff, afn, ffn); } \
252 | static inline void							\
253 | ps_slabptr_free_coreid_##name(struct ps_mem *m, void *buf, coreid_t coreid) \
254 | { __ps_slab_mem_free(buf, m, coreid, obj_sz, allocsz, headoff, afn, ffn); } \
255 | static inline void							\
256 | ps_slabptr_free_##name(struct ps_mem *m, void *buf)			\
257 | { ps_slabptr_free_coreid_##name(m, buf, ps_coreid()); }			\
258 | static inline void *							\
259 | ps_slab_alloc_##name(void)						\
260 | { return ps_slabptr_alloc_##name(&__ps_mem_##name); }			\
261 | static inline void							\
262 | ps_slab_free_##name(void *buf)						\
263 | { ps_slabptr_free_##name(&__ps_mem_##name, buf); }			\
264 | static inline void							\
265 | ps_slab_free_coreid_##name(void *buf, coreid_t curr)			\
266 | { ps_slabptr_free_coreid_##name(&__ps_mem_##name, buf, curr); }		\
267 | static inline void							\
268 | ps_slabptr_init_##name(struct ps_mem *m)				\
269 | { ps_slabptr_init(m); }							\
270 | static inline void							\
271 | ps_slab_init_##name(void)						\
272 | { ps_slabptr_init_##name(&__ps_mem_##name); }				\
273 | static inline struct ps_mem *						\
274 | ps_slabptr_create_##name(void)						\
275 | {									\
276 | 	struct ps_mem *m = ps_plat_alloc(sizeof(struct ps_mem), ps_coreid()); \
277 | 	if (m) ps_slabptr_init_##name(m);				\
278 | 	return m;							\
279 | }									\
280 | static inline void							\
281 | ps_slabptr_delete_##name(struct ps_mem *m)				\
282 | { ps_plat_free(m, sizeof(struct ps_mem), ps_coreid()); }		\
283 | static inline size_t							\
284 | ps_slab_objmem_##name(void)						\
285 | { return __ps_slab_objmemsz(obj_sz); }					\
286 | static inline size_t							\
287 | ps_slab_nobjs_##name(void)						\
288 | { return __ps_slab_max_nobjs(obj_sz, allocsz, headoff); }		\
289 | static inline unsigned int						\
290 | ps_slab_objoff_##name(void *obj)					\
291 | {									\
292 | 	struct ps_mheader *h = __ps_mhead_get(obj);			\
293 | 	return __ps_slab_objsoff(h->slab, h, obj_sz, headoff);		\
294 | }
295 | 
296 | /*
297 |  * allocsz is the size of the backing memory allocation, and
298 |  * headintern is 0 or 1, should the ps_slab header be internally
299 |  * allocated from that slab of memory, or from elsewhere.
300 |  *
301 |  * Note: if you use headintern == 1, then you must manually create
302 |  * PS_SLAB_CREATE_DEF(meta, sizeof(struct ps_slab));
303 |  */
304 | #define PS_SLAB_CREATE_AFNS(name, size, allocsz, headoff, allocfn, freefn)		\
305 | __PS_MEM_CREATE_DATA(name)								\
306 | __PS_SLAB_CREATE_FNS(name, size, allocsz, headoff, allocfn, freefn)
307 | 
308 | #define PS_SLAB_CREATE(name, size, allocsz)								\
309 | PS_SLAB_CREATE_AFNS(name, size, allocsz, sizeof(struct ps_slab), ps_slab_defalloc, ps_slab_deffree)
310 | 
311 | #define PS_SLAB_CREATE_DEF(name, size)							\
312 | PS_SLAB_CREATE(name, size, PS_PAGE_SIZE)
313 | 
314 | #endif /* PS_SLAB_H */
315 | 


--------------------------------------------------------------------------------
/ps_smr.c:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2014-2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Qi Wang, interwq@gmail.com, Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  *
  7 |  * History:
  8 |  * - Started as parsec.c and parsec.h by Qi.
  9 |  */
 10 | 
 11 | #include <ps_smr.h>
 12 | 
 13 | 
 14 | void __ps_timing_info_init(struct parsec *ps, ps_tsc_t now);
 15 | void __ps_smr_account_init(struct ps_smr_info *si);
 16 | 
 17 | int
 18 | __ps_smr_reclaim_batch(int batch, ps_tsc_t qsc, coreid_t curr, struct ps_qsc_list *ql, 
 19 | 		       struct ps_mem *m, ps_free_fn_t ffn)
 20 | {
 21 | 	int i;
 22 | 	struct ps_mheader *a;
 23 | 	/* Remove a batch worth of items from the qlist */
 24 | 	for (i = 0 ; i < batch ; i++) {
 25 | 		a = __ps_qsc_peek(ql);
 26 | 		if (!a || a->tsc_free > qsc) break;
 27 | 
 28 | 		a = __ps_qsc_dequeue(ql);
 29 | 		assert(a && __ps_mhead_isfree(a));
 30 | 		__ps_mhead_reset(a);
 31 | 		ffn(m, __ps_mhead_mem(a), 0, curr);
 32 | 	}
 33 | 	return i;
 34 | }
 35 | 
 36 | size_t
 37 | ps_smr_nqueued(struct ps_mem *m)
 38 | { return m->percore[ps_coreid()].smr_info.account.qmemcnt; }
 39 | 
 40 | void
 41 | ps_init(struct parsec *ps)
 42 | {
 43 | 	ps_tsc_t now = ps_tsc();
 44 | 	int i;
 45 | 
 46 | 	assert(ps);
 47 | 	memset(ps, 0, sizeof(struct parsec));
 48 | 
 49 | 	ps->refcnt = 0;
 50 | 	for (i = 0 ; i < PS_NUMCORES ; i++) {
 51 | 		struct ps_quiescence_timing *t = &ps->timing_info[i].timing;
 52 | 
 53 | 		t->time_in  = now;
 54 | 		t->time_out = t->time_in + 1;
 55 | 	}
 56 | 	__ps_timing_info_init(ps, now);
 57 | }
 58 | 
 59 | struct parsec *
 60 | ps_alloc(void)
 61 | {
 62 | 	struct parsec *ps = ps_plat_alloc(sizeof(struct parsec), ps_coreid());
 63 | 
 64 | 	if (!ps) return NULL;
 65 | 	ps_init(ps);
 66 | 
 67 | 	return ps;
 68 | }
 69 | 
 70 | int
 71 | ps_free(struct parsec *ps)
 72 | {
 73 | 	if (ps->refcnt > 0) return -1;
 74 | 	ps_plat_free(ps, sizeof(struct parsec), ps_coreid());
 75 | 
 76 | 	return 0;
 77 | }
 78 | 
 79 | void
 80 | __ps_memptr_init(struct ps_mem *m, struct parsec *ps)
 81 | {
 82 |         struct ps_mem_percore *pc = &m->percore[0];
 83 | 	int i;
 84 | 
 85 | 	assert(m && ps);
 86 | 	for (i = 0 ; i < PS_NUMCORES ; i++) {
 87 | 		__ps_smr_account_init(&pc[i].smr_info);
 88 | 		pc[i].smr_info.ps         = ps;
 89 | 	}
 90 | 	ps->refcnt++;
 91 | }
 92 | 
 93 | int
 94 | __ps_memptr_delete(struct ps_mem *m)
 95 | {
 96 |         struct ps_mem_percore *pc = &m->percore[0];
 97 | 	struct parsec         *ps = pc->smr_info.ps;
 98 | 	int i;
 99 | 
100 | 	if (!ps) return 0;
101 | 	if (!ps_slabptr_isempty(m)) return -1;
102 | 	for (i = 0 ; i < PS_NUMCORES ; i++) {
103 | 		if (__ps_qsc_peek(&pc[i].smr_info.qsc_list)) return -1;
104 | 	}
105 | 	ps->refcnt--;
106 | 	/* TODO: actually delete it iff refcnt == 0 */
107 | 
108 | 	return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/ps_smr.h:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2014-2015 by Gabriel Parmer.  All rights reserved.
  3 |  * Redistribution of this file is permitted under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Qi Wang, interwq@gmail.com, Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  *
  7 |  * History:
  8 |  * - Started as parsec.c and parsec.h by Qi.
  9 |  */
 10 | 
 11 | /***
 12 |  * A Scalable Memory Reclamation (SMR) technique built off of the slab
 13 |  * allocator for parsec (parallel sections).  Maintains a freelist per
 14 |  * slab with memory items ordered in terms of the Time Stamp Counter
 15 |  * (tsc) taken when the node was freed.  Removal from these queues is
 16 |  * governed by quiescence of parallel threads at the time the memory
 17 |  * was freed (which might be some time in the past).  This code
 18 |  * specifies the policy for when memory flows between the quiescing
 19 |  * queues, and the slab memory.  Moving memory back to the slabs is
 20 |  * important to enable us to reclaim and migrate memory between cores
 21 |  * (each slab is owned by a core), thus there is some balancing to be
 22 |  * done here.
 23 |  */
 24 | 
 25 | #ifndef PS_SMR_H
 26 | #define PS_SMR_H
 27 | 
 28 | #include <ps_config.h>
 29 | #include <ps_global.h>
 30 | #include <ps_slab.h>
 31 | #include <ps_quiesce.h>
 32 | 
 33 | struct parsec {
 34 | 	int refcnt;
 35 | 	struct ps_smr_percore timing_info[PS_NUMCORES] PS_ALIGNED;
 36 | } PS_ALIGNED;
 37 | 
 38 | void ps_init(struct parsec *ps);
 39 | struct parsec *ps_alloc(void);
 40 | void __ps_smr_reclaim(coreid_t curr, struct ps_qsc_list *ql, struct ps_smr_info *si, struct ps_mem *mem, ps_free_fn_t ffn);
 41 | void __ps_invoke_smr(coreid_t curr, struct ps_qsc_list *ql, struct ps_smr_info *si, struct ps_mem *mem, ps_free_fn_t ffn);
 42 | void __ps_memptr_init(struct ps_mem *m, struct parsec *ps);
 43 | int  __ps_memptr_delete(struct ps_mem *m);
 44 | 
 45 | 
 46 | static inline void
 47 | __ps_smr_free(void *buf, struct ps_mem *mem, ps_free_fn_t ffn)
 48 | {
 49 | 	struct ps_mheader  *m  = __ps_mhead_get(buf);
 50 | 	struct ps_smr_info *si;
 51 | 	struct ps_qsc_list *ql;
 52 | 	coreid_t curr_core, curr_numa;
 53 | 	ps_tsc_t tsc;
 54 | 
 55 | 	/* this is 85% of the cost of the function... */
 56 | 	tsc = ps_tsc_locality(&curr_core, &curr_numa);
 57 | 
 58 | 	si  = &mem->percore[curr_core].smr_info;
 59 | 	ql  = &si->qsc_list;
 60 | 
 61 | 	/*
 62 | 	 * Note: we currently enqueue remotely freed memory into the
 63 | 	 * qlist of the core the memory is freed on, later to be moved
 64 | 	 * to its native core by the remote free logic within the slab
 65 | 	 * allocator.  This might cause some cache coherency traffic
 66 | 	 * that we wouldn't otherwise have due to qlist operations
 67 | 	 * (i.e. writing to the ->next field within the header), but
 68 | 	 * has the large benefit that we don't have to complicate the
 69 | 	 * free-time ordering of memory chunks in the quiescence list.
 70 | 	 */
 71 | 	__ps_mhead_setfree(m, tsc);
 72 | 	__ps_qsc_enqueue(ql, m);
 73 | 	si->account.qmemcnt++;
 74 | 	__ps_invoke_smr(curr_core, ql, si, mem, ffn);
 75 | }
 76 | 
 77 | static inline void
 78 | __ps_quiesce(struct ps_mem *mem, ps_free_fn_t ffn)
 79 | {
 80 | 	struct ps_smr_info *si;
 81 | 	struct ps_qsc_list *ql;
 82 | 	coreid_t curr;
 83 | 
 84 | 	curr = ps_coreid();
 85 | 	si  = &mem->percore[curr].smr_info;
 86 | 	ql  = &si->qsc_list;
 87 | 	__ps_smr_reclaim(curr, ql, si, mem, ffn);
 88 | 
 89 | 	return;
 90 | }
 91 | 
 92 | static inline void
 93 | ps_enter(struct parsec *parsec)
 94 | {
 95 | 	coreid_t curr_cpu, curr_numa;
 96 | 	ps_tsc_t curr_time;
 97 | 	struct ps_quiescence_timing *timing;
 98 | 
 99 | 	curr_time = ps_tsc_locality(&curr_cpu, &curr_numa);
100 | 
101 | 	timing = &(parsec->timing_info[curr_cpu].timing);
102 | 	timing->time_in = curr_time;
103 | 	/*
104 | 	 * The following is needed when we have coarse granularity
105 | 	 * time-stamps (i.e. non-cycle granularity, which means we
106 | 	 * could have same time-stamp for different events).
107 | 	 */
108 | 	/* timing->time_out = curr_time - 1; */
109 | 
110 | 	ps_mem_fence();
111 | 
112 | 	return;
113 | }
114 | 
115 | static inline int
116 | __ps_in_lib(struct ps_quiescence_timing *timing)
117 | { return timing->time_out <= timing->time_in; }
118 | 
119 | static inline void
120 | ps_exit(struct parsec *parsec)
121 | {
122 | 	int curr_cpu = ps_coreid();
123 | 	struct ps_quiescence_timing *timing;
124 | 
125 | 	timing = &(parsec->timing_info[curr_cpu].timing);
126 | 	/*
127 | 	 * Here we don't require a full memory barrier on x86 -- only
128 | 	 * a compiler barrier is enough.
129 | 	 */
130 | 	ps_cc_barrier();
131 | 	timing->time_out = timing->time_in + 1;
132 | 
133 | 	return;
134 | }
135 | 
136 | #define __PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, headoff, allocfn, freefn)		\
137 | PS_SLAB_CREATE_AFNS(name, objsz, allocsz, headoff, allocfn, freefn)				\
138 | static inline void										\
139 | __ps_parslab_free_tramp_##name(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t c)	\
140 | { (void)sz; ps_slabptr_free_coreid_##name(m, s, c); }						\
141 | static inline void *										\
142 | ps_memptr_alloc_##name(struct ps_mem *m)							\
143 | { return ps_slabptr_alloc_##name(m); }								\
144 | static inline void *										\
145 | ps_mem_alloc_##name(void)									\
146 | { return ps_slab_alloc_##name(); }								\
147 | static inline void										\
148 | ps_memptr_free_##name(struct ps_mem *m, void *buf)						\
149 | { __ps_smr_free(buf, m, __ps_parslab_free_tramp_##name); }					\
150 | static inline void										\
151 | ps_mem_free_##name(void *buf)									\
152 | { ps_memptr_free_##name(&__ps_mem_##name, buf); }						\
153 | static void											\
154 | ps_memptr_init_##name(struct ps_mem *m, struct parsec *ps)					\
155 | { __ps_memptr_init(m, ps); }									\
156 | static inline void										\
157 | ps_mem_init_##name(struct parsec *ps)								\
158 | {												\
159 | 	ps_slabptr_init_##name(&__ps_mem_##name);						\
160 | 	ps_memptr_init_##name(&__ps_mem_##name, ps); 						\
161 | }												\
162 | static inline struct ps_mem *									\
163 | ps_memptr_create_##name(struct parsec *ps)							\
164 | {												\
165 | 	struct ps_mem *m = ps_slabptr_create_##name();						\
166 | 	if (!m) return NULL;									\
167 | 	ps_memptr_init_##name(m, ps);								\
168 | 	return m;										\
169 | }												\
170 | static inline int										\
171 | ps_memptr_delete_##name(struct ps_mem *m)							\
172 | {												\
173 | 	if (__ps_memptr_delete(m)) return -1;							\
174 | 	ps_slabptr_delete_##name(m);								\
175 | 	return 0;										\
176 | }												\
177 | static inline int										\
178 | ps_mem_delete_##name(void)									\
179 | { return ps_memptr_delete_##name(&__ps_mem_##name); }						\
180 | static inline void									        \
181 | ps_memptr_quiesce_##name(struct ps_mem *m)							\
182 | { __ps_quiesce(m, __ps_parslab_free_tramp_##name); }                                            \
183 | static inline void									        \
184 | ps_quiesce_##name(void)										\
185 | { __ps_quiesce(&__ps_mem_##name, __ps_parslab_free_tramp_##name); }
186 | 
187 | 
188 | #define PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, allocfn, freefn)		\
189 | __PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, sizeof(struct ps_slab), allocfn, freefn)
190 | 
191 | #define PS_PARSLAB_CREATE(name, objsz, allocsz)					\
192 | PS_PARSLAB_CREATE_AFNS(name, objsz, allocsz, ps_slab_defalloc, ps_slab_deffree)
193 | 
194 | 
195 | #endif	/* PS_SMR_H */
196 | 


--------------------------------------------------------------------------------
/quiesce_type/general/ps_quiesce.c:
--------------------------------------------------------------------------------
  1 | #include <ps_smr.h>
  2 | 
  3 | int __ps_smr_reclaim_batch(int batch, ps_tsc_t qsc, coreid_t curr, struct ps_qsc_list *ql, struct ps_mem *m, ps_free_fn_t ffn);
  4 | 
  5 | inline void
  6 | __ps_invoke_smr(coreid_t curr, struct ps_qsc_list *ql, struct ps_smr_info *si, struct ps_mem *mem, ps_free_fn_t ffn)
  7 | {
  8 | 	if (unlikely(si->account.qmemcnt >= si->account.qmemtarget)) __ps_smr_reclaim(curr, ql, si, mem, ffn);
  9 | }
 10 | 
 11 | static inline void
 12 | __ps_timing_update_remote(struct parsec *parsec, struct ps_smr_percore *curr, int remote_cpu)
 13 | {
 14 | 	struct ps_quiescence_timing *cpu_i;
 15 | 
 16 | 	cpu_i = &(parsec->timing_info[remote_cpu].timing);
 17 | 
 18 | 	curr->timing_others[remote_cpu].time_in  = cpu_i->time_in;
 19 | 	curr->timing_others[remote_cpu].time_out = cpu_i->time_out;
 20 | 
 21 | 	/*
 22 | 	 * We are reading remote cachelines possibly, so this time
 23 | 	 * stamp reading cost is fine.
 24 | 	 */
 25 | 	curr->timing_others[remote_cpu].time_updated = ps_tsc();
 26 | 
 27 | 	/* If remote core has information that can help, use it. */
 28 | 	if (curr->timing.last_known_quiescence < cpu_i->last_known_quiescence) {
 29 | 		curr->timing.last_known_quiescence = cpu_i->last_known_quiescence;
 30 | 	}
 31 | 
 32 | 	ps_mem_fence();
 33 | 
 34 | 	return;
 35 | }
 36 | 
 37 | static int
 38 | ps_quiesce(struct parsec *parsec, ps_tsc_t tsc, const int blocking, ps_tsc_t *qsc)
 39 | {
 40 | 	int inlib_curr, qsc_cpu, curr_cpu, first_try, i, done_i;
 41 | 	ps_tsc_t min_known_qsc;
 42 | 	ps_tsc_t in, out, update;
 43 | 	struct ps_smr_percore *cpuinfo;
 44 | 	struct ps_quiescence_timing *timing_local;
 45 | 	ps_tsc_t time_check;
 46 | 	assert(parsec);
 47 | 
 48 | 	time_check   = tsc;
 49 | 	curr_cpu     = ps_coreid();
 50 | 	cpuinfo      = &(parsec->timing_info[curr_cpu]);
 51 | 	timing_local = &cpuinfo->timing;
 52 | 	inlib_curr   = __ps_in_lib(timing_local);
 53 | 
 54 | 	*qsc = timing_local->last_known_quiescence;
 55 | 	/*
 56 | 	 * We cannot attempt quiescence for a time after we entered
 57 | 	 * the library.  By the definition of quiescence, this is not
 58 | 	 * possible.  Thus, ensure quiescence on the current core:
 59 | 	 * either time_in > time_check, or we are not in the lib right
 60 | 	 * now.  Either call ps_quiesce when we aren't in the library,
 61 | 	 * or for a quiescence period _before_ when we entered.
 62 | 	 */
 63 | 	if (unlikely((time_check > timing_local->time_in) && inlib_curr)) return -EQUIESCENCE;
 64 | 
 65 | 	min_known_qsc = (unsigned long long)(-1); /* start with the largest value */
 66 | 	for (i = 1 ; i < PS_NUMCORES ; i++) {
 67 | 		/* Make sure we don't all hammer core 0... */
 68 | 		qsc_cpu = (curr_cpu + i) % PS_NUMCORES;
 69 | 		assert(qsc_cpu != curr_cpu);
 70 | 
 71 | 		first_try = 1;
 72 | 		done_i    = 0;
 73 | re_check:
 74 | 		/* If we can use the quiescence for another core */
 75 | 		if (time_check < timing_local->last_known_quiescence) break;
 76 | 
 77 | 		/* Use our cached values of the other core's values */
 78 | 		in     = cpuinfo->timing_others[qsc_cpu].time_in;
 79 | 		out    = cpuinfo->timing_others[qsc_cpu].time_out;
 80 | 		update = cpuinfo->timing_others[qsc_cpu].time_updated;
 81 | 
 82 | 		/*
 83 | 		 * If the time is before the last in-tsc, or the other
 84 | 		 * cores has entered and exited the parallel section,
 85 | 		 * and our updated version of its timing happened
 86 | 		 * before the time in question, this core is done.
 87 | 		 */
 88 | 		if ((time_check < in) || ((time_check < update) && (in < out))) done_i = 1;
 89 | 
 90 | 		if (done_i) {
 91 | 			/*
 92 | 			 * We want to update our own version of the
 93 | 			 * time furthest into the past that quiescence
 94 | 			 * has been observed.
 95 | 			 */
 96 | 			/* assertion: update >= in */
 97 | 			if (in < out) {
 98 | 				if (min_known_qsc > update) min_known_qsc = update;
 99 | 			} else {
100 | 				if (min_known_qsc > in)     min_known_qsc = in;
101 | 			}
102 | 			continue; /* move on to the next core... */
103 | 		}
104 | 
105 | 		/*
106 | 		 * If no blocking allowed, then read at most one remote
107 | 		 * cacheline per core.
108 | 		 */
109 | 		if      (first_try) first_try = 0;
110 | 		else if (!blocking) return -1;
111 | 
112 | 		/*
113 | 		 * If we couldn't satisfy the quiescence locally, then
114 | 		 * we need to update our cached state for the remote
115 | 		 * core.
116 | 		 */
117 | 		__ps_timing_update_remote(parsec, cpuinfo, qsc_cpu);
118 | 
119 | 		goto re_check;
120 | 	}
121 | 
122 | 	/*
123 | 	 * Update our cached value of the last known quiescence value.
124 | 	 * This is a little complicated as it can be updated to the
125 | 	 * min_known_qsc if we had to iterate through all cores (thus
126 | 	 * we likely found an improvement to our previous value.
127 | 	 */
128 | 	if (PS_NUMCORES > 1 && i == PS_NUMCORES) {
129 | 		if (inlib_curr && (min_known_qsc > timing_local->time_in)) {
130 | 			min_known_qsc = timing_local->time_in;
131 | 		}
132 | 
133 | 		assert(min_known_qsc < (unsigned long long)(-1));
134 | 		/*
135 | 		 * This implies we went through all cores. Thus the
136 | 		 * min_known_quie can be used to determine global
137 | 		 * quiescence.
138 | 		 */
139 | 		if (timing_local->last_known_quiescence < min_known_qsc) {
140 | 			*qsc = timing_local->last_known_quiescence = min_known_qsc;
141 | 		}
142 | 		ps_mem_fence();
143 | 	}
144 | 
145 | 	return 0;
146 | }
147 | 
148 | /*
149 |  * Blocking and non-blocking versions of quiescence.  By default, we
150 |  * should only use the non-blocking version (i.e. the system should be
151 |  * wait-free), but we might run out of memory if this is the case.
152 |  */
153 | int
154 | ps_quiesce_wait(struct parsec *p, ps_tsc_t tsc, ps_tsc_t *qsc_tsc)
155 | { return ps_quiesce(p, tsc, 1, qsc_tsc); }
156 | 
157 | int
158 | ps_try_quiesce(struct parsec *p, ps_tsc_t tsc, ps_tsc_t *qsc_tsc)
159 | { return ps_quiesce(p, tsc, 0, qsc_tsc); }
160 | 
161 | /*
162 |  * We assume that the quiescence queue has at least PS_QLIST_BATCH items
163 |  * in it.
164 |  */
165 | void
166 | __ps_smr_reclaim(coreid_t curr, struct ps_qsc_list *ql, struct ps_smr_info *si,
167 | 		 struct ps_mem *m, ps_free_fn_t ffn)
168 | {
169 | 	struct parsec    *ps = m->percore[curr].smr_info.ps;
170 | 	struct ps_mheader *a = __ps_qsc_peek(ql);
171 | 	int increase_backlog = 0, r = 0;
172 | 	ps_tsc_t qsc, tsc;
173 | 	assert(ps && ql && si);
174 | 
175 | 	if (!a) return ;
176 | 	tsc = a->tsc_free;
177 | 	if (ps_try_quiesce(ps, tsc, &qsc)) increase_backlog = 1;
178 | 	else r = __ps_smr_reclaim_batch(PS_QLIST_BATCH,  qsc, curr, ql, m, ffn);
179 | 	si->account.qmemcnt -= r;
180 | 	if (r < PS_QLIST_BATCH) increase_backlog = 1;
181 | 	if (increase_backlog) si->account.qmemtarget += PS_QLIST_BATCH; /* TODO: shrink target */
182 | 
183 | 	return;
184 | 
185 | }
186 | 
187 | void
188 | __ps_timing_info_init(struct parsec *ps, ps_tsc_t now)
189 | {
190 | 	int i, j;
191 | 	for (i = 0 ; i < PS_NUMCORES ; i++) {
192 | 		ps->timing_info[i].timing.last_known_quiescence = now;
193 | 		for (j = 0 ; j < PS_NUMCORES ; j++) {
194 | 			struct __ps_other_core *o = &ps->timing_info[i].timing_others[j];
195 | 
196 | 			o->time_in = o->time_out = o->time_updated = now;
197 | 			o->time_out++;
198 | 		}
199 | 	}
200 | }
201 | 
202 | void
203 | __ps_smr_account_init(struct ps_smr_info *si)
204 | {
205 | 	si->account.qmemtarget = PS_QLIST_BATCH;
206 | 	si->account.qmemcnt    = 0;
207 | }
208 | 


--------------------------------------------------------------------------------
/quiesce_type/general/ps_quiesce_impl.h:
--------------------------------------------------------------------------------
 1 | #ifndef PS_QUIESCE_IMPL_H
 2 | #define PS_QUIESCE_IMPL_H
 3 | 
 4 | #ifndef PS_QLIST_BATCH
 5 | #define PS_QLIST_BATCH 128
 6 | #endif
 7 | 
 8 | struct ps_quiescence_timing {
 9 | 	volatile ps_tsc_t     time_in, time_out;
10 | 	volatile ps_tsc_t     last_known_quiescence;
11 | 	char __padding[PS_CACHE_PAD_SZ(3*sizeof(ps_tsc_t))];
12 | } PS_ALIGNED PS_PACKED;
13 | 
14 | struct __ps_other_core {
15 | 	ps_tsc_t time_in, time_out, time_updated;
16 | };
17 | 
18 | struct ps_smr_percore {
19 | 	/* ps_quiescence_timing info of this CPU */
20 | 	struct ps_quiescence_timing timing;
21 | 	/* ps_quiescence_timing info of other CPUs known by this CPU */
22 | 	struct __ps_other_core timing_others[PS_NUMCORES];
23 | 	/* padding an additional cacheline for prefetching */
24 | 	char __padding[PS_CACHE_PAD_SZ(sizeof(struct __ps_other_core)*PS_NUMCORES + sizeof(struct ps_quiescence_timing))];
25 | } PS_ALIGNED PS_PACKED;
26 | 
27 | struct ps_qsc_account {
28 | 	size_t             qmemcnt;    /* # of items in the qsc_list */
29 | 	size_t             qmemtarget; /* # of items in qsc_list before we attempt to quiesce */
30 | };
31 | 
32 | struct parsec;
33 | int ps_quiesce_wait(struct parsec *p, ps_tsc_t tsc, ps_tsc_t *qsc);
34 | int ps_try_quiesce (struct parsec *p, ps_tsc_t tsc, ps_tsc_t *qsc);
35 | 
36 | #endif	/* PS_QUIESCE_IMPL_H */
37 | 


--------------------------------------------------------------------------------
/quiesce_type/real_time/ps_quiesce.c:
--------------------------------------------------------------------------------
 1 | #include <ps_smr.h>
 2 | 
 3 | static inline void
 4 | __ps_timing_update_remote(struct parsec *parsec, struct ps_quiescence_timing *curr, int remote_cpu)
 5 | {
 6 | 	struct ps_quiescence_timing *cpu_i;
 7 | 
 8 | 	cpu_i = &(parsec->timing_info[remote_cpu].timing);
 9 | 	curr->time_in  = cpu_i->time_in;
10 | 	curr->time_out = cpu_i->time_out;
11 | 
12 | 	ps_mem_fence();
13 | 	return;
14 | }
15 | 
16 | void
17 | ps_quiesce(struct parsec *parsec, coreid_t curr, ps_tsc_t *qsc)
18 | {
19 | 	int i, qsc_cpu;
20 | 	ps_tsc_t min_known_qsc;
21 | 	struct ps_quiescence_timing t;
22 | 
23 | 	min_known_qsc = ps_tsc();
24 | 	for (i = 1 ; i < PS_NUMCORES; i++) {
25 | 		/* Make sure we don't all hammer core 0... */
26 | 		qsc_cpu = (curr + i) % PS_NUMCORES;
27 | 		assert(qsc_cpu != curr);
28 | 
29 | 		__ps_timing_update_remote(parsec, &t, qsc_cpu);
30 | 		if (__ps_in_lib(&t)) {
31 | 			if (min_known_qsc > t.time_in) min_known_qsc = t.time_in;
32 | 		}
33 | 	}
34 | 	*qsc = min_known_qsc;
35 | 	ps_mem_fence();
36 | }
37 | 


--------------------------------------------------------------------------------
/quiesce_type/real_time/ps_quiesce_impl.h:
--------------------------------------------------------------------------------
1 | #ifndef PS_QUIESCE_IMPL_H
2 | #define PS_QUIESCE_IMPL_H
3 | 
4 | #include "ps_quiesce_rt.h"
5 | 
6 | #endif	/* PS_QUIESCE_IMPL_H */
7 | 


--------------------------------------------------------------------------------
/quiesce_type/real_time/ps_quiesce_rt.c:
--------------------------------------------------------------------------------
 1 | #include <limits.h>
 2 | #include <ps_smr.h>
 3 | 
 4 | void ps_quiesce(struct parsec *parsec, coreid_t curr, ps_tsc_t *qsc);
 5 | int __ps_smr_reclaim_batch(int batch, ps_tsc_t qsc, coreid_t curr, struct ps_qsc_list *ql, struct ps_mem *m, ps_free_fn_t ffn);
 6 | 
 7 | void
 8 | __ps_timing_info_init(struct parsec *ps, ps_tsc_t now)
 9 | { (void)ps, (void)now; return ; }
10 | 
11 | void
12 | __ps_smr_account_init(struct ps_smr_info *si)
13 | { si->account.qmemcnt = 0; }
14 | 
15 | void
16 | __ps_invoke_smr(coreid_t curr, struct ps_qsc_list *ql, struct ps_smr_info *si, struct ps_mem *mem, ps_free_fn_t ffn)
17 | { (void)curr, (void)ql, (void)si, (void)mem, (void)ffn; return ; }
18 | 
19 | void
20 | __ps_smr_reclaim(coreid_t curr, struct ps_qsc_list *ql, struct ps_smr_info *si,
21 | 		 struct ps_mem *m, ps_free_fn_t ffn)
22 | {
23 | 	struct parsec    *ps = m->percore[curr].smr_info.ps;
24 | 	struct ps_mheader *a = __ps_qsc_peek(ql);
25 | 	ps_tsc_t qsc;
26 | 	assert(ps && ql && si);
27 | 
28 | 	if (!a) return ;
29 | 	ps_quiesce(ps, curr, &qsc);
30 | 	__ps_smr_reclaim_batch(INT_MAX,  qsc, curr, ql, m, ffn);
31 | 
32 | 	return;
33 | }
34 | 


--------------------------------------------------------------------------------
/quiesce_type/real_time/ps_quiesce_rt.h:
--------------------------------------------------------------------------------
 1 | #ifndef PS_QUIESCE_RT_H
 2 | #define PS_QUIESCE_RT_H
 3 | 
 4 | struct ps_quiescence_timing {
 5 | 	volatile ps_tsc_t time_in, time_out;
 6 | 	char __padding[PS_CACHE_PAD_SZ(2*sizeof(ps_tsc_t))];
 7 | } PS_ALIGNED PS_PACKED;
 8 | 
 9 | struct ps_smr_percore {
10 | 	/* ps_quiescence_timing info of this CPU */
11 | 	struct ps_quiescence_timing timing;
12 | } PS_ALIGNED PS_PACKED;
13 | 
14 | struct ps_qsc_account {
15 | 	size_t qmemcnt;    /* # of items in the qsc_list */
16 | };
17 | 
18 | #endif	/* PS_QUIESCE_RT_H */
19 | 


--------------------------------------------------------------------------------
/quiesce_type/temporal/ps_quiesce.c:
--------------------------------------------------------------------------------
1 | #include <ps_smr.h>
2 | 
3 | void
4 | ps_quiesce(struct parsec *parsec, coreid_t curr, ps_tsc_t *qsc)
5 | {
6 | 	(void)parsec, (void)curr;
7 | 	*qsc = ps_tsc() - (ps_tsc_t)MAX_PARSEC_RESPONSE;
8 | }
9 | 


--------------------------------------------------------------------------------
/quiesce_type/temporal/ps_quiesce_impl.h:
--------------------------------------------------------------------------------
1 | #ifndef PS_QUIESCE_IMPL_H
2 | #define PS_QUIESCE_IMPL_H
3 | 
4 | #include "ps_quiesce_rt.h"
5 | 
6 | #define MAX_PARSEC_RESPONSE 50000 /* maximal response time in parallel section (in cycles) */
7 | 
8 | #endif	/* PS_QUIESCE_IMPL_H */
9 | 


--------------------------------------------------------------------------------
/quiesce_type/temporal/ps_quiesce_rt.c:
--------------------------------------------------------------------------------
1 | ../real_time/ps_quiesce_rt.c


--------------------------------------------------------------------------------
/quiesce_type/temporal/ps_quiesce_rt.h:
--------------------------------------------------------------------------------
1 | ../real_time/ps_quiesce_rt.h


--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
 1 | include ../Makefile.inc
 2 | 
 3 | CINC   += -I../
 4 | LDFLAGS = -pthread -L..
 5 | 
 6 | # tests
 7 | TCFILES = $(wildcard *.c)
 8 | TCOBJS  = $(patsubst %.c,%.o,$(TCFILES))
 9 | TCDEPS  = $(patsubst %.c,%.d,$(TCFILES))
10 | EXECS   = $(patsubst %.c,%.test,$(TCFILES))
11 | 
12 | .PHONY: clean all
13 | 
14 | all: $(EXECS)
15 | 
16 | %.o:%.c
17 | 	$(CC) $(CINC) $(CFLAGS) -o $@ -c $<
18 | 
19 | %.test:%.o
20 | 	$(LD) $(LDFLAGS) -o $@ $< -l$(LNAME)
21 | 
22 | clean:
23 | 	rm -f $(EXECS) $(TCOBJS) $(TCDEPS)
24 | 
25 | -include $(TCDEPS)
26 | 


--------------------------------------------------------------------------------
/tests/ht.c.inprogress:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * This file is dual licensed under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  */
  7 | 
  8 | #include <stdlib.h>
  9 | #include <stdio.h>
 10 | 
 11 | #include <ps.h>
 12 | 
 13 | #define MAX_KEY_LEN 255
 14 | 
 15 | struct item {
 16 | 	struct item *next;
 17 | 	char        *key, *data;
 18 | 	size_t       key_len, data_len;
 19 | } PS_ALIGNED;
 20 | 
 21 | #define BINORD (16)
 22 | #define NBINS (1<<BINORD)
 23 | 
 24 | struct bin {
 25 | 	struct item *item;
 26 | 	struct ps_lock lock;
 27 | 	char padding[PS_CACHE_LINE - ((sizeof(struct ps_lock)+sizeof(struct item *)+sizeof(struct mheader)) % PS_CACHE_LINE];
 28 | } PS_ALIGNED PS_PACKED;
 29 | 
 30 | PS_NSSLAB_CREATE(htbl, sizeof(struct bin), 2, 9, 7)
 31 | PS_PARSLAB_CREATE(item300, sizeof(struct item) + 300, PS_PAGE_SIZE * 64)
 32 | 
 33 | 
 34 | /* lets start with something simple but stupid, djb2 hash @ http://www.cse.yorku.ca/~oz/hash.html */
 35 | static inline unsigned long
 36 | hash(unsigned char *str, size_t len)
 37 | {
 38 | 	unsigned long hash = 5381;
 39 | 	unsigned int  i;
 40 | 
 41 | 	for (i = 0 ; i < len ; i++) {
 42 | 		hash = ((hash << 5) + hash) + str[i]; /* hash * 33 + c */
 43 | 	}
 44 | 
 45 | 	return hash;
 46 | }
 47 | 
 48 | struct parsec ps;
 49 | 
 50 | static struct ps_ns *
 51 | ht_create(struct ps_ns *ps)
 52 | {
 53 | 	int i;
 54 | 	struct ps_ns *ht;
 55 | 	(void)ps;
 56 | 
 57 | 	ht = ps_nsptr_create_slab_htbl();
 58 | 	assert(ht);
 59 | 
 60 | 	for (i = 0 ; i < NBINS ; i++) {
 61 | 		ps_desc_t d = 0;
 62 | 		struct bin *ret;
 63 | 
 64 | 		ret = ps_nsptr_alloc_htbl(ht, &d);
 65 | 		assert(ret);
 66 | 		assert(i == (int)d);
 67 | 		ret->l = locks[i];
 68 | 	}
 69 | 
 70 | 	return ht;
 71 | }
 72 | 
 73 | static void
 74 | slabs_init(void)
 75 | {
 76 | 	ps_mem_init_item300(&ps);
 77 | }
 78 | 
 79 | static inline struct bin *
 80 | __ht_get_bin(struct ps_ns *ht, char *key, size_t key_len)
 81 | {
 82 | 	ps_desc_t bin = hash(key, key_len) % NBINS;
 83 | 	struct bin *b = ps_nsptr_lkup_htbl(ht, bin);
 84 | 	assert(b);
 85 | 
 86 | 	return b;
 87 | }
 88 | 
 89 | static inline struct item *
 90 | __ht_get_walk(struct bin *b, char *key, size_t key_len, struct item **prev)
 91 | {
 92 | 	struct item *i;
 93 | 
 94 | 	for (i = b->item, *prev = NULL ; i != NULL ; *prev = i, i = i->next) {
 95 | 		if (key_len == i->key_len && !memcmp(i->key, key, key_len)) {
 96 | 			return i;
 97 | 		}
 98 | 	}
 99 | 
100 | 	return NULL;
101 | }
102 | 
103 | static inline struct item *
104 | __ht_get(struct ps_ns *ht, char *key, size_t key_len, struct item **prev)
105 | {
106 | 	struct bin *b = __ht_get_bin(ht, key, key_len);
107 | 	assert(b);
108 | 
109 | 	return __ht_get_walk(b, key, key_len, prev);
110 | }
111 | 
112 | static int
113 | ht_get(struct ps_ns *ht, char *key, size_t key_len, char **data, size_t *data_len)
114 | {
115 | 	struct item *item, *prev;
116 | 
117 | 	item      = __ht_get(ht, key, key_len, &prev);
118 | 	if (!item) return -1;
119 | 
120 | 	*data     = item->data;
121 | 	*data_len = item->data_len;
122 | 
123 | 	return 0;
124 | }
125 | 
126 | static int
127 | ht_put(struct ps_ns *ht, char *key, size_t key_len, char *data, size_t data_len)
128 | {
129 | 	struct item *item, *prev, *new;
130 | 	struct bin  *b;
131 | 
132 | 	assert(sizeof(struct item) + key_len + data_len < 300);
133 | 	new = ps_mem_alloc_item300();
134 | 	assert(new);
135 | 	new->key = (char *)&new[1];
136 | 	memcpy(new->key, key, key_len);
137 | 	new->data = new->key + key_len;
138 | 	memcpy(new->data, data, data_len);
139 | 
140 | 	b = __ht_get_bin(ht, key, key_len);
141 | 	assert(b);
142 | 	ps_lock_take(&b->lock);
143 | 	item = __ht_get_walk(b, key, key_len, &prev);
144 | 
145 | 	if (item) {
146 | 		assert(prev);
147 | 		prev->next = item->next;
148 | 		ps_mem_free_item300(item);
149 | 	}
150 | 
151 | 	new = b->item;
152 | 	b->item = new;
153 | 	ps_lock_release(&b->lock);
154 | 
155 | 	return 0;
156 | }
157 | 
158 | #define NITEMS (1<<(BINORD+3))
159 | #define KEYSZ  128
160 | #define DATASZ 128
161 | 
162 | void
163 | ht_load(struct ps_ns *ht, unsigned long start, unsigned long end)
164 | {
165 | 	unsigned long i, j;
166 | 
167 | 	assert(KEYSZ % sizeof(unsigned long) == 0);
168 | 	assert(end > start);
169 | 	for (i = start ; i < end ; i++) {
170 | 		char key[KEYSZ], data[DATASZ], dataval;
171 | 
172 | 		for (j = 0 ; j < KEYSZ ; j += sizeof(unsigned long)) {
173 | 			memcpy(&key[j], &i, sizeof(unsigned long));
174 | 		}
175 | 
176 | 		dataval = (char)i % 256;
177 | 		for (j = 0 ; j < DATASZ ; j++) data[j] = dataval;
178 | 
179 | 		ps_enter(&ps);
180 | 		ht_put(ht, key, KEYSZ, data, DATASZ);
181 | 		ps_exit(&ps);
182 | 	}
183 | }
184 | 
185 | int
186 | main(void)
187 | {
188 | 	struct ps_ns *ht;
189 | 
190 | 	ps_init(&ps);
191 | 	ht = ht_create(&ps);
192 | 
193 | 
194 | 	return 0;
195 | }
196 | 


--------------------------------------------------------------------------------
/tests/list.c:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * Copyright 2017 by Gabriel Parmer.  All rights reserved.
 3 |  * This file is licensed under the BSD 2 clause license.
 4 |  *
 5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2017
 6 |  */
 7 | 
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <assert.h>
11 | 
12 | #include <ps_list.h>
13 | 
14 | #define LIST_LEN 10
15 | 
16 | struct n {
17 | 	int num;
18 | 	int padding[15];
19 | 	struct ps_list list;
20 | };
21 | 
22 | void
23 | foo(struct ps_list_head *h, struct n *m)
24 | { ps_list_head_add_d(h, m); } __attribute__((noinline))
25 | 
26 | int
27 | main(void)
28 | {
29 | 	struct n *nodes[LIST_LEN], *node, *tmp = NULL;
30 | 	struct ps_list_head h;
31 | 	int i;
32 | 
33 | 	ps_list_head_init(&h);
34 | 	for (i = 0 ; i < LIST_LEN ; i++) {
35 | 		nodes[i] = malloc(sizeof(struct n));
36 | 		assert(nodes[i]);
37 | 		ps_list_init_d(nodes[i]);
38 | 		nodes[i]->num = i;
39 | 	}
40 | 
41 | 	assert(ps_list_head_empty(&h));
42 | 	assert(ps_list_singleton_d(nodes[0]));
43 | 	assert(ps_list_next_d(nodes[0]) == nodes[0]);
44 | 	assert(ps_list_prev_d(nodes[0]) == nodes[0]);
45 | 	assert(ps_container(&nodes[0]->list, struct n, list) == nodes[0]);
46 | 
47 | 	for (i = 0 ; i < LIST_LEN ; i++) {
48 | 		ps_list_head_add_d(&h, nodes[i]);
49 | 		assert(ps_list_prev_d(nodes[i]) == ps_container(&h, struct n, list));
50 | 		assert(nodes[i] == ps_list_head_first(&h, struct n, list));
51 | 		if (i == 0) {
52 | 			assert(ps_list_next_d(nodes[i]) == ps_container(&h, struct n, list));
53 | 			assert(ps_list_prev_d(nodes[i]) == ps_container(&h, struct n, list));
54 | 			assert(nodes[i] == ps_list_head_first(&h, struct n, list));
55 | 			assert(nodes[i] == ps_list_head_last(&h, struct n, list));
56 | 		} else {
57 | 			assert(ps_list_next_d(nodes[i]) != ps_container(&h, struct n, list));
58 | 			assert(nodes[i] != ps_list_head_last(&h, struct n, list));
59 | 		}
60 | 	}
61 | 
62 | 	i = LIST_LEN-1;
63 | 	ps_list_foreach_d(&h, node) {
64 | 		assert(node->num == i);
65 | 		i--;
66 | 	}
67 | 	assert(i == -1);
68 | 
69 | 	for (i = 0 ; i < LIST_LEN ; i++) {
70 | 		struct n *f = ps_list_head_first(&h, struct n, list);
71 | 
72 | 		assert(tmp != f);
73 | 		ps_list_rem_d(f);
74 | 		assert(ps_list_singleton_d(f));
75 | 		ps_list_head_append_d(&h, f);
76 | 		tmp = f;
77 | 	}
78 | 
79 | 	i = LIST_LEN - 1;
80 | 	ps_list_foreach_d(&h, node) {
81 | 		assert(node->num == i);
82 | 		i--;
83 | 	}
84 | 	assert(i == -1);
85 | 
86 | 	ps_list_foreach_del_d(&h, node, tmp) {
87 | 		ps_list_rem_d(node);
88 | 		assert(ps_list_singleton_d(node));
89 | 	}
90 | 	assert(ps_list_head_empty(&h));
91 | 
92 | 	printf("Linked List Tests SUCCESS.\n");
93 | 
94 | 	return 0;
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/ns.c:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * This file is licensed under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  */
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include <ps_ns.h>
 12 | #include <ps_plat.h>
 13 | 
 14 | /*
 15 |  * FIXME:
 16 |  * - segfault when ns is not allocated (i.e. when default ns is used)
 17 |  */
 18 | 
 19 | #define LEAF_ORDER 7
 20 | PS_NSSLAB_CREATE(nstest, sizeof(void *), 3, 9, LEAF_ORDER)
 21 | 
 22 | #define SMRITER (2737*128)
 23 | #define SLABITER SMRITER
 24 | void *ds[SMRITER];
 25 | ps_desc_t descs[SMRITER];
 26 | struct parsec ps;
 27 | 
 28 | void
 29 | test_slab_alloc_lkup(void)
 30 | {
 31 | 	int i;
 32 | 	struct ps_ns *ns;
 33 | 
 34 | 	ns = ps_nsptr_create_slab_nstest();
 35 | 	assert(ns);
 36 | 
 37 | 	printf("--------------------[  NS Slab Tests  ]-----------------\n");
 38 | 
 39 | 	printf("Testing ps_ns slab allocation: objmem %zu (ns objmemsz %lu), sz = nobj %zu * %d\n",
 40 | 	       ps_slab_objmem_nstest(), __ps_slab_objmemsz(sizeof(void*)),
 41 | 	       ps_slab_nobjs_nstest(), 1<<LEAF_ORDER);
 42 | 
 43 | 	for (i = 0 ; i < SLABITER/2 ; i++)        assert((ds[i] = ps_nsptr_alloc_nstest(ns, &descs[i])));
 44 | 	for (i = 0 ; i < SLABITER/4 ; i++)        ps_nsptr_free_nstest(ns, ds[i]);
 45 | 	for (i = 0 ; i < SLABITER/4 ; i++)        assert((ds[i] = ps_nsptr_alloc_nstest(ns, &descs[i])));
 46 | 	for (i = SLABITER/2 ; i < SLABITER ; i++) assert((ds[i] = ps_nsptr_alloc_nstest(ns, &descs[i])));
 47 | 	for (i = 0 ; i < SLABITER ; i++)          ps_nsptr_free_nstest(ns, ds[i]);
 48 | 
 49 | 	for (i = 0 ; i < SLABITER ; i++)          assert(!ps_nsptr_lkup_nstest(ns, descs[i]));
 50 | 	for (i = 0 ; i < (1<<LEAF_ORDER)/2 ; i++) assert((ds[i] = ps_nsptr_alloc_nstest(ns, &descs[i])));
 51 | 	for (i = 0 ; i < (1<<LEAF_ORDER)/2 ; i++) assert(ps_nsptr_lkup_nstest(ns, descs[i]) == ds[i]);
 52 | 	for (i = (1<<LEAF_ORDER)/2 ;
 53 | 	     i < 1<<LEAF_ORDER ; i++)             assert(!ps_nsptr_lkup_nstest(ns, descs[i]));
 54 | 	for (i = 0 ; i < (1<<LEAF_ORDER)/2 ; i++) ps_nsptr_free_nstest(ns, ds[i]);
 55 | 	for (i = 0 ; i < SLABITER ; i++)          assert((ds[i] = ps_nsptr_alloc_nstest(ns, &descs[i])));
 56 | 	for (i = 0 ; i < SLABITER ; i++)          assert(ps_nsptr_lkup_nstest(ns, descs[i]) == ds[i]);
 57 | 	for (i = 0 ; i < SLABITER ; i++)          ps_nsptr_free_nstest(ns, ds[i]);
 58 | 
 59 | 	printf("--------------------[ NS Slab: SUCCESS ]-----------------\n");
 60 | }
 61 | 
 62 | PS_NS_CREATE(nstest2, sizeof(void *), 3, 9, LEAF_ORDER)
 63 | 
 64 | /* #define REAL_TIME_PARSEC_TEST */
 65 | #ifdef REAL_TIME_PARSEC_TEST
 66 | ps_tsc_t _ps_period = 10000;
 67 | __thread ps_tsc_t _ps_deadline = 0;
 68 | 
 69 | static void
 70 | ps_period_quiesce(struct ps_ns *ns)
 71 | {
 72 | 	ps_tsc_t tsc;
 73 | 
 74 | 	tsc = ps_tsc();
 75 | 	if (tsc >= _ps_deadline) {
 76 | 		_ps_deadline = tsc + _ps_period;
 77 | 		ps_memptr_quiesce_nstest2(&ns->m);
 78 | 	}
 79 | }
 80 | #else
 81 | static void 
 82 | ps_period_quiesce(struct ps_ns *ns)
 83 | { (void)ns; }
 84 | #endif
 85 | 
 86 | void
 87 | test_smr_alloc_lkup(void)
 88 | {
 89 | 	int i;
 90 | 	struct ps_ns *ns;
 91 | 
 92 | 	ns = ps_nsptr_create_nstest(&ps);
 93 | 	assert(ns);
 94 | 
 95 | 	printf("--------------------[  NS Tests  ]-----------------\n");
 96 | 
 97 | 	printf("Testing ps_ns allocation: objmem %zu (ns objmemsz %lu), sz = nobj %zu * %d\n",
 98 | 	       ps_slab_objmem_nstest2(), __ps_slab_objmemsz(sizeof(void*)),
 99 | 	       ps_slab_nobjs_nstest2(), 1<<LEAF_ORDER);
100 | 
101 | 	for (i = 0 ; i < SMRITER/2 ; i++)         assert((ds[i] = ps_nsptr_alloc_nstest2(ns, &descs[i])));
102 | 	for (i = 0 ; i < SMRITER/4 ; i++)         ps_nsptr_free_nstest2(ns, ds[i]);
103 | 	ps_period_quiesce(ns);
104 | 	for (i = 0 ; i < SMRITER/4 ; i++)         assert((ds[i] = ps_nsptr_alloc_nstest2(ns, &descs[i])));
105 | 	for (i = SMRITER/2 ; i < SMRITER ; i++)   assert((ds[i] = ps_nsptr_alloc_nstest2(ns, &descs[i])));
106 | 	for (i = 0 ; i < SMRITER ; i++)           ps_nsptr_free_nstest2(ns, ds[i]);
107 | 	ps_period_quiesce(ns);
108 | 
109 | 	for (i = 0 ; i < SMRITER ; i++)           assert(!ps_nsptr_lkup_nstest2(ns, descs[i]));
110 | 	for (i = 0 ; i < (1<<LEAF_ORDER)/2 ; i++) assert((ds[i] = ps_nsptr_alloc_nstest2(ns, &descs[i])));
111 | 	for (i = 0 ; i < (1<<LEAF_ORDER)/2 ; i++) assert(ps_nsptr_lkup_nstest2(ns, descs[i]) == ds[i]);
112 | 	for (i = (1<<LEAF_ORDER)/2 ;
113 | 	     i < 1<<LEAF_ORDER ; i++)             assert(!ps_nsptr_lkup_nstest2(ns, descs[i]));
114 | 	for (i = 0 ; i < (1<<LEAF_ORDER)/2 ; i++) ps_nsptr_free_nstest2(ns, ds[i]);
115 | 	ps_period_quiesce(ns);
116 | 	for (i = 0 ; i < SMRITER ; i++)           assert((ds[i] = ps_nsptr_alloc_nstest2(ns, &descs[i])));
117 | 	for (i = 0 ; i < SMRITER ; i++)           assert(ps_nsptr_lkup_nstest2(ns, descs[i]) == ds[i]);
118 | 	for (i = 0 ; i < SMRITER ; i++)           ps_nsptr_free_nstest2(ns, ds[i]);
119 | 	ps_period_quiesce(ns);
120 | 
121 | 	printf("--------------------[ NS: SUCCESS ]-----------------\n");
122 | }
123 | 
124 | #define ITER 128
125 | volatile void *val;
126 | 
127 | void
128 | test_perf(void)
129 | {
130 | 	int i, j;
131 | 	struct ps_ns *ns;
132 | 	ps_desc_t d;
133 | 	u64_t start, end;
134 | 
135 | 	ns = ps_nsptr_create_nstest(&ps);
136 | 	assert(ns);
137 | 
138 | 	printf("--------------------[  NS Performance Tests  ]-----------------\n");
139 | 
140 | 	ps_nsptr_alloc_nstest2(ns, &d);
141 | 	start = ps_tsc();
142 | 	for (j = 0 ; j < ITER ; j++) {
143 | 		for (i = 0 ; i < SMRITER ; i++) assert((ds[i] = ps_nsptr_alloc_nstest2(ns, &descs[i])));
144 | 		for (i = 0 ; i < SMRITER ; i++) ps_nsptr_free_nstest2(ns, ds[i]);
145 | 		ps_period_quiesce(ns);
146 | 	}
147 | 	end = ps_tsc();
148 | 	printf("ns alloc+free average %lld\n", (end-start)/(ITER*SMRITER));
149 | 
150 | 	for (i = 0 ; i < SMRITER ; i++) assert((ds[i] = ps_nsptr_alloc_nstest2(ns, &descs[i])));
151 | 	start = ps_tsc();
152 | 	for (j = 0 ; j < ITER ; j++) {
153 | 		for (i = 0 ; i < SMRITER ; i++) val = ps_nsptr_lkup_nstest2(ns, descs[i]);
154 | 	}
155 | 	end = ps_tsc();
156 | 	for (i = 0 ; i < SMRITER ; i++) ps_nsptr_free_nstest2(ns, ds[i]);
157 | 	ps_period_quiesce(ns);
158 | 	printf("ns lookup average %lld\n", (end-start)/(ITER*SMRITER));
159 | 
160 | 	printf("--------------------[ NS: SUCCESS ]-----------------\n");
161 | }
162 | 
163 | void *
164 | disassemble_lkup(struct ps_ns *ns)
165 | {
166 | 	return ps_nsptr_lkup_nstest2(ns, descs[5]); /* arbitrary value...we never execute this. */
167 | }
168 | 
169 | int
170 | main(void)
171 | {
172 | 	thd_set_affinity(pthread_self(), 0);
173 | 	ps_init(&ps);
174 | 	test_slab_alloc_lkup();
175 | 	test_smr_alloc_lkup();
176 | 	test_perf();
177 | 
178 | 	return 0;
179 | }
180 | 


--------------------------------------------------------------------------------
/tests/pgalloc.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <assert.h>
 3 | #include <ps_pgalloc.h>
 4 | 
 5 | struct ps_freelist pages = PS_PGALLOC_STATICDEF(PS_PAGE_SIZE);
 6 | 
 7 | #define ITER 1024
 8 | 
 9 | /* memory that has been seen... */
10 | void *seen[ITER] = {0};
11 | /* ...and is currently allocated */
12 | void *allocated[ITER] = {0};
13 | 
14 | void
15 | page_add(void *p)
16 | {
17 | 	int i;
18 | 
19 | 	/* check for proper page recycling */
20 | 	for (i = 0 ; i < ITER ; i++) {
21 | 		if (seen[i] == p || !seen[i]) {
22 | 			seen[i] = p;
23 | 			break;
24 | 		}
25 | 	}
26 | 	assert(i < ITER);
27 | 
28 | 	/* check for double allocations */
29 | 	for (i = 0 ; i < ITER ; i++) {
30 | 		assert(allocated[i] != p);
31 | 		if (!allocated[i]) {
32 | 			allocated[i] = p;
33 | 			break;
34 | 		}
35 | 	}
36 | }
37 | 
38 | void
39 | page_rem(void *p)
40 | {
41 | 	int i;
42 | 
43 | 	for (i = 0 ; i < ITER ; i++) {
44 | 		if (allocated[i] != p) continue;
45 | 
46 | 		allocated[i] = NULL;
47 | 		break;
48 | 	}
49 | }
50 | 
51 | void *ptrs[ITER];
52 | 
53 | int
54 | main(void)
55 | {
56 | 	int i, j;
57 | 
58 | 	for (i = ITER-1 ; i >= 0 ; i--) {
59 | 		for (j = i ; j < ITER ; j++) {
60 | 			ptrs[j] = ps_pgalloc(&pages);
61 | 			assert(ptrs[j]);
62 | 			page_add(ptrs[j]);
63 | 		}
64 | 		for (j = i ; j < ITER ; j++) {
65 | 			page_rem(ptrs[j]);
66 | 			ps_pgfree(&pages, ptrs[j]);
67 | 			ptrs[j] = NULL;
68 | 		}
69 | 	}
70 | 	printf("Page allocator unit tests:  SUCCESS!\n");
71 | 
72 | 	return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/tests/slab.c:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * This file is dual licensed under the BSD 2 clause license.
  4 |  *
  5 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  6 |  */
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include <ps_slab.h>
 12 | 
 13 | #define SMALLSZ 1
 14 | #define LARGESZ 8000
 15 | 
 16 | struct small {
 17 | 	char x[SMALLSZ];
 18 | };
 19 | 
 20 | struct larger {
 21 | 	char x[LARGESZ];
 22 | };
 23 | 
 24 | PS_SLAB_CREATE_DEF(s, sizeof(struct small))
 25 | PS_SLAB_CREATE(l, sizeof(struct larger), PS_PAGE_SIZE * 128)
 26 | PS_SLAB_CREATE(hextern, sizeof(struct larger), PS_PAGE_SIZE * 128)
 27 | 
 28 | #define ITER       (1024)
 29 | #define SMALLCHUNK 2
 30 | #define LARGECHUNK 32
 31 | 
 32 | /* These are meant to be disassembled and inspected, to validate inlining/optimization */
 33 | void *
 34 | disassemble_alloc()
 35 | { return ps_slab_alloc_l(); }
 36 | void
 37 | disassemble_free(void *m)
 38 | { ps_slab_free_l(m); }
 39 | 
 40 | void
 41 | mark(char *c, int sz, char val)
 42 | {
 43 | 	int i;
 44 | 
 45 | 	for (i = 0 ; i < sz ; i++) c[i] = val;
 46 | }
 47 | 
 48 | void
 49 | chk(char *c, int sz, char val)
 50 | {
 51 | 	int i;
 52 | 
 53 | 	for (i = 0 ; i < sz ; i++) assert(c[i] == val);
 54 | }
 55 | 
 56 | struct small  *s[ITER];
 57 | struct larger *l[ITER];
 58 | 
 59 | #define FREE_BATCH 64
 60 | #define RB_SZ   ((PS_NUMCORES-1)*FREE_BATCH)
 61 | #define RB_ITER (RB_SZ * 1024)
 62 | 
 63 | void * volatile ring_buffer[RB_SZ] PS_ALIGNED;
 64 | 
 65 | unsigned long long free_tsc, alloc_tsc;
 66 | 
 67 | void
 68 | consumer(void)
 69 | {
 70 | 	struct small *s;
 71 | 	unsigned long i;
 72 | 	unsigned long long start, end, tot = 0;
 73 | 
 74 | 	meas_barrier(2);
 75 | 
 76 | 	for (i = 0 ; i < RB_ITER ; i++) {
 77 | 		unsigned long off = i % RB_SZ;
 78 | 
 79 | 		while (!ring_buffer[off]) ;
 80 | 		s = ring_buffer[off];
 81 | 		ring_buffer[off] = NULL;
 82 | 
 83 | 		start = ps_tsc();
 84 | 		ps_slab_free_s(s);
 85 | 		end = ps_tsc();
 86 | 		tot += end-start;
 87 | 	}
 88 | 	free_tsc = tot / RB_ITER;
 89 | 
 90 | 	meas_barrier(2);
 91 | }
 92 | 
 93 | void
 94 | producer(void)
 95 | {
 96 | 	struct small *s;
 97 | 	unsigned long i;
 98 | 	unsigned long long start, end, tot = 0;
 99 | 
100 | 	meas_barrier(2);
101 | 
102 | 	for (i = 0 ; i < RB_ITER ; i++) {
103 | 		unsigned long off = i % RB_SZ;
104 | 
105 | 		while (ring_buffer[off]) ;
106 | 
107 | 		start = ps_tsc();
108 | 		s = ps_slab_alloc_s();
109 | 		end = ps_tsc();
110 | 		tot += end-start;
111 | 
112 | 		assert(s);
113 | 		ring_buffer[off] = s;
114 | 	}
115 | 	alloc_tsc = tot / RB_ITER;
116 | 
117 | 	meas_barrier(2);
118 | }
119 | 
120 | void *
121 | child_fn(void *d)
122 | {
123 | 	(void)d;
124 | 
125 | 	thd_set_affinity(pthread_self(), 1);
126 | 	consumer();
127 | 
128 | 	return NULL;
129 | }
130 | 
131 | void
132 | test_remote_frees(void)
133 | {
134 | 	pthread_t child;
135 | 
136 | 	printf("Starting test for remote frees\n");
137 | 
138 | 	if (pthread_create(&child, 0, child_fn, NULL)) {
139 | 		perror("pthread create of child\n");
140 | 		exit(-1);
141 | 	}
142 | 
143 | 	producer();
144 | 
145 | 	pthread_join(child, NULL);
146 | 	printf("Remote allocations take %lld, remote frees %lld (unadjusted for tsc)\n", alloc_tsc, free_tsc);
147 | }
148 | 
149 | #define STATS_REPORT_THD 2
150 | #define REMOTE_FREE_ITER (100000)
151 | 
152 | unsigned long cost[REMOTE_FREE_ITER]  PS_ALIGNED;
153 | unsigned long alloc[REMOTE_FREE_ITER] PS_ALIGNED;
154 | __thread int thd_local_id;
155 | 
156 | static inline int
157 | cmpfunc(const void * a, const void * b)
158 | { return (*(unsigned long*)b) - (*(unsigned long*)a); }
159 | 
160 | static inline void
161 | out_latency(unsigned long *re, int num, char *label)
162 | {
163 | 	int i;
164 | 	unsigned long long sum = 0;
165 | 
166 | 	for (i = 0; i < num; i++) sum += (unsigned long long)re[i];
167 | 	qsort(re, num, sizeof(unsigned long), cmpfunc);
168 | 	printf("thd %d %s tot %d avg %llu 99.9 %lu 99 %lu min %lu max %lu\n", thd_local_id,
169 | 	       label, num, sum/num, re[num/1000], re[num/100], re[num-1], re[0]);
170 | }
171 | 
172 | void
173 | mt_consumer(void)
174 | {
175 | 	char *s, *h;
176 | 	int id = thd_local_id, k = 0;
177 | 	long b, e, i;
178 | 	unsigned long long start, end;
179 | 
180 | 	b = (id-1)*FREE_BATCH;
181 | 	e = id*FREE_BATCH;
182 | 	meas_barrier(PS_NUMCORES);
183 | 
184 | c_begin:
185 | 	for (i = b; i < e; i++) {
186 | 		while (!ring_buffer[i]) ;
187 | 		s = (char *)ring_buffer[i];
188 | 		if (s == (void *)-1) goto c_end;
189 | 
190 | 		ring_buffer[i] = NULL;
191 | 		assert(i == ((int *)s)[0]);
192 | 		h = s-sizeof(struct ps_mheader);
193 | 		h[0] = 0;
194 | 		ps_mem_fence();
195 | 
196 | 		start = ps_tsc();
197 | 		ps_slab_free_s(s);
198 | 		end = ps_tsc();
199 | 		if (id == STATS_REPORT_THD && k < REMOTE_FREE_ITER) cost[k++] = end-start;
200 | 	}
201 | 	goto c_begin;
202 | 
203 | c_end:
204 | 	if (id == STATS_REPORT_THD) out_latency(cost, k, "remote_free");
205 | 	meas_barrier(PS_NUMCORES);
206 | }
207 | 
208 | void
209 | mt_producer(void)
210 | {
211 | 	void *s;
212 | 	unsigned long i, k = 0, b = 0;
213 | 	unsigned long long start, end;
214 | 
215 | 	meas_barrier(PS_NUMCORES);
216 | 
217 | p_begin:
218 | 	for (i = b; i < RB_SZ; i += (PS_NUMCORES-1)) {
219 | 		if (ring_buffer[i]) continue;
220 | 		start = ps_tsc();
221 | 		s = ps_slab_alloc_s();
222 | 		end = ps_tsc();
223 | 		assert(s);
224 | 
225 | 		((int *)s)[0] = i;
226 | 		ps_mem_fence();
227 | 		ring_buffer[i] = s;
228 | 		if (k < REMOTE_FREE_ITER) alloc[k] = end-start;
229 | 		if ((++k) == (PS_NUMCORES-1)*REMOTE_FREE_ITER) goto p_end;
230 | 	}
231 | 	b = (b+1) % FREE_BATCH;
232 | 	goto p_begin;
233 | 
234 | p_end:
235 | 	for(i=0; i<RB_SZ; i++) ring_buffer[i] = (void *)-1;
236 | 	out_latency(alloc, REMOTE_FREE_ITER, "alloc");
237 | 	meas_barrier(PS_NUMCORES);
238 | }
239 | 
240 | void *
241 | child_mt_fn(void *d)
242 | {
243 | 	(void)d;
244 | 
245 | 	thd_local_id = (int)(long)d;
246 | 	thd_set_affinity(pthread_self(), thd_local_id);
247 | 	mt_consumer();
248 | 
249 | 	return NULL;
250 | }
251 | 
252 | void
253 | test_remote_frees_multi_thd(void)
254 | {
255 | 	pthread_t child[PS_NUMCORES];
256 | 	int ret;
257 | 	long i, *s;
258 | 
259 | 	printf("Starting test for multi-thread remote frees\n");
260 | 	for (i =0 ; i < RB_SZ; i++) {
261 | 		s = (long *)ps_slab_alloc_s();
262 | 		s[0] = i;
263 | 		ring_buffer[i] = (void *)s;
264 | 	}
265 | 
266 | 	for (i = 1; i < PS_NUMCORES; i++) {
267 | 		ret = pthread_create(&child[i], 0, child_mt_fn, (void *)i);
268 | 		if (ret) {
269 | 			perror("pthread create of child\n");
270 | 			exit(-1);
271 | 		}
272 | 	}
273 | 
274 | 	mt_producer();
275 | 
276 | 	for (i = 1; i < PS_NUMCORES; i++) {
277 | 		pthread_join(child[i], NULL);
278 | 	}
279 | }
280 | 
281 | void
282 | test_correctness(void)
283 | {
284 | 	int i, j;
285 | 
286 | 	printf("Starting mark & check for increasing numbers of allocations.\n");
287 | 	for (i = 0 ; i < ITER ; i++) {
288 | 		l[i] = ps_slab_alloc_l();
289 | 		mark(l[i]->x, sizeof(struct larger), i);
290 | 		for (j = i+1 ; j < ITER ; j++) {
291 | 			l[j] = ps_slab_alloc_l();
292 | 			mark(l[j]->x, sizeof(struct larger), j);
293 | 		}
294 | 		for (j = i+1 ; j < ITER ; j++) {
295 | 			chk(l[j]->x, sizeof(struct larger), j);
296 | 			ps_slab_free_l(l[j]);
297 | 		}
298 | 	}
299 | 	for (i = 0 ; i < ITER ; i++) {
300 | 		assert(l[i]);
301 | 		chk(l[i]->x, sizeof(struct larger), i);
302 | 		ps_slab_free_l(l[i]);
303 | 	}
304 | }
305 | 
306 | void
307 | test_perf(void)
308 | {
309 | 	int i, j;
310 | 	unsigned long long start, end;
311 | 
312 | 	printf("Slabs:\n"
313 | 	       "\tsmall: objsz %lu, objmem %lu, nobj %lu\n"
314 | 	       "\tlarge: objsz %lu, objmem %lu, nobj %lu\n"
315 | 	       "\tlarge+nohead: objsz %lu, objmem %lu, nobj %lu\n",
316 | 	       (unsigned long)sizeof(struct small),  (unsigned long)ps_slab_objmem_s(), (unsigned long)ps_slab_nobjs_s(),
317 | 	       (unsigned long)sizeof(struct larger), (unsigned long)ps_slab_objmem_l(), (unsigned long)ps_slab_nobjs_l(),
318 | 	       (unsigned long)sizeof(struct larger), (unsigned long)ps_slab_objmem_hextern(), (unsigned long)ps_slab_nobjs_hextern());
319 | 
320 | 	start = ps_tsc();
321 | 	for (j = 0 ; j < ITER ; j++) {
322 | 		for (i = 0 ; i < LARGECHUNK ; i++) s[i] = ps_slab_alloc_l();
323 | 		for (i = 0 ; i < LARGECHUNK ; i++) ps_slab_free_l(s[i]);
324 | 	}
325 | 	end = ps_tsc();
326 | 	end = (end-start)/(ITER*LARGECHUNK);
327 | 	printf("Average cost of large slab alloc+free: %lld\n", end);
328 | 
329 | 	ps_slab_alloc_s();
330 | 	start = ps_tsc();
331 | 	for (j = 0 ; j < ITER ; j++) {
332 | 		for (i = 0 ; i < SMALLCHUNK ; i++) s[i] = ps_slab_alloc_s();
333 | 		for (i = 0 ; i < SMALLCHUNK ; i++) ps_slab_free_s(s[i]);
334 | 	}
335 | 	end = ps_tsc();
336 | 	end = (end-start)/(ITER*SMALLCHUNK);
337 | 	printf("Average cost of small slab alloc+free: %lld\n", end);
338 | 
339 | 	ps_slab_alloc_hextern();
340 | 	start = ps_tsc();
341 | 	for (j = 0 ; j < ITER ; j++) {
342 | 		for (i = 0 ; i < LARGECHUNK ; i++) s[i] = ps_slab_alloc_hextern();
343 | 		for (i = 0 ; i < LARGECHUNK ; i++) ps_slab_free_hextern(s[i]);
344 | 	}
345 | 	end = ps_tsc();
346 | 	end = (end-start)/(ITER*LARGECHUNK);
347 | 	printf("Average cost of extern slab header, large slab alloc+free: %lld\n", end);
348 | }
349 | 
350 | void
351 | stats_print(struct ps_mem *m)
352 | {
353 | 	struct ps_slab_stats s;
354 | 	int i;
355 | 
356 | 	printf("Stats for slab @ %p\n", (void*)m);
357 | 	ps_slabptr_stats(m, &s);
358 | 	for (i = 0 ; i < PS_NUMCORES ; i++) {
359 | 		printf("\tcore %d, slabs %zd, partial slabs %zd, nfree %zd, nremote %zd\n",
360 | 		       i, s.percore[i].nslabs, s.percore[i].npartslabs, s.percore[i].nfree, s.percore[i].nremote);
361 | 	}
362 | }
363 | 
364 | int
365 | main(void)
366 | {
367 | 	thd_set_affinity(pthread_self(), 0);
368 | 
369 | 	test_perf();
370 | 
371 | 	stats_print(&__ps_mem_l);
372 | 	stats_print(&__ps_mem_s);
373 | 	test_correctness();
374 | 	stats_print(&__ps_mem_l);
375 | 	test_remote_frees();
376 | 	stats_print(&__ps_mem_s);
377 | 	test_remote_frees_multi_thd();
378 | 	stats_print(&__ps_mem_s);
379 | 
380 | 	return 0;
381 | }
382 | 


--------------------------------------------------------------------------------
/tests/smr.c:
--------------------------------------------------------------------------------
  1 | /***
  2 |  * Copyright 2015 by Gabriel Parmer.  All rights reserved.
  3 |  * This file is dual licensed both under the GPL v2 license with the
  4 |  * classpath exception and the BSD 2 clause license.
  5 |  *
  6 |  * Authors: Gabriel Parmer, gparmer@gwu.edu, 2015
  7 |  */
  8 | 
  9 | #define _GNU_SOURCE
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <sched.h>
 13 | #include <sys/resource.h>
 14 | #include <pthread.h>
 15 | #include <sys/types.h>
 16 | #include <sys/stat.h>
 17 | #include <fcntl.h>
 18 | #include <unistd.h>
 19 | #include <string.h>
 20 | #include <time.h>
 21 | 
 22 | #include <ps_smr.h>
 23 | #include <ps_plat.h>
 24 | 
 25 | struct parsec ps;
 26 | PS_PARSLAB_CREATE(tst, 100, PS_PAGE_SIZE * 128)
 27 | PS_PARSLAB_CREATE(bench, 1, PS_PAGE_SIZE * 8)
 28 | 
 29 | #define ITER 1024
 30 | void *ptrs[ITER];
 31 | 
 32 | void
 33 | test_mem(void)
 34 | {
 35 | 	ps_tsc_t start, end;
 36 | 	int i, j;
 37 | 
 38 | 	start = ps_tsc();
 39 | 	for (j = 0 ; j < ITER ; j++) ptrs[j] = ps_mem_alloc_tst();
 40 | 	for (j = 0 ; j < ITER ; j++) ps_mem_free_tst(ptrs[j]);
 41 | 	end = ps_tsc();
 42 | 	end = (end-start)/ITER;
 43 | 	printf("Average cost of alloc->free: %lld\n", end);
 44 | 
 45 | 	ps_mem_alloc_tst();
 46 | 	start = ps_tsc();
 47 | 	for (j = 0 ; j < ITER ; j++) {
 48 | 		for (i = 0 ; i < ITER ; i++) ptrs[i] = ps_mem_alloc_tst();
 49 | 		for (i = 0 ; i < ITER ; i++) ps_mem_free_tst(ptrs[i]);
 50 | 	}
 51 | 	end = ps_tsc();
 52 | 	end = (end-start)/(ITER*ITER);
 53 | 	printf("Average cost of ITER * (alloc->free): %lld\n", end);
 54 | 
 55 | 	printf("Starting complicated allocation pattern for increasing numbers of allocations.\n");
 56 | 	for (i = 0 ; i < ITER ; i++) {
 57 | 		ptrs[i] = ps_mem_alloc_tst();
 58 | 		for (j = i+1 ; j < ITER ; j++) {
 59 | 			ptrs[j] = ps_mem_alloc_tst();
 60 | 		}
 61 | 		for (j = i+1 ; j < ITER ; j++) {
 62 | 			ps_mem_free_tst(ptrs[j]);
 63 | 		}
 64 | 	}
 65 | 	for (i = 0 ; i < ITER ; i++) {
 66 | 		assert(ptrs[i]);
 67 | 		ps_mem_free_tst(ptrs[i]);
 68 | 	}
 69 | }
 70 | 
 71 | void test_smr(void);
 72 | void test_remote_frees(void);
 73 | 
 74 | int
 75 | main(void)
 76 | {
 77 | 	thd_set_affinity(pthread_self(), 0);
 78 | 
 79 | 	printf("Starting tests on core %d.\n", ps_coreid());
 80 | 	ps_init(&ps);
 81 | 	ps_mem_init_tst(&ps);
 82 | 	ps_mem_init_bench(&ps);
 83 | 
 84 | 	printf("Testing memory management functionalities.\n");
 85 |  	test_mem();
 86 | 	printf("Testing Scalable Memory Reclamation.\n");
 87 | 	test_smr();
 88 | 	printf("Testing remote frees\n");
 89 | 	test_remote_frees();
 90 | 
 91 | 	return 0;
 92 | }
 93 | 
 94 | /* #define REAL_TIME_PARSEC_TEST */
 95 | #ifdef REAL_TIME_PARSEC_TEST
 96 | ps_tsc_t _ps_period = 10000;
 97 | __thread ps_tsc_t _ps_deadline = 0;
 98 | 
 99 | static void
100 | ps_period_quiesce(void)
101 | {
102 | 	ps_tsc_t tsc;
103 | 
104 | 	tsc = ps_tsc();
105 | 	if (tsc >= _ps_deadline) {
106 | 		_ps_deadline = tsc + _ps_period;
107 | 		ps_quiesce_bench();
108 | 	}
109 | }
110 | #else
111 | static void 
112 | ps_period_quiesce(void)
113 | { ; }
114 | #endif
115 | 
116 | #define N_OPS (50000000)
117 | #define N_LOG (N_OPS / PS_NUMCORES)
118 | static char ops[N_OPS] PS_ALIGNED;
119 | static unsigned long results[PS_NUMCORES][2] PS_ALIGNED;
120 | static unsigned long p99_log[N_LOG] PS_ALIGNED;
121 | 
122 | /* for qsort */
123 | static int
124 | cmpfunc(const void * a, const void * b)
125 | { return ( *(int*)a - *(int*)b ); }
126 | 
127 | void
128 | bench(void)
129 | {
130 | 	int i, id;
131 | 	unsigned long n_read = 0, n_update = 0, op_jump = PS_NUMCORES;
132 | 	unsigned long long s, e, s1, e1, tot_cost_r = 0, tot_cost_w = 0, max = 0, cost;
133 | 	void *last_alloc;
134 | 
135 | 	id = ps_coreid();
136 | 	last_alloc = ps_mem_alloc_bench();
137 | 	assert(last_alloc);
138 | 
139 | 	s = ps_tsc();
140 | 	for (i = 0 ; i < N_OPS/PS_NUMCORES; i++) {
141 | 		s1 = ps_tsc();
142 | 
143 | 		if (ops[(unsigned long)id+op_jump*i]) {
144 | 			ps_mem_free_bench(ps_mem_alloc_bench());
145 | 			ps_period_quiesce();
146 | 
147 | 			e1 = ps_tsc();
148 | 			cost = e1-s1;
149 | 			tot_cost_w += cost;
150 | 			n_update++;
151 | 
152 | 			if (id == 0) p99_log[N_LOG - n_update] = cost;
153 | 		} else {
154 | 			ps_enter(&ps);
155 | 			ps_exit(&ps);
156 | 
157 | 			e1 = ps_tsc();
158 | 			cost = e1-s1;
159 | 			tot_cost_r += cost;
160 | 
161 | 			if (id == 0) p99_log[n_read] = cost;
162 | 			n_read++;
163 | 		}
164 | 
165 | 		if (cost > max) max = cost;
166 | 	}
167 | 	assert(n_read + n_update <= N_LOG);
168 | 	e = ps_tsc();
169 | 
170 | 	if (n_read)   tot_cost_r /= n_read;
171 | 	if (n_update) tot_cost_w /= n_update;
172 | 
173 | 	results[id][0] = tot_cost_r;
174 | 	results[id][1] = tot_cost_w;
175 | 
176 | 	if (id == 0) {
177 | 		unsigned long r_99 = 0, w_99 = 0;
178 | 
179 | 		if (n_read) {
180 | 			qsort(p99_log, n_read, sizeof(unsigned long), cmpfunc);
181 | 			r_99 = p99_log[n_read - n_read / 100];
182 | 		}
183 | 		if (n_update) {
184 | 			qsort(&p99_log[n_read], n_update, sizeof(unsigned long), cmpfunc);
185 | 			w_99 = p99_log[N_LOG - 1 - n_update / 100];
186 | 		}
187 | 		printf("99p: read %lu write %lu\n", r_99, w_99);
188 | 	}
189 | 
190 | 
191 |         printf("Thd %d: tot %lu ops (r %lu, u %lu) done, %llu (r %llu, w %llu) cycles per op, max %llu\n",
192 |                id, n_read+n_update, n_read, n_update, (unsigned long long)(e-s)/(n_read + n_update),
193 |                tot_cost_r, tot_cost_w, max);
194 | 
195 | 	return;
196 | }
197 | 
198 | char *TRACE_FILE = "/tmp/trace.dat";
199 | 
200 | void *
201 | worker(void *arg)
202 | {
203 | 	ps_tsc_t s,e;
204 | 	int cpuid = (int)(long)arg;
205 | 
206 | 	thd_set_affinity(pthread_self(), cpuid);
207 | 	assert(!__ps_mem_bench.percore[cpuid].slab_info.fl.list);
208 | 	assert(ps_mem_alloc_bench());
209 | 
210 | 	meas_barrier(PS_NUMCORES);
211 | 	s = ps_tsc();
212 | 	bench();
213 | 	e = ps_tsc();
214 | 	meas_barrier(PS_NUMCORES);
215 | 
216 | 	if (cpuid == 0) {
217 | 		int i;
218 | 		unsigned long long tot_r = 0, tot_w = 0;
219 | 
220 | 		for (i = 0; i < PS_NUMCORES; i++) {
221 | 			tot_r += results[i][0];
222 | 			tot_w += results[i][1];
223 | 
224 | 			results[i][0] = 0;
225 | 			results[i][1] = 0;
226 | 		}
227 | 		tot_r /= PS_NUMCORES;
228 | 		tot_w /= PS_NUMCORES;
229 | 
230 | 		printf("Summary: %s, (r %llu, w %llu) cycles per op\n", TRACE_FILE, tot_r, tot_w);
231 | 	}
232 | 
233 | 	printf("cpu %d done in %llu cycles (%llu to %llu)\n", cpuid, e-s, s, e);
234 | 
235 | 	return 0;
236 | }
237 | 
238 | void
239 | trace_gen(int fd, unsigned int nops, unsigned int percent_update)
240 | {
241 | 	unsigned int i;
242 | 
243 | 	srand(time(NULL));
244 | 	for (i = 0 ; i < nops ; i++) {
245 | 		char value;
246 | 		if ((unsigned int)rand() % 100 < percent_update) value = 'U';
247 | 		else                               value = 'R';
248 | 		if (write(fd, &value, 1) < 1) {
249 | 			perror("Writing to trace file");
250 | 			exit(-1);
251 | 		}
252 | 	}
253 | 	lseek(fd, 0, SEEK_SET);
254 | }
255 | 
256 | void
257 | load_trace(void)
258 | {
259 | 	int fd, ret;
260 | 	int bytes;
261 | 	unsigned long i, n_read, n_update;
262 | 
263 | 	ret = mlock(ops, N_OPS);
264 | 	if (ret) {
265 | 		printf("Cannot lock memory (%d). Check privilege (i.e. use sudo). Exit.\n", ret);
266 | 		exit(-1);
267 | 	}
268 | 
269 | 	printf("loading trace file @ %s.\n", TRACE_FILE);
270 | 	/* read the entire trace into memory. */
271 | 	fd = open(TRACE_FILE, O_RDONLY);
272 | 	if (fd < 0) {
273 | 		fd = open(TRACE_FILE, O_CREAT | O_RDWR, S_IRWXU);
274 | 		assert(fd >= 0);
275 | 		trace_gen(fd, N_OPS, 50);
276 | 	}
277 | 
278 | 	bytes = read(fd, &ops[0], N_OPS);
279 | 	assert(bytes == N_OPS);
280 | 	n_read = n_update = 0;
281 | 
282 | 	for (i = 0 ; i < N_OPS ; i++) {
283 | 		if      (ops[i] == 'R') { ops[i] = 0; n_read++; }
284 | 		else if (ops[i] == 'U') { ops[i] = 1; n_update++; }
285 | 		else assert(0);
286 | 	}
287 | 	printf("Trace: read %lu, update %lu, total %lu\n", n_read, n_update, (n_read+n_update));
288 | 	assert(n_read+n_update == N_OPS);
289 | 
290 | 	close(fd);
291 | 
292 | 	return;
293 | }
294 | 
295 | pthread_t thds[PS_NUMCORES];
296 | 
297 | void
298 | test_smr(void)
299 | {
300 | 	int i, ret;
301 | 
302 | 	ret = mlockall(MCL_CURRENT | MCL_FUTURE);
303 | 	if (ret) {
304 | 		printf("cannot lock memory %d... exit.\n", ret);
305 | 		exit(-1);
306 | 	}
307 | 	load_trace();
308 | 
309 | 	for (i = 1 ; i < PS_NUMCORES ; i++) {
310 | 		ret = pthread_create(&thds[i], 0, worker, (void *)(long)i);
311 | 		if (ret) exit(-1);
312 | 	}
313 | 	usleep(50000);
314 | 
315 | 	worker((void *)0);
316 | 
317 | 	/* for (i = 1 ; i < PS_NUMCORES ; i++) { */
318 | 	for (i = PS_NUMCORES-1 ; i > 0 ; i--) {
319 | 		pthread_join(thds[i], (void *)&ret);
320 | 	}
321 | 
322 | 	return;
323 | }
324 | 
325 | 
326 | 
327 | #define RB_SZ   (1024 * 32)
328 | #define RB_ITER (RB_SZ * 1024)
329 | 
330 | void * volatile ring_buffer[RB_SZ] PS_ALIGNED;
331 | 
332 | unsigned long long free_tsc, alloc_tsc;
333 | 
334 | void
335 | consumer(void)
336 | {
337 | 	char *s;
338 | 	unsigned long i;
339 | 	unsigned long long start, end, tot = 0;
340 | 
341 | 	meas_barrier(2);
342 | 
343 | 	for (i = 0 ; i < RB_ITER ; i++) {
344 | 		unsigned long off = i % RB_SZ;
345 | 
346 | 		while (!ring_buffer[off]) ;
347 | 		s = ring_buffer[off];
348 | 		ring_buffer[off] = NULL;
349 | 
350 | 		start = ps_tsc();
351 | 		ps_mem_free_bench(s);
352 | 		end = ps_tsc();
353 | 		tot += end-start;
354 | 	}
355 | 	free_tsc = tot / RB_ITER;
356 | }
357 | 
358 | void
359 | producer(void)
360 | {
361 | 	char *s;
362 | 	unsigned long i;
363 | 	unsigned long long start, end, tot = 0;
364 | 
365 | 	meas_barrier(2);
366 | 
367 | 	for (i = 0 ; i < RB_ITER ; i++) {
368 | 		unsigned long off = i % RB_SZ;
369 | 
370 | 		while (ring_buffer[off]) ;
371 | 
372 | 		start = ps_tsc();
373 | 		s = ps_mem_alloc_bench();
374 | 		end = ps_tsc();
375 | 		tot += end-start;
376 | 
377 | 		assert(s);
378 | 		ring_buffer[off] = s;
379 | 	}
380 | 	alloc_tsc = tot / RB_ITER;
381 | }
382 | 
383 | void *
384 | child_fn(void *d)
385 | {
386 | 	(void)d;
387 | 
388 | 	thd_set_affinity(pthread_self(), 1);
389 | 	consumer();
390 | 
391 | 	return NULL;
392 | }
393 | 
394 | void
395 | test_remote_frees(void)
396 | {
397 | 	pthread_t child;
398 | 
399 | 	printf("Starting test for remote frees\n");
400 | 
401 | 	if (pthread_create(&child, 0, child_fn, NULL)) {
402 | 		perror("pthread create of child\n");
403 | 		exit(-1);
404 | 	}
405 | 
406 | 	producer();
407 | 
408 | 	pthread_join(child, NULL);
409 | 	printf("Remote allocations take %lld, remote frees %lld (unadjusted for tsc)\n", alloc_tsc, free_tsc);
410 | }
411 | 


--------------------------------------------------------------------------------