├── .gitignore ├── .idea ├── compiler.xml ├── libraries │ ├── org_openjdk_jmh_jmh_core_1_21.xml │ └── org_openjdk_jmh_jmh_generator_annprocess_1_21.xml ├── misc.xml ├── modules.xml ├── uiDesigner.xml └── vcs.xml ├── c ├── Makefile └── hashmap.c ├── go-concurrency-test.iml ├── go.mod ├── java ├── pom.xml └── src │ └── test │ └── TestJavaCache.java ├── maps.go ├── maps_test.go └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | c/hashmap.dSYM 2 | target 3 | c/hashmap -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/libraries/org_openjdk_jmh_jmh_core_1_21.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/libraries/org_openjdk_jmh_jmh_generator_annprocess_1_21.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /c/Makefile: -------------------------------------------------------------------------------- 1 | # the compiler: gcc for C program, define as g++ for C++ 2 | CC = gcc 3 | 4 | # compiler flags: 5 | # -g adds debugging information to the executable file 6 | # -Wall turns on most, but not all, compiler warnings 7 | CFLAGS = -g -Wall -O3 8 | 9 | # the build target executable: 10 | TARGET = hashmap 11 | 12 | all: $(TARGET) 13 | 14 | $(TARGET): $(TARGET).c 15 | $(CC) $(CFLAGS) -o $(TARGET) $(TARGET).c 16 | 17 | clean: 18 | $(RM) $(TARGET) 19 | -------------------------------------------------------------------------------- /c/hashmap.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | struct node{ 6 | int key; 7 | int val; 8 | struct node *next; 9 | }; 10 | struct table{ 11 | int size; 12 | struct node **list; 13 | }; 14 | struct table *createTable(int size){ 15 | struct table *t = (struct table*)malloc(sizeof(struct table)); 16 | t->size = size; 17 | t->list = (struct node**)malloc(sizeof(struct node*)*size); 18 | int i; 19 | for(i=0;ilist[i] = NULL; 21 | return t; 22 | } 23 | int hashCode(struct table *t,int key){ 24 | if(key<0) 25 | return -(key%t->size); 26 | return key%t->size; 27 | } 28 | void insert(struct table *t,int key,int val){ 29 | int pos = hashCode(t,key); 30 | struct node *list = t->list[pos]; 31 | struct node *temp = list; 32 | while(temp){ 33 | if(temp->key==key){ 34 | temp->val = val; 35 | return; 36 | } 37 | temp = temp->next; 38 | } 39 | struct node *newNode = (struct node*)malloc(sizeof(struct node)); 40 | newNode->key = key; 41 | newNode->val = val; 42 | newNode->next = list; 43 | t->list[pos] = newNode; 44 | } 45 | int lookup(struct table *t,int key){ 46 | int pos = hashCode(t,key); 47 | struct node *list = t->list[pos]; 48 | struct node *temp = list; 49 | while(temp){ 50 | if(temp->key==key){ 51 | return temp->val; 52 | } 53 | temp = temp->next; 54 | } 55 | return -1; 56 | } 57 | 58 | // calculate the time diff between start and end 59 | long delay(struct timeval t1, struct timeval t2) 60 | { 61 | long d; 62 | d = (t2.tv_sec - t1.tv_sec) * 1000000; 63 | d += t2.tv_usec - t1.tv_usec; 64 | return(d); 65 | } 66 | 67 | /* The state word must be initialized to non-zero */ 68 | uint32_t myrand(uint32_t r) 69 | { 70 | /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */ 71 | r ^= r << 13; 72 | r ^= r >> 17; 73 | r ^= r << 5; 74 | return r; 75 | } 76 | 77 | int Sink; 78 | 79 | void test(char *name,struct table *t) { 80 | int mask = (1024*1024)-1; 81 | struct timeval start, end; 82 | for(int i=0;i<=mask;i++){ 83 | insert(t,i,i); 84 | } 85 | gettimeofday(&start, NULL); 86 | uint32_t r = start.tv_usec; 87 | for( int i=0;i<5000000;i++){ 88 | r = myrand(r); 89 | int index = r & mask; 90 | insert(t,index,index); 91 | } 92 | gettimeofday(&end, NULL); 93 | printf("%s put = %lf ns/op\n", name,delay(start, end)/(5000000.0/1000)); 94 | 95 | gettimeofday(&start, NULL); 96 | int count=0; 97 | for( int i=0;i<5000000;i++){ 98 | r = myrand(r); 99 | int index = r & mask; 100 | count += lookup(t,index); 101 | } 102 | Sink=count; 103 | gettimeofday(&end, NULL); 104 | printf("%s get = %lf ns/op\n", name, delay(start, end)/(5000000.0/1000)); 105 | } 106 | 107 | int main(){ 108 | struct table *t = createTable(256000); 109 | 110 | test("intmap",t); 111 | 112 | t = createTable(1000000); 113 | 114 | test("intmap2",t); 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /go-concurrency-test.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/robaho/go-concurrency-test 2 | 3 | go 1.22.5 4 | -------------------------------------------------------------------------------- /java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | robaho 7 | go-concurrency-test 8 | 1.0-SNAPSHOT 9 | 10 | src 11 | src 12 | 13 | 14 | org.apache.maven.plugins 15 | maven-shade-plugin 16 | 3.2.0 17 | 18 | 19 | package 20 | 21 | shade 22 | 23 | 24 | jmh-benchmarks 25 | 26 | 28 | org.openjdk.jmh.Main 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | org.openjdk.jmh 40 | jmh-core 41 | 1.37 42 | 43 | 44 | org.openjdk.jmh 45 | jmh-generator-annprocess 46 | 1.37 47 | 48 | 49 | -------------------------------------------------------------------------------- /java/src/test/TestJavaCache.java: -------------------------------------------------------------------------------- 1 | package test; 2 | 3 | import org.openjdk.jmh.annotations.*; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import java.util.concurrent.*; 8 | import java.util.concurrent.locks.ReadWriteLock; 9 | import java.util.concurrent.locks.ReentrantReadWriteLock; 10 | 11 | interface AnyCache { 12 | int get(int key); 13 | void put(int key,int value); 14 | } 15 | 16 | class MyConcurrentCache implements AnyCache { 17 | 18 | final ConcurrentHashMap m = new ConcurrentHashMap(); 19 | @Override 20 | public int get(int key) { 21 | return m.get(key); 22 | } 23 | 24 | @Override 25 | public void put(int key,int value) { 26 | m.put(key,value); 27 | } 28 | } 29 | 30 | class MyLockCache implements AnyCache { 31 | 32 | final ReadWriteLock rw = new ReentrantReadWriteLock(false); 33 | final HashMap m = new HashMap(); 34 | 35 | @Override 36 | public int get(int key) { 37 | rw.readLock().lock(); 38 | try { 39 | return m.get(key); 40 | } finally { 41 | rw.readLock().unlock(); 42 | } 43 | } 44 | 45 | @Override 46 | public void put(int key,int value) { 47 | rw.writeLock().lock(); 48 | try { 49 | m.put(key, value); 50 | } finally { 51 | rw.writeLock().unlock(); 52 | } 53 | } 54 | } 55 | 56 | /* 57 | note, this would crash in a real "multi" environment, but only works here since 58 | the map is pre-populated so it is never resized. There is no easy way in jmh to restrict 59 | certain benchmarks to certain parameters 60 | */ 61 | class MyUnsharedCache implements AnyCache { 62 | final Map m = new HashMap(); 63 | 64 | @Override 65 | public int get(int key) { 66 | return m.get(key); 67 | } 68 | 69 | @Override 70 | public void put(int key,int value) { 71 | m.put(key,value); 72 | } 73 | } 74 | 75 | class IntMap implements AnyCache { 76 | static class node { 77 | int key,value; 78 | node next; 79 | } 80 | 81 | private final node[] table; 82 | private final int mask; 83 | private static int nextPowerOf2(int v) { 84 | v--; 85 | v |= v >> 1; 86 | v |= v >> 2; 87 | v |= v >> 4; 88 | v |= v >> 8; 89 | v |= v >> 16; 90 | v++; 91 | return v; 92 | } 93 | public IntMap(int size) { 94 | size = nextPowerOf2(size); 95 | table = new node[size]; 96 | mask = size-1; 97 | } 98 | @Override 99 | public int get(int key) { 100 | node n = table[key&mask]; 101 | if (n==null) { 102 | return 0; 103 | } 104 | for(;n!=null;n=n.next) { 105 | if(n.key==key){ 106 | return n.value; 107 | } 108 | } 109 | return 0; 110 | } 111 | 112 | @Override 113 | public void put(int key, int value) { 114 | node head = table[key&mask]; 115 | for(node n=head;n!=null;n=n.next) { 116 | if(n.key==key) { 117 | n.value=value; 118 | return; 119 | } 120 | } 121 | node n = new node(); 122 | n.key=key; 123 | n.value=value; 124 | n.next=head; 125 | table[key&mask]=n; 126 | } 127 | } 128 | 129 | @State(Scope.Benchmark) 130 | @Fork(1) 131 | @Warmup(iterations = 1, time = 3) 132 | @Measurement(iterations = 5, time = 3) 133 | @BenchmarkMode(Mode.AverageTime) 134 | @OutputTimeUnit(TimeUnit.NANOSECONDS) 135 | 136 | public class TestJavaCache { 137 | final int Mask = (1024*1024)-1; 138 | final int NTHREADS = 2; 139 | 140 | static int rand(int r) { 141 | /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */ 142 | r ^= r << 13; 143 | r ^= r >> 17; 144 | r ^= r << 5; 145 | return r & 0x7fffffff; 146 | } 147 | 148 | @Param({"unshared", "concurrent", "lock","intmap","intmap2"}) 149 | public String arg; 150 | 151 | static AnyCache m; 152 | 153 | static ExecutorService e; 154 | 155 | public int Sink; 156 | 157 | @Setup 158 | public void setup() { 159 | switch(arg){ 160 | case "unshared": 161 | m = new MyUnsharedCache(); break; 162 | case "concurrent": 163 | m = new MyConcurrentCache(); break; 164 | case "lock": 165 | m = new MyLockCache(); break; 166 | case "intmap": 167 | m = new IntMap(256000); break; 168 | case "intmap2": 169 | m = new IntMap(1000000); break; 170 | } 171 | 172 | e = Executors.newFixedThreadPool(NTHREADS); 173 | for(int i=0;i<=Mask;i++){ 174 | m.put(i,i); 175 | } 176 | } 177 | @TearDown 178 | public void tearDown() { 179 | e.shutdown(); 180 | for(int i=0;i<=Mask;i++){ 181 | if ((m.get(i)&Mask) != (i&Mask)) { 182 | throw new IllegalStateException("index "+i+" = "+m.get(i)); 183 | } 184 | } 185 | } 186 | 187 | @Benchmark 188 | @OperationsPerInvocation(1000000) 189 | public void Test0Get() { 190 | int sum=0; 191 | int r = (int)System.nanoTime(); 192 | for(int i=0;i<1000000;i++) { 193 | r = rand(r); 194 | sum+=m.get(r&Mask); 195 | } 196 | Sink = sum; 197 | } 198 | 199 | @Benchmark 200 | @OperationsPerInvocation(1000000) 201 | public void Test2Put() { 202 | int r = (int)System.nanoTime(); 203 | for(int i=0;i<1000000;i++) { 204 | r = rand(r); 205 | m.put(r&Mask,r); 206 | } 207 | } 208 | 209 | @Benchmark 210 | @OperationsPerInvocation(1000000) 211 | public void Test3PutGet() { 212 | int r = (int)System.nanoTime(); 213 | int sum=0; 214 | for(int i=0;i<1000000;i++) { 215 | r = rand(r); 216 | m.put(r&Mask,r); 217 | r = rand(r); 218 | sum+=m.get(r&Mask); 219 | } 220 | Sink = sum; 221 | } 222 | 223 | @Benchmark 224 | @OperationsPerInvocation(1000000) 225 | public void Test4MultiGet() throws InterruptedException { 226 | CountDownLatch latch = new CountDownLatch(NTHREADS); 227 | 228 | Runnable run = () -> { 229 | Test0Get(); 230 | latch.countDown(); 231 | }; 232 | for(int i=0;i { 244 | Test2Put(); 245 | latch.countDown(); 246 | }; 247 | for(int i=0;i { 259 | Test3PutGet(); 260 | latch.countDown(); 261 | }; 262 | for(int i=0;i> 1 12 | v |= v >> 2 13 | v |= v >> 4 14 | v |= v >> 8 15 | v |= v >> 16 16 | v++ 17 | return v 18 | } 19 | 20 | type node struct { 21 | key, value int 22 | next *node 23 | } 24 | 25 | type IntMap struct { 26 | table []*node 27 | mask int 28 | } 29 | 30 | func NewIntMap(size int) *IntMap { 31 | size = nextPowerOf2(size) 32 | m := IntMap{} 33 | m.table = make([]*node, size) 34 | m.mask = size - 1 35 | return &m 36 | } 37 | 38 | func (m *IntMap) Get(key int) int { 39 | node := m.table[key&m.mask] 40 | if node == nil { 41 | return 0 42 | } 43 | for ; node != nil; node = node.next { 44 | if node.key == key { 45 | return node.value 46 | } 47 | } 48 | return 0 49 | } 50 | func (m *IntMap) Put(key int, value int) { 51 | head := m.table[key&m.mask] 52 | for node := head; node != nil; node = node.next { 53 | if node.key == key { 54 | node.value = value 55 | return 56 | } 57 | } 58 | n := &node{key: key, value: value, next: head} 59 | m.table[key&m.mask] = n 60 | } 61 | 62 | type SharedIntMap struct { 63 | table []*node 64 | mask int 65 | } 66 | 67 | func NewSharedIntMap(size int) *SharedIntMap { 68 | size = nextPowerOf2(size) 69 | m := SharedIntMap{} 70 | m.table = make([]*node, size) 71 | m.mask = size - 1 72 | return &m 73 | } 74 | 75 | func (m *SharedIntMap) Get(key int) int { 76 | p := (*unsafe.Pointer)(unsafe.Pointer(&m.table[key&m.mask])) 77 | node := (*node)(atomic.LoadPointer(p)) 78 | 79 | for ; node != nil; node = node.next { 80 | if node.key == key { 81 | return node.value 82 | } 83 | } 84 | return 0 85 | } 86 | func (m *SharedIntMap) Put(key int, value int) { 87 | 88 | p := (*unsafe.Pointer)(unsafe.Pointer(&m.table[key&m.mask])) 89 | 90 | for { 91 | head := (*node)(atomic.LoadPointer(p)) 92 | for node := head; node != nil; node = node.next { 93 | if node.key == key { 94 | node.value = value 95 | //if !atomic.CompareAndSwapPointer(p,head,head) { 96 | // continue 97 | //} 98 | return 99 | } 100 | } 101 | n := &node{key: key, value: value, next: head} 102 | if atomic.CompareAndSwapPointer(p, unsafe.Pointer(head), unsafe.Pointer(n)) { 103 | continue 104 | } 105 | } 106 | } 107 | 108 | type Cache interface { 109 | Get(key int) int 110 | Put(key int, value int) 111 | } 112 | 113 | type LockCache struct { 114 | sync.RWMutex 115 | m map[int]int 116 | } 117 | 118 | func NewLockCache() *LockCache { 119 | m := LockCache{m: make(map[int]int)} 120 | return &m 121 | } 122 | 123 | func (m *LockCache) Get(key int) int { 124 | m.RLock() 125 | val, _ := m.m[key] 126 | m.RUnlock() // non-idiomatic go, but avoid defer performance hit 127 | return val 128 | } 129 | func (m *LockCache) Put(key int, value int) { 130 | m.Lock() 131 | m.m[key] = value 132 | m.Unlock() // non-idiomatic go, but avoid defer performance hit 133 | } 134 | 135 | type ShardCache struct { 136 | maps [10]map[int]int 137 | } 138 | 139 | func NewShardCache() *ShardCache { 140 | m := ShardCache{} 141 | for i := 0; i < 10; i++ { 142 | m.maps[i] = make(map[int]int) 143 | } 144 | return &m 145 | } 146 | 147 | func (m *ShardCache) Get(key int) int { 148 | val, _ := m.maps[key%10][key] 149 | return val 150 | } 151 | func (m *ShardCache) Put(key int, value int) { 152 | m.maps[key%10][key] = value 153 | } 154 | 155 | const SharedShardMask = 128 - 1 156 | 157 | type imap map[int]int 158 | 159 | type shard struct { 160 | imap 161 | sync.RWMutex 162 | } 163 | 164 | type SharedShardCache struct { 165 | shards [128]*shard 166 | } 167 | 168 | func NewSharedShardCache() *SharedShardCache { 169 | m := SharedShardCache{} 170 | for i := 0; i < 128; i++ { 171 | s := shard{imap: make(imap)} 172 | m.shards[i] = &s 173 | } 174 | return &m 175 | } 176 | 177 | func (m *SharedShardCache) Get(key int) int { 178 | s := m.shards[key&SharedShardMask] 179 | 180 | s.RLock() 181 | val, ok := s.imap[key] 182 | s.RUnlock() 183 | 184 | if !ok { 185 | return 0 186 | } 187 | return val 188 | } 189 | 190 | func (m *SharedShardCache) Put(key int, value int) { 191 | s := m.shards[key&SharedShardMask] 192 | s.Lock() 193 | s.imap[key] = value 194 | s.Unlock() 195 | } 196 | 197 | type UnsharedCache map[int]int 198 | 199 | func NewUnsharedCache() *UnsharedCache { 200 | m := UnsharedCache{} 201 | return &m 202 | } 203 | 204 | func (m *UnsharedCache) Get(key int) int { 205 | val := (*m)[key] 206 | return val 207 | } 208 | func (m *UnsharedCache) Put(key int, value int) { 209 | (*m)[key] = value 210 | } 211 | 212 | type SyncCache struct { 213 | m sync.Map 214 | } 215 | 216 | func NewSyncCache() *SyncCache { 217 | m := SyncCache{} 218 | return &m 219 | } 220 | 221 | func (m *SyncCache) Get(key int) int { 222 | val, _ := m.m.Load(key) 223 | if val == nil { 224 | return 0 225 | } 226 | return val.(int) 227 | } 228 | func (m *SyncCache) Put(key int, value int) { 229 | m.m.Store(key, value) 230 | } 231 | 232 | type PutRequest struct { 233 | key, value int 234 | } 235 | type GetRequest struct { 236 | key int 237 | } 238 | 239 | type ChannelCache struct { 240 | m map[int]int 241 | request chan interface{} 242 | response chan int 243 | } 244 | 245 | func (m *ChannelCache) Get(key int) int { 246 | m.request <- GetRequest{key} 247 | return <-m.response 248 | } 249 | 250 | func (m *ChannelCache) Put(key int, value int) { 251 | m.request <- PutRequest{key, value} 252 | } 253 | 254 | func NewChannelCache() *ChannelCache { 255 | c := &ChannelCache{m: make(map[int]int), request: make(chan interface{}), response: make(chan int)} 256 | go func() { 257 | for { 258 | request := <-c.request 259 | switch request.(type) { 260 | case GetRequest: 261 | val, ok := c.m[request.(GetRequest).key] 262 | if !ok { 263 | val = 0 264 | } 265 | c.response <- val 266 | case PutRequest: 267 | c.m[request.(PutRequest).key] = request.(PutRequest).value 268 | } 269 | } 270 | }() 271 | return c 272 | } 273 | -------------------------------------------------------------------------------- /maps_test.go: -------------------------------------------------------------------------------- 1 | package go_concurrency_test 2 | 3 | import ( 4 | "fmt" 5 | "github.com/robaho/go-concurrency-test" 6 | "sync" 7 | "sync/atomic" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | const NGOS = 2 // number of concurrent go routines for read/load tests 13 | const Mask = (1024 * 1024) - 1 14 | 15 | var um = go_concurrency.NewUnsharedCache() 16 | var lm = go_concurrency.NewLockCache() 17 | var sm = go_concurrency.NewSyncCache() 18 | var cm = go_concurrency.NewChannelCache() 19 | var sc = go_concurrency.NewShardCache() 20 | var ssc = go_concurrency.NewSharedShardCache() 21 | var im = go_concurrency.NewIntMap(256000) // so there are 4x collisions 22 | var im2 = go_concurrency.NewIntMap(1000000) // so there are no collisions 23 | var sim = go_concurrency.NewSharedIntMap(1000000) // so there are no collisions 24 | 25 | var Sink atomic.Value 26 | 27 | func rand(r int) int { 28 | /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */ 29 | r ^= r << 13 30 | r ^= r >> 17 31 | r ^= r << 5 32 | return r & 0x7fffffff 33 | } 34 | 35 | func TestNewSharedIntMap(t *testing.T) { 36 | for i := 0; i < 2000000; i++ { 37 | sim.Put(i, i) 38 | } 39 | for i := 0; i < 2000000; i++ { 40 | if sim.Get(i) != i { 41 | t.Fatal("wrong value") 42 | } 43 | } 44 | } 45 | 46 | func BenchmarkRand(m *testing.B) { 47 | r := time.Now().Nanosecond() 48 | for i := 0; i < m.N; i++ { 49 | r = rand(r) 50 | } 51 | Sink.Store(r) 52 | } 53 | 54 | func testget(impl go_concurrency.Cache, b *testing.B) { 55 | r := time.Now().Nanosecond() 56 | 57 | var sum int 58 | for i := 0; i < b.N; i++ { 59 | r = rand(r) 60 | sum += impl.Get(r & Mask) 61 | } 62 | Sink.Store(sum) 63 | } 64 | func testput(impl go_concurrency.Cache, b *testing.B) { 65 | r := time.Now().Nanosecond() 66 | for i := 0; i < b.N; i++ { 67 | r = rand(r) 68 | impl.Put(r&Mask, r) 69 | } 70 | } 71 | func testputget(impl go_concurrency.Cache, b *testing.B) { 72 | r := time.Now().Nanosecond() 73 | var sum int 74 | for i := 0; i < b.N; i++ { 75 | r = rand(r) 76 | impl.Put(r&Mask, r) 77 | r = rand(r) 78 | sum += impl.Get(r & Mask) 79 | } 80 | Sink.Store(sum) 81 | } 82 | func BenchmarkMain(m *testing.B) { 83 | fmt.Println("populating maps...") 84 | for i := 0; i <= Mask; i++ { 85 | um.Put(i, i) 86 | lm.Put(i, i) 87 | sm.Put(i, i) 88 | cm.Put(i, i) 89 | sc.Put(i, i) 90 | ssc.Put(i, i) 91 | im.Put(i, i) 92 | im2.Put(i, i) 93 | sim.Put(i, i) 94 | } 95 | 96 | sm.Get(100) 97 | m.ResetTimer() 98 | 99 | impls := []go_concurrency.Cache{um, lm, sm, cm, sc, ssc, im, im2, sim} 100 | names := []string{"unshared", "lock", "sync", "channel", "shard", "shareshard", "intmap", "intmap2", "sharedint"} 101 | multi := []bool{false, true, true, true, false, true, true, true, true} 102 | 103 | //impls := []go_concurrency.Cache{lm,sm} 104 | //names := []string{"lock","sync"} 105 | //multi := []bool{true,true} 106 | 107 | for i := 0; i < len(impls); i++ { 108 | impl := impls[i] 109 | m.Run(names[i]+".get", func(b *testing.B) { 110 | testget(impl, b) 111 | }) 112 | m.Run(names[i]+".put", func(b *testing.B) { 113 | testput(impl, b) 114 | }) 115 | m.Run(names[i]+".putget", func(b *testing.B) { 116 | testputget(impl, b) 117 | }) 118 | m.Run(names[i]+".multiget", func(b *testing.B) { 119 | wg := sync.WaitGroup{} 120 | for g := 0; g < NGOS; g++ { 121 | wg.Add(1) 122 | go func() { 123 | testget(impl, b) 124 | wg.Done() 125 | }() 126 | } 127 | wg.Wait() 128 | }) 129 | if !multi[i] { // some impl do not support concurrent write 130 | continue 131 | } 132 | m.Run(names[i]+".multiput", func(b *testing.B) { 133 | wg := sync.WaitGroup{} 134 | for g := 0; g < NGOS; g++ { 135 | wg.Add(1) 136 | go func() { 137 | testput(impl, b) 138 | wg.Done() 139 | }() 140 | } 141 | wg.Wait() 142 | }) 143 | m.Run(names[i]+".multiputget", func(b *testing.B) { 144 | wg := sync.WaitGroup{} 145 | for g := 0; g < NGOS; g++ { 146 | wg.Add(1) 147 | go func() { 148 | testputget(impl, b) 149 | wg.Done() 150 | }() 151 | } 152 | wg.Wait() 153 | }) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 |
2 | Update 10/09/2024 3 | 4 | With Go generics, it is straightforward (should be, uncertain about hash function) to use the techniques listed below to create a highly performant concurrent map implementation. 5 | 6 | I have updated the timings using OpenJDK 21, Go 1.22.5, and clang version 14.0.0 (clang-1400.0.29.202) on an iMac Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz 7 |
8 |
9 | Update 11/26/18 10 | 11 | After discussion in Go incident [28938](https://github.com/golang/go/issues/28938) I am prepared to make some final conclusions. 12 | 13 | Concurrent map data structures in Go are most efficiently implemented using locks and the built-in map implementations. This is especially true 14 | if the structure can be sharded to avoid single write mutex contention (see "shared shard" impl). 15 | Locks are very efficient in Go due to the user-threading of Go routines. 16 | 17 | The problem with Go's sync.Map is that it is "non-internal", unlike the built-in map implementation. So multiple layers of indirection are required 18 | to accomplish some operations (use of interfaces, no atomic CAS at the table entry level). 19 | 20 | In my tests I developed a fully concurrent "shared intmap" which shows excellent performance, but for a variety of reasons this is not a general solution, 21 | but it shows the performance potential of a redesigned sync.Map. 22 | 23 | The Go incident above links to several issues #[21031](https://github.com/golang/go/issues/54720), #[21035](https://github.com/golang/go/issues/21035) that when resolved 24 | should bring sync.Map performance inline with Java's ConcurrentHashMap. 25 | 26 | To be fair, most of performance issues with sync.Map only surface with a large map, with a large working set of active keys, as the indirection is 27 | especially painful due to cpu cache misses. 28 |
29 |
30 | Update 11/24/18 31 | 32 | I added 'intmap' versions - which are fixed sized maps (without resizing). The 'intmap2' is sized to 33 | avoid any collisions, 'intmap' has 4x collisions. 34 | 35 | I added 'C' versions of the 'intmap' for comparison. I also added a 'lock map' in Java similar to Go. 36 | 37 | I increased the number of elements to 1000000 to avoid the cache fitting in the L2. 38 | 39 | In all cases, the maps are pre-allocated to avoid any allocation overhead. 40 | 41 | The major change across the board was to use random get/put indexes as the linear read/put was biased towards Java, since the Go uses 42 | an alternate hash method so that sequential keys are not sequential in memory. 43 | 44 | I re-ran the tests with latest versions, Go 1.11.2, and Java 1.8_191, and C was compiled with clang-1000.11.45.5 45 | 46 | The tests now show Go to be a clear winner when using Locks, but sync.Map has significant performance issues. 47 | 48 | Both Go and Java perform very close to the optimized C versions, with Java bettering Go in almost all cases (intmap tests) - this is probably 49 | the reason the Go map implementation uses arrays of structs rather than a linked list of nodes. 50 | 51 | *** A note about the multi timings... They are not divided by the number of Go routines/threads (which doubles the number of operations), but since there 52 | are only 2 and ample cores available- meaning they should execute concurrently, it measures the overhead more directly since the timings 53 | can be directly compared to the single routine/thread case. 54 |
55 |
56 | Update 57 | 58 | Based on feedback from Bryan Mills, I've updated the implementation using channels, and re-tested. I do not believe it has 59 | made a significant difference. It does highlight that the 'Get' is worse than the 'Put' because 2 channels are involved. 60 | 61 | He has written [Rethinking Classical Concurrency in Go](https://golang.org/wiki/Go-Community-Slides#rethinking-classical-concurrency-patterns) 62 | 63 | He also provided that sync.Map has performance issues, see 64 | [this search](https://github.com/golang/go/issues?utf8=✓&q=is%3Aissue+is%3Aopen+%22sync%3A%22+Map+in%3Atitle+label%3APerformance) 65 | 66 | He also provided that the RWMutex has scalability issues (but I am not sure that applies here since only 2 routines are tested), 67 | but here is the [incident](https://golang.org/issue/17973) 68 | 69 | I removed the use of defer in the lock implementation as it is a known? performance issue. 70 | 71 | I reduced the number of elements cached to be 500k which should allow the cache to fit entirely in the L3 on the testing machine. 72 | It made some improvement. 73 | 74 | I updated the testing methodology to make certain constraints on the test more clear. 75 | 76 | I determined that the PutGet not matching the Put + Get times was because of cache locality, so PutGet was changed to read from 77 | opposite sides, which corrected the problem, and so I removed the comment regarding potential 'go bench' accuracy issues. 78 | 79 | I added an 'unshared' cache to the Java tests for an additional baseline. 80 |
81 | 82 | **Summary** 83 | 84 | The Go sync.Map implementation has significant room for improvement, performing far worse than Java ConcurrentHashMap, 85 | and locks should be used for low concurrent access, especially for high read weighted use cases. 86 | 87 | The Go synchronization primitives and Go routine scheduling outperform Java with locks by a very wide margin. The user level context switching 88 | is ideal. 89 | 90 | **Background** 91 | 92 | This is a project designed to test the concurrency mechanisms available in Go. Go states, 93 | 94 | [Do not communicate by sharing memory; instead, share memory by communicating.](https://blog.golang.org/share-memory-by-communicating) 95 | and the basic mechanism to achieve this is channels. 96 | 97 | In this test, I compare the 4 readily available ways to share information in Go: 98 | 1. unshared/basic map 99 | 2. using locks 100 | 3. using sync.Map 101 | 4. using channels 102 | 103 | The above methods are used to build a "memory cache", which is a very common structure in high-performance financial applications, which is my current 104 | area of work. This may not be the standard use case the Go authors envisioned as the amount of work per operation (store/load) is very small. 105 | 106 | The 'unshared 'is not usable as a shared cache - possibly as a write-at-init, read only cache in a specialized application with cooperative immutability. 107 | 108 | Additionally, I wrote the same tests using Java, but only for the sync.Map case, which is fairly equivalent to Java's ConcurrentHashMap, and this 109 | data structure and its methodologies use the same features as most lock-free structures (although ConcurrentHashMap does uses locks on store) 110 | 111 | The tests run under [go bench]([https://golang.org/pkg/testing/) for Go, and [jmh](http://openjdk.java.net/projects/code-tools/jmh/) for Java. 112 | 113 | In both cases, identical hardware was used, which is a 3.4 ghz Core i7, with 4 cores (8 threads), under OSX 10.13.6. 114 | 115 | The software versions are Go 1.11.2, and Java 1.8_191, Java OpenJDK11 with Shenandoah GC, and LLVM version 10.0.0 (clang-1000.11.45.5) 116 | 117 | **Testing Methodology** 118 | 119 | The cache uses ints for keys and values, and the cache is limited to 1 million entries to avoid possible 'map' degradation. 120 | The map is also pre-populated with the 1 million entries, and used for all of the tests, to ensure the tests do not measure map resizing costs, and 121 | other start-up penalties. The expected cache size in memory is roughly 1 million * sizeof(int), but larger when pointers are used by the implementation, 122 | especially in the case of Java. No tests were performed to measure the actual memory usage. 123 | 124 | There are 3 operations tested Get, Put, and PutGet. They are tested in a uncontested scenario using 1 go routine, they are also tested 125 | in a contested scenario (multi) using 2 go routines. The contested was limited to 2, since the machine only has 4 true cores, and there is 126 | housekeeper work performed by the benchmark harness, OS, etc. so this seemed fair. 127 | 128 | The tests for both Go and Java were executed via Intellij, as it has support for both 'go bench' and 'jmh' 129 | 130 | The testing harnesses do not operate identically, but I attempted to make the comparisons as fair as possible, mainly to use Java as a baseline 131 | for what would be expected of the Go solution, since in my experience in many cases the Go code seems to outperform the equivalent Java code. 132 | 133 | The caches are essentially 'static' structures. I did this for fairness. In testing there was no appreciable difference when the the cache structure 134 | was allocated within the method under test. 135 | 136 | Any GC related activity is included in the tests. No attempt was made to reduce, or tune the GC activity on any platform. 137 | 138 | **Go Test Results** 139 | 140 | ``` 141 | cpu: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz 142 | BenchmarkRand-8 669243439 1.781 ns/op 143 | populating maps... 144 | BenchmarkMain/unshared.get-8 18877132 63.53 ns/op 145 | BenchmarkMain/unshared.put-8 17971988 69.09 ns/op 146 | BenchmarkMain/unshared.putget-8 9268077 131.4 ns/op 147 | BenchmarkMain/unshared.multiget-8 17787741 66.86 ns/op 148 | BenchmarkMain/lock.get-8 17399762 70.33 ns/op 149 | BenchmarkMain/lock.put-8 10549318 114.1 ns/op 150 | BenchmarkMain/lock.putget-8 6803950 178.8 ns/op 151 | BenchmarkMain/lock.multiget-8 10305220 116.4 ns/op 152 | BenchmarkMain/lock.multiput-8 4546767 255.9 ns/op 153 | BenchmarkMain/lock.multiputget-8 2609588 448.0 ns/op 154 | BenchmarkMain/sync.get-8 5393236 216.7 ns/op 155 | BenchmarkMain/sync.put-8 3445075 353.9 ns/op 156 | BenchmarkMain/sync.putget-8 1963018 651.0 ns/op 157 | BenchmarkMain/sync.multiget-8 3567596 326.3 ns/op 158 | BenchmarkMain/sync.multiput-8 3339694 377.1 ns/op 159 | BenchmarkMain/sync.multiputget-8 1865768 676.4 ns/op 160 | BenchmarkMain/channel.get-8 2028018 614.3 ns/op 161 | BenchmarkMain/channel.put-8 3150184 384.3 ns/op 162 | BenchmarkMain/channel.putget-8 1226596 984.5 ns/op 163 | BenchmarkMain/channel.multiget-8 1000000 1169 ns/op 164 | BenchmarkMain/channel.multiput-8 1468042 790.1 ns/op 165 | BenchmarkMain/channel.multiputget-8 641374 1873 ns/op 166 | BenchmarkMain/shard.get-8 16428013 71.47 ns/op 167 | BenchmarkMain/shard.put-8 12421777 85.71 ns/op 168 | BenchmarkMain/shard.putget-8 7720970 152.4 ns/op 169 | BenchmarkMain/shard.multiget-8 15689773 75.81 ns/op 170 | BenchmarkMain/shareshard.get-8 16479192 72.35 ns/op 171 | BenchmarkMain/shareshard.put-8 9910390 110.3 ns/op 172 | BenchmarkMain/shareshard.putget-8 6203272 183.1 ns/op 173 | BenchmarkMain/shareshard.multiget-8 14384760 83.03 ns/op 174 | BenchmarkMain/shareshard.multiput-8 8925080 133.5 ns/op 175 | BenchmarkMain/shareshard.multiputget-8 4910743 248.7 ns/op 176 | BenchmarkMain/intmap.get-8 14669283 83.42 ns/op 177 | BenchmarkMain/intmap.put-8 6084549 191.5 ns/op 178 | BenchmarkMain/intmap.putget-8 4504140 250.0 ns/op 179 | BenchmarkMain/intmap.multiget-8 12845756 88.15 ns/op 180 | BenchmarkMain/intmap.multiput-8 5912864 190.2 ns/op 181 | BenchmarkMain/intmap.multiputget-8 4621642 255.4 ns/op 182 | BenchmarkMain/intmap2.get-8 30600810 38.98 ns/op 183 | BenchmarkMain/intmap2.put-8 8988387 125.8 ns/op 184 | BenchmarkMain/intmap2.putget-8 5373553 210.6 ns/op 185 | BenchmarkMain/intmap2.multiget-8 27520656 41.71 ns/op 186 | BenchmarkMain/intmap2.multiput-8 9286503 128.7 ns/op 187 | BenchmarkMain/intmap2.multiputget-8 5525810 211.8 ns/op 188 | BenchmarkMain/sharedint.get-8 20388027 58.56 ns/op 189 | BenchmarkMain/sharedint.put-8 5578724 210.7 ns/op 190 | BenchmarkMain/sharedint.putget-8 3231453 365.0 ns/op 191 | BenchmarkMain/sharedint.multiget-8 17616985 65.32 ns/op 192 | BenchmarkMain/sharedint.multiput-8 5640673 207.1 ns/op 193 | BenchmarkMain/sharedint.multiputget-8 3212062 364.5 ns/op 194 | ``` 195 | 196 | **Go Analysis** 197 | 198 | There are several interesting, and disconcerting aspects. 199 | 200 | 1. The sync.Map performs far worse than the map using locks, even for Get - almost 3x slower. 201 | 2. Continuing on #1, the 'multi get' using sync performs worse than get. Again, this should only be a volatile load, with the 202 | expectation of performance equal to the unshared get, or at least the sync.Map singular read. 203 | 3. The channel method is more than 5x slower than the others, and this is using a very simple key/value structure, although a more complex one would 204 | probably necessitate using pointers. 205 | 206 | **Java Test Results** 207 | 208 | ``` 209 | using 1 fork, 1 warm-up iteration, and 5 iterations of 3 sec 210 | 211 | Benchmark (arg) Mode Cnt Score Error Units 212 | TestJavaCache.Test0Get unshared avgt 5 50.360 ± 5.020 ns/op 213 | TestJavaCache.Test0Get concurrent avgt 5 48.672 ± 1.108 ns/op 214 | TestJavaCache.Test0Get lock avgt 5 99.454 ± 13.884 ns/op 215 | TestJavaCache.Test0Get intmap avgt 5 48.244 ± 8.969 ns/op 216 | TestJavaCache.Test0Get intmap2 avgt 5 18.599 ± 0.447 ns/op 217 | TestJavaCache.Test2Put unshared avgt 5 109.816 ± 38.564 ns/op 218 | TestJavaCache.Test2Put concurrent avgt 5 174.497 ± 47.735 ns/op 219 | TestJavaCache.Test2Put lock avgt 5 174.281 ± 31.701 ns/op 220 | TestJavaCache.Test2Put intmap avgt 5 105.264 ± 6.934 ns/op 221 | TestJavaCache.Test2Put intmap2 avgt 5 78.434 ± 1.202 ns/op 222 | TestJavaCache.Test3PutGet unshared avgt 5 247.885 ± 55.511 ns/op 223 | TestJavaCache.Test3PutGet concurrent avgt 5 282.537 ± 22.152 ns/op 224 | TestJavaCache.Test3PutGet lock avgt 5 298.475 ± 29.188 ns/op 225 | TestJavaCache.Test3PutGet intmap avgt 5 149.688 ± 3.338 ns/op 226 | TestJavaCache.Test3PutGet intmap2 avgt 5 119.706 ± 1.820 ns/op 227 | TestJavaCache.Test4MultiGet unshared avgt 5 53.801 ± 0.663 ns/op 228 | TestJavaCache.Test4MultiGet concurrent avgt 5 53.011 ± 1.252 ns/op 229 | TestJavaCache.Test4MultiGet lock avgt 5 329.347 ± 2.941 ns/op 230 | TestJavaCache.Test4MultiGet intmap avgt 5 51.659 ± 9.043 ns/op 231 | TestJavaCache.Test4MultiGet intmap2 avgt 5 22.869 ± 3.840 ns/op 232 | TestJavaCache.Test5MultiPut unshared avgt 5 108.950 ± 28.201 ns/op 233 | TestJavaCache.Test5MultiPut concurrent avgt 5 159.692 ± 23.058 ns/op 234 | TestJavaCache.Test5MultiPut lock avgt 5 470.977 ± 84.851 ns/op 235 | TestJavaCache.Test5MultiPut intmap avgt 5 104.542 ± 1.934 ns/op 236 | TestJavaCache.Test5MultiPut intmap2 avgt 5 78.117 ± 3.363 ns/op 237 | TestJavaCache.Test6MultiPutGet unshared avgt 5 240.432 ± 59.746 ns/op 238 | TestJavaCache.Test6MultiPutGet concurrent avgt 5 285.872 ± 38.001 ns/op 239 | TestJavaCache.Test6MultiPutGet lock avgt 5 1678.923 ± 288.992 ns/op 240 | TestJavaCache.Test6MultiPutGet intmap avgt 5 151.847 ± 5.489 ns/op 241 | TestJavaCache.Test6MultiPutGet intmap2 avgt 5 122.743 ± 3.129 ns/op 242 | 243 | ``` 244 | *** The Java multi-unshared and intmap are not valid, but no easy way to exclude with jmh. It doesn't crash because the maps are 245 | pre-populated and don't resize. 246 | 247 | **Java Analysis** 248 | 249 | 1. The warm-up phase matters little in tests like this, since so many iterations are required. 250 | 2. The Java concurrent 'multi get' is in line with the 'get', since it should be a lock-free volatile read. 251 | 3. The results shows that Java's dynamic inlining can achieve amazing performance. 252 | 4. The Java boxing of primitives into objects (required by CHM), seems very efficient. 253 | 5. When using equal read & write, the Java "lock" methods shows very poor performance compared with the concurrent. 254 | 255 | **C test Results** 256 | 257 | ``` 258 | With -O0 259 | intmap put = 170.991800 ns/op 260 | intmap get = 134.725800 ns/op 261 | intmap2 put = 108.658200 ns/op 262 | intmap2 get = 79.461800 ns/op 263 | 264 | With -O3 265 | intmap put = 116.890400 ns/op 266 | intmap get = 56.260000 ns/op 267 | intmap2 put = 94.516000 ns/op 268 | intmap2 get = 25.283200 ns/op 269 | ``` 270 | 271 | **Overall Findings** 272 | 273 | The Go performance numbers as compared to Java show that there is a lot of room for improvement in the available Go 274 | structures for concurrent applications. The Go "use channels" is only suitable when the handling performs a significant amount of work, and/or 275 | simplicity of code (as it is single-threaded, and the use of locks has other issues). The sync.Map should be nearly identical to the 'unshared map' 276 | for reads (as it is for Java, especially the Get vs. MultiGet), but it is 2x slower - this is a indicator of a poor implementation, poor compiler optimizations, 277 | or that the underlying native operations used (CAS, etc.) need to be examined. 278 | 279 | _My experience with Go is ongoing, and there is a chance I've made some errors in these tests, and I welcome the community input to improve them._ 280 | --------------------------------------------------------------------------------