├── .gitignore
├── .idea
├── compiler.xml
├── libraries
│ ├── org_openjdk_jmh_jmh_core_1_21.xml
│ └── org_openjdk_jmh_jmh_generator_annprocess_1_21.xml
├── misc.xml
├── modules.xml
├── uiDesigner.xml
└── vcs.xml
├── c
├── Makefile
└── hashmap.c
├── go-concurrency-test.iml
├── go.mod
├── java
├── pom.xml
└── src
│ └── test
│ └── TestJavaCache.java
├── maps.go
├── maps_test.go
└── readme.md
/.gitignore:
--------------------------------------------------------------------------------
1 | c/hashmap.dSYM
2 | target
3 | c/hashmap
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/libraries/org_openjdk_jmh_jmh_core_1_21.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/libraries/org_openjdk_jmh_jmh_generator_annprocess_1_21.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/c/Makefile:
--------------------------------------------------------------------------------
1 | # the compiler: gcc for C program, define as g++ for C++
2 | CC = gcc
3 |
4 | # compiler flags:
5 | # -g adds debugging information to the executable file
6 | # -Wall turns on most, but not all, compiler warnings
7 | CFLAGS = -g -Wall -O3
8 |
9 | # the build target executable:
10 | TARGET = hashmap
11 |
12 | all: $(TARGET)
13 |
14 | $(TARGET): $(TARGET).c
15 | $(CC) $(CFLAGS) -o $(TARGET) $(TARGET).c
16 |
17 | clean:
18 | $(RM) $(TARGET)
19 |
--------------------------------------------------------------------------------
/c/hashmap.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | struct node{
6 | int key;
7 | int val;
8 | struct node *next;
9 | };
10 | struct table{
11 | int size;
12 | struct node **list;
13 | };
14 | struct table *createTable(int size){
15 | struct table *t = (struct table*)malloc(sizeof(struct table));
16 | t->size = size;
17 | t->list = (struct node**)malloc(sizeof(struct node*)*size);
18 | int i;
19 | for(i=0;ilist[i] = NULL;
21 | return t;
22 | }
23 | int hashCode(struct table *t,int key){
24 | if(key<0)
25 | return -(key%t->size);
26 | return key%t->size;
27 | }
28 | void insert(struct table *t,int key,int val){
29 | int pos = hashCode(t,key);
30 | struct node *list = t->list[pos];
31 | struct node *temp = list;
32 | while(temp){
33 | if(temp->key==key){
34 | temp->val = val;
35 | return;
36 | }
37 | temp = temp->next;
38 | }
39 | struct node *newNode = (struct node*)malloc(sizeof(struct node));
40 | newNode->key = key;
41 | newNode->val = val;
42 | newNode->next = list;
43 | t->list[pos] = newNode;
44 | }
45 | int lookup(struct table *t,int key){
46 | int pos = hashCode(t,key);
47 | struct node *list = t->list[pos];
48 | struct node *temp = list;
49 | while(temp){
50 | if(temp->key==key){
51 | return temp->val;
52 | }
53 | temp = temp->next;
54 | }
55 | return -1;
56 | }
57 |
58 | // calculate the time diff between start and end
59 | long delay(struct timeval t1, struct timeval t2)
60 | {
61 | long d;
62 | d = (t2.tv_sec - t1.tv_sec) * 1000000;
63 | d += t2.tv_usec - t1.tv_usec;
64 | return(d);
65 | }
66 |
67 | /* The state word must be initialized to non-zero */
68 | uint32_t myrand(uint32_t r)
69 | {
70 | /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
71 | r ^= r << 13;
72 | r ^= r >> 17;
73 | r ^= r << 5;
74 | return r;
75 | }
76 |
77 | int Sink;
78 |
79 | void test(char *name,struct table *t) {
80 | int mask = (1024*1024)-1;
81 | struct timeval start, end;
82 | for(int i=0;i<=mask;i++){
83 | insert(t,i,i);
84 | }
85 | gettimeofday(&start, NULL);
86 | uint32_t r = start.tv_usec;
87 | for( int i=0;i<5000000;i++){
88 | r = myrand(r);
89 | int index = r & mask;
90 | insert(t,index,index);
91 | }
92 | gettimeofday(&end, NULL);
93 | printf("%s put = %lf ns/op\n", name,delay(start, end)/(5000000.0/1000));
94 |
95 | gettimeofday(&start, NULL);
96 | int count=0;
97 | for( int i=0;i<5000000;i++){
98 | r = myrand(r);
99 | int index = r & mask;
100 | count += lookup(t,index);
101 | }
102 | Sink=count;
103 | gettimeofday(&end, NULL);
104 | printf("%s get = %lf ns/op\n", name, delay(start, end)/(5000000.0/1000));
105 | }
106 |
107 | int main(){
108 | struct table *t = createTable(256000);
109 |
110 | test("intmap",t);
111 |
112 | t = createTable(1000000);
113 |
114 | test("intmap2",t);
115 | return 0;
116 | }
117 |
--------------------------------------------------------------------------------
/go-concurrency-test.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/robaho/go-concurrency-test
2 |
3 | go 1.22.5
4 |
--------------------------------------------------------------------------------
/java/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 | robaho
7 | go-concurrency-test
8 | 1.0-SNAPSHOT
9 |
10 | src
11 | src
12 |
13 |
14 | org.apache.maven.plugins
15 | maven-shade-plugin
16 | 3.2.0
17 |
18 |
19 | package
20 |
21 | shade
22 |
23 |
24 | jmh-benchmarks
25 |
26 |
28 | org.openjdk.jmh.Main
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 | org.openjdk.jmh
40 | jmh-core
41 | 1.37
42 |
43 |
44 | org.openjdk.jmh
45 | jmh-generator-annprocess
46 | 1.37
47 |
48 |
49 |
--------------------------------------------------------------------------------
/java/src/test/TestJavaCache.java:
--------------------------------------------------------------------------------
1 | package test;
2 |
3 | import org.openjdk.jmh.annotations.*;
4 |
5 | import java.util.HashMap;
6 | import java.util.Map;
7 | import java.util.concurrent.*;
8 | import java.util.concurrent.locks.ReadWriteLock;
9 | import java.util.concurrent.locks.ReentrantReadWriteLock;
10 |
11 | interface AnyCache {
12 | int get(int key);
13 | void put(int key,int value);
14 | }
15 |
16 | class MyConcurrentCache implements AnyCache {
17 |
18 | final ConcurrentHashMap m = new ConcurrentHashMap();
19 | @Override
20 | public int get(int key) {
21 | return m.get(key);
22 | }
23 |
24 | @Override
25 | public void put(int key,int value) {
26 | m.put(key,value);
27 | }
28 | }
29 |
30 | class MyLockCache implements AnyCache {
31 |
32 | final ReadWriteLock rw = new ReentrantReadWriteLock(false);
33 | final HashMap m = new HashMap();
34 |
35 | @Override
36 | public int get(int key) {
37 | rw.readLock().lock();
38 | try {
39 | return m.get(key);
40 | } finally {
41 | rw.readLock().unlock();
42 | }
43 | }
44 |
45 | @Override
46 | public void put(int key,int value) {
47 | rw.writeLock().lock();
48 | try {
49 | m.put(key, value);
50 | } finally {
51 | rw.writeLock().unlock();
52 | }
53 | }
54 | }
55 |
56 | /*
57 | note, this would crash in a real "multi" environment, but only works here since
58 | the map is pre-populated so it is never resized. There is no easy way in jmh to restrict
59 | certain benchmarks to certain parameters
60 | */
61 | class MyUnsharedCache implements AnyCache {
62 | final Map m = new HashMap();
63 |
64 | @Override
65 | public int get(int key) {
66 | return m.get(key);
67 | }
68 |
69 | @Override
70 | public void put(int key,int value) {
71 | m.put(key,value);
72 | }
73 | }
74 |
75 | class IntMap implements AnyCache {
76 | static class node {
77 | int key,value;
78 | node next;
79 | }
80 |
81 | private final node[] table;
82 | private final int mask;
83 | private static int nextPowerOf2(int v) {
84 | v--;
85 | v |= v >> 1;
86 | v |= v >> 2;
87 | v |= v >> 4;
88 | v |= v >> 8;
89 | v |= v >> 16;
90 | v++;
91 | return v;
92 | }
93 | public IntMap(int size) {
94 | size = nextPowerOf2(size);
95 | table = new node[size];
96 | mask = size-1;
97 | }
98 | @Override
99 | public int get(int key) {
100 | node n = table[key&mask];
101 | if (n==null) {
102 | return 0;
103 | }
104 | for(;n!=null;n=n.next) {
105 | if(n.key==key){
106 | return n.value;
107 | }
108 | }
109 | return 0;
110 | }
111 |
112 | @Override
113 | public void put(int key, int value) {
114 | node head = table[key&mask];
115 | for(node n=head;n!=null;n=n.next) {
116 | if(n.key==key) {
117 | n.value=value;
118 | return;
119 | }
120 | }
121 | node n = new node();
122 | n.key=key;
123 | n.value=value;
124 | n.next=head;
125 | table[key&mask]=n;
126 | }
127 | }
128 |
129 | @State(Scope.Benchmark)
130 | @Fork(1)
131 | @Warmup(iterations = 1, time = 3)
132 | @Measurement(iterations = 5, time = 3)
133 | @BenchmarkMode(Mode.AverageTime)
134 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
135 |
136 | public class TestJavaCache {
137 | final int Mask = (1024*1024)-1;
138 | final int NTHREADS = 2;
139 |
140 | static int rand(int r) {
141 | /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
142 | r ^= r << 13;
143 | r ^= r >> 17;
144 | r ^= r << 5;
145 | return r & 0x7fffffff;
146 | }
147 |
148 | @Param({"unshared", "concurrent", "lock","intmap","intmap2"})
149 | public String arg;
150 |
151 | static AnyCache m;
152 |
153 | static ExecutorService e;
154 |
155 | public int Sink;
156 |
157 | @Setup
158 | public void setup() {
159 | switch(arg){
160 | case "unshared":
161 | m = new MyUnsharedCache(); break;
162 | case "concurrent":
163 | m = new MyConcurrentCache(); break;
164 | case "lock":
165 | m = new MyLockCache(); break;
166 | case "intmap":
167 | m = new IntMap(256000); break;
168 | case "intmap2":
169 | m = new IntMap(1000000); break;
170 | }
171 |
172 | e = Executors.newFixedThreadPool(NTHREADS);
173 | for(int i=0;i<=Mask;i++){
174 | m.put(i,i);
175 | }
176 | }
177 | @TearDown
178 | public void tearDown() {
179 | e.shutdown();
180 | for(int i=0;i<=Mask;i++){
181 | if ((m.get(i)&Mask) != (i&Mask)) {
182 | throw new IllegalStateException("index "+i+" = "+m.get(i));
183 | }
184 | }
185 | }
186 |
187 | @Benchmark
188 | @OperationsPerInvocation(1000000)
189 | public void Test0Get() {
190 | int sum=0;
191 | int r = (int)System.nanoTime();
192 | for(int i=0;i<1000000;i++) {
193 | r = rand(r);
194 | sum+=m.get(r&Mask);
195 | }
196 | Sink = sum;
197 | }
198 |
199 | @Benchmark
200 | @OperationsPerInvocation(1000000)
201 | public void Test2Put() {
202 | int r = (int)System.nanoTime();
203 | for(int i=0;i<1000000;i++) {
204 | r = rand(r);
205 | m.put(r&Mask,r);
206 | }
207 | }
208 |
209 | @Benchmark
210 | @OperationsPerInvocation(1000000)
211 | public void Test3PutGet() {
212 | int r = (int)System.nanoTime();
213 | int sum=0;
214 | for(int i=0;i<1000000;i++) {
215 | r = rand(r);
216 | m.put(r&Mask,r);
217 | r = rand(r);
218 | sum+=m.get(r&Mask);
219 | }
220 | Sink = sum;
221 | }
222 |
223 | @Benchmark
224 | @OperationsPerInvocation(1000000)
225 | public void Test4MultiGet() throws InterruptedException {
226 | CountDownLatch latch = new CountDownLatch(NTHREADS);
227 |
228 | Runnable run = () -> {
229 | Test0Get();
230 | latch.countDown();
231 | };
232 | for(int i=0;i {
244 | Test2Put();
245 | latch.countDown();
246 | };
247 | for(int i=0;i {
259 | Test3PutGet();
260 | latch.countDown();
261 | };
262 | for(int i=0;i> 1
12 | v |= v >> 2
13 | v |= v >> 4
14 | v |= v >> 8
15 | v |= v >> 16
16 | v++
17 | return v
18 | }
19 |
20 | type node struct {
21 | key, value int
22 | next *node
23 | }
24 |
25 | type IntMap struct {
26 | table []*node
27 | mask int
28 | }
29 |
30 | func NewIntMap(size int) *IntMap {
31 | size = nextPowerOf2(size)
32 | m := IntMap{}
33 | m.table = make([]*node, size)
34 | m.mask = size - 1
35 | return &m
36 | }
37 |
38 | func (m *IntMap) Get(key int) int {
39 | node := m.table[key&m.mask]
40 | if node == nil {
41 | return 0
42 | }
43 | for ; node != nil; node = node.next {
44 | if node.key == key {
45 | return node.value
46 | }
47 | }
48 | return 0
49 | }
50 | func (m *IntMap) Put(key int, value int) {
51 | head := m.table[key&m.mask]
52 | for node := head; node != nil; node = node.next {
53 | if node.key == key {
54 | node.value = value
55 | return
56 | }
57 | }
58 | n := &node{key: key, value: value, next: head}
59 | m.table[key&m.mask] = n
60 | }
61 |
62 | type SharedIntMap struct {
63 | table []*node
64 | mask int
65 | }
66 |
67 | func NewSharedIntMap(size int) *SharedIntMap {
68 | size = nextPowerOf2(size)
69 | m := SharedIntMap{}
70 | m.table = make([]*node, size)
71 | m.mask = size - 1
72 | return &m
73 | }
74 |
75 | func (m *SharedIntMap) Get(key int) int {
76 | p := (*unsafe.Pointer)(unsafe.Pointer(&m.table[key&m.mask]))
77 | node := (*node)(atomic.LoadPointer(p))
78 |
79 | for ; node != nil; node = node.next {
80 | if node.key == key {
81 | return node.value
82 | }
83 | }
84 | return 0
85 | }
86 | func (m *SharedIntMap) Put(key int, value int) {
87 |
88 | p := (*unsafe.Pointer)(unsafe.Pointer(&m.table[key&m.mask]))
89 |
90 | for {
91 | head := (*node)(atomic.LoadPointer(p))
92 | for node := head; node != nil; node = node.next {
93 | if node.key == key {
94 | node.value = value
95 | //if !atomic.CompareAndSwapPointer(p,head,head) {
96 | // continue
97 | //}
98 | return
99 | }
100 | }
101 | n := &node{key: key, value: value, next: head}
102 | if atomic.CompareAndSwapPointer(p, unsafe.Pointer(head), unsafe.Pointer(n)) {
103 | continue
104 | }
105 | }
106 | }
107 |
108 | type Cache interface {
109 | Get(key int) int
110 | Put(key int, value int)
111 | }
112 |
113 | type LockCache struct {
114 | sync.RWMutex
115 | m map[int]int
116 | }
117 |
118 | func NewLockCache() *LockCache {
119 | m := LockCache{m: make(map[int]int)}
120 | return &m
121 | }
122 |
123 | func (m *LockCache) Get(key int) int {
124 | m.RLock()
125 | val, _ := m.m[key]
126 | m.RUnlock() // non-idiomatic go, but avoid defer performance hit
127 | return val
128 | }
129 | func (m *LockCache) Put(key int, value int) {
130 | m.Lock()
131 | m.m[key] = value
132 | m.Unlock() // non-idiomatic go, but avoid defer performance hit
133 | }
134 |
135 | type ShardCache struct {
136 | maps [10]map[int]int
137 | }
138 |
139 | func NewShardCache() *ShardCache {
140 | m := ShardCache{}
141 | for i := 0; i < 10; i++ {
142 | m.maps[i] = make(map[int]int)
143 | }
144 | return &m
145 | }
146 |
147 | func (m *ShardCache) Get(key int) int {
148 | val, _ := m.maps[key%10][key]
149 | return val
150 | }
151 | func (m *ShardCache) Put(key int, value int) {
152 | m.maps[key%10][key] = value
153 | }
154 |
155 | const SharedShardMask = 128 - 1
156 |
157 | type imap map[int]int
158 |
159 | type shard struct {
160 | imap
161 | sync.RWMutex
162 | }
163 |
164 | type SharedShardCache struct {
165 | shards [128]*shard
166 | }
167 |
168 | func NewSharedShardCache() *SharedShardCache {
169 | m := SharedShardCache{}
170 | for i := 0; i < 128; i++ {
171 | s := shard{imap: make(imap)}
172 | m.shards[i] = &s
173 | }
174 | return &m
175 | }
176 |
177 | func (m *SharedShardCache) Get(key int) int {
178 | s := m.shards[key&SharedShardMask]
179 |
180 | s.RLock()
181 | val, ok := s.imap[key]
182 | s.RUnlock()
183 |
184 | if !ok {
185 | return 0
186 | }
187 | return val
188 | }
189 |
190 | func (m *SharedShardCache) Put(key int, value int) {
191 | s := m.shards[key&SharedShardMask]
192 | s.Lock()
193 | s.imap[key] = value
194 | s.Unlock()
195 | }
196 |
197 | type UnsharedCache map[int]int
198 |
199 | func NewUnsharedCache() *UnsharedCache {
200 | m := UnsharedCache{}
201 | return &m
202 | }
203 |
204 | func (m *UnsharedCache) Get(key int) int {
205 | val := (*m)[key]
206 | return val
207 | }
208 | func (m *UnsharedCache) Put(key int, value int) {
209 | (*m)[key] = value
210 | }
211 |
212 | type SyncCache struct {
213 | m sync.Map
214 | }
215 |
216 | func NewSyncCache() *SyncCache {
217 | m := SyncCache{}
218 | return &m
219 | }
220 |
221 | func (m *SyncCache) Get(key int) int {
222 | val, _ := m.m.Load(key)
223 | if val == nil {
224 | return 0
225 | }
226 | return val.(int)
227 | }
228 | func (m *SyncCache) Put(key int, value int) {
229 | m.m.Store(key, value)
230 | }
231 |
232 | type PutRequest struct {
233 | key, value int
234 | }
235 | type GetRequest struct {
236 | key int
237 | }
238 |
239 | type ChannelCache struct {
240 | m map[int]int
241 | request chan interface{}
242 | response chan int
243 | }
244 |
245 | func (m *ChannelCache) Get(key int) int {
246 | m.request <- GetRequest{key}
247 | return <-m.response
248 | }
249 |
250 | func (m *ChannelCache) Put(key int, value int) {
251 | m.request <- PutRequest{key, value}
252 | }
253 |
254 | func NewChannelCache() *ChannelCache {
255 | c := &ChannelCache{m: make(map[int]int), request: make(chan interface{}), response: make(chan int)}
256 | go func() {
257 | for {
258 | request := <-c.request
259 | switch request.(type) {
260 | case GetRequest:
261 | val, ok := c.m[request.(GetRequest).key]
262 | if !ok {
263 | val = 0
264 | }
265 | c.response <- val
266 | case PutRequest:
267 | c.m[request.(PutRequest).key] = request.(PutRequest).value
268 | }
269 | }
270 | }()
271 | return c
272 | }
273 |
--------------------------------------------------------------------------------
/maps_test.go:
--------------------------------------------------------------------------------
1 | package go_concurrency_test
2 |
3 | import (
4 | "fmt"
5 | "github.com/robaho/go-concurrency-test"
6 | "sync"
7 | "sync/atomic"
8 | "testing"
9 | "time"
10 | )
11 |
12 | const NGOS = 2 // number of concurrent go routines for read/load tests
13 | const Mask = (1024 * 1024) - 1
14 |
15 | var um = go_concurrency.NewUnsharedCache()
16 | var lm = go_concurrency.NewLockCache()
17 | var sm = go_concurrency.NewSyncCache()
18 | var cm = go_concurrency.NewChannelCache()
19 | var sc = go_concurrency.NewShardCache()
20 | var ssc = go_concurrency.NewSharedShardCache()
21 | var im = go_concurrency.NewIntMap(256000) // so there are 4x collisions
22 | var im2 = go_concurrency.NewIntMap(1000000) // so there are no collisions
23 | var sim = go_concurrency.NewSharedIntMap(1000000) // so there are no collisions
24 |
25 | var Sink atomic.Value
26 |
27 | func rand(r int) int {
28 | /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
29 | r ^= r << 13
30 | r ^= r >> 17
31 | r ^= r << 5
32 | return r & 0x7fffffff
33 | }
34 |
35 | func TestNewSharedIntMap(t *testing.T) {
36 | for i := 0; i < 2000000; i++ {
37 | sim.Put(i, i)
38 | }
39 | for i := 0; i < 2000000; i++ {
40 | if sim.Get(i) != i {
41 | t.Fatal("wrong value")
42 | }
43 | }
44 | }
45 |
46 | func BenchmarkRand(m *testing.B) {
47 | r := time.Now().Nanosecond()
48 | for i := 0; i < m.N; i++ {
49 | r = rand(r)
50 | }
51 | Sink.Store(r)
52 | }
53 |
54 | func testget(impl go_concurrency.Cache, b *testing.B) {
55 | r := time.Now().Nanosecond()
56 |
57 | var sum int
58 | for i := 0; i < b.N; i++ {
59 | r = rand(r)
60 | sum += impl.Get(r & Mask)
61 | }
62 | Sink.Store(sum)
63 | }
64 | func testput(impl go_concurrency.Cache, b *testing.B) {
65 | r := time.Now().Nanosecond()
66 | for i := 0; i < b.N; i++ {
67 | r = rand(r)
68 | impl.Put(r&Mask, r)
69 | }
70 | }
71 | func testputget(impl go_concurrency.Cache, b *testing.B) {
72 | r := time.Now().Nanosecond()
73 | var sum int
74 | for i := 0; i < b.N; i++ {
75 | r = rand(r)
76 | impl.Put(r&Mask, r)
77 | r = rand(r)
78 | sum += impl.Get(r & Mask)
79 | }
80 | Sink.Store(sum)
81 | }
82 | func BenchmarkMain(m *testing.B) {
83 | fmt.Println("populating maps...")
84 | for i := 0; i <= Mask; i++ {
85 | um.Put(i, i)
86 | lm.Put(i, i)
87 | sm.Put(i, i)
88 | cm.Put(i, i)
89 | sc.Put(i, i)
90 | ssc.Put(i, i)
91 | im.Put(i, i)
92 | im2.Put(i, i)
93 | sim.Put(i, i)
94 | }
95 |
96 | sm.Get(100)
97 | m.ResetTimer()
98 |
99 | impls := []go_concurrency.Cache{um, lm, sm, cm, sc, ssc, im, im2, sim}
100 | names := []string{"unshared", "lock", "sync", "channel", "shard", "shareshard", "intmap", "intmap2", "sharedint"}
101 | multi := []bool{false, true, true, true, false, true, true, true, true}
102 |
103 | //impls := []go_concurrency.Cache{lm,sm}
104 | //names := []string{"lock","sync"}
105 | //multi := []bool{true,true}
106 |
107 | for i := 0; i < len(impls); i++ {
108 | impl := impls[i]
109 | m.Run(names[i]+".get", func(b *testing.B) {
110 | testget(impl, b)
111 | })
112 | m.Run(names[i]+".put", func(b *testing.B) {
113 | testput(impl, b)
114 | })
115 | m.Run(names[i]+".putget", func(b *testing.B) {
116 | testputget(impl, b)
117 | })
118 | m.Run(names[i]+".multiget", func(b *testing.B) {
119 | wg := sync.WaitGroup{}
120 | for g := 0; g < NGOS; g++ {
121 | wg.Add(1)
122 | go func() {
123 | testget(impl, b)
124 | wg.Done()
125 | }()
126 | }
127 | wg.Wait()
128 | })
129 | if !multi[i] { // some impl do not support concurrent write
130 | continue
131 | }
132 | m.Run(names[i]+".multiput", func(b *testing.B) {
133 | wg := sync.WaitGroup{}
134 | for g := 0; g < NGOS; g++ {
135 | wg.Add(1)
136 | go func() {
137 | testput(impl, b)
138 | wg.Done()
139 | }()
140 | }
141 | wg.Wait()
142 | })
143 | m.Run(names[i]+".multiputget", func(b *testing.B) {
144 | wg := sync.WaitGroup{}
145 | for g := 0; g < NGOS; g++ {
146 | wg.Add(1)
147 | go func() {
148 | testputget(impl, b)
149 | wg.Done()
150 | }()
151 | }
152 | wg.Wait()
153 | })
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 |
2 | Update 10/09/2024
3 |
4 | With Go generics, it is straightforward (should be, uncertain about hash function) to use the techniques listed below to create a highly performant concurrent map implementation.
5 |
6 | I have updated the timings using OpenJDK 21, Go 1.22.5, and clang version 14.0.0 (clang-1400.0.29.202) on an iMac Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
7 |
8 |
9 | Update 11/26/18
10 |
11 | After discussion in Go incident [28938](https://github.com/golang/go/issues/28938) I am prepared to make some final conclusions.
12 |
13 | Concurrent map data structures in Go are most efficiently implemented using locks and the built-in map implementations. This is especially true
14 | if the structure can be sharded to avoid single write mutex contention (see "shared shard" impl).
15 | Locks are very efficient in Go due to the user-threading of Go routines.
16 |
17 | The problem with Go's sync.Map is that it is "non-internal", unlike the built-in map implementation. So multiple layers of indirection are required
18 | to accomplish some operations (use of interfaces, no atomic CAS at the table entry level).
19 |
20 | In my tests I developed a fully concurrent "shared intmap" which shows excellent performance, but for a variety of reasons this is not a general solution,
21 | but it shows the performance potential of a redesigned sync.Map.
22 |
23 | The Go incident above links to several issues #[21031](https://github.com/golang/go/issues/54720), #[21035](https://github.com/golang/go/issues/21035) that when resolved
24 | should bring sync.Map performance inline with Java's ConcurrentHashMap.
25 |
26 | To be fair, most of performance issues with sync.Map only surface with a large map, with a large working set of active keys, as the indirection is
27 | especially painful due to cpu cache misses.
28 |
29 |
30 | Update 11/24/18
31 |
32 | I added 'intmap' versions - which are fixed sized maps (without resizing). The 'intmap2' is sized to
33 | avoid any collisions, 'intmap' has 4x collisions.
34 |
35 | I added 'C' versions of the 'intmap' for comparison. I also added a 'lock map' in Java similar to Go.
36 |
37 | I increased the number of elements to 1000000 to avoid the cache fitting in the L2.
38 |
39 | In all cases, the maps are pre-allocated to avoid any allocation overhead.
40 |
41 | The major change across the board was to use random get/put indexes as the linear read/put was biased towards Java, since the Go uses
42 | an alternate hash method so that sequential keys are not sequential in memory.
43 |
44 | I re-ran the tests with latest versions, Go 1.11.2, and Java 1.8_191, and C was compiled with clang-1000.11.45.5
45 |
46 | The tests now show Go to be a clear winner when using Locks, but sync.Map has significant performance issues.
47 |
48 | Both Go and Java perform very close to the optimized C versions, with Java bettering Go in almost all cases (intmap tests) - this is probably
49 | the reason the Go map implementation uses arrays of structs rather than a linked list of nodes.
50 |
51 | *** A note about the multi timings... They are not divided by the number of Go routines/threads (which doubles the number of operations), but since there
52 | are only 2 and ample cores available- meaning they should execute concurrently, it measures the overhead more directly since the timings
53 | can be directly compared to the single routine/thread case.
54 |
55 |
56 | Update
57 |
58 | Based on feedback from Bryan Mills, I've updated the implementation using channels, and re-tested. I do not believe it has
59 | made a significant difference. It does highlight that the 'Get' is worse than the 'Put' because 2 channels are involved.
60 |
61 | He has written [Rethinking Classical Concurrency in Go](https://golang.org/wiki/Go-Community-Slides#rethinking-classical-concurrency-patterns)
62 |
63 | He also provided that sync.Map has performance issues, see
64 | [this search](https://github.com/golang/go/issues?utf8=✓&q=is%3Aissue+is%3Aopen+%22sync%3A%22+Map+in%3Atitle+label%3APerformance)
65 |
66 | He also provided that the RWMutex has scalability issues (but I am not sure that applies here since only 2 routines are tested),
67 | but here is the [incident](https://golang.org/issue/17973)
68 |
69 | I removed the use of defer in the lock implementation as it is a known? performance issue.
70 |
71 | I reduced the number of elements cached to be 500k which should allow the cache to fit entirely in the L3 on the testing machine.
72 | It made some improvement.
73 |
74 | I updated the testing methodology to make certain constraints on the test more clear.
75 |
76 | I determined that the PutGet not matching the Put + Get times was because of cache locality, so PutGet was changed to read from
77 | opposite sides, which corrected the problem, and so I removed the comment regarding potential 'go bench' accuracy issues.
78 |
79 | I added an 'unshared' cache to the Java tests for an additional baseline.
80 |
81 |
82 | **Summary**
83 |
84 | The Go sync.Map implementation has significant room for improvement, performing far worse than Java ConcurrentHashMap,
85 | and locks should be used for low concurrent access, especially for high read weighted use cases.
86 |
87 | The Go synchronization primitives and Go routine scheduling outperform Java with locks by a very wide margin. The user level context switching
88 | is ideal.
89 |
90 | **Background**
91 |
92 | This is a project designed to test the concurrency mechanisms available in Go. Go states,
93 |
94 | [Do not communicate by sharing memory; instead, share memory by communicating.](https://blog.golang.org/share-memory-by-communicating)
95 | and the basic mechanism to achieve this is channels.
96 |
97 | In this test, I compare the 4 readily available ways to share information in Go:
98 | 1. unshared/basic map
99 | 2. using locks
100 | 3. using sync.Map
101 | 4. using channels
102 |
103 | The above methods are used to build a "memory cache", which is a very common structure in high-performance financial applications, which is my current
104 | area of work. This may not be the standard use case the Go authors envisioned as the amount of work per operation (store/load) is very small.
105 |
106 | The 'unshared 'is not usable as a shared cache - possibly as a write-at-init, read only cache in a specialized application with cooperative immutability.
107 |
108 | Additionally, I wrote the same tests using Java, but only for the sync.Map case, which is fairly equivalent to Java's ConcurrentHashMap, and this
109 | data structure and its methodologies use the same features as most lock-free structures (although ConcurrentHashMap does uses locks on store)
110 |
111 | The tests run under [go bench]([https://golang.org/pkg/testing/) for Go, and [jmh](http://openjdk.java.net/projects/code-tools/jmh/) for Java.
112 |
113 | In both cases, identical hardware was used, which is a 3.4 ghz Core i7, with 4 cores (8 threads), under OSX 10.13.6.
114 |
115 | The software versions are Go 1.11.2, and Java 1.8_191, Java OpenJDK11 with Shenandoah GC, and LLVM version 10.0.0 (clang-1000.11.45.5)
116 |
117 | **Testing Methodology**
118 |
119 | The cache uses ints for keys and values, and the cache is limited to 1 million entries to avoid possible 'map' degradation.
120 | The map is also pre-populated with the 1 million entries, and used for all of the tests, to ensure the tests do not measure map resizing costs, and
121 | other start-up penalties. The expected cache size in memory is roughly 1 million * sizeof(int), but larger when pointers are used by the implementation,
122 | especially in the case of Java. No tests were performed to measure the actual memory usage.
123 |
124 | There are 3 operations tested Get, Put, and PutGet. They are tested in a uncontested scenario using 1 go routine, they are also tested
125 | in a contested scenario (multi) using 2 go routines. The contested was limited to 2, since the machine only has 4 true cores, and there is
126 | housekeeper work performed by the benchmark harness, OS, etc. so this seemed fair.
127 |
128 | The tests for both Go and Java were executed via Intellij, as it has support for both 'go bench' and 'jmh'
129 |
130 | The testing harnesses do not operate identically, but I attempted to make the comparisons as fair as possible, mainly to use Java as a baseline
131 | for what would be expected of the Go solution, since in my experience in many cases the Go code seems to outperform the equivalent Java code.
132 |
133 | The caches are essentially 'static' structures. I did this for fairness. In testing there was no appreciable difference when the the cache structure
134 | was allocated within the method under test.
135 |
136 | Any GC related activity is included in the tests. No attempt was made to reduce, or tune the GC activity on any platform.
137 |
138 | **Go Test Results**
139 |
140 | ```
141 | cpu: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
142 | BenchmarkRand-8 669243439 1.781 ns/op
143 | populating maps...
144 | BenchmarkMain/unshared.get-8 18877132 63.53 ns/op
145 | BenchmarkMain/unshared.put-8 17971988 69.09 ns/op
146 | BenchmarkMain/unshared.putget-8 9268077 131.4 ns/op
147 | BenchmarkMain/unshared.multiget-8 17787741 66.86 ns/op
148 | BenchmarkMain/lock.get-8 17399762 70.33 ns/op
149 | BenchmarkMain/lock.put-8 10549318 114.1 ns/op
150 | BenchmarkMain/lock.putget-8 6803950 178.8 ns/op
151 | BenchmarkMain/lock.multiget-8 10305220 116.4 ns/op
152 | BenchmarkMain/lock.multiput-8 4546767 255.9 ns/op
153 | BenchmarkMain/lock.multiputget-8 2609588 448.0 ns/op
154 | BenchmarkMain/sync.get-8 5393236 216.7 ns/op
155 | BenchmarkMain/sync.put-8 3445075 353.9 ns/op
156 | BenchmarkMain/sync.putget-8 1963018 651.0 ns/op
157 | BenchmarkMain/sync.multiget-8 3567596 326.3 ns/op
158 | BenchmarkMain/sync.multiput-8 3339694 377.1 ns/op
159 | BenchmarkMain/sync.multiputget-8 1865768 676.4 ns/op
160 | BenchmarkMain/channel.get-8 2028018 614.3 ns/op
161 | BenchmarkMain/channel.put-8 3150184 384.3 ns/op
162 | BenchmarkMain/channel.putget-8 1226596 984.5 ns/op
163 | BenchmarkMain/channel.multiget-8 1000000 1169 ns/op
164 | BenchmarkMain/channel.multiput-8 1468042 790.1 ns/op
165 | BenchmarkMain/channel.multiputget-8 641374 1873 ns/op
166 | BenchmarkMain/shard.get-8 16428013 71.47 ns/op
167 | BenchmarkMain/shard.put-8 12421777 85.71 ns/op
168 | BenchmarkMain/shard.putget-8 7720970 152.4 ns/op
169 | BenchmarkMain/shard.multiget-8 15689773 75.81 ns/op
170 | BenchmarkMain/shareshard.get-8 16479192 72.35 ns/op
171 | BenchmarkMain/shareshard.put-8 9910390 110.3 ns/op
172 | BenchmarkMain/shareshard.putget-8 6203272 183.1 ns/op
173 | BenchmarkMain/shareshard.multiget-8 14384760 83.03 ns/op
174 | BenchmarkMain/shareshard.multiput-8 8925080 133.5 ns/op
175 | BenchmarkMain/shareshard.multiputget-8 4910743 248.7 ns/op
176 | BenchmarkMain/intmap.get-8 14669283 83.42 ns/op
177 | BenchmarkMain/intmap.put-8 6084549 191.5 ns/op
178 | BenchmarkMain/intmap.putget-8 4504140 250.0 ns/op
179 | BenchmarkMain/intmap.multiget-8 12845756 88.15 ns/op
180 | BenchmarkMain/intmap.multiput-8 5912864 190.2 ns/op
181 | BenchmarkMain/intmap.multiputget-8 4621642 255.4 ns/op
182 | BenchmarkMain/intmap2.get-8 30600810 38.98 ns/op
183 | BenchmarkMain/intmap2.put-8 8988387 125.8 ns/op
184 | BenchmarkMain/intmap2.putget-8 5373553 210.6 ns/op
185 | BenchmarkMain/intmap2.multiget-8 27520656 41.71 ns/op
186 | BenchmarkMain/intmap2.multiput-8 9286503 128.7 ns/op
187 | BenchmarkMain/intmap2.multiputget-8 5525810 211.8 ns/op
188 | BenchmarkMain/sharedint.get-8 20388027 58.56 ns/op
189 | BenchmarkMain/sharedint.put-8 5578724 210.7 ns/op
190 | BenchmarkMain/sharedint.putget-8 3231453 365.0 ns/op
191 | BenchmarkMain/sharedint.multiget-8 17616985 65.32 ns/op
192 | BenchmarkMain/sharedint.multiput-8 5640673 207.1 ns/op
193 | BenchmarkMain/sharedint.multiputget-8 3212062 364.5 ns/op
194 | ```
195 |
196 | **Go Analysis**
197 |
198 | There are several interesting, and disconcerting aspects.
199 |
200 | 1. The sync.Map performs far worse than the map using locks, even for Get - almost 3x slower.
201 | 2. Continuing on #1, the 'multi get' using sync performs worse than get. Again, this should only be a volatile load, with the
202 | expectation of performance equal to the unshared get, or at least the sync.Map singular read.
203 | 3. The channel method is more than 5x slower than the others, and this is using a very simple key/value structure, although a more complex one would
204 | probably necessitate using pointers.
205 |
206 | **Java Test Results**
207 |
208 | ```
209 | using 1 fork, 1 warm-up iteration, and 5 iterations of 3 sec
210 |
211 | Benchmark (arg) Mode Cnt Score Error Units
212 | TestJavaCache.Test0Get unshared avgt 5 50.360 ± 5.020 ns/op
213 | TestJavaCache.Test0Get concurrent avgt 5 48.672 ± 1.108 ns/op
214 | TestJavaCache.Test0Get lock avgt 5 99.454 ± 13.884 ns/op
215 | TestJavaCache.Test0Get intmap avgt 5 48.244 ± 8.969 ns/op
216 | TestJavaCache.Test0Get intmap2 avgt 5 18.599 ± 0.447 ns/op
217 | TestJavaCache.Test2Put unshared avgt 5 109.816 ± 38.564 ns/op
218 | TestJavaCache.Test2Put concurrent avgt 5 174.497 ± 47.735 ns/op
219 | TestJavaCache.Test2Put lock avgt 5 174.281 ± 31.701 ns/op
220 | TestJavaCache.Test2Put intmap avgt 5 105.264 ± 6.934 ns/op
221 | TestJavaCache.Test2Put intmap2 avgt 5 78.434 ± 1.202 ns/op
222 | TestJavaCache.Test3PutGet unshared avgt 5 247.885 ± 55.511 ns/op
223 | TestJavaCache.Test3PutGet concurrent avgt 5 282.537 ± 22.152 ns/op
224 | TestJavaCache.Test3PutGet lock avgt 5 298.475 ± 29.188 ns/op
225 | TestJavaCache.Test3PutGet intmap avgt 5 149.688 ± 3.338 ns/op
226 | TestJavaCache.Test3PutGet intmap2 avgt 5 119.706 ± 1.820 ns/op
227 | TestJavaCache.Test4MultiGet unshared avgt 5 53.801 ± 0.663 ns/op
228 | TestJavaCache.Test4MultiGet concurrent avgt 5 53.011 ± 1.252 ns/op
229 | TestJavaCache.Test4MultiGet lock avgt 5 329.347 ± 2.941 ns/op
230 | TestJavaCache.Test4MultiGet intmap avgt 5 51.659 ± 9.043 ns/op
231 | TestJavaCache.Test4MultiGet intmap2 avgt 5 22.869 ± 3.840 ns/op
232 | TestJavaCache.Test5MultiPut unshared avgt 5 108.950 ± 28.201 ns/op
233 | TestJavaCache.Test5MultiPut concurrent avgt 5 159.692 ± 23.058 ns/op
234 | TestJavaCache.Test5MultiPut lock avgt 5 470.977 ± 84.851 ns/op
235 | TestJavaCache.Test5MultiPut intmap avgt 5 104.542 ± 1.934 ns/op
236 | TestJavaCache.Test5MultiPut intmap2 avgt 5 78.117 ± 3.363 ns/op
237 | TestJavaCache.Test6MultiPutGet unshared avgt 5 240.432 ± 59.746 ns/op
238 | TestJavaCache.Test6MultiPutGet concurrent avgt 5 285.872 ± 38.001 ns/op
239 | TestJavaCache.Test6MultiPutGet lock avgt 5 1678.923 ± 288.992 ns/op
240 | TestJavaCache.Test6MultiPutGet intmap avgt 5 151.847 ± 5.489 ns/op
241 | TestJavaCache.Test6MultiPutGet intmap2 avgt 5 122.743 ± 3.129 ns/op
242 |
243 | ```
244 | *** The Java multi-unshared and intmap are not valid, but no easy way to exclude with jmh. It doesn't crash because the maps are
245 | pre-populated and don't resize.
246 |
247 | **Java Analysis**
248 |
249 | 1. The warm-up phase matters little in tests like this, since so many iterations are required.
250 | 2. The Java concurrent 'multi get' is in line with the 'get', since it should be a lock-free volatile read.
251 | 3. The results shows that Java's dynamic inlining can achieve amazing performance.
252 | 4. The Java boxing of primitives into objects (required by CHM), seems very efficient.
253 | 5. When using equal read & write, the Java "lock" methods shows very poor performance compared with the concurrent.
254 |
255 | **C test Results**
256 |
257 | ```
258 | With -O0
259 | intmap put = 170.991800 ns/op
260 | intmap get = 134.725800 ns/op
261 | intmap2 put = 108.658200 ns/op
262 | intmap2 get = 79.461800 ns/op
263 |
264 | With -O3
265 | intmap put = 116.890400 ns/op
266 | intmap get = 56.260000 ns/op
267 | intmap2 put = 94.516000 ns/op
268 | intmap2 get = 25.283200 ns/op
269 | ```
270 |
271 | **Overall Findings**
272 |
273 | The Go performance numbers as compared to Java show that there is a lot of room for improvement in the available Go
274 | structures for concurrent applications. The Go "use channels" is only suitable when the handling performs a significant amount of work, and/or
275 | simplicity of code (as it is single-threaded, and the use of locks has other issues). The sync.Map should be nearly identical to the 'unshared map'
276 | for reads (as it is for Java, especially the Get vs. MultiGet), but it is 2x slower - this is a indicator of a poor implementation, poor compiler optimizations,
277 | or that the underlying native operations used (CAS, etc.) need to be examined.
278 |
279 | _My experience with Go is ongoing, and there is a chance I've made some errors in these tests, and I welcome the community input to improve them._
280 |
--------------------------------------------------------------------------------