├── test
    ├── pandokia_top
    └── shmht
    │   └── trivial.py
├── MANIFEST.in
├── ext_shmht
    ├── __init__.py
    ├── raw_performance_test.py
    ├── HashTable.py
    └── Cacher.py
├── .gitignore
├── shmht.notes.txt
├── hashtable.h
├── README.md
├── LICENSE
├── setup.py
├── shmht.c
└── hashtable.c


/test/pandokia_top:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.h
2 | include setup.py
3 | include README.md
4 | include LICENSE
5 | 


--------------------------------------------------------------------------------
/ext_shmht/__init__.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 | 
3 | from HashTable import HashTable
4 | from Cacher import Cacher, MemCacher
5 | 
6 | 


--------------------------------------------------------------------------------
/ext_shmht/raw_performance_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding: utf-8
 3 | 
 4 | import shmht
 5 | import time
 6 | 
 7 | capacity = 300000
 8 | 
 9 | fd = shmht.open('/dev/shm/test.performance', capacity, 1)
10 | 
11 | begin_time = time.time()
12 | for i in range(capacity):
13 |     s = '%064d' % i
14 |     shmht.setval(fd, s, s)
15 | end_time = time.time()
16 | print capacity / (end_time - begin_time), 'iops @ set'
17 | 
18 | begin_timend_time = time.time()
19 | for i in range(capacity):
20 |     s = '%064d' % i
21 |     if s != shmht.getval(fd, s):
22 |         raise Exception(s)
23 | end_time = time.time()
24 | print capacity / (end_time - begin_time), 'iops @ get'
25 | 
26 | shmht.close(fd)
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/shmht.notes.txt:
--------------------------------------------------------------------------------
 1 | hash tables
 2 | 
 3 | max key size = 256
 4 | max value size = 1024
 5 | 
 6 | shmht.open(
 7 | 	s|ii
 8 | 		name
 9 | 			file name
10 | 		capacity = 0
11 | 			min number of slots in hash table
12 | 		force_init = 0
13 | 			initialize even if initialized
14 | 
15 | 	creates a file with a hash table in it
16 | 
17 | 	returns an integer "ident" - hash table number
18 | 
19 | shmht.close
20 | 	i
21 | 		idx
22 | 			number of the hash table to close
23 | 
24 | shmht.getval
25 | 	is
26 | 		idx
27 | 			number of the hash table
28 | 		key
29 | 			string index of hash table element
30 | 
31 | shmht.setval
32 | 	iss
33 | 		idx
34 | 			number of the hash table
35 | 		key
36 | 			string index of hash table element
37 | 		value
38 | 			string value of hash table element
39 | 
40 | shmht.remove
41 | 	is
42 | 		idx
43 | 			number of the hash table
44 | 		key
45 | 			string index of hash table element
46 | 
47 | 		
48 | shmht.foreach
49 | 	iO
50 | 		idx
51 | 			number of the hash table
52 | 		O
53 | 			callable to be called for each element
54 | 			called with key, value
55 | 


--------------------------------------------------------------------------------
/hashtable.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HASH_TABLE__
 2 | #define __HASH_TABLE__
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include <ctype.h>
 8 | #include <errno.h>
 9 | #include <assert.h>
10 | 
11 | #define ALLOC(type, n) ((type *)malloc(sizeof(type) * n))
12 | 
13 | typedef struct __hashtable {
14 |     unsigned magic;
15 |     size_t ref_cnt, orig_capacity, capacity, size, flag_offset, bucket_offset;
16 | } hashtable;
17 | 
18 | typedef unsigned u_int32;
19 | 
20 | typedef struct _ht_str {
21 |     u_int32 size;
22 |     char str[1];
23 | } ht_str;
24 | 
25 | typedef struct _ht_iter {
26 |     hashtable *ht;
27 |     size_t pos;
28 |     ht_str *key, *value;
29 | } ht_iter;
30 | 
31 | typedef int BOOL;
32 | #define True    1
33 | #define False   0
34 | 
35 | ht_iter* ht_get_iterator(hashtable *ht);
36 | int ht_iter_next(ht_iter* iter);
37 | 
38 | size_t ht_memory_size(size_t capacity);
39 | hashtable* ht_init(void *base_addr, size_t capacity, int force_init);
40 | ht_str* ht_get(hashtable *ht, const char *key, u_int32 key_size);
41 | int ht_set(hashtable *ht, const char *key, u_int32 key_size, const char *value, u_int32 value_size);
42 | int ht_remove(hashtable *ht, const char *key, u_int32 key_size);
43 | int ht_destroy(hashtable *ht);
44 | 
45 | int ht_is_valid(hashtable *ht);
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pyshmht
 2 | =======
 3 | 
 4 | **Sharing memory based** Hash Table extension for Python
 5 | 
 6 | For examples, see test cases in python files (pyshmht/Cacher.py, pyshmht/HashTable.py), where you can find performance tests as well.
 7 | 
 8 | Performance
 9 | ===========
10 | 
11 | capacity=200M, 64 bytes key/value tests, tested on (Xeon E5-2670 0 @ 2.60GHz, 128GB ram)
12 | 
13 | * hashtable.c (raw hash table in c, tested on `malloc`ed memory)
14 | > set: 0.93 Million iops;  
15 | > get: 2.35 Million iops;
16 | 
17 | * performance\_test.py (raw python binding)
18 | > set: 451k iops;  
19 | > get: 272k iops;
20 | 
21 | * HashTable.py (simple wrapper, no serialization)
22 | > set: 354k iops;  
23 | > get: 202k iops;
24 | 
25 | * Cacher.py (cached wrapper, with serialization)
26 | > set: 501k iops (cached), 228k iops (after write\_back);  
27 | > get: 560k iops (cached), 238k iops (no cache);
28 | 
29 | * python native dict
30 | > set: 741k iops;  
31 | > get: 390k iops;
32 | 
33 | Notice
34 | ======
35 | 
36 | In hashtable.c, default max key length is `256 - 4`, max value length is `1024 - 4`; you can change `bucket_size` and `max_key_size` manually, but bear in mind that increasing these two arguments will result in larger memory consumption.
37 | 
38 | If you find any bugs, please submit an issue or send me a pull request, I'll see to it ASAP :)
39 | 
40 | p.s. `hashtable.c` is independent (i.e. has nothing to do with python), you can use it in other projects if needed. :P
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Felix021
 2 | All rights reserved.
 3 | Copyright (c) 2015, Association of Universities for Research in Astronomy
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | * Redistributions of source code must retain the above copyright notice, this
 9 |   list of conditions and the following disclaimer.
10 | 
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 |   this list of conditions and the following disclaimer in the documentation
13 |   and/or other materials provided with the distribution.
14 | 
15 | * Neither the name of the {organization} nor the names of its
16 |   contributors may be used to endorse or promote products derived from
17 |   this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/test/shmht/trivial.py:
--------------------------------------------------------------------------------
 1 | # using Pandokia - http://ssb.stsci.edu/testing/pandokia
 2 | #
 3 | # using this feature:
 4 | # http://ssb.stsci.edu/testing/pandokia/docs_new/runner_minipyt.html#linear-execution-in-sequential-code-with-statement
 5 | #
 6 | import pandokia.helpers.pycode as pycode
 7 | from   pandokia.helpers.filecomp import safe_rm
 8 | 
 9 | import shmht
10 | 
11 | testfile = 'test_shmht.dat'
12 | 
13 | safe_rm(testfile)
14 | 
15 | with pycode.test('open-error') :
16 | 
17 |     try :
18 |         ident = shmht.open( testfile )
19 | 
20 |     except shmht.error as e :
21 |         pass
22 | 
23 |     else :
24 |         assert False, 'should have raised an exception'
25 | 
26 | with pycode.test('open-init') :
27 | 
28 |     ident = shmht.open( testfile, 10 )
29 | 
30 | with pycode.test('insert-lookup') :
31 | 
32 |     shmht.setval( ident, 'arf', 'data for arf' )
33 |     assert shmht.getval( ident, 'arf' ) == 'data for arf'
34 | 
35 |     shmht.setval( ident, 'narf', 'data for narf' )
36 |     assert shmht.getval( ident, 'narf' ) == 'data for narf'
37 | 
38 |     assert shmht.getval( ident, 'arf' ) == 'data for arf'
39 |     assert shmht.getval( ident, 'narf' ) == 'data for narf'
40 | 
41 | with pycode.test('iter-small') :
42 |     d = { }
43 |     def collect( key, value ):
44 |         d[key] = value
45 | 
46 |     shmht.foreach( ident, collect )
47 | 
48 |     print d
49 |     assert d == { 'arf' : 'data for arf', 'narf' : 'data for narf' }
50 | 
51 | with pycode.test('remove') :
52 |     shmht.remove( ident, 'arf' )
53 |     assert shmht.getval( ident, 'arf' ) == None
54 |     assert shmht.getval( ident, 'narf' ) == 'data for narf'
55 |     shmht.remove( ident, 'narf' )
56 |     assert shmht.getval( ident, 'arf' ) == None
57 |     assert shmht.getval( ident, 'narf' ) == None
58 | 
59 |     def collect( key, value ):
60 |         assert 0, 'somehow found %s : %s'% (key, value)
61 | 
62 |     shmht.foreach( ident, collect )
63 | 
64 | with pycode.test('fill') :
65 |     for x in range(57):
66 |         shmht.setval( ident, str(x), str(x)+' data' )
67 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os
 3 | from distutils.core import setup, Extension
 4 | 
 5 | #os.putenv("CFLAGS", "-g")
 6 | 
 7 | shmht = Extension('ext_shmht/_shmht',
 8 |         sources = ['shmht.c', 'hashtable.c']
 9 | )
10 | 
11 | setup(
12 |     name            = 'ext_shmht',
13 | # minimal changing of the version
14 |     version         = '0.1',
15 | # not to claim credit for another's work, nor to unfairly attribute my errors
16 |     author          = '',   
17 |     author_email    = '',
18 |     description     = 'shared memory hash table with locking',
19 |     license         = "BSD",
20 |     keywords        = "shared memory hash table shmem mmap",
21 |     url             = "http://github.com/stsci-sienkiew/pyshmht",
22 |     ext_modules     = [shmht],
23 |     packages        = ["ext_shmht"],
24 |     long_description = """
25 | An extended pyshmht - a simple hash table stored in an mmapped file
26 | 
27 | The basic access is vaguely dict like with the core capability being:
28 | 
29 |     h = ext_shmht.HashTable( filename, max_entries )
30 |     h['key'] = 'value'
31 |     v = h['key']
32 | 
33 | The table only uses strings for keys and values, but there is an
34 | interface that uses an object serializer, such as json or some other
35 | serializer that you provide.
36 | 
37 | There is a max length of key and value that are specified by defines
38 | in the C code.
39 | 
40 | extensions include: 
41 | 
42 | - file locking for multi-threaded or multi-process access 
43 |   n.b. do not use the same object in multiple threads - open the
44 |   file again in each thread.
45 | 
46 | - a little bit of documentation
47 | 
48 | - a few test cases that run in Pandokia.  See http://ssb.stsci.edu/testing/pandokia/ or 'pip install pandokia'.
49 | 
50 | 
51 | This is extended from pyshmht by felix021@gmail.com.  My intent is
52 | to enhance the original for my needs, in a way that the changes may
53 | someday make a reasonable pull request into the original.  It is a
54 | fork with a new name because I don't have time for the coordination
55 | with someone on the other side of the world right now.  (No kidding!
56 | felix is in Shanghai and I am in Baltimore, separated by 160 degrees
57 | longitude, or 10 to 11 time zones.)
58 | """, 
59 | 
60 | )
61 | 


--------------------------------------------------------------------------------
/ext_shmht/HashTable.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #coding: utf-8
  3 | 
  4 | import os
  5 | from . import _shmht
  6 | import marshal
  7 | 
  8 | #basic wrapper: open, close, get, set, remove, foreach
  9 | #extended wrapper: getobj, setobj, [], to_dict, update
 10 | 
 11 | class HashTable(object):
 12 |     """
 13 |     Simple hash table stored in shared memory.
 14 |         max open tables = 256
 15 |             (bug: fix this someday)
 16 |         string keys, max len = 256
 17 |         string data, max len = 1024
 18 |             (bug: make this settable per table someday)
 19 | 
 20 |     import pyshmht
 21 |     h = pyshmht.HashTable( filename, max_entries )
 22 | 
 23 |     ## for string keys and data values only:
 24 | 
 25 |     h.put( 'key', 'data' )
 26 |     h['key'] = 'data'
 27 |         # put string data
 28 | 
 29 |     s = h.get('key')
 30 |     s = h['key']
 31 |         # returns string, or None if key not present
 32 |     
 33 |     d = h.to_dict()
 34 |         # returns dict copied from hash table
 35 | 
 36 |     h.remove('key')
 37 |         # removes key from hash table
 38 | 
 39 |     h.update(dict)
 40 |         # insert each element of dict
 41 | 
 42 |     print 'key' in h
 43 |         #
 44 | 
 45 |     h.close()
 46 | 
 47 |     ## for string key and non-string python objects
 48 |     h.getobj() and h.setobj() use a serializer to convert the object
 49 |     to a string for storage.
 50 | 
 51 |     """
 52 |     def __init__(self, name, capacity=0, force_init=False, serializer=marshal, mkdirs=False):
 53 |         if mkdirs:
 54 |             try:
 55 |                 d = os.path.dirname(name)
 56 |                 os.makedirs(d)
 57 |             except OSError :
 58 |                 pass
 59 |         force_init = 1 if force_init else 0
 60 |         self.fd = _shmht.open(name, capacity, force_init)
 61 |         self.loads = serializer.loads
 62 |         self.dumps = serializer.dumps
 63 | 
 64 |     def close(self):
 65 |         _shmht.close(self.fd)
 66 | 
 67 |     def get(self, key, default=None):
 68 |         val = _shmht.getval(self.fd, key)
 69 |         if val == None:
 70 |             return default
 71 |         return val
 72 | 
 73 |     def set(self, key, value):
 74 |         return _shmht.setval(self.fd, key, value)
 75 | 
 76 |     # "set" is a python data type, so use put()
 77 | 
 78 |     def put(self, key, value):
 79 |         return _shmht.setval(self.fd, key, value)
 80 | 
 81 |     def remove(self, key):
 82 |         return _shmht.remove(self.fd, key)
 83 | 
 84 |     def foreach(self, callback, unserialize=False):
 85 |         if not unserialize:
 86 |             cb = callback
 87 |         else:
 88 |             loads = self.loads
 89 |             def mcb(key, value):
 90 |                 return callback(key, loads(value))
 91 |             cb = mcb
 92 |         return _shmht.foreach(self.fd, cb)
 93 | 
 94 |     def getobj(self, key, default=None):
 95 |         val = self.get(key, default)
 96 |         if val == default:
 97 |             return default
 98 |         return self.loads(val)
 99 | 
100 |     def setobj(self, key, val):
101 |         val = self.dumps(val)
102 |         return self.set(key, val)
103 | 
104 |     def __getitem__(self, key):
105 |         val = _shmht.getval(self.fd, key)
106 |         if val == None:
107 |             raise KeyError(key)
108 |         return val
109 | 
110 |     def __setitem__(self, key, value):
111 |         return _shmht.setval(self.fd, key, value)
112 | 
113 |     def __delitem__(self, key):
114 |         if False == _shmht.remove(self.fd, key):
115 |             raise KeyError(key)
116 | 
117 |     def __contains__(self, key):
118 |         return _shmht.getval(self.fd, key) != None
119 | 
120 |     def to_dict(self, unserialize=False):
121 |         d = {}
122 |         def insert(k,v):
123 |             d[k] = v
124 |         self.foreach(insert, unserialize)
125 |         return d
126 | 
127 |     def update(self, d, serialize=False):
128 |         dumps = self.dumps
129 |         if serialize:
130 |             for k in d:
131 |                 self[k] = dumps(d[k])
132 |         else:
133 |             for k in d:
134 |                 self[k] = d[k]
135 | 
136 | if __name__ == "__main__":
137 |     loads = marshal.loads
138 |     dumps = marshal.dumps
139 |     #test cases
140 |     ht = HashTable('/dev/shm/test.HashTable', 1024, 1)
141 | 
142 |     #set
143 |     ht['a'] = '1'
144 |     ht.set('b', '2')
145 |     c = {'hello': 'world'}
146 |     ht.setobj('c', c)
147 | 
148 |     #get
149 |     print ht['b'] == '2'
150 |     print ht['c'] == marshal.dumps(c)
151 |     print ht.getobj('c') == c
152 |     print ht.get('d') == None
153 |     try:
154 |         ht['d']
155 |         print False
156 |     except:
157 |         print True
158 | 
159 |     #contains
160 |     print ('c' in ht) == True
161 |     print ('d' in ht) == False
162 | 
163 |     #del
164 |     del ht['c']
165 |     print ht.get('c') == None
166 |     try:
167 |         del ht['d']
168 |         print 'del:', False
169 |     except:
170 |         print True
171 | 
172 |     #update & to_dict & foreach
173 |     ht.setobj('c', c)
174 |     print ht.to_dict() == {'a': '1', 'b': '2', 'c': dumps(c)}
175 | 
176 |     s = ''
177 |     def cb(key, value):
178 |         global s
179 |         s += key + str(value)
180 |     ht.foreach(cb)
181 |     print s == 'a1b2c' + dumps(c)
182 | 
183 |     ht.update({'a': 1, 'b': 2}, serialize=True)
184 | 
185 |     s = ''
186 |     ht.foreach(cb, unserialize=True)
187 |     print s == 'a1b2c' + str(c)
188 | 
189 |     print ht.to_dict() == {'a':dumps(1), 'b':dumps(2), 'c':dumps(c)}
190 |     print ht.to_dict(unserialize=True) == {'a': 1, 'b': 2, 'c': c}
191 | 
192 |     #close
193 |     ht.close()
194 |     try:
195 |         ht['a']
196 |         print False
197 |     except:
198 |         print True
199 | 
200 |     #simple performance test
201 |     import time
202 | 
203 |     capacity = 300000
204 | 
205 |     #write_through
206 |     ht = HashTable('/dev/shm/test.HashTable', capacity, True)
207 | 
208 |     begin_time = time.time()
209 |     for i in range(capacity):
210 |         s = '%064d' % i
211 |         ht[s] = s
212 |     end_time = time.time()
213 |     print capacity / (end_time - begin_time), 'iops @ set'
214 | 
215 |     begin_timend_time = time.time()
216 |     for i in range(capacity):
217 |         s = '%064d' % i
218 |         if s != ht[s]:
219 |             raise Exception(s)
220 |     end_time = time.time()
221 |     print capacity / (end_time - begin_time), 'iops @ get'
222 | 
223 |     ht.close()
224 | 
225 | 


--------------------------------------------------------------------------------
/ext_shmht/Cacher.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #coding: utf-8
  3 | 
  4 | import marshal
  5 | import HashTable
  6 | 
  7 | _debug = False
  8 | 
  9 | class Cacher(object):
 10 |     """
 11 |     Cacher: wrap HashTable with serializer and write_back mechanism
 12 |         if you intend to modify the cache, call write_back() before the program exits
 13 | 
 14 |         notice:
 15 |             Cacher tries to simulate dict in most cases, mainly except for:
 16 |                 (a) no __iter__, please use foreach instead
 17 |                 (b) key should always be a str, where dict allows all hashable objects 
 18 |                 (c) no comparation with other 'dict's
 19 |             When necessary, you can use .to_dict() to get a real dict object.
 20 |     """
 21 |     def __init__(self, name, capacity=0, force_init=False, serializer=marshal):
 22 |         """
 23 |         'name'          the path of the file to be 'mmap'ed
 24 |                         use MemCacher(name, ...) to add prefix '/dev/shm' automatically
 25 |         'capacity'      optional, if you want to connect to an existing shmht
 26 |         'serializer'    should contain loads/dumps (marshal, json, pickle, etc.)
 27 |         """
 28 | 
 29 |         self.ht = HashTable.HashTable(name, capacity, force_init, serializer)
 30 |         self.d = {}
 31 |         self.loads = serializer.loads
 32 |         self.dumps = serializer.dumps
 33 | 
 34 |     def __getitem__(self, key):
 35 |         d = self.d
 36 |         if key in d:
 37 |             val = d[key]
 38 |         else:
 39 |             val = self.loads(self.ht[key])
 40 |             d[key] = val
 41 |         return val
 42 | 
 43 |     def __setitem__(self, key, val):
 44 |         self.d[key] = val
 45 | 
 46 |     def __delitem__(self, key):
 47 |         if key in self.d:
 48 |             del self.d[key]
 49 |             try:
 50 |                 del self.ht[key]
 51 |             except:
 52 |                 pass
 53 |         else:
 54 |             del self.d[key]
 55 | 
 56 |     def __contains__(self, key): #notice: key will be cached here
 57 |         return self.get(key) != None
 58 | 
 59 |     def get(self, key, default=None):
 60 |         try:
 61 |             return self.__getitem__(key)
 62 |         except:
 63 |             return default
 64 | 
 65 |     def update(self, dic):
 66 |         self.d.update(dic)
 67 | 
 68 |     def foreach(self, callback):
 69 |         self.write_back()
 70 |         return self.ht.foreach(callback, unserialize=True)
 71 | 
 72 |     def to_dict(self):
 73 |         self.write_back()
 74 |         return self.ht.to_dict(unserialize=True)
 75 | 
 76 |     def write_back(self):
 77 |         self.ht.update(self.d, serialize=True)
 78 | 
 79 |     def close(self):
 80 |         if self.d:
 81 |             global _debug
 82 |             if not _debug:
 83 |                 self.write_back() #commented out for testing
 84 |             del self.d
 85 |             self.d = None
 86 |         if self.ht:
 87 |             self.ht.close()
 88 |             self.ht = None
 89 | 
 90 |     def __del__(self):
 91 |         """
 92 |         don't rely on this, please call write_back() manually if necessary.
 93 |         """
 94 |         self.close()
 95 | 
 96 | def MemCacher(name, capacity=0, force_init=False, serializer=marshal):
 97 |     """
 98 |     Add an prefix '/dev/shm/' to `name`, so that the file is saved only in memory
 99 |     For more information, see `help(Cacher)`
100 |     """
101 |     name = '/dev/shm/' + name
102 |     return Cacher(name, capacity, force_init, serializer)
103 | 
104 | if __name__ == "__main__":
105 |     #test cases
106 |     ht = MemCacher('test.Cacher', 1024, True)
107 |     print 'fd:', ht.ht.fd
108 | 
109 |     #set
110 |     ht['a'] = '1'
111 |     ht['b'] = 2
112 |     c = {'hello': 'world'}
113 |     ht['c'] = c
114 | 
115 |     #get
116 |     print ht['b'] == 2
117 |     print ht['c'] == c
118 |     print ht.get('c') == c
119 |     print ht.get('d') == None
120 |     try:
121 |         ht['d']
122 |         print False
123 |     except:
124 |         print True
125 | 
126 |     #contains
127 |     print ('c' in ht) == True
128 |     print ('d' in ht) == False
129 | 
130 |     #del
131 |     del ht['c']
132 |     print ht.get('c') == None
133 |     try:
134 |         del ht['d']
135 |         print 'del:', False
136 |     except:
137 |         print True
138 | 
139 |     #update & to_dict & foreach
140 |     dumps = marshal.dumps
141 |     ht['c'] = c
142 |     print ht.to_dict() == {'a': '1', 'b': 2, 'c': c}
143 | 
144 |     def cb(key, value):
145 |         global s
146 |         s += key + str(value)
147 | 
148 |     s = ''
149 |     ht.foreach(cb)
150 |     print s == 'a1b2c' + str(c)
151 | 
152 |     ht.update({'a': 'x', 'b': 1000})
153 | 
154 |     s = ''
155 |     ht.foreach(cb)
156 |     print s == 'axb1000c' + str(c)
157 | 
158 |     print ht.to_dict() == {'a': 'x', 'b': 1000, 'c': c}
159 | 
160 |     #close
161 |     ht.close()
162 |     try:
163 |         ht['a']
164 |         print False
165 |     except:
166 |         print True
167 | 
168 |     #write_back
169 |     ht = MemCacher('test.Cacher', 1024, True)
170 |     print 'fd:', ht.ht.fd
171 |     ht['a'] = 1
172 |     ht.write_back()
173 |     ht['b'] = 2
174 | 
175 |     _debug = True
176 |     ht.close() #write_back() is called in close() when not debugging
177 | 
178 |     ht = MemCacher('test.Cacher', 1024, False)
179 |     print 'fd:', ht.ht.fd
180 |     print ht['a'] == 1
181 |     try:
182 |         print ht['b']
183 |         print False
184 |     except:
185 |         print True
186 |     ht.close()
187 | 
188 |     #simple performance test
189 |     import time
190 | 
191 |     capacity = 300000
192 | 
193 |     ht = MemCacher('test.Cacher', capacity, force_init=True)
194 |     begin_time = time.time()
195 |     for i in range(capacity):
196 |         s = '%064d' % i
197 |         ht[s] = s
198 |     end_time = time.time()
199 |     print capacity / (end_time - begin_time), 'iops @ set / no write_back '
200 | 
201 |     ht.write_back()
202 |     end_time = time.time()
203 |     print capacity / (end_time - begin_time), 'iops @ set / after write_back '
204 | 
205 |     ht.d = {}
206 |     begin_time = time.time()
207 |     for i in range(capacity):
208 |         s = '%064d' % i
209 |         if s != ht[s]:
210 |             raise Exception(s)
211 |     end_time = time.time()
212 |     print capacity / (end_time - begin_time), 'iops @ get / no cache '
213 | 
214 |     begin_time = time.time()
215 |     for i in range(capacity):
216 |         s = '%064d' % i
217 |         if s != ht[s]:
218 |             raise Exception(s)
219 |     end_time = time.time()
220 |     print capacity / (end_time - begin_time), 'iops @ get / all cached '
221 | 
222 |     ht.close()
223 | 


--------------------------------------------------------------------------------
/shmht.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <unistd.h>
  5 | #include <fcntl.h>
  6 | #include <pthread.h>
  7 | #include <sys/mman.h>
  8 | #include <sys/file.h>
  9 | 
 10 | #include <Python.h>
 11 | 
 12 | #include "hashtable.h"
 13 | 
 14 | struct mapnode {
 15 |     int fd;
 16 |     size_t mem_size;
 17 |     hashtable *ht;
 18 | };
 19 | 
 20 | #define max_ht_map_entries 2048
 21 | static struct mapnode ht_map[max_ht_map_entries];
 22 | static int ht_idx = -1;
 23 | 
 24 | static PyObject * shmht_open(PyObject *self, PyObject *args);
 25 | static PyObject * shmht_close(PyObject *self, PyObject *args);
 26 | static PyObject * shmht_getval(PyObject *self, PyObject *args);
 27 | static PyObject * shmht_setval(PyObject *self, PyObject *args);
 28 | static PyObject * shmht_remove(PyObject *self, PyObject *args);
 29 | static PyObject * shmht_foreach(PyObject *self, PyObject *args);
 30 | 
 31 | static PyObject *shmht_error;
 32 | PyMODINIT_FUNC init_shmht(void);
 33 | 
 34 | static PyMethodDef shmht_methods[] = {
 35 |     {"open", shmht_open, METH_VARARGS, "create a shared memory hash table"},
 36 |     {"close", shmht_close, METH_VARARGS, ""},
 37 |     {"getval", shmht_getval, METH_VARARGS, ""},
 38 |     {"setval", shmht_setval, METH_VARARGS, ""},
 39 |     {"remove", shmht_remove, METH_VARARGS, ""},
 40 |     {"foreach", shmht_foreach, METH_VARARGS, ""},
 41 |     {NULL, NULL, 0, NULL}
 42 | };
 43 | 
 44 | // bug: half-assed file locking; I'm in a hurry at the moment. It
 45 | // might make sense to separate read/write locks or even use file
 46 | // regions, but there is no substitute for simplicity.
 47 | static void mylock(fd) {
 48 |     flock(fd, LOCK_EX);
 49 |     // bug: not handling error condition
 50 | }
 51 | 
 52 | static void myunlock(fd) {
 53 |     flock(fd, LOCK_UN);
 54 |     // bug: not handling error condition
 55 | }
 56 | 
 57 | 
 58 | PyMODINIT_FUNC init_shmht(void)
 59 | {
 60 |     PyObject *m = Py_InitModule("ext_shmht._shmht", shmht_methods);
 61 |     if (m == NULL)
 62 |         return;
 63 | 
 64 |     shmht_error = PyErr_NewException("ext_shmht._shmht.error", NULL, NULL);
 65 |     Py_INCREF(shmht_error);
 66 |     PyModule_AddObject(m, "error", shmht_error);
 67 | 
 68 |     bzero(ht_map, sizeof(ht_map));
 69 | }
 70 | 
 71 | static PyObject * shmht_open(PyObject *self, PyObject *args)
 72 | {
 73 |     int fd = 0;
 74 |     size_t mem_size = 0;
 75 |     hashtable *ht = NULL;
 76 | 
 77 |     const char *name;
 78 |     size_t i_capacity = 0;
 79 |     int force_init = 0;
 80 |     if (!PyArg_ParseTuple(args, "s|ii:shmht.create", &name, &i_capacity, &force_init))
 81 |         return NULL;
 82 | 
 83 |     size_t capacity = i_capacity;
 84 | 
 85 |     fd = open(name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
 86 |     if (fd < 0) {
 87 |         PyErr_Format(shmht_error, "open file(%s) failed: [%d] %s", name, errno, strerror(errno));
 88 |         return NULL;
 89 |     }
 90 | 
 91 |     mylock(fd);
 92 | 
 93 |     struct stat buf;
 94 |     fstat(fd, &buf);
 95 | 
 96 |     if (force_init == 0) { //try to load from existing shmht
 97 |         mem_size = sizeof(hashtable);
 98 |         if (buf.st_size >= sizeof(hashtable)) { //may be valid
 99 |             ht = mmap(NULL, sizeof(hashtable), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
100 |             if (ht == MAP_FAILED) {
101 |                 PyErr_Format(shmht_error, "mmap failed, map_size=sizeof(hashtable)=%lu: [%d] %s",
102 |                                             mem_size, errno, strerror(errno));
103 |                 goto create_failed;
104 |             }
105 | 
106 |             if (ht_is_valid(ht)) {
107 |                 // may not ask for larger capacity than is already in file
108 |                 if (capacity != 0 && capacity > ht->orig_capacity) {
109 |                     PyErr_Format(shmht_error, "file has smaller capacity than requested (req %d, have %d); specify force_init=1 to overwrite an existing shmht", (int)capacity, (int)ht->orig_capacity);
110 |                     goto create_failed;
111 |                 }
112 |                 capacity = ht->orig_capacity; //loaded capacity
113 |             }
114 |             munmap(ht, sizeof(hashtable));
115 |             ht = NULL;
116 |         }
117 |     }
118 | 
119 |     if (capacity == 0) {
120 |         PyErr_Format(shmht_error, "please specify 'capacity' when you try to create a shmht");
121 |         goto create_failed;
122 |     }
123 | 
124 |     mem_size = ht_memory_size(capacity);
125 | 
126 |     if (buf.st_size < mem_size) {
127 |         if (lseek(fd, mem_size - 1, SEEK_SET) == -1) {
128 |             PyErr_Format(shmht_error, "lseek failed: [%d] %s", errno, strerror(errno));
129 |             goto create_failed;
130 |         }
131 |         char t = 0;
132 |         if (write(fd, &t, 1) == -1) {
133 |             PyErr_Format(shmht_error, "write failed: [%d] %s", errno, strerror(errno));
134 |             goto create_failed;
135 |         }
136 |     }
137 | 
138 |     ht = mmap(NULL, mem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
139 |     if (ht == MAP_FAILED) {
140 |         PyErr_Format(shmht_error, "mmap failed, mem_size=%lu: [%d] %s",
141 |                                     mem_size, errno, strerror(errno));
142 |         goto create_failed;
143 |     }
144 | 
145 |     ht_init(ht, capacity, force_init);
146 |     int count;
147 |     for (count = 0; count < max_ht_map_entries; count++)
148 |     {
149 |         ht_idx = (ht_idx + 1) % max_ht_map_entries;
150 |         count += 1;
151 |         if (ht_map[ht_idx].ht == NULL)
152 |             break;
153 |     }
154 |     if (count >= max_ht_map_entries) {
155 |         PyErr_Format(shmht_error, "exceeded max_ht_map_entries(%d) in one process", max_ht_map_entries);
156 |         goto create_failed;
157 |     }
158 |     ht_map[ht_idx].fd       = fd;
159 |     ht_map[ht_idx].mem_size = mem_size;
160 |     ht_map[ht_idx].ht       = ht;
161 | 
162 |     myunlock(fd);
163 |     return PyInt_FromLong(ht_idx);
164 | 
165 | create_failed:
166 |     if (fd >= 0) {
167 |         myunlock(fd);
168 |         close(fd);
169 |     }
170 |     if (ht != NULL)
171 |         munmap(ht, mem_size);
172 |     return NULL;
173 | }
174 | 
175 | static PyObject * shmht_close(PyObject *self, PyObject *args)
176 | {
177 |     int idx;
178 |     if (!PyArg_ParseTuple(args, "i:shmht.create", &idx))
179 |         return NULL;
180 | 
181 |     if (idx < 0 || idx >= max_ht_map_entries || ht_map[idx].ht == NULL) {
182 |         PyErr_Format(shmht_error, "invalid ht id: (%d)", idx);
183 |         return NULL;
184 |     }
185 | 
186 |     hashtable *ht = ht_map[idx].ht;
187 | 
188 |     size_t ref_cnt = ht_destroy(ht);
189 | 
190 |     if (munmap(ht, ht_map[idx].mem_size) != 0) {
191 |         PyErr_Format(shmht_error, "munmap failed: [%d] %s", errno, strerror(errno));
192 |         //return NULL;
193 |     }
194 | 
195 |     // Do not delete the mapping file - somebody else might still
196 |     // want it.  If the application knows that the shared memory
197 |     // should not persist, it can delete the file.
198 | 
199 |     close(ht_map[idx].fd);
200 | 
201 |     memset(&ht_map[idx], 0, sizeof(struct mapnode));
202 | 
203 |     Py_RETURN_TRUE;
204 | }
205 | 
206 | static PyObject * shmht_getval(PyObject *self, PyObject *args)
207 | {
208 |     int idx, key_size;
209 |     const char *key;
210 |     PyObject * return_value;
211 | 
212 |     if (!PyArg_ParseTuple(args, "is#:shmht.getval", &idx, &key, &key_size))
213 |         return NULL;
214 | 
215 |     if (idx < 0 || idx >= max_ht_map_entries || ht_map[idx].ht == NULL) {
216 |         PyErr_Format(shmht_error, "invalid ht id: (%d)", idx);
217 |         return NULL;
218 |     }
219 | 
220 |     mylock(ht_map[idx].fd);
221 | 
222 |     hashtable *ht = ht_map[idx].ht;
223 | 
224 |     ht_str* value = ht_get(ht, key, key_size);
225 |     if (value == NULL) {
226 |         myunlock(ht_map[idx].fd);
227 |         Py_RETURN_NONE;
228 |     }
229 | 
230 |     myunlock(ht_map[idx].fd);
231 |     return PyString_FromStringAndSize(value->str, value->size);
232 | }
233 | 
234 | static PyObject * shmht_setval(PyObject *self, PyObject *args)
235 | {
236 |     int idx, key_size, value_size;
237 |     const char *key, *value;
238 |     if (!PyArg_ParseTuple(args, "is#s#:shmht.setval", &idx, &key, &key_size, &value, &value_size)) {
239 |         return NULL;
240 |     }
241 | 
242 |     if (idx < 0 || idx >= max_ht_map_entries || ht_map[idx].ht == NULL) {
243 |         PyErr_Format(shmht_error, "invalid ht id: (%d)", idx);
244 |         return NULL;
245 |     }
246 | 
247 |     hashtable *ht = ht_map[idx].ht;
248 | 
249 |     mylock(ht_map[idx].fd);
250 | 
251 |     int result = ht_set(ht, key, key_size, value, value_size);
252 | 
253 |     myunlock(ht_map[idx].fd);
254 | 
255 |     if (result == False ) {
256 |         PyErr_Format(shmht_error, "insert failed for key(%s)", key);
257 |         return NULL;
258 |     }
259 | 
260 |     Py_RETURN_TRUE;
261 | }
262 | 
263 | static PyObject * shmht_remove(PyObject *self, PyObject *args)
264 | {
265 |     int idx, key_size;
266 |     const char *key;
267 |     if (!PyArg_ParseTuple(args, "is#:shmht.remove", &idx, &key, &key_size))
268 |         return NULL;
269 | 
270 |     if (idx < 0 || idx >= max_ht_map_entries || ht_map[idx].ht == NULL) {
271 |         PyErr_Format(shmht_error, "invalid ht id: (%d)", idx);
272 |         return NULL;
273 |     }
274 | 
275 |     hashtable *ht = ht_map[idx].ht;
276 |     mylock(ht_map[idx].fd);
277 | 
278 |     int result = ht_remove(ht, key, key_size);
279 | 
280 |     myunlock(ht_map[idx].fd);
281 | 
282 |     if ( result == False)
283 |         Py_RETURN_FALSE;
284 |     else
285 |         Py_RETURN_TRUE;
286 | }
287 | 
288 | static PyObject * shmht_foreach(PyObject *self, PyObject *args)
289 | {
290 |     int idx;
291 |     static PyObject *cb = NULL;
292 | 
293 |     if (!PyArg_ParseTuple(args, "iO:shmht.foreach", &idx, &cb))
294 |         return NULL;
295 | 
296 |     if (idx < 0 || idx >= max_ht_map_entries || ht_map[idx].ht == NULL) {
297 |         PyErr_Format(shmht_error, "invalid ht id: (%d)", idx);
298 |         return NULL;
299 |     }
300 | 
301 |     if (!PyCallable_Check(cb)) {
302 |         PyErr_SetString(PyExc_TypeError, "parameter must be callable");
303 |         return NULL;
304 |     }
305 | 
306 | 
307 |     hashtable *ht = ht_map[idx].ht;
308 |     ht_iter *iter = ht_get_iterator(ht);
309 | 
310 |     mylock(ht_map[idx].fd);
311 |     while (ht_iter_next(iter)) {
312 |         ht_str *key = iter->key, *value = iter->value;
313 |         PyObject *arglist = Py_BuildValue("(s#s#)", key->str, key->size, value->str, value->size);
314 |         PyEval_CallObject(cb, arglist);
315 |         Py_DECREF(arglist);
316 |     }
317 |     myunlock(ht_map[idx].fd);
318 | 
319 |     free(iter);
320 | 
321 |     Py_RETURN_NONE;
322 | }
323 | 
324 | 
325 | // TODO: add an msync() operation.  see https://docs.python.org/2/c-api/init.html#thread-state-and-the-global-interpreter-lock for releasing the GIL during blocking I/O
326 | // TODO: add a find_slot() / put_slot_data() operation, so you don't need to hash the key again when you use the same key repeatedly
327 | 


--------------------------------------------------------------------------------
/hashtable.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <ctype.h>
  5 | #include <errno.h>
  6 | #include <assert.h>
  7 | #include <unistd.h>
  8 | #include <sys/time.h>
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | #include "hashtable.h"
 15 | 
 16 | #ifdef __cplusplus
 17 | }
 18 | #endif
 19 | 
 20 | #define ht_flag_base(ht) ((char *)(ht) + (ht)->flag_offset)
 21 | #define ht_bucket_base(ht) ((char *)(ht) + (ht)->bucket_offset)
 22 | 
 23 | static const unsigned ht_magic = 0xBFBF;
 24 | 
 25 | enum bucket_flag {
 26 |     empty = 0, used = 1, removed = 2
 27 | };
 28 | 
 29 | size_t header_size = 1024;
 30 | 
 31 | #define bucket_size     1280
 32 | #define max_key_size    256
 33 | #define max_value_size  (bucket_size - max_key_size)
 34 | 
 35 | const float max_load_factor = 0.65;
 36 | 
 37 | static const unsigned int primes[] = { 
 38 |     53, 97, 193, 389,
 39 |     769, 1543, 3079, 6151,
 40 |     12289, 24593, 49157, 98317,
 41 |     196613, 393241, 786433, 1572869,
 42 |     3145739, 6291469, 12582917, 25165843,
 43 |     50331653, 100663319, 201326611, 402653189,
 44 |     805306457, 1610612741
 45 | };
 46 | static const unsigned int prime_table_length = sizeof (primes) / sizeof (primes[0]);
 47 | 
 48 | static inline void fill_ht_str(ht_str *s, const char *str, const u_int32 size) {
 49 |     s->size = size;
 50 |     memcpy(s->str, str, size);
 51 | }
 52 | 
 53 | static unsigned int ht_get_prime_by(size_t capacity) {
 54 |     unsigned i = 0;
 55 |     capacity *= 2;
 56 |     for (i = 0; i < prime_table_length; i++) {
 57 |         if (primes[i] > capacity)
 58 |             return primes[i];
 59 |     }
 60 |     return 0;
 61 | }
 62 | 
 63 | size_t ht_memory_size(size_t capacity) {
 64 |     const int flag_size = 1; //char
 65 |     size_t aligned_capacity = (ht_get_prime_by(capacity) / 4 + 1) * 4; //round up to 4-byte alignment
 66 |     return header_size                      //header
 67 |          + flag_size * aligned_capacity     //flag
 68 |          + bucket_size * aligned_capacity;  //bucket
 69 | }
 70 | 
 71 | /*dbj2_hash function (copied from libshmht)*/
 72 | static unsigned int dbj2_hash (const char *str, size_t size) {
 73 |     unsigned long hash = 5381;
 74 |     while (size--) {
 75 |         char c = *str++;
 76 |         hash = ((hash << 5) + hash) + c;    /* hash * 33 + c */
 77 |     }
 78 |     return (unsigned int) hash;
 79 | }
 80 | 
 81 | BOOL is_equal(const char *a, size_t asize, const char *b, size_t bsize) {
 82 |     if (asize != bsize)
 83 |         return False;
 84 |     return strncmp(a, b, asize) ? False : True;
 85 | }
 86 | 
 87 | int ht_is_valid(hashtable *ht) {
 88 |     return (ht->magic == ht_magic);
 89 | }
 90 | 
 91 | /*
 92 |  * The caller is responsible for the 4-byte alignment of base_addr
 93 |  * and the size of base_addr should be no less than ht_get_prime_by(capacity)
 94 |  */
 95 | hashtable* ht_init(void *base_addr, size_t capacity, int force_init) {
 96 |     hashtable* ht = (hashtable *)base_addr;
 97 |     if (force_init || !ht_is_valid(ht)) {
 98 |         ht->magic     = ht_magic;
 99 |         ht->ref_cnt   = 0;
100 | 
101 |         ht->orig_capacity = capacity;
102 |         ht->capacity      = ht_get_prime_by(capacity);
103 |         ht->size          = 0;
104 | 
105 |         ht->flag_offset   = header_size;
106 |         ht->bucket_offset = ht->flag_offset + (ht->capacity / 4 + 1) * 4; //alignment
107 | 
108 |         bzero(ht_flag_base(ht), ht->capacity);
109 |     }
110 |     ht->ref_cnt += 1;
111 |     return ht;
112 | }
113 | 
114 | static size_t ht_position(hashtable *ht, const char *key, u_int32 key_size, BOOL treat_removed_as_empty) {
115 |     char *flag_base = ht_flag_base(ht);
116 |     char *bucket_base = ht_bucket_base(ht);
117 |     size_t capacity = ht->capacity;
118 |     unsigned long hval = dbj2_hash(key, key_size) % capacity;
119 | 
120 |     size_t i = hval, di = 1;
121 |     while (True) {
122 |         if (flag_base[i] == empty)
123 |             break;
124 |         if (flag_base[i] == removed && treat_removed_as_empty)
125 |             break;
126 |         if (flag_base[i] == used)
127 |         {
128 |             char *bucket = bucket_base + i * bucket_size;
129 |             ht_str* bucket_key = (ht_str *)bucket;
130 |             if (is_equal(key, key_size, bucket_key->str, bucket_key->size)) {
131 |                 break;
132 |             }
133 |         }
134 |         i = (i + di) % capacity;
135 |         di++;
136 |         if (i == hval) {
137 |             //extreme condition: when all flags are 'removed'
138 |             bzero(flag_base, capacity);
139 |             break;
140 |         }
141 |     }
142 |     return i;
143 | }
144 | 
145 | ht_str* ht_get(hashtable *ht, const char *key, u_int32 key_size) {
146 |     size_t i = ht_position(ht, key, key_size, False); //'removed' bucket is not 'empty' when searching a chain.
147 |     if (ht_flag_base(ht)[i] != used) {
148 |         return NULL;
149 |     }
150 |     char *bucket = ht_bucket_base(ht) + i * bucket_size;
151 |     return (ht_str*)(bucket + max_key_size);
152 | }
153 | 
154 | int ht_set(hashtable *ht, const char *key, u_int32 key_size, const char *value, u_int32 value_size) {
155 |     if (sizeof(u_int32) + key_size >= max_key_size || sizeof(u_int32) + value_size >= max_value_size) {
156 |         //the item is too large
157 |         fprintf(stderr, "the item is too large: key_size(%u), value(%u)\n", key_size, value_size);
158 |         return False;
159 |     }
160 | 
161 |     char *flag_base = ht_flag_base(ht);
162 |     char *bucket_base = ht_bucket_base(ht);
163 | 
164 |     ht_str *bucket_key = NULL, *bucket_value = NULL;
165 | 
166 |     //if it exists: just find and modify it's value
167 |     bucket_value = ht_get(ht, key, key_size);
168 |     if (bucket_value) { 
169 |         fill_ht_str(bucket_value, value, value_size);
170 |         return True;
171 |     }
172 | 
173 |     //else: find an available bucket, which can be both 'empty' or 'removed'
174 |     size_t i = ht_position(ht, key, key_size, True);
175 | 
176 |     if (ht->capacity * max_load_factor < ht->size) {
177 |         //hash table is over loaded
178 |         fprintf(stderr, "hash table is over loaded, capacity=%lu, size=%lu\n", ht->capacity, ht->size);
179 |         return False;
180 |     }
181 | 
182 |     ht->size += 1;
183 |     flag_base[i] = used;
184 | 
185 |     char *bucket = bucket_base + i * bucket_size;
186 |     bucket_key   = (ht_str*)bucket;
187 |     bucket_value = (ht_str*)(bucket + max_key_size);
188 |     fill_ht_str(bucket_key, key, key_size);
189 |     fill_ht_str(bucket_value, value, value_size);
190 |     return True;
191 | }
192 | 
193 | int ht_remove(hashtable *ht, const char *key, u_int32 key_size) {
194 |     size_t i = ht_position(ht, key, key_size, False); //'removed' bucket is not 'empty' when searching a chain.
195 |     if (ht_flag_base(ht)[i] != used) {
196 |         return False;
197 |     }
198 |     ht_flag_base(ht)[i] = removed;
199 |     ht->size -= 1;
200 |     return True;
201 | }
202 | 
203 | //don't forget to free(ht_iter)
204 | ht_iter* ht_get_iterator(hashtable *ht) {
205 |     ht_iter* iter = ALLOC(ht_iter, 1);
206 |     assert(iter != NULL);
207 |     iter->ht    = ht;
208 |     iter->pos   = -1;
209 |     return iter;
210 | }
211 | 
212 | int ht_iter_next(ht_iter* iter) {
213 |     size_t i = 0;
214 |     hashtable *ht = iter->ht;
215 |     char *flag_base = ht_flag_base(ht);
216 |     char *bucket_base = ht_bucket_base(ht);
217 | 
218 |     for (i = iter->pos + 1; i < ht->capacity; i++) {
219 |         if (flag_base[i] == used) {
220 |             char *bucket = bucket_base + i * bucket_size;
221 |             iter->key = (ht_str*)bucket, iter->value = (ht_str*)(bucket + max_key_size);
222 |             iter->pos = i;
223 |             return True;
224 |         }
225 |     }
226 |     return False;
227 | }
228 | 
229 | int ht_destroy(hashtable *ht) {
230 |     ht->ref_cnt -= 1;
231 |     return ht->ref_cnt == 0 ? True : False;
232 | }
233 | 
234 | /*
235 | 
236 | //commented out together with 'main' to eliminate compiler's complaint
237 | static void dump_ht_str(ht_str *s) {
238 |     if (s) {
239 |         printf("%u: %*s\n", s->size, (int)s->size, s->str);
240 |     }
241 |     else {
242 |         printf("(nil)\n");
243 |     }
244 | }
245 | 
246 | int main() {
247 |     size_t capacity = 500000;
248 |     printf("%u\n", ht_get_prime_by(capacity));
249 |     printf("%lu\n", ht_memory_size(capacity));
250 |     void *mem = malloc(ht_memory_size(capacity) + 1);
251 |     hashtable *ht = ht_init(mem, capacity, 0);
252 | 
253 |     ht_set(ht, "hello", 5, "-----", 5);
254 |     ht_set(ht, "hello1", 6, "hello1", 6);
255 |     ht_set(ht, "hello", 5, "hello", 5);
256 |     ht_remove(ht, "hello", 5);
257 | 
258 |     ht_str* s = NULL;
259 |     
260 |     s = ht_get(ht, "hello", 5);
261 |     dump_ht_str(s);
262 | 
263 |     s = ht_get(ht, "hello1", 6);
264 |     dump_ht_str(s);
265 | 
266 |     ht_set(ht, "a", 1, "a", 1);
267 |     ht_set(ht, "b", 1, "b", 1);
268 |     ht_set(ht, "c", 1, "c", 1);
269 |     ht_set(ht, "d", 1, "d", 1);
270 |     printf("ht->size: %lu\n", ht->size);
271 | 
272 |     ht_remove(ht, "c", 1);
273 | 
274 |     hashtable* ht1 = ht_init(mem, capacity, 0);
275 | 
276 |     ht_iter* iter = ht_get_iterator(ht1);
277 |     while (ht_iter_next(iter)) {
278 |         ht_str *key = iter->key, *value = iter->value;
279 |         printf("%*s => %*s\n", (int)key->size, key->str, (int)value->size, value->str);
280 |     }
281 |     free(iter);
282 |     printf("ht_get_iterator test ok\n");
283 | 
284 |     char x[128];
285 |     int i, len;
286 |     struct timeval begin, end;
287 | #define ts(tv) (tv.tv_sec + tv.tv_usec / 1000000.0)
288 | 
289 |     gettimeofday(&begin, NULL);
290 |     for (i = 0; i < (int)capacity; i++) {
291 |         len = sprintf(x, "%064d", i);
292 |         if (ht_set(ht, x, len, x, len) == 0) {
293 |             printf("set wrong @ %d\n", i);
294 |             return 1;
295 |         }
296 |     }
297 |     gettimeofday(&end, NULL);
298 |     printf("set test: %.0lf iops\n", capacity / (ts(end) - ts(begin)));
299 | 
300 |     gettimeofday(&begin, NULL);
301 |     for (i = 0; i < (int)capacity; i++) {
302 |         len = sprintf(x, "%064d", i);
303 |         ht_str* val = ht_get(ht, x, len);
304 |         if (val == NULL || !is_equal(x, len, val->str, val->size)) {
305 |             printf("(after set)get wrong @ %d\n", i);
306 |             return 1;
307 |         }
308 |     }
309 |     gettimeofday(&end, NULL);
310 |     printf("get test: %.0lf iops\n", capacity / (ts(end) - ts(begin)));
311 | 
312 |     for (i = 0; i < (int)capacity; i += 2) {
313 |         len = sprintf(x, "%064d", i);
314 |         if (ht_remove(ht, x, len) == 0) {
315 |             printf("remove wrong @ %d\n", i);
316 |             return 1;
317 |         }
318 |         len = sprintf(x, "%064d", i + 1);
319 |         ht_str* val = ht_get(ht, x, len);
320 |         if (val == NULL || !is_equal(x, len, val->str, val->size)) {
321 |             printf("(after remove)get wrong @ %d\n", i);
322 |             return 1;
323 |         }
324 |     }
325 |     printf("remove/get test ok\n");
326 | 
327 |     //while(1) sleep(1000);
328 |     return 0;
329 | }
330 | // */
331 | 


--------------------------------------------------------------------------------