├── .gitignore ├── .travis.yml ├── COPYING ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── api_semidbm.rst ├── benchmarks.rst ├── changelog.rst ├── conf.py ├── details.rst ├── fileformat.rst ├── img │ ├── delete_sequential.png │ ├── fill_sequential.png │ ├── large_delete_sequential.png │ ├── large_fill_sequential.png │ ├── large_read_hot.png │ ├── large_read_random.png │ ├── large_read_sequential.png │ ├── read_hot.png │ ├── read_random.png │ └── read_sequential.png ├── index.rst └── overview.rst ├── requirements.txt ├── scripts ├── adapters │ ├── bdb_btopen.py │ ├── bdb_hashopen.py │ ├── bdb_minimal.py │ └── builtindict.py ├── benchmark ├── loadtime ├── makedb ├── makegraphs └── tps ├── semidbm ├── __init__.py ├── compat.py ├── db.py ├── exceptions.py ├── loaders │ ├── __init__.py │ ├── mmapload.py │ └── simpleload.py └── win32.py ├── setup.cfg ├── setup.py └── test_semidbm.py /.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | dist/* 3 | semidbm.egg-info/* 4 | .coverage 5 | _templates/ 6 | _static/ 7 | _build/ 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.3" 6 | - "3.4" 7 | - "pypy" 8 | - "pypy3" 9 | install: 10 | - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi 11 | - pip install coverage coveralls 12 | script: 13 | - coverage erase 14 | - coverage run --source semidbm test_semidbm.py 15 | notifications: 16 | email: 17 | - js@jamesls.com 18 | after_success: coveralls 19 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 James Saryerwinnie 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 3. The name of the author may not be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 | IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include COPYING 2 | include README.rst 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Overview 3 | ======== 4 | 5 | .. image:: https://secure.travis-ci.org/jamesls/semidbm.png?branch=master 6 | :target: http://travis-ci.org/jamesls/semidbm 7 | 8 | .. image:: https://coveralls.io/repos/jamesls/semidbm/badge.png?branch=master 9 | :target: https://coveralls.io/r/jamesls/semidbm?branch=master 10 | 11 | .. image:: https://img.shields.io/pypi/v/semidbm.svg 12 | :target: https://pypi.python.org/pypi/semidbm/ 13 | :alt: Latest Version 14 | 15 | .. image:: https://img.shields.io/pypi/pyversions/semidbm.svg 16 | :target: https://pypi.python.org/pypi/semidbm/ 17 | :alt: Supported Python versions 18 | 19 | .. image:: https://img.shields.io/pypi/implementation/semidbm.svg 20 | :target: https://pypi.python.org/pypi/semidbm/ 21 | :alt: Supported Python implementations 22 | 23 | .. image:: https://img.shields.io/pypi/l/semidbm.svg 24 | :target: https://pypi.python.org/pypi/semidbm/ 25 | :alt: License 26 | 27 | .. image:: https://img.shields.io/pypi/wheel/semidbm.svg 28 | :target: https://pypi.python.org/pypi/semidbm/ 29 | :alt: Wheel Status 30 | 31 | 32 | Semidbm is a fast, pure python implementation of a dbm, which is a 33 | persistent key value store. It allows you to get and set keys through 34 | a dict interface:: 35 | 36 | import semidbm 37 | db = semidbm.open('testdb', 'c') 38 | db['foo'] = 'bar' 39 | print db['foo'] 40 | db.close() 41 | 42 | These values are persisted to disk, and you can later retrieve 43 | these key/value pairs:: 44 | 45 | # Then at a later time: 46 | db = semidbm.open('testdb', 'r') 47 | # prints "bar" 48 | print db['foo'] 49 | 50 | 51 | It was written with these things in mind: 52 | 53 | * Pure python, supporting python 2.6, 2.7, 3.3, and 3.4. 54 | * Cross platform, works on Windows, Linux, Mac OS X. 55 | * Supports CPython, pypy, and jython (versions 2.7-b3 and higher). 56 | * Simple and Fast (See `Benchmarking Semidbm `__). 57 | 58 | 59 | Supported Python Versions 60 | ========================= 61 | 62 | Semidbm supports python 2.6, 2.7, 3.3, and 3.4. 63 | 64 | ============= 65 | Official Docs 66 | ============= 67 | 68 | Read the `semidbm docs `_ for more information 69 | and how to use semidbm. 70 | 71 | 72 | ======== 73 | Features 74 | ======== 75 | 76 | Semidbm originally started off as an improvement over the 77 | `dumbdbm `__ 78 | library in the python standard library. Below are a list of some of the 79 | improvements over dumbdbm. 80 | 81 | 82 | Single Data File 83 | ================ 84 | 85 | Instead of an index file and a data file, the index and data have been 86 | consolidated into a single file. This single data file is always appended to, 87 | data written to the file is never modified. 88 | 89 | 90 | Data File Compaction 91 | ==================== 92 | 93 | Semidbm uses an append only file format. This has the potential to grow to 94 | large sizes as space is never reclaimed. Semidbm addresses this by adding a 95 | ``compact()`` method that will rewrite the data file to a minimal size. 96 | 97 | 98 | Performance 99 | =========== 100 | 101 | Semidbm is significantly faster than dumbdbm (keep in mind both are pure python 102 | libraries) in just about every way. The documentation shows the 103 | `results `_ 104 | of semidbm vs. other dbms, along with how to run the benchmarking 105 | script yourself. 106 | 107 | 108 | =========== 109 | Limitations 110 | =========== 111 | 112 | * Not thread safe; can't be accessed by multiple processes. 113 | * The entire index must fit in memory. This essentially means that all of the 114 | keys must fit in memory. 115 | 116 | 117 | Post feedback and issues on `github issues`_, or check out the 118 | latest changes at the github `repo`_. 119 | 120 | 121 | .. _github issues: https://github.com/jamesls/semidbm/issues 122 | .. _repo: https://github.com/jamesls/semidbm 123 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/semidbm.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/semidbm.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/semidbm" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/semidbm" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/api_semidbm.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | API for semidbm 3 | =============== 4 | 5 | .. autofunction:: semidbm.db.open 6 | 7 | 8 | .. autoclass:: semidbm.db._SemiDBM 9 | :members: 10 | -------------------------------------------------------------------------------- /docs/benchmarks.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | Benchmarking Semidbm 3 | ==================== 4 | 5 | Semidbm was not written to be the fastest dbm available, but its performance is 6 | surprisingly well for a pure python dbm. Before showing the benchmark results, 7 | it's important to note that these benchmark results can vary across machines 8 | and should in no way be considered definitive nor comprehensive. And yes, 9 | there are other things besides performance that are important when considering 10 | a dbm. 11 | 12 | 13 | Benchmarking Approach 14 | ===================== 15 | 16 | The benchmarks used for semidbm are based off the benchmark scripts for 17 | `leveldb `_. You can run the benchmark 18 | scripts yourself using the `scripts/benchmark` script in the repo. By default, 19 | the benchmark uses a db of one million keys with 16 byte keys and 100 byte 20 | values (these are the values used for leveldb's benchmarks). All of these 21 | parameters can be changed via command line arugments ( `-n`, `-k`, `-s` 22 | respectively). 23 | 24 | The benchmark script is written in a way to be compatible with any module 25 | supporting the dbm interface. Given the dbm interface isn't entirely 26 | standardized, this is what is required: 27 | 28 | * An `open()` function in the module (that behaves like 29 | `dumbdbm.open `_, 30 | `gdbm.open `_, etc). 31 | * The returned object from `open()` is a "dbm" like object. All the object 32 | needs to support is `__getitem__`, `__setitem__`, `__delitem__`, and 33 | `close()`. 34 | 35 | To specify what dbm module to use, use the `-d` argument. The value of this 36 | argument should the module name of the dbm, for example, to run the benchmarks 37 | against semidbm:: 38 | 39 | scripts/benchmark -d semidbm 40 | 41 | The `-d` argument can be specified multiple times. 42 | 43 | If a dbm does not support a dbm interface, an adapter module can be written for 44 | the dbm. The directory `scripts/adapters` is added to `sys.path` before the 45 | benchmarks are run, so benchmarking a 3rd party dbm is straightforward. For 46 | example, in order to benchmark Berkeley DB using the bsddb3 module, a 47 | `scripts/adapters/bdb_minimal.py` file was created:: 48 | 49 | 50 | import bsddb3.db 51 | def open(filename, mode): 52 | db = bsddb3.db.DB(None) 53 | if mode == 'r': 54 | flags = bsddb3.db.DB_RDONLY 55 | elif mode == 'rw': 56 | flags = 0 57 | elif mode == 'w': 58 | flags = bsddb3.db.DB_CREATE 59 | elif mode == 'c': 60 | flags = bsddb3.db.DB_CREATE 61 | elif mode == 'n': 62 | flags = bsddb3.db.DB_TRUNCATE | bsddb3.db.DB_CREATE 63 | else: 64 | raise bsddb3.db.DBError( 65 | "flags should be one of 'r', 'w', 'c' or 'n' or use the " 66 | "bsddb.db.DB_* flags") 67 | db.open(filename, None, bsddb3.db.DB_HASH, flags) 68 | return db 69 | 70 | The `bsddb3.db.DB `_ 71 | object can now be benchmarked using:: 72 | 73 | scripts/benchmark -d bdb_minimal 74 | 75 | 76 | Benchmark Results 77 | ================= 78 | 79 | Below are the results of benchmarking various dbms. 80 | Although `scripts/benchmark` shows the results in various forms of measurement, 81 | the measurement chosen here is the average number of operations per second over 82 | the total number of keys. For this measurement, **higher is better**. 83 | 84 | The dbms chosen for this benchmark are: 85 | 86 | * semidbm 87 | * gdbm (GDN dbm) 88 | * bdb (minimal Berkeley DB interface, `scripts/adapaters/bdb_minimal.py`) 89 | * dumbdbm 90 | 91 | The `dbm` module was not included because it was not able to add 1000000 to its 92 | db, it raises an exception around 420000 keys with an "Out of overflow pages" 93 | error. 94 | 95 | 96 | This first benchmark shows the ops/sec for adding one million keys to the db. 97 | 98 | 99 | .. image:: img/fill_sequential.png 100 | 101 | The second benchmark shows the ops/sec for repeatedly accessing 1% of the keys 102 | (randomly selected). 103 | 104 | 105 | .. image:: img/read_hot.png 106 | 107 | 108 | The next benchmark shows the ops/sec for reading all one million keys in the 109 | same order that they were added. 110 | 111 | .. image:: img/read_sequential.png 112 | 113 | The next benchmark shows the ops/sec for reading all one million keys in a 114 | randomly selected order. 115 | 116 | .. image:: img/read_random.png 117 | 118 | And the last benchmark shows the ops/sec for deleting all one million keys in 119 | the same order that they were added. 120 | 121 | .. image:: img/delete_sequential.png 122 | 123 | 124 | Note that dumbdbm is not shown in the chart above. This is because deletion of 125 | keys in dumbdbm is extremely slow. It also appears to have O(n) behavior (it 126 | writes out its data file on every delete). To give you an idea of the 127 | performance, running this benchmark against dumbdbm with 1000 keys gave an 128 | average ops/sec for the delete_sequential benchmark of **800**. For 10000 129 | keys dumbdbm resulted in **104** ops/sec. 130 | 131 | 132 | The table below shows the actual numbers for the charts above. 133 | 134 | +-------------------+-------------+------------+--------+---------+ 135 | | | semidbm | gdbm | bdb | dumbdbm | 136 | +===================+=============+============+========+=========+ 137 | | fill_sequential | **73810** | 63177 | 73614 | 5460 | 138 | +-------------------+-------------+------------+--------+---------+ 139 | | read_hot | **218651** | 202432 | 200111 | 59569 | 140 | +-------------------+-------------+------------+--------+---------+ 141 | | read_sequential | 257668 | **417320** | 209696 | 62605 | 142 | +-------------------+-------------+------------+--------+---------+ 143 | | read_random | 219962 | **406594** | 197690 | 59258 | 144 | +-------------------+-------------+------------+--------+---------+ 145 | | delete_sequential | **144265** | 119167 | 135137 | 0 | 146 | +-------------------+-------------+------------+--------+---------+ 147 | 148 | 149 | Benchmarking With Large Values 150 | ------------------------------ 151 | 152 | One area where semidbm benchmarks really well is when dealing with large 153 | values. The same 5 benchmarks were repeated, but with only 1000 total keys, 154 | 16 byte keys, and 100000 byte values. 155 | 156 | 157 | The first benchmark shows the ops/sec for 1000 sequential writes. 158 | 159 | 160 | .. image:: img/large_fill_sequential.png 161 | 162 | 163 | The second benchmark shows the ops/sec for repeatedly accessing 1% of the keys 164 | (randomly selected). 165 | 166 | 167 | .. image:: img/large_read_hot.png 168 | 169 | 170 | The third benchmark shows the ops/sec for sequentially reading all 1000 keys. 171 | 172 | .. image:: img/large_read_sequential.png 173 | 174 | The fourth benchmark shows the ops/sec for reading all 1000 keys in a 175 | randomly selected order. 176 | 177 | .. image:: img/large_read_random.png 178 | 179 | And the last benchmark shows the ops/sec for deleting all 1000 keys in 180 | the same order that they were added. 181 | 182 | .. image:: img/large_delete_sequential.png 183 | 184 | Below is the raw data used to generate the above charts. 185 | 186 | +----------------------+------------+-----------+-----------+-------------+-----------+ 187 | | n=1000,k=16,v=100000 | semidbm | dbm | gdbm | bdb_minimal | dumbdbm | 188 | +======================+============+===========+===========+=============+===========+ 189 | | fill_sequential | 2653 | 2591 | **5525** | 4677 | 1330 | 190 | +----------------------+------------+-----------+-----------+-------------+-----------+ 191 | | read_hot | **61016** | 8363 | 23104 | 11782 | 31624 | 192 | +----------------------+------------+-----------+-----------+-------------+-----------+ 193 | | read_sequential | **42421** | 8822 | 1508 | 11519 | 26757 | 194 | +----------------------+------------+-----------+-----------+-------------+-----------+ 195 | | read_random | **42133** | 8720 | 16442 | 11162 | 23778 | 196 | +----------------------+------------+-----------+-----------+-------------+-----------+ 197 | | delete_sequential | **141379** | 21167 | 17695 | 7267 | 780 | 198 | +----------------------+------------+-----------+-----------+-------------+-----------+ 199 | 200 | You can see that with the exception of fill_sequential (in which the fastest 201 | module, gdbm, was roughly twice as fast as semidbm), semidbm completely 202 | outperforms all the other dbms. In the case of read_sequential, semidbm's **28 203 | times faster than gdbm.** 204 | 205 | 206 | Overall, semidbm's performance is comparable to the performance of other dbms 207 | with small keys and values, but is surprisingly faster than other dbms when 208 | reading large values. It's also clear that semidbm is faster than dumbdbm is all 209 | of the benchmarks shown here. 210 | 211 | 212 | Running the Benchmarks 213 | ---------------------- 214 | 215 | You are encouraged to run the benchmarks yourself, to recreate the benchmark 216 | above, you can run:: 217 | 218 | scripts/benchmark -d semidbm -d gdbm -d bdb_minimal -d dumbdbm 219 | 220 | Though keep in mind that you will probably want to stop the benchmark 221 | once dumbdbm reaches the delete_sequential benchmark. Either that or you can 222 | leave off dumbdbm and run it with a smaller number of keys:: 223 | 224 | scripts/benchmark -d dumbdbm -n 10000 225 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | 0.5.1 6 | ===== 7 | 8 | * Remove unused code. 9 | * Add support for wheels. 10 | 11 | 12 | 0.5.0 13 | ===== 14 | 15 | * Remove mmap read only dbm subclass. This functionality 16 | has not been available in a public interface since 17 | b265e60c5f4c0b1e8e9e4343f5f2300b5e017bf0 (1.5 years ago) 18 | so it's now removed. 19 | * Added non mmap based dbm loader for platforms that do not 20 | support mmap (jython). 21 | * Atomic renames on windows during db compaction. 22 | 23 | 24 | 0.4.0 25 | ===== 26 | 27 | 0.4.0 is a backwards incompatible release with 0.3.1. 28 | Data files created with 0.3.1 will not work with 0.4.0. 29 | The reasons for switching to 0.4.0 include: 30 | 31 | * Data format switched from ASCII to binary file format, 32 | this resulted in a nice performance boost. 33 | * Index and data file consolidated to a single file, resulting 34 | in improved write performance. 35 | * Checksums are written for all entries. Checksums can 36 | be verified for every __getitem__ call (off by default). 37 | * Python 3 support (officially python 3.3.x). 38 | 39 | 40 | 0.3.1 41 | ===== 42 | 43 | * Windows support. 44 | 45 | 46 | 0.3.0 47 | ===== 48 | 49 | * The data file and the index file are kept in a separate directory. To load 50 | the the db you specify the directory name instead of the data filename. 51 | * Non-mmapped read only version is used when the db is opened with ``r``. 52 | * Write performance improvements. 53 | 54 | 55 | 0.2.1 56 | ===== 57 | 58 | * DB can be opened with ``r``, ``c``, ``w``, and ``n``. 59 | * Add a memory mapped read only implementation for reading 60 | from the DB (if your entire data file can be mmapped this 61 | provides a huge performance boost for reads). 62 | * Benchmark scripts rewritten to provide more useful information. 63 | 64 | 65 | 0.2.0 66 | ===== 67 | 68 | * New ``sync()`` method to ensure data is written to disk. 69 | 70 | * ``sync()`` is called during compaction and on ``close()``. 71 | 72 | * Add a ``DBMLoadError`` exception for catching semidbm loading errors. 73 | 74 | 75 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # semidbm documentation build configuration file, created by 4 | # sphinx-quickstart on Sun Feb 12 21:08:54 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | on_rtd = os.environ.get('READTHEDOCS') == 'True' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be extensions 29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 31 | 32 | # Add any paths that contain templates here, relative to this directory. 33 | templates_path = ['_templates'] 34 | 35 | # The suffix of source filenames. 36 | source_suffix = '.rst' 37 | 38 | # The encoding of source files. 39 | #source_encoding = 'utf-8-sig' 40 | 41 | # The master toctree document. 42 | master_doc = 'index' 43 | 44 | # General information about the project. 45 | project = u'semidbm' 46 | copyright = u'2012, James Saryerwinnie Jr' 47 | 48 | # The version info for the project you're documenting, acts as replacement for 49 | # |version| and |release|, also used in various other places throughout the 50 | # built documents. 51 | # 52 | # The short X.Y version. 53 | version = '0.5' 54 | # The full version, including alpha/beta/rc tags. 55 | release = '0.5.1' 56 | 57 | # The language for content autogenerated by Sphinx. Refer to documentation 58 | # for a list of supported languages. 59 | #language = None 60 | 61 | # There are two options for replacing |today|: either, you set today to some 62 | # non-false value, then it is used: 63 | #today = '' 64 | # Else, today_fmt is used as the format for a strftime call. 65 | #today_fmt = '%B %d, %Y' 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | exclude_patterns = ['_build'] 70 | 71 | # The reST default role (used for this markup: `text`) to use for all documents. 72 | #default_role = None 73 | 74 | # If true, '()' will be appended to :func: etc. cross-reference text. 75 | #add_function_parentheses = True 76 | 77 | # If true, the current module name will be prepended to all description 78 | # unit titles (such as .. function::). 79 | #add_module_names = True 80 | 81 | # If true, sectionauthor and moduleauthor directives will be shown in the 82 | # output. They are ignored by default. 83 | #show_authors = False 84 | 85 | 86 | 87 | # A list of ignored prefixes for module index sorting. 88 | #modindex_common_prefix = [] 89 | 90 | 91 | # -- Options for HTML output --------------------------------------------------- 92 | # The name of the Pygments (syntax highlighting) style to use. 93 | if not on_rtd: 94 | try: 95 | import sphinx_rtd_theme 96 | except ImportError: 97 | sys.stderr.write("Could not import sphinx_rtd_theme, you can " 98 | "run 'pip install sphinx_rtd_theme' to install " 99 | "this package.\n") 100 | raise 101 | html_theme = 'sphinx_rtd_theme' 102 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 103 | html_theme_options = { 104 | } 105 | 106 | 107 | # Add any paths that contain custom static files (such as style sheets) here, 108 | # relative to this directory. They are copied after the builtin static files, 109 | # so a file named "default.css" will overwrite the builtin "default.css". 110 | html_static_path = ['_static'] 111 | 112 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 113 | # using the given strftime format. 114 | #html_last_updated_fmt = '%b %d, %Y' 115 | 116 | # If true, SmartyPants will be used to convert quotes and dashes to 117 | # typographically correct entities. 118 | #html_use_smartypants = True 119 | 120 | # Custom sidebar templates, maps document names to template names. 121 | #html_sidebars = {} 122 | 123 | # Additional templates that should be rendered to pages, maps page names to 124 | # template names. 125 | #html_additional_pages = {} 126 | 127 | # If false, no module index is generated. 128 | #html_domain_indices = True 129 | 130 | # If false, no index is generated. 131 | #html_use_index = True 132 | 133 | # If true, the index is split into individual pages for each letter. 134 | #html_split_index = False 135 | 136 | # If true, links to the reST sources are added to the pages. 137 | #html_show_sourcelink = True 138 | 139 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 140 | #html_show_sphinx = True 141 | 142 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 143 | #html_show_copyright = True 144 | 145 | # If true, an OpenSearch description file will be output, and all pages will 146 | # contain a tag referring to it. The value of this option must be the 147 | # base URL from which the finished HTML is served. 148 | #html_use_opensearch = '' 149 | 150 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 151 | #html_file_suffix = None 152 | 153 | # Output file base name for HTML help builder. 154 | htmlhelp_basename = 'semidbmdoc' 155 | 156 | 157 | # -- Options for LaTeX output -------------------------------------------------- 158 | 159 | latex_elements = { 160 | # The paper size ('letterpaper' or 'a4paper'). 161 | #'papersize': 'letterpaper', 162 | 163 | # The font size ('10pt', '11pt' or '12pt'). 164 | #'pointsize': '10pt', 165 | 166 | # Additional stuff for the LaTeX preamble. 167 | #'preamble': '', 168 | } 169 | 170 | # Grouping the document tree into LaTeX files. List of tuples 171 | # (source start file, target name, title, author, documentclass [howto/manual]). 172 | latex_documents = [ 173 | ('index', 'semidbm.tex', u'semidbm Documentation', 174 | u'James Saryerwinnie Jr', 'manual'), 175 | ] 176 | 177 | # The name of an image file (relative to this directory) to place at the top of 178 | # the title page. 179 | #latex_logo = None 180 | 181 | # For "manual" documents, if this is true, then toplevel headings are parts, 182 | # not chapters. 183 | #latex_use_parts = False 184 | 185 | # If true, show page references after internal links. 186 | #latex_show_pagerefs = False 187 | 188 | # If true, show URL addresses after external links. 189 | #latex_show_urls = False 190 | 191 | # Documents to append as an appendix to all manuals. 192 | #latex_appendices = [] 193 | 194 | # If false, no module index is generated. 195 | #latex_domain_indices = True 196 | 197 | 198 | # -- Options for manual page output -------------------------------------------- 199 | 200 | # One entry per manual page. List of tuples 201 | # (source start file, name, description, authors, manual section). 202 | man_pages = [ 203 | ('index', 'semidbm', u'semidbm Documentation', 204 | [u'James Saryerwinnie Jr'], 1) 205 | ] 206 | 207 | # If true, show URL addresses after external links. 208 | #man_show_urls = False 209 | 210 | 211 | # -- Options for Texinfo output ------------------------------------------------ 212 | 213 | # Grouping the document tree into Texinfo files. List of tuples 214 | # (source start file, target name, title, author, 215 | # dir menu entry, description, category) 216 | texinfo_documents = [ 217 | ('index', 'semidbm', u'semidbm Documentation', 218 | u'James Saryerwinnie Jr', 'semidbm', 'One line description of project.', 219 | 'Miscellaneous'), 220 | ] 221 | 222 | # Documents to append as an appendix to all manuals. 223 | #texinfo_appendices = [] 224 | 225 | # If false, no module index is generated. 226 | #texinfo_domain_indices = True 227 | 228 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 229 | #texinfo_show_urls = 'footnote' 230 | -------------------------------------------------------------------------------- /docs/details.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | SemiDBM Details 3 | =============== 4 | 5 | This guide goes into the details of how semidbm works. 6 | 7 | Writing a Value 8 | =============== 9 | 10 | One of the key characteristics of semidbm is that it only writes to the end of 11 | a file. **Once data has been written to a file, it is never changed.** This 12 | makes it easy to guarantee that once the data is written to disk, you can be 13 | certain that semidbm will not corrupt the data. This also makes semidbm 14 | simpler because we don't have to worry about how to modify data in a way that 15 | prevents corruption in the event of a crash. 16 | 17 | Even updates to existing values are written as new values at the end of 18 | a file. When the data file is loaded, these transactions are "replayed" 19 | so that the last change will "win". For example, given these operations:: 20 | 21 | add key "foo" with value "bar" 22 | add key "foo2" with value "bar2" 23 | delete key "foo2" 24 | add key "foo" with value "new value" 25 | 26 | this would represent a dictionary that looked like this:: 27 | 28 | {"foo": "new value"} 29 | 30 | .. note:: 31 | 32 | This is just the conceptual view of the transactions. The actual 33 | format is a binary format specified in :doc:`fileformat`. 34 | 35 | You can imagine that a db with a large number of updates can cause 36 | the file to grow to a much larger size than is needed. To reclaim 37 | fixed space, you can use the ``compact()`` method. This will 38 | rewrite the data file is the shortest amount of transactions 39 | needed. The above example can be compacted to:: 40 | 41 | add key "foo" with value "new value" 42 | 43 | When a compaction occurs, a new data file is written out (the original 44 | data file is left untouched). Once all the compacted data has been 45 | written out to the new data file (and fsync'd!), the new data file 46 | is renamed over the original data file, completing the compaction. 47 | This way, if a crash occurs during compaction, the original data file 48 | is not corrupted. 49 | 50 | 51 | Reading Values 52 | ============== 53 | 54 | The index that is stored in memory does not contain the actual 55 | data associated with the key. Instead, it contains the location 56 | within the file where the value is located, conceptually:: 57 | 58 | db = {'foo': DiskLocation(offset=40, size=10)} 59 | 60 | When the value for a key is requested, the offset and size are looked 61 | up. A disk seek is performed and a read is performed for the 62 | specified size associated with the value. This translates to 63 | 2 syscalls:: 64 | 65 | lseek(fd, offset, os.SEEKSET) 66 | data = read(fs, value_size) 67 | 68 | Data Verification 69 | ================= 70 | 71 | Every write to a semidbm db file also includes a crc32 checksum. 72 | When a value is read from disk, semidbm can verify this crc32 checksum. 73 | By default, this verification step is turned off, but can be enabled using the 74 | ``verify_checksums`` argument:: 75 | 76 | >>> db = semidbm.open('dbname', 'c', verify_checksums=True) 77 | 78 | If a checksum error is detected a ``DBMChecksumError`` is raised:: 79 | 80 | >>> db[b'foo'] 81 | Traceback (most recent call last): 82 | File "", line 1, in 83 | File "./semidbm/db.py", line 192, in __getitem__ 84 | return self._verify_checksum_data(key, data) 85 | File "./semidbm/db.py", line 203, in _verify_checksum_data 86 | "Corrupt data detected: invalid checksum for key %s" % key) 87 | semidbm.db.DBMChecksumError: Corrupt data detected: invalid checksum for key b'foo' 88 | 89 | 90 | Read Only Mode 91 | ============== 92 | 93 | SemiDBM includes an optimized read only mode. If you know you only 94 | want to read values from the database without writing new values you 95 | can take advantage of this optimized read only mode. To open a db 96 | file as read only, use the ``'r'`` option:: 97 | 98 | db = semidbm.open('dbname', 'r') 99 | -------------------------------------------------------------------------------- /docs/fileformat.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | File Format of DB file 3 | ====================== 4 | 5 | :author: James Saryerwinnie 6 | :status: Draft 7 | :target-version: 0.4.0 8 | :date: April 15, 2013 9 | 10 | Abstract 11 | ======== 12 | 13 | This document proposes a new file format for semidbm. This is a backwards 14 | incompatible change. 15 | 16 | Motivation 17 | ========== 18 | 19 | When python3 support was added, ``semidbm`` received a significant performance 20 | degredation. This was mainly due to the str vs. bytes differentiation, and 21 | the fact that semidbm was a text based format. All of the integer sizes and 22 | checksum information was written as ASCII strings, and as a result, encoding 23 | the string to a byte sequence added additional overhead. 24 | 25 | In order to improve performance, ``semidbm`` should adopt a binary format, 26 | specifically the sizes of the keys and values as well as the checksums should 27 | be written as binary values. This will avoid the need to use string formatting 28 | when writing values. It will also improve the load time of a db file. 29 | 30 | 31 | Specification 32 | ============= 33 | 34 | A semidbm file will consist of a header and a sequence of entries. 35 | All multibyte sequences are writteni network byte order. 36 | 37 | 38 | Header 39 | ====== 40 | 41 | The semidbm header format consists of: 42 | 43 | * 4 byte magic number (``53 45 4d 49``) 44 | * 4 byte version number consisting of 2 byte major version and 2 byte 45 | minor version (currently (1, 1)). 46 | 47 | 48 | Entries 49 | ======= 50 | 51 | After the header, the file contains a sequence of 52 | entries. Each entry has this format: 53 | 54 | * 4 byte key size 55 | * 4 byte value size 56 | * Key contents 57 | * Value content 58 | * 4 byte CRC32 checksum of Key + Value 59 | 60 | If a key is deleted it will have a value size of -1 and no value content. 61 | -------------------------------------------------------------------------------- /docs/img/delete_sequential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/delete_sequential.png -------------------------------------------------------------------------------- /docs/img/fill_sequential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/fill_sequential.png -------------------------------------------------------------------------------- /docs/img/large_delete_sequential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_delete_sequential.png -------------------------------------------------------------------------------- /docs/img/large_fill_sequential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_fill_sequential.png -------------------------------------------------------------------------------- /docs/img/large_read_hot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_read_hot.png -------------------------------------------------------------------------------- /docs/img/large_read_random.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_read_random.png -------------------------------------------------------------------------------- /docs/img/large_read_sequential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_read_sequential.png -------------------------------------------------------------------------------- /docs/img/read_hot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/read_hot.png -------------------------------------------------------------------------------- /docs/img/read_random.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/read_random.png -------------------------------------------------------------------------------- /docs/img/read_sequential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/read_sequential.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Semidbm 2 | ======= 3 | 4 | Semidbm is a fast, pure python implementation of a dbm, which is a 5 | persistent key value store. It allows you to get and set keys through 6 | a dict interface:: 7 | 8 | import semidbm 9 | db = semidbm.open('testdb', 'c') 10 | db['foo'] = 'bar' 11 | print db['foo'] 12 | db.close() 13 | 14 | These values are persisted to disk, and you can later retrieve 15 | these key/value pairs:: 16 | 17 | # Then at a later time: 18 | db = semidbm.open('testdb', 'r') 19 | # prints "bar" 20 | print db['foo'] 21 | 22 | 23 | It was written with these things in mind: 24 | 25 | * Pure python, supporting python 2.6, 2.7, 3.3, and 3.4. 26 | * Cross platform, works on Windows, Linux, Mac OS X. 27 | * Supports CPython, pypy, and jython (versions 2.7-b3 and higher). 28 | * Simple and Fast (See :doc:`benchmarks`). 29 | 30 | 31 | Post feedback and issues on 32 | `github issues `_, or check out the 33 | latest changes at the `github repo `_. 34 | 35 | 36 | Topics 37 | ------ 38 | 39 | .. toctree:: 40 | :maxdepth: 2 41 | 42 | overview 43 | details 44 | benchmarks 45 | changelog 46 | 47 | 48 | Developer Documentation 49 | ----------------------- 50 | 51 | .. toctree:: 52 | :maxdepth: 2 53 | 54 | api_semidbm 55 | fileformat 56 | 57 | 58 | Indices and tables 59 | ================== 60 | 61 | * :ref:`genindex` 62 | * :ref:`modindex` 63 | * :ref:`search` 64 | 65 | -------------------------------------------------------------------------------- /docs/overview.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | An Overview of Semidbm 3 | ====================== 4 | 5 | The easiest way to think of semidbm is as an improvement over python's 6 | `dumbdbm `_ module. 7 | 8 | While the standard library has faster dbms based on well established C 9 | libraries (GNU dbm, Berkeley DB, ndbm), dumbdbm is the only pure python 10 | portable dbm in the standard library. 11 | 12 | Semidbm offers a few improvements over dumbdbm including: 13 | 14 | * Better overall performance (more on this later). 15 | * Only a single file is used (no separate index and data file). 16 | * Data file compaction. Free space can be reclaimed (though this 17 | only happens whenever explicitly asked to do so 18 | using the `compact()` method). 19 | * Get/set/delete are require O(1) IO. 20 | 21 | Like dumbdbm, semidbm is cross platform. It has been tested on: 22 | 23 | * Linux (Ubuntu 11.10, debian) 24 | * Mac OS X (Lion/Mountain Lion) 25 | * Windows 7/8. 26 | 27 | There are also a few caveats to consider when using semidbm: 28 | 29 | * The entire index must fit in memory, this means all keys must 30 | fit in memory. 31 | * Not thread safe; can only be accessed by a single process. 32 | * While the performance is reasonable, it still will not beat one of the 33 | standard dbms (GNU dbm, Berkeley DB, etc). 34 | 35 | 36 | Using Semidbm 37 | ============= 38 | 39 | To create a new db, specify the name of the directory:: 40 | 41 | import semidbm 42 | db = semidbm.open('mydb', 'c') 43 | 44 | This will create a *mydb* directory. This directory is where semidbm will 45 | place all the files it needs to keep track of the keys and values stored in the 46 | db. If the directory does not exist, it will be created. 47 | 48 | 49 | Once the db has been created, you can get and set values:: 50 | 51 | db['key1'] = 'value1' 52 | print db['key1'] 53 | 54 | **Keys and values can be either str or bytes.** 55 | 56 | ``str`` types will be encoded to utf-8 before writing to disk. 57 | You can avoid this encoding step by providing a byte string 58 | directly:: 59 | 60 | db[b'key1'] = b'value1' 61 | 62 | Otherwise, semidbm will do the equivalent of:: 63 | 64 | db['key1'.encode('utf-8')] = 'value1'.encode('utf-8') 65 | 66 | It is recommended that you handle the encoding of your strings 67 | in your application, and only use ``bytes`` when working with 68 | semidbm. The reason for this is that when a value 69 | is retrieved, it is returned as a bytestring (semidbm can't 70 | know the encoding of the bytes it retrieved). For example (this 71 | is with python 3.3):: 72 | 73 | >>> db['foo'] = 'value' 74 | >>> db['foo'] 75 | b'value' 76 | >>> db['kēy'] = 'valueē' 77 | >>> db['kēy'] 78 | b'value\xc4\x93' 79 | 80 | To avoid this confusion, encode your strings before storing with 81 | with semidbm. 82 | 83 | The reason this automatic conversion is supported is that this is 84 | what is done with the DBMs in the python standard library (including 85 | ``dumbdbm`` which this module was intended to be a drop in replacement 86 | for). In order to be able to be a drop in replacement, this 87 | automatic encoding process needs to be supported (but not recommended). 88 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx==1.2b1 2 | -------------------------------------------------------------------------------- /scripts/adapters/bdb_btopen.py: -------------------------------------------------------------------------------- 1 | import bsddb3 2 | open = bsddb3.btopen 3 | -------------------------------------------------------------------------------- /scripts/adapters/bdb_hashopen.py: -------------------------------------------------------------------------------- 1 | import bsddb3 2 | open = bsddb3.hashopen 3 | -------------------------------------------------------------------------------- /scripts/adapters/bdb_minimal.py: -------------------------------------------------------------------------------- 1 | """A minimal version of bsddb3.DB.""" 2 | import bsddb3.db 3 | 4 | # This might be somewhere in the bsddb3 module, but I wanted to compare the 5 | # performance of bsddb3 with semidbm and I could not have a bare bones dict 6 | # interface exposed with a shelve like interface. 7 | 8 | def open(filename, mode): 9 | db = bsddb3.db.DB(None) 10 | if mode == 'r': 11 | flags = bsddb3.db.DB_RDONLY 12 | elif mode == 'rw': 13 | flags = 0 14 | elif mode == 'w': 15 | flags = bsddb3.db.DB_CREATE 16 | elif mode == 'c': 17 | flags = bsddb3.db.DB_CREATE 18 | elif mode == 'n': 19 | flags = bsddb3.db.DB_TRUNCATE | bsddb3.db.DB_CREATE 20 | else: 21 | raise bsddb3.db.DBError( 22 | "flags should be one of 'r', 'w', 'c' or 'n' or use the " 23 | "bsddb.db.DB_* flags") 24 | db.open(filename, None, bsddb3.db.DB_HASH, flags) 25 | return db 26 | -------------------------------------------------------------------------------- /scripts/adapters/builtindict.py: -------------------------------------------------------------------------------- 1 | _DB = {} 2 | 3 | def open(*args, **kwargs): 4 | return _DB 5 | -------------------------------------------------------------------------------- /scripts/benchmark: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Very simple script for profiling various dbms. 3 | 4 | The point of this script is to give a rough 5 | estimate for how semidbm does compared to other 6 | dbms. You can run this script with no args or 7 | specify the dbms you want to benchmark using 8 | the --dbm arg. 9 | 10 | """ 11 | import os 12 | import sys 13 | import stat 14 | import json 15 | import shutil 16 | import optparse 17 | import time 18 | import string 19 | import tempfile 20 | import random 21 | import traceback 22 | 23 | try: 24 | _range = xrange 25 | except NameError: 26 | _range = range 27 | 28 | random.seed(100) 29 | 30 | 31 | _potential_dbms = ['dbhash', 'dbm', 'gdbm', 'dumbdbm', 'semidbm'] 32 | 33 | ADAPTER_DIR = os.path.join(os.path.dirname(__file__), 'adapters') 34 | sys.path.append(ADAPTER_DIR) 35 | out = sys.stdout.write 36 | 37 | 38 | def _rand_key(key_length, chars=string.printable): 39 | return bytes(bytearray(''.join(random.choice(chars) for i in 40 | _range(key_length)))) 41 | 42 | 43 | def set_dbms(dbms): 44 | dbms_found = [] 45 | for potential in dbms: 46 | try: 47 | d = __import__(potential, fromlist=[potential]) 48 | dbms_found.append(d) 49 | except ImportError as e: 50 | sys.stderr.write("Could not import %s: %s\n" % (potential, e)) 51 | continue 52 | return dbms_found 53 | 54 | 55 | class Options(object): 56 | num_keys = 1000000 57 | key_size_bytes = 16 58 | value_size_bytes = 100 59 | 60 | def __init__(self, **kwargs): 61 | self.__dict__.update(kwargs) 62 | 63 | def print_options(self): 64 | stats = (" num_keys : %(num_keys)s\n" 65 | " key_size : %(key_size_bytes)s\n" 66 | " value_size: %(value_size_bytes)s" % self.__dict__) 67 | return stats 68 | 69 | @property 70 | def key_format(self): 71 | return '%0' + str(self.key_size_bytes) + 'd' 72 | 73 | 74 | class StatsReporter(object): 75 | def __init__(self, name, total_time, total_bytes, total_ops): 76 | self._name = name 77 | self._total_time = total_time 78 | self._total_bytes = total_bytes 79 | self._total_ops = total_ops 80 | 81 | def micros_per_op(self): 82 | # Leveldb uses this, so it's useful to compare. 83 | total_micros = self._total_time * 1e6 84 | return total_micros / self._total_ops 85 | 86 | def ops_per_second(self): 87 | return self._total_ops / float(self._total_time) 88 | 89 | def megabytes_per_second(self): 90 | return self._total_bytes / (1024.0 * 1024) / self._total_time 91 | 92 | def print_report(self): 93 | out("%-20s:" % self._name) 94 | out(" time: %9.3f, micros/ops: %9.3f, ops/s: %10.3f, " 95 | "MB/s: %10.3f\n" % (self._total_time, self.micros_per_op(), 96 | self.ops_per_second(), 97 | self.megabytes_per_second())) 98 | 99 | @property 100 | def name(self): 101 | return self._name 102 | 103 | 104 | class Benchmarks(object): 105 | def __init__(self, options, tmpdir): 106 | self.options = options 107 | self.tmpdir = tmpdir 108 | self.random_values = self._generate_random_string(1024 * 1024) 109 | 110 | def _generate_random_string(self, string_size): 111 | out("Generating random data.\n") 112 | c = chr 113 | rand = random.randint 114 | r = bytes(bytearray([rand(0, 255) for i in _range(string_size)])) 115 | return r 116 | 117 | def run(self, dbm): 118 | print("Benchmarking:", dbm) 119 | print(self.options.print_options()) 120 | all_reports = [] 121 | try: 122 | for name in ['fill_random', 'fill_sequential', 'read_cold', 123 | 'read_sequential', 'read_hot', 'read_random', 124 | 'delete_sequential']: 125 | method = getattr(self, name) 126 | report = method(dbm) 127 | report.print_report() 128 | all_reports.append(report) 129 | finally: 130 | self.delete_dbm() 131 | print 132 | return all_reports 133 | 134 | def fill_random(self, dbm): 135 | db = self._load_dbm(dbm) 136 | random_values = self.random_values 137 | maxlen = len(random_values) 138 | position = 0 139 | value_size = self.options.value_size_bytes 140 | key_size = self.options.key_size_bytes 141 | num_keys = self.options.num_keys 142 | indices = [_rand_key(key_size) for i in _range(num_keys)] 143 | 144 | t = time.time 145 | out = sys.stdout.write 146 | flush = sys.stdout.flush 147 | start = t() 148 | for i in _range(num_keys): 149 | db[indices[i]] = random_values[position:position+value_size] 150 | position += value_size 151 | if position + value_size > maxlen: 152 | position = 0 153 | out("(%s/%s)\r" % (i, num_keys)) 154 | flush() 155 | total = t() - start 156 | self._close_db(db) 157 | self.delete_dbm() 158 | return StatsReporter( 159 | 'fill_random', total, 160 | (value_size * num_keys) + (self.options.key_size_bytes * num_keys), 161 | num_keys) 162 | 163 | def fill_sequential(self, dbm): 164 | db = self._load_dbm(dbm) 165 | key_format = self.options.key_format 166 | random_values = self.random_values 167 | maxlen = len(random_values) 168 | position = 0 169 | value_size = self.options.value_size_bytes 170 | num_keys = self.options.num_keys 171 | indices = [(key_format % i).encode('utf-8') for i in _range(num_keys)] 172 | 173 | t = time.time 174 | out = sys.stdout.write 175 | flush = sys.stdout.flush 176 | start = t() 177 | for i in _range(num_keys): 178 | db[indices[i]] = random_values[position:position+value_size] 179 | position += value_size 180 | if position + value_size > maxlen: 181 | position = 0 182 | out("(%s/%s)\r" % (i, num_keys)) 183 | flush() 184 | total = t() - start 185 | self._close_db(db) 186 | return StatsReporter( 187 | 'fill_sequential', total, 188 | (value_size * num_keys) + (self.options.key_size_bytes * num_keys), 189 | num_keys) 190 | 191 | def read_sequential(self, dbm, name='read_sequential'): 192 | # Assumes fill_sequential has been called. 193 | db = self._load_dbm(dbm, 'r') 194 | key_format = self.options.key_format 195 | num_keys = self.options.num_keys 196 | 197 | indices = [(key_format % i).encode('utf-8') for i in _range(num_keys)] 198 | t = time.time 199 | start = t() 200 | for i in _range(num_keys): 201 | db[indices[i]] 202 | total = t() - start 203 | self._close_db(db) 204 | total_bytes = (self.options.key_size_bytes * num_keys + 205 | self.options.value_size_bytes * num_keys) 206 | return StatsReporter(name, total, total_bytes, num_keys) 207 | 208 | def read_cold(self, dbm): 209 | # read_cold is intended to be called before read_sequential or any 210 | # other reads to test the performance of a "cold" read. 211 | return self.read_sequential(dbm, name='read_cold') 212 | 213 | def read_hot(self, dbm): 214 | # Assumes fill_sequential has been called. 215 | # Read from 1% of the database self.options.num_keys times. 216 | # This should test the effectiveness of any caching being used. 217 | num_keys = self.options.num_keys 218 | unique_keys = int(num_keys * 0.01) 219 | indices = [(self.options.key_format % i).encode('utf-8') 220 | for i in random.sample(_range(num_keys), unique_keys)] 221 | indices = indices * (int(num_keys / unique_keys)) 222 | db = self._load_dbm(dbm, 'r') 223 | t = time.time 224 | start = t() 225 | for i in _range(num_keys): 226 | db[indices[i]] 227 | total = t() - start 228 | self._close_db(db) 229 | total_bytes = (self.options.key_size_bytes * num_keys + 230 | self.options.value_size_bytes * num_keys) 231 | return StatsReporter('read_hot', total, total_bytes, 232 | num_keys) 233 | 234 | def read_random(self, dbm): 235 | # This doesn't matter to semidbm because the keys 236 | # aren't ordered, but other dbms might be impacted. 237 | num_keys = self.options.num_keys 238 | key_format = self.options.key_format 239 | indices = [(key_format % i).encode('utf-8') for i in range(num_keys)] 240 | random.shuffle(indices) 241 | db = self._load_dbm(dbm, 'r') 242 | t = time.time 243 | start = t() 244 | for i in _range(num_keys): 245 | db[indices[i]] 246 | total = t() - start 247 | self._close_db(db) 248 | total_bytes = (self.options.key_size_bytes * num_keys + 249 | self.options.value_size_bytes * num_keys) 250 | return StatsReporter('read_random', total, total_bytes, 251 | num_keys) 252 | 253 | def delete_sequential(self, dbm): 254 | # Assumes fill_sequential has been called. 255 | db = self._load_dbm(dbm, 'c') 256 | key_format = self.options.key_format 257 | num_keys = self.options.num_keys 258 | 259 | indices = [(key_format % i).encode('utf-8') for i in _range(num_keys)] 260 | t = time.time 261 | start = t() 262 | for i in _range(num_keys): 263 | del db[indices[i]] 264 | total = t() - start 265 | self._close_db(db) 266 | total_bytes = (self.options.key_size_bytes * num_keys + 267 | self.options.value_size_bytes * num_keys) 268 | return StatsReporter('delete_sequential', total, total_bytes, num_keys) 269 | 270 | def delete_dbm(self): 271 | # Just wipe out everything under tmpdir. 272 | self._rmtree(self.tmpdir) 273 | 274 | def _rmtree(self, tmpdir): 275 | # Delete everything under tmpdir but don't actually 276 | # delete tmpdir itself. 277 | for path in os.listdir(tmpdir): 278 | full_path = os.path.join(tmpdir, path) 279 | mode = os.lstat(full_path).st_mode 280 | if stat.S_ISDIR(mode): 281 | shutil.rmtree(full_path) 282 | else: 283 | os.remove(full_path) 284 | 285 | def _load_dbm(self, dbm, flags='c'): 286 | db = dbm.open(os.path.join(self.tmpdir, 'db'), flags) 287 | return db 288 | 289 | def _close_db(self, db): 290 | # If the db has a close() method call it. Basically a hack 291 | # so we can benchmark a normal python dict. 292 | if hasattr(db, 'close'): 293 | db.close() 294 | 295 | 296 | def generate_report(filename, options, reports): 297 | """Create a json report grouped by benchmarks rather than by dbm. 298 | 299 | Since this is going to be used to autogenerate the 300 | charts/tables, a comparison across dbms for a given benchmark 301 | is more useful. The output should look like:: 302 | 303 | {num_keys: 100, key_size_bytes: 16, value_size_bytes: 1000, 304 | dbms: ['semidbm', 'gdbm'], 305 | benchmarks: 306 | [['fill_sequential', [ 307 | {total_time: 100, micros_per_op: 1, 308 | ops_per_second: 123, mb_per_second: 100}]], 309 | ... 310 | ] 311 | } 312 | 313 | 314 | """ 315 | # Generating a report requires python >= 2.7. 316 | from collections import OrderedDict 317 | output = { 318 | 'num_keys': options.num_keys, 319 | 'key_size_bytes': options.key_size_bytes, 320 | 'value_size_bytes': options.value_size_bytes 321 | } 322 | by_benchmarks = OrderedDict() 323 | dbms = [] 324 | for dbm, benchmarks in reports: 325 | dbms.append(dbm) 326 | for benchmark in benchmarks: 327 | by_benchmarks.setdefault(benchmark.name, []).append({ 328 | 'total_time': benchmark.total_time(), 329 | 'micros_per_op': benchmark.micros_per_op(), 330 | 'ops_per_second': benchmark.ops_per_second(), 331 | 'megabytes_per_second': benchmark.megabytes_per_second(), 332 | }) 333 | output['dbms'] = dbms 334 | output['benchmarks'] = by_benchmarks 335 | json.dump(output, open(filename, 'w'), indent=4) 336 | 337 | 338 | def main(): 339 | parser = optparse.OptionParser() 340 | parser.add_option('-d', '--dbm', dest='dbms', action='append') 341 | # These are the same defaults as the leveldb benchmark, 342 | # which this scripts is based off of. 343 | parser.add_option('-n', '--num-keys', default=1000000, type=int) 344 | parser.add_option('-k', '--key-size-bytes', default=16, type=int) 345 | parser.add_option('-s', '--value-size-bytes', default=100, type=int) 346 | parser.add_option('-r', '--report', help="Generate a summary report " 347 | "in json to specified location.") 348 | opts, args = parser.parse_args() 349 | 350 | 351 | dbm_names = opts.__dict__.pop('dbms') or _potential_dbms 352 | dbms = set_dbms(dbm_names) 353 | if not dbms: 354 | sys.stderr.write("List of dbms is empty.\n") 355 | sys.exit(1) 356 | options = Options(**opts.__dict__) 357 | tmpdir = tempfile.mkdtemp(prefix='dbmprofile') 358 | benchmarks = Benchmarks(options, tmpdir) 359 | all_reports = [] 360 | try: 361 | for dbm in dbms: 362 | try: 363 | all_reports.append((dbm.__name__, benchmarks.run(dbm))) 364 | except Exception as e: 365 | traceback.print_exc() 366 | sys.stderr.write( 367 | "ERROR: exception caught when benchmarking %s: %s\n" % 368 | (dbm, e)) 369 | finally: 370 | shutil.rmtree(tmpdir) 371 | if opts.report: 372 | generate_report(opts.report, options, all_reports) 373 | 374 | 375 | if __name__ == '__main__': 376 | main() 377 | -------------------------------------------------------------------------------- /scripts/loadtime: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import argparse 4 | 5 | import semidbm 6 | 7 | 8 | def measure_laod_time(db_path): 9 | num_loads = 10 10 | times = [] 11 | o = semidbm.open 12 | for i in range(num_loads): 13 | start = time.time() 14 | db = o(db_path, 'c') 15 | times.append(time.time() - start) 16 | db.close() 17 | print "%.5f milliseconds average load time" % ( 18 | (sum(times) / float(num_loads)) * 1000) 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('db_path') 24 | args = parser.parse_args() 25 | measure_laod_time(args.db_path) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /scripts/makedb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Time the time to load a semidbm db. 4 | import sys 5 | import random 6 | import string 7 | from argparse import ArgumentParser 8 | import tempfile 9 | import os 10 | 11 | import semidbm 12 | 13 | 14 | try: 15 | _range = xrange 16 | except NameError: 17 | _range = range 18 | 19 | 20 | def _rand_bytes(key_length, chars=string.printable): 21 | return bytes(bytearray(''.join(random.choice(chars) for i in 22 | _range(key_length)))) 23 | 24 | 25 | def populate_db(args): 26 | path = args.output_dir 27 | db = semidbm.open(path, 'c') 28 | sys.stdout.write("Populating the DB...\n") 29 | sys.stdout.write(" - num_keys: %s\n" % args.num_keys) 30 | sys.stdout.write(" - key_size_bytes: %s\n" % args.key_size_bytes) 31 | sys.stdout.write(" - val_size_bytes: %s\n" % args.value_size_bytes) 32 | sys.stdout.flush() 33 | key_size_bytes = args.key_size_bytes 34 | value_size_bytes = args.value_size_bytes 35 | for i in range(args.num_keys): 36 | db[_rand_bytes(key_size_bytes)] = _rand_bytes(value_size_bytes) 37 | sys.stdout.write("\nDone") 38 | db.close() 39 | 40 | 41 | def main(): 42 | parser = ArgumentParser() 43 | parser.add_argument('-n', '--num-keys', default=1000000, type=int) 44 | parser.add_argument('-k', '--key-size-bytes', default=16, type=int) 45 | parser.add_argument('-s', '--value-size-bytes', default=100, type=int) 46 | parser.add_argument('output_dir', help="Location of db to create.") 47 | 48 | args = parser.parse_args() 49 | populate_db(args) 50 | 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /scripts/makegraphs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # This script is used to generate fancy charts and text tables 3 | # from a .json file produced via the '-r' argument of scripts/benchmark. 4 | import os 5 | import sys 6 | import json 7 | import optparse 8 | # I don't expect anyone else to run this script, but OrderedDict 9 | # requires python2.7. 10 | from collections import OrderedDict 11 | 12 | import numpy.numarray as na 13 | from matplotlib import pyplot as p 14 | import matplotlib 15 | import texttable 16 | 17 | 18 | def generate_charts(results, benchmark_to_use, results_filename): 19 | labels = results['dbms'] 20 | font = {'size' : 7} 21 | matplotlib.rc('font', **font) 22 | 23 | for name, benchmark in results['benchmarks'].iteritems(): 24 | data = [b[benchmark_to_use] for b in benchmark] 25 | title = name 26 | title += ('(numkeys=%(num_keys)s,keysize=%(key_size_bytes)s,' 27 | 'valsize=%(value_size_bytes)s)' % results) 28 | xlocations = na.array(range(len(data)))+0.5 29 | width = 0.5 30 | p.gcf().set_size_inches(5, 5) 31 | p.bar(xlocations, data, width=width) 32 | ymax = int(max(data)) + 1 33 | p.yticks(range(0, ymax, ymax / 10)) 34 | p.ylim(0, ymax) 35 | p.xticks(xlocations + width / 2, labels) 36 | p.xlim(0, xlocations[-1] + width * 2) 37 | p.ylabel(benchmark_to_use) 38 | p.title(title) 39 | p.gca().get_xaxis().tick_bottom() 40 | p.gca().get_yaxis().tick_left() 41 | p.savefig('docs/img/' + '%s_%s' % 42 | (os.path.splitext(results_filename)[0], name)) 43 | p.close('all') 44 | 45 | 46 | def generate_table(results, metric_to_use): 47 | dbms = results['dbms'] 48 | title = ('n=%(num_keys)s,k=%(key_size_bytes)s,' 49 | 'v=%(value_size_bytes)s' % results) 50 | t = texttable.Texttable(max_width=120) 51 | t.set_cols_align(['l'] + ['r' for i in xrange(len(dbms))]) 52 | t.add_rows([ 53 | [title] + dbms, 54 | ]) 55 | for name, benchmark in results['benchmarks'].iteritems(): 56 | t.add_row([name] + [b[metric_to_use] for b in benchmark]) 57 | print t.draw() 58 | 59 | 60 | def main(): 61 | parser = optparse.OptionParser() 62 | parser.add_option('-t', '--table', action="store_true", default=False) 63 | parser.add_option('-c', '--chart', action="store_true", default=False) 64 | parser.add_option('-b', '--benchmark', default="ops_per_second") 65 | opts, args = parser.parse_args() 66 | if len(args) != 1: 67 | sys.stderr.write("Supply the name of the .json report.") 68 | sys.exit(1) 69 | results = json.load(open(args[0]), object_pairs_hook=OrderedDict) 70 | if opts.chart: 71 | generate_charts(results, opts.benchmark, args[0]) 72 | if opts.table: 73 | generate_table(results, opts.benchmark) 74 | 75 | 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /scripts/tps: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Ok so I know this is basically a bs metric, but it's still fun to see what the numbers 3 | # are. But be warned, don't take these numbers too seriously. 4 | import sys 5 | import shutil 6 | import time 7 | import os 8 | import tempfile 9 | import optparse 10 | 11 | 12 | def main(): 13 | parser = optparse.OptionParser() 14 | parser.add_option('-n', '--num-transactions', default=1000000, type=int) 15 | parser.add_option('-c', '--chunk-size', default=10000, type=int, 16 | help="For the read chunked tests, this will " 17 | "set how many elements to iterate over at a time") 18 | parser.add_option('-r', '--repeat', default=10, type=int, 19 | help="For the read chunked tests, this will " 20 | "specify how many times to iterate over the chunks " 21 | "before moving on to the next chunk.") 22 | parser.add_option('-t', '--skip-read-test', action="store_true", 23 | default=False, help="Skip the sequential read test " 24 | "(useful if you just want to benchmark writes)") 25 | parser.add_option('-s', '--skip-read-chunk', action="store_true", 26 | default=False, help="Skip the read chunk test " 27 | "(it can take a while to run).") 28 | parser.add_option('-d', '--dbm', default='semidbm') 29 | opts, args = parser.parse_args() 30 | 31 | tempdir = tempfile.mkdtemp(prefix='tps') 32 | dbname = os.path.join(tempdir, 'tps.db') 33 | try: 34 | dbm_module = __import__(opts.dbm, fromlist=[opts.dbm]) 35 | except ImportError: 36 | sys.stderr.write("Can't import dbm: %s\n" % opts.dbm) 37 | sys.exit(1) 38 | db = dbm_module.open(dbname, 'c') 39 | 40 | num_transactions = opts.num_transactions 41 | groups_of = opts.chunk_size 42 | repeat = opts.repeat 43 | 44 | start = time.time() 45 | for i in xrange(num_transactions): 46 | db[str(i)] = str(i) 47 | end = time.time() 48 | print "Write ", 49 | print "Total: %.5f, tps: %.2f" % (end - start, 50 | float(num_transactions) / (end - start)) 51 | if not opts.skip_read_test: 52 | db.close() 53 | db = dbm_module.open(dbname, 'r') 54 | start = time.time() 55 | for i in xrange(num_transactions): 56 | db[str(i)] 57 | end = time.time() 58 | 59 | print "Read ", 60 | print "Total: %.5f, tps: %.2f" % (end - start, 61 | float(num_transactions) / (end - start)) 62 | 63 | 64 | if not opts.skip_read_chunk: 65 | count = 0 66 | start = time.time() 67 | for i in xrange(0, num_transactions, groups_of): 68 | for j in xrange(groups_of): 69 | for k in xrange(repeat): 70 | count += 1 71 | db[str(i + j)] 72 | end = time.time() 73 | print "Read (grouped)", 74 | print "count:", count 75 | print "Total: %.5f, tps: %.2f" % (end - start, 76 | float(count) / (end - start)) 77 | db.close() 78 | shutil.rmtree(tempdir) 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /semidbm/__init__.py: -------------------------------------------------------------------------------- 1 | import semidbm.db 2 | open = semidbm.db.open 3 | 4 | from semidbm.db import DBMError 5 | from semidbm.db import DBMLoadError 6 | from semidbm.db import DBMChecksumError 7 | 8 | 9 | __version__ = '0.5.1' 10 | -------------------------------------------------------------------------------- /semidbm/compat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | try: 4 | import __builtin__ 5 | except ImportError: 6 | import builtins as __builtin__ 7 | 8 | file_open = __builtin__.open 9 | 10 | 11 | try: 12 | str_type = unicode 13 | except NameError: 14 | # Python 3.x. 15 | str_type = str 16 | 17 | 18 | DATA_OPEN_FLAGS = os.O_RDWR | os.O_CREAT | os.O_APPEND 19 | if sys.platform.startswith('win'): 20 | # On windows we need to specify that we should be 21 | # reading the file as a binary file so it doesn't 22 | # change any line ending characters. 23 | DATA_OPEN_FLAGS = DATA_OPEN_FLAGS | os.O_BINARY 24 | -------------------------------------------------------------------------------- /semidbm/db.py: -------------------------------------------------------------------------------- 1 | """An only semi-dumb DBM. 2 | 3 | This module is an attempt to do slightly better than the 4 | standard library's dumbdbm. It keeps a similar design 5 | to dumbdbm while improving and fixing some of dumbdbm's 6 | problems. 7 | 8 | """ 9 | import os 10 | import sys 11 | from binascii import crc32 12 | import struct 13 | 14 | from semidbm.exceptions import DBMLoadError, DBMChecksumError, DBMError 15 | from semidbm.loaders import _DELETED, FILE_FORMAT_VERSION, FILE_IDENTIFIER 16 | from semidbm import compat 17 | 18 | 19 | _open = compat.file_open 20 | 21 | 22 | class _SemiDBM(object): 23 | """ 24 | 25 | :param dbdir: The directory containing the dbm files. If the directory 26 | does not exist it will be created. 27 | 28 | """ 29 | def __init__(self, dbdir, renamer, data_loader=None, 30 | verify_checksums=False): 31 | self._renamer = renamer 32 | self._data_loader = data_loader 33 | self._dbdir = dbdir 34 | self._data_filename = os.path.join(dbdir, 'data') 35 | # The in memory index, mapping of key to (offset, size). 36 | self._index = None 37 | self._data_fd = None 38 | self._verify_checksums = verify_checksums 39 | self._current_offset = 0 40 | self._load_db() 41 | 42 | def _create_db_dir(self): 43 | if not os.path.exists(self._dbdir): 44 | os.makedirs(self._dbdir) 45 | 46 | def _load_db(self): 47 | self._create_db_dir() 48 | self._index = self._load_index(self._data_filename) 49 | self._data_fd = os.open(self._data_filename, compat.DATA_OPEN_FLAGS) 50 | self._current_offset = os.lseek(self._data_fd, 0, os.SEEK_END) 51 | 52 | def _load_index(self, filename): 53 | # This method is only used upon instantiation to populate 54 | # the in memory index. 55 | if not os.path.exists(filename): 56 | self._write_headers(filename) 57 | return {} 58 | try: 59 | return self._load_index_from_fileobj(filename) 60 | except ValueError as e: 61 | raise DBMLoadError("Bad index file %s: %s" % (filename, e)) 62 | 63 | def _write_headers(self, filename): 64 | with _open(filename, 'wb') as f: 65 | # Magic number identifier. 66 | f.write(FILE_IDENTIFIER) 67 | # File version format. 68 | f.write(struct.pack('!HH', *FILE_FORMAT_VERSION)) 69 | 70 | def _load_index_from_fileobj(self, filename): 71 | index = {} 72 | for key_name, offset, size in self._data_loader.iter_keys(filename): 73 | size = int(size) 74 | offset = int(offset) 75 | if size == _DELETED: 76 | # This is a deleted item so we need to make sure that this 77 | # value is not in the index. We know that the key is already 78 | # in the index, because a delete is only written to the index 79 | # if the key already exists in the db. 80 | del index[key_name] 81 | else: 82 | if key_name in index: 83 | index[key_name] = (offset, size) 84 | else: 85 | index[key_name] = (offset, size) 86 | return index 87 | 88 | def __getitem__(self, key, read=os.read, lseek=os.lseek, 89 | seek_set=os.SEEK_SET, str_type=compat.str_type, 90 | isinstance=isinstance): 91 | if isinstance(key, str_type): 92 | key = key.encode('utf-8') 93 | offset, size = self._index[key] 94 | lseek(self._data_fd, offset, seek_set) 95 | if not self._verify_checksums: 96 | return read(self._data_fd, size) 97 | else: 98 | # Checksum is at the end of the value. 99 | data = read(self._data_fd, size + 4) 100 | return self._verify_checksum_data(key, data) 101 | 102 | def _verify_checksum_data(self, key, data): 103 | # key is the bytes of the key, 104 | # data is the bytes of the value + 4 byte checksum at the end. 105 | value = data[:-4] 106 | expected = struct.unpack('!I', data[-4:])[0] 107 | actual = crc32(key) 108 | actual = crc32(value, actual) 109 | if actual & 0xffffffff != expected: 110 | raise DBMChecksumError( 111 | "Corrupt data detected: invalid checksum for key %s" % key) 112 | return value 113 | 114 | def __setitem__(self, key, value, len=len, crc32=crc32, write=os.write, 115 | str_type=compat.str_type, pack=struct.pack, 116 | isinstance=isinstance): 117 | if isinstance(key, str_type): 118 | key = key.encode('utf-8') 119 | if isinstance(value, str_type): 120 | value = value.encode('utf-8') 121 | # Write the new data out at the end of the file. 122 | # Format is 123 | # 4 bytes 4bytes 4bytes 124 | # 125 | # Everything except for the actual checksum + value 126 | key_size = len(key) 127 | val_size = len(value) 128 | keyval_size = pack('!ii', key_size, val_size) 129 | keyval = key + value 130 | checksum = pack('!I', crc32(keyval) & 0xffffffff) 131 | blob = keyval_size + keyval + checksum 132 | 133 | write(self._data_fd, blob) 134 | # Update the in memory index. 135 | self._index[key] = (self._current_offset + 8 + key_size, 136 | val_size) 137 | self._current_offset += len(blob) 138 | 139 | def __contains__(self, key): 140 | return key in self._index 141 | 142 | def __delitem__(self, key, len=len, write=os.write, deleted=_DELETED, 143 | str_type=compat.str_type, isinstance=isinstance, 144 | crc32=crc32, pack=struct.pack): 145 | if isinstance(key, str_type): 146 | key = key.encode('utf-8') 147 | key_size = pack('!ii', len(key), _DELETED) 148 | crc = pack('!I', crc32(key) & 0xffffffff) 149 | blob = key_size + key + crc 150 | 151 | write(self._data_fd, blob) 152 | del self._index[key] 153 | self._current_offset += len(blob) 154 | 155 | def __iter__(self): 156 | for key in self._index: 157 | yield key 158 | 159 | def keys(self): 160 | """Return all they keys in the db. 161 | 162 | The keys are returned in an arbitrary order. 163 | 164 | """ 165 | return self._index.keys() 166 | 167 | def values(self): 168 | return [self[key] for key in self._index] 169 | 170 | def close(self, compact=False): 171 | """Close the db. 172 | 173 | The data is synced to disk and the db is closed. 174 | Once the db has been closed, no further reads or writes 175 | are allowed. 176 | 177 | :param compact: Indicate whether or not to compact the db 178 | before closing the db. 179 | 180 | """ 181 | if compact: 182 | self.compact() 183 | self.sync() 184 | os.close(self._data_fd) 185 | 186 | def sync(self): 187 | """Sync the db to disk. 188 | 189 | This will flush any of the existing buffers and 190 | fsync the data to disk. 191 | 192 | You should call this method to guarantee that the data 193 | is written to disk. This method is also called whenever 194 | the dbm is `close()`'d. 195 | 196 | """ 197 | # The files are opened unbuffered so we don't technically 198 | # need to flush the file objects. 199 | os.fsync(self._data_fd) 200 | 201 | def compact(self): 202 | """Compact the db to reduce space. 203 | 204 | This method will compact the data file and the index file. 205 | This is needed because of the append only nature of the index 206 | and data files. This method will read the index and data file 207 | and write out smaller but equivalent versions of these files. 208 | 209 | As a general rule of thumb, the more non read updates you do, 210 | the more space you'll save when you compact. 211 | 212 | """ 213 | # Basically, compaction works by opening a new db, writing 214 | # all the keys from this db to the new db, renaming the 215 | # new db to the filenames associated with this db, and 216 | # reopening the files associated with this db. This 217 | # implementation can certainly be more efficient, but compaction 218 | # is really slow anyways. 219 | new_db = self.__class__(os.path.join(self._dbdir, 'compact'), 220 | data_loader=self._data_loader, 221 | renamer=self._renamer) 222 | for key in self._index: 223 | new_db[key] = self[key] 224 | new_db.sync() 225 | new_db.close() 226 | os.close(self._data_fd) 227 | self._renamer(new_db._data_filename, self._data_filename) 228 | os.rmdir(new_db._dbdir) 229 | # The index is already compacted so we don't need to compact it. 230 | self._load_db() 231 | 232 | 233 | class _SemiDBMReadOnly(_SemiDBM): 234 | def __delitem__(self, key): 235 | self._method_not_allowed('delitem') 236 | 237 | def __setitem__(self, key, value): 238 | self._method_not_allowed('setitem') 239 | 240 | def sync(self): 241 | self._method_not_allowed('sync') 242 | 243 | def compact(self): 244 | self._method_not_allowed('compact') 245 | 246 | def _method_not_allowed(self, method_name): 247 | raise DBMError("Can't %s: db opened in read only mode." % method_name) 248 | 249 | def close(self, compact=False): 250 | os.close(self._data_fd) 251 | 252 | 253 | class _SemiDBMReadWrite(_SemiDBM): 254 | def _load_db(self): 255 | if not os.path.isfile(self._data_filename): 256 | raise DBMError("Not a file: %s" % self._data_filename) 257 | 258 | super(_SemiDBMReadWrite, self)._load_db() 259 | 260 | 261 | class _SemiDBMNew(_SemiDBM): 262 | def _load_db(self): 263 | self._create_db_dir() 264 | self._remove_files_in_dbdir() 265 | super(_SemiDBMNew, self)._load_db() 266 | 267 | def _remove_files_in_dbdir(self): 268 | # We want to create a new DB so we need to remove 269 | # any of the existing files in the dbdir. 270 | if os.path.exists(self._data_filename): 271 | os.remove(self._data_filename) 272 | 273 | 274 | # These renamer classes are needed because windows 275 | # doesn't support atomic renames, and I won't want 276 | # non-window clients to suffer for this. If you're on 277 | # windows, you don't get atomic renames. 278 | class _Renamer(object): 279 | """An object that can rename files.""" 280 | def __call__(self, from_file, to_file): 281 | os.rename(from_file, to_file) 282 | 283 | 284 | # Note that this also works on posix platforms as well. 285 | class _WindowsRenamer(object): 286 | def __call__(self, from_file, to_file): 287 | # os.rename() does not work if the dst file exists 288 | # on windows so we have to use our own version that 289 | # supports atomic renames. 290 | import semidbm.win32 291 | semidbm.win32.rename(from_file, to_file) 292 | 293 | 294 | def _create_default_params(**starting_kwargs): 295 | kwargs = starting_kwargs.copy() 296 | # Internal method that creates the parameters based 297 | # on the choices like platform/available features. 298 | if sys.platform.startswith('win'): 299 | renamer = _WindowsRenamer() 300 | else: 301 | renamer = _Renamer() 302 | try: 303 | from semidbm.loaders.mmapload import MMapLoader 304 | data_loader = MMapLoader() 305 | except ImportError: 306 | # If mmap is not available then fall back to the 307 | # simple non mmap based file loader. 308 | from semidbm.loaders.simpleload import SimpleFileLoader 309 | data_loader = SimpleFileLoader() 310 | kwargs.update({'renamer': renamer, 'data_loader': data_loader}) 311 | return kwargs 312 | 313 | 314 | # The "dbm" interface is: 315 | # 316 | # open(filename, flag='r', mode=0o666) 317 | # 318 | # All the other args after this should have default values 319 | # so that this function remains compatible with the dbm interface. 320 | def open(filename, flag='r', mode=0o666, verify_checksums=False): 321 | """Open a semidbm database. 322 | 323 | :param filename: The name of the db. Note that for semidbm, 324 | this is actually a directory name. The argument is named 325 | `filename` to be compatible with the dbm interface. 326 | 327 | :param flag: Specifies how the db should be opened. 328 | `flag` can be any of these values: 329 | 330 | +---------+-------------------------------------------+ 331 | | Value | Meaning | 332 | +=========+===========================================+ 333 | | ``'r'`` | Open existing database for reading only | 334 | | | (default) | 335 | +---------+-------------------------------------------+ 336 | | ``'w'`` | Open existing database for reading and | 337 | | | writing | 338 | +---------+-------------------------------------------+ 339 | | ``'c'`` | Open database for reading and writing, | 340 | | | creating it if it doesn't exist | 341 | +---------+-------------------------------------------+ 342 | | ``'n'`` | Always create a new, empty database, open | 343 | | | for reading and writing | 344 | +---------+-------------------------------------------+ 345 | 346 | :param mode: Not currently used (provided to be compatible with 347 | the dbm interface). 348 | 349 | :param verify_checksums: Verify the checksums for each value 350 | are correct on every __getitem__ call (defaults to False). 351 | 352 | """ 353 | kwargs = _create_default_params(verify_checksums=verify_checksums) 354 | if flag == 'r': 355 | return _SemiDBMReadOnly(filename, **kwargs) 356 | elif flag == 'c': 357 | return _SemiDBM(filename, **kwargs) 358 | elif flag == 'w': 359 | return _SemiDBMReadWrite(filename, **kwargs) 360 | elif flag == 'n': 361 | return _SemiDBMNew(filename, **kwargs) 362 | else: 363 | raise ValueError("flag argument must be 'r', 'c', 'w', or 'n'") 364 | -------------------------------------------------------------------------------- /semidbm/exceptions.py: -------------------------------------------------------------------------------- 1 | class DBMError(Exception): 2 | pass 3 | 4 | 5 | class DBMLoadError(DBMError): 6 | pass 7 | 8 | 9 | class DBMChecksumError(DBMError): 10 | pass 11 | -------------------------------------------------------------------------------- /semidbm/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | 4 | from semidbm.exceptions import DBMLoadError 5 | 6 | 7 | # Major, Minor version. 8 | FILE_FORMAT_VERSION = (1, 1) 9 | FILE_IDENTIFIER = b'\x53\x45\x4d\x49' 10 | _DELETED = -1 11 | 12 | 13 | class DBMLoader(object): 14 | def __init__(self): 15 | pass 16 | 17 | def iter_keys(self, filename): 18 | """Load the keys given a filename. 19 | 20 | Subclasses need to implement this method that accepts a filename and 21 | iterates over the keys associated with the data file. Each yielded 22 | item should contain a tuple of:: 23 | 24 | (key_name, offset, size) 25 | 26 | Where key_name is the name of the key (bytes), offset is the integer 27 | offset within the file of the value associated with the key, and size 28 | is the size of the value in bytes. 29 | """ 30 | raise NotImplementedError("iter_keys") 31 | 32 | def _verify_header(self, header): 33 | sig = header[:4] 34 | if sig != FILE_IDENTIFIER: 35 | raise DBMLoadError("File is not a semidbm db file.") 36 | major, minor = struct.unpack('!HH', header[4:]) 37 | if major != FILE_FORMAT_VERSION[0]: 38 | raise DBMLoadError( 39 | 'Incompatible file version (got: v%s, can handle: v%s)' % ( 40 | (major, FILE_FORMAT_VERSION[0]))) 41 | -------------------------------------------------------------------------------- /semidbm/loaders/mmapload.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mmap 3 | import struct 4 | 5 | 6 | from semidbm.loaders import DBMLoader, _DELETED 7 | from semidbm.exceptions import DBMLoadError 8 | from semidbm import compat 9 | 10 | 11 | _MAPPED_LOAD_PAGES = 300 12 | 13 | 14 | class MMapLoader(DBMLoader): 15 | def __init__(self): 16 | pass 17 | 18 | def iter_keys(self, filename): 19 | # yields keyname, offset, size 20 | f = compat.file_open(filename, 'rb') 21 | header = f.read(8) 22 | self._verify_header(header) 23 | contents = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) 24 | remap_size = mmap.ALLOCATIONGRANULARITY * _MAPPED_LOAD_PAGES 25 | # We need to track the max_index to use as the upper bound 26 | # in the .find() calls to be compatible with python 2.6. 27 | # There's a bug in python 2.6 where if an offset is specified 28 | # along with a size of 0, then the size for mmap() is the size 29 | # of the file instead of the size of the file - offset. To 30 | # fix this, we track this ourself and make sure we never go passed 31 | # max_index. If we don't do this, python2.6 will crash with 32 | # a bus error (python2.7 works fine without this workaround). 33 | # See http://bugs.python.org/issue10916 for more info. 34 | max_index = os.path.getsize(filename) 35 | file_size_bytes = max_index 36 | num_resizes = 0 37 | current = 8 38 | try: 39 | while current != max_index: 40 | try: 41 | key_size, val_size = struct.unpack( 42 | '!ii', contents[current:current+8]) 43 | except struct.error: 44 | raise DBMLoadError() 45 | key = contents[current+8:current+8+key_size] 46 | if len(key) != key_size: 47 | raise DBMLoadError() 48 | offset = (remap_size * num_resizes) + current + 8 + key_size 49 | if offset + val_size > file_size_bytes: 50 | # If this happens then the index is telling us 51 | # to read past the end of the file. What we need 52 | # to do is stop reading from the index. 53 | return 54 | yield (key, offset, val_size) 55 | if val_size == _DELETED: 56 | val_size = 0 57 | # Also need to skip past the 4 byte checksum, hence 58 | # the '+ 4' at the end 59 | current = current + 8 + key_size + val_size + 4 60 | if current >= remap_size: 61 | contents.close() 62 | num_resizes += 1 63 | offset = num_resizes * remap_size 64 | # Windows python2.6 bug. You can't specify a length of 65 | # 0 with an offset, otherwise you get a WindowsError, not 66 | # enough storage is available to process this command. 67 | # Couldn't find an issue for this, but the workaround 68 | # is to specify the actual length of the mmap'd region 69 | # which is the total size minus the offset we want. 70 | contents = mmap.mmap(f.fileno(), file_size_bytes - offset, 71 | access=mmap.ACCESS_READ, 72 | offset=offset) 73 | current -= remap_size 74 | max_index -= remap_size 75 | finally: 76 | contents.close() 77 | f.close() 78 | -------------------------------------------------------------------------------- /semidbm/loaders/simpleload.py: -------------------------------------------------------------------------------- 1 | import os 2 | import struct 3 | 4 | from semidbm.loaders import DBMLoader, _DELETED 5 | from semidbm.exceptions import DBMLoadError 6 | 7 | 8 | class SimpleFileLoader(DBMLoader): 9 | def __init__(self): 10 | pass 11 | 12 | def iter_keys(self, filename): 13 | # yields keyname, offset, size 14 | with open(filename, 'rb') as f: 15 | header = f.read(8) 16 | self._verify_header(header) 17 | current_offset = 8 18 | file_size_bytes = os.path.getsize(filename) 19 | while True: 20 | current_contents = f.read(8) 21 | current_offset += 8 22 | if len(current_contents) < 8: 23 | if len(current_contents) > 0: 24 | # This means we read a partial header 25 | # entry which should never happen. 26 | raise DBMLoadError( 27 | 'Error loading db: partial header read') 28 | else: 29 | return 30 | key_size, val_size = struct.unpack( 31 | '!ii', current_contents) 32 | key = f.read(key_size) 33 | if len(key) != key_size: 34 | raise DBMLoadError( 35 | "Error loading db: key size does not match " 36 | "(expected %s bytes, got %s instead." 37 | % (key_size, len(key))) 38 | value_offset = current_offset + key_size 39 | if value_offset + val_size > file_size_bytes: 40 | return 41 | yield (key, value_offset, val_size) 42 | if val_size == _DELETED: 43 | val_size = 0 44 | # 4 bytes is for the checksum. 45 | skip_ahead = key_size + val_size + 4 46 | current_offset += skip_ahead 47 | if current_offset > file_size_bytes: 48 | raise DBMLoadError( 49 | "Error loading db: reading past the " 50 | "end of the file (file possibly truncated)") 51 | f.seek(current_offset) 52 | -------------------------------------------------------------------------------- /semidbm/win32.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | from ctypes.wintypes import LPVOID, DWORD 3 | 4 | 5 | LPCTSTR = ctypes.c_wchar_p 6 | LPTSTR = LPCTSTR 7 | kernel32 = ctypes.windll.kernel32 8 | kernel32.ReplaceFile.argtypes = [ 9 | LPCTSTR, LPCTSTR, LPCTSTR, DWORD, LPVOID, LPVOID] 10 | 11 | 12 | def rename(src, dst): 13 | # Atomic renames in windows! 14 | # Equivalent to os.rename() in POSIX. 15 | # Yes the args here seem backwards but this is in fact 16 | # the awesomeness of windows just being different. 17 | rc = kernel32.ReplaceFile(LPCTSTR(dst), LPCTSTR(src), None, 0, None, None) 18 | if rc == 0: 19 | # While some sort of error is better than nothing, 20 | # I think there's a way to get a better error message 21 | # from another win32 call. 22 | raise OSError("can't rename file, error: %s" % kernel32.GetLastError()) 23 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | 5 | setup( 6 | name='semidbm', 7 | version='0.5.1', 8 | description="Cross platform (fast) DBM interface in python", 9 | long_description=open(os.path.join(os.path.dirname(__file__), 10 | 'README.rst')).read(), 11 | license='BSD', 12 | author='James Saryerwinnie', 13 | author_email='js@jamesls.com', 14 | packages = find_packages(), 15 | zip_safe=False, 16 | keywords="semidbm", 17 | url="https://github.com/jamesls/semidbm", 18 | classifiers=[ 19 | 'Development Status :: 4 - Beta', 20 | 'Programming Language :: Python :: 2.6', 21 | 'Programming Language :: Python :: 2.7', 22 | 'Programming Language :: Python :: 3.3', 23 | 'Programming Language :: Python :: 3.4', 24 | 'Programming Language :: Python :: Implementation :: CPython', 25 | 'Programming Language :: Python :: Implementation :: PyPy', 26 | 'Programming Language :: Python :: Implementation :: Jython', 27 | 'License :: OSI Approved :: BSD License', 28 | ], 29 | ) 30 | 31 | -------------------------------------------------------------------------------- /test_semidbm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import shutil 6 | import struct 7 | import tempfile 8 | try: 9 | import mmap 10 | except ImportError: 11 | mmap = None 12 | try: 13 | import unittest2 as unittest 14 | except ImportError: 15 | import unittest 16 | 17 | import semidbm 18 | import semidbm.db 19 | from semidbm.loaders.simpleload import SimpleFileLoader 20 | 21 | 22 | class SemiDBMTest(unittest.TestCase): 23 | def setUp(self): 24 | self.tempdir = tempfile.mkdtemp(prefix='semidbm_ut') 25 | self.dbdir = os.path.join(self.tempdir, 'myfile.db') 26 | 27 | def tearDown(self): 28 | shutil.rmtree(self.tempdir) 29 | 30 | def open_db_file(self, **kwargs): 31 | return semidbm.open(self.dbdir, 'c', **kwargs) 32 | 33 | def open_data_file(self, dbdir=None, mode='r'): 34 | if dbdir is None: 35 | dbdir = self.dbdir 36 | if not os.path.exists(dbdir): 37 | os.makedirs(dbdir) 38 | data_filename = os.path.join(dbdir, 'data') 39 | return open(data_filename, mode=mode) 40 | 41 | def truncate_data_file(self, bytes_from_end): 42 | with self.open_data_file(mode='rb') as f: 43 | contents = f.read() 44 | with self.open_data_file(mode='wb') as f2: 45 | # Simulate the last bytes_from_end bytes missing. 46 | f2.write(contents[:-bytes_from_end]) 47 | 48 | 49 | class TestSemiDBM(SemiDBMTest): 50 | def test_insert_then_retrieve(self): 51 | db = self.open_db_file() 52 | db['foo'] = 'bar' 53 | self.assertEqual(db['foo'], b'bar') 54 | db.close() 55 | 56 | def test_insert_close_retrieve(self): 57 | # This will verify loading the index. 58 | db = self.open_db_file() 59 | db['foo'] = 'bar' 60 | db.close() 61 | 62 | db2 = self.open_db_file() 63 | self.assertEqual(db2['foo'], b'bar') 64 | db2.close() 65 | 66 | def test_insert_multiple(self): 67 | db = self.open_db_file() 68 | db['one'] = '1' 69 | db['two'] = '2' 70 | db['three'] = '3' 71 | self.assertEqual(db['one'], b'1') 72 | self.assertEqual(db['two'], b'2') 73 | self.assertEqual(db['three'], b'3') 74 | db.close() 75 | 76 | def test_intermixed_inserts_and_retrievals(self): 77 | db = self.open_db_file() 78 | db['one'] = '1' 79 | db['two'] = '2' 80 | self.assertEqual(db['one'], b'1') 81 | db['three'] = '3' 82 | self.assertEqual(db['two'], b'2') 83 | self.assertEqual(db['three'], b'3') 84 | db.close() 85 | 86 | def test_keyerror_raised_when_key_does_not_exist(self): 87 | db = self.open_db_file() 88 | self.assertRaises(KeyError, db.__getitem__, 'one') 89 | db.close() 90 | 91 | def test_updates(self): 92 | db = self.open_db_file() 93 | db['one'] = 'foo' 94 | db['one'] = 'bar' 95 | self.assertEqual(db['one'], b'bar') 96 | db['one'] = 'baz' 97 | self.assertEqual(db['one'], b'baz') 98 | db.close() 99 | 100 | def test_updates_persist(self): 101 | db = self.open_db_file() 102 | db['one'] = 'foo' 103 | db['one'] = 'bar' 104 | db['one'] = 'baz' 105 | db.close() 106 | 107 | db2 = self.open_db_file() 108 | self.assertEqual(db2['one'], b'baz') 109 | db2.close() 110 | 111 | def test_contains(self): 112 | db = self.open_db_file() 113 | db[b'one'] = 'foo' 114 | self.assertTrue(b'one' in db) 115 | db.close() 116 | 117 | def test_deletes(self): 118 | db = self.open_db_file() 119 | db['foo'] = 'bar' 120 | del db['foo'] 121 | self.assertTrue('foo' not in db) 122 | db.close() 123 | 124 | def test_delete_key_not_there_when_reopened(self): 125 | db = self.open_db_file() 126 | db['foo'] = 'foo' 127 | db['bar'] = 'bar' 128 | del db['foo'] 129 | db.close() 130 | 131 | db2 = self.open_db_file() 132 | self.assertTrue('foo' not in db2) 133 | self.assertEqual(db2['bar'], b'bar') 134 | db2.close() 135 | 136 | def test_multiple_deletes(self): 137 | db = self.open_db_file() 138 | db['foo'] = 'foo' 139 | del db['foo'] 140 | db['foo'] = 'foo' 141 | del db['foo'] 142 | db['foo'] = 'foo' 143 | del db['foo'] 144 | db['bar'] = 'bar' 145 | db.close() 146 | db2 = self.open_db_file() 147 | self.assertTrue('foo' not in db2) 148 | self.assertEqual(db2['bar'], b'bar') 149 | db2.close() 150 | 151 | def test_keys_method(self): 152 | db = self.open_db_file() 153 | db['one'] = 'bar' 154 | db['two'] = 'bar' 155 | db['three'] = 'bar' 156 | self.assertEqual(set(db.keys()), set([b'one', b'two', b'three'])) 157 | db.close() 158 | 159 | def test_values_method(self): 160 | db = self.open_db_file() 161 | db['one'] = 'one_value' 162 | db['two'] = 'two_value' 163 | db['three'] = 'three_value' 164 | self.assertEqual(set(db.values()), set([b'one_value', b'two_value', 165 | b'three_value'])) 166 | db.close() 167 | 168 | def test_iterate(self): 169 | db = self.open_db_file() 170 | db['one'] = 'foo' 171 | db['two'] = 'bar' 172 | db['three'] = 'baz' 173 | self.assertEqual(set(db), set([b'one', b'two', b'three'])) 174 | db.close() 175 | 176 | def test_sync_contents(self): 177 | # So there's not really a good way to test this, so 178 | # I'm just making sure you can call it, and you can see the data. 179 | db = self.open_db_file() 180 | db['foo'] = 'bar' 181 | db.sync() 182 | db.close() 183 | db2 = self.open_db_file() 184 | self.assertEqual(db2['foo'], b'bar') 185 | db2.close() 186 | 187 | def test_compaction_does_not_leave_behind_files(self): 188 | db = self.open_db_file() 189 | before = len(os.listdir(self.dbdir)) 190 | for i in range(10): 191 | db[str(i)] = str(i) 192 | for i in range(10): 193 | del db[str(i)] 194 | db.close() 195 | db2 = self.open_db_file() 196 | db2.compact() 197 | db2.close() 198 | after = len(os.listdir(self.dbdir)) 199 | self.assertEqual(before, after, os.listdir(self.dbdir)) 200 | 201 | def test_inserts_after_deletes(self): 202 | db = self.open_db_file() 203 | db['one'] = b'one' 204 | del db['one'] 205 | db['two'] = b'two' 206 | 207 | self.assertEqual(db['two'], b'two') 208 | db.close() 209 | 210 | def test_mixed_updates_and_deletes(self): 211 | db = self.open_db_file() 212 | db['one'] = 'one' 213 | db['CHECK'] = 'original' 214 | db['two'] = 'two' 215 | db['CHECK'] = 'updated' 216 | del db['CHECK'] 217 | db['three'] = 'three' 218 | 219 | self.assertEqual(db['one'], b'one') 220 | self.assertEqual(db['two'], b'two') 221 | self.assertEqual(db['three'], b'three') 222 | db.close() 223 | 224 | def test_compact_and_retrieve_data(self): 225 | db = self.open_db_file() 226 | db['one'] = 'foo' 227 | db['key'] = 'original' 228 | db['two'] = 'bar' 229 | db['key'] = 'updated' 230 | del db['key'] 231 | db['three'] = 'baz' 232 | db.compact() 233 | self.assertEqual(db['one'], b'foo') 234 | self.assertEqual(db['two'], b'bar') 235 | self.assertEqual(db['three'], b'baz') 236 | db.close() 237 | 238 | def test_compact_on_close(self): 239 | db = self.open_db_file() 240 | db['key'] = 'original' 241 | del db['key'] 242 | db.close(compact=True) 243 | # Header is 8 bytes. 244 | self.assertEqual(len(open(db._data_filename).read()), 8) 245 | 246 | def test_compact_then_write_data(self): 247 | db = self.open_db_file() 248 | db['before'] = 'before' 249 | del db['before'] 250 | db.compact() 251 | db['after'] = 'after' 252 | db.close() 253 | 254 | db2 = self.open_db_file() 255 | self.assertEqual(db2['after'], b'after') 256 | db2.close() 257 | 258 | def test_bad_magic_number(self): 259 | db = self.open_db_file() 260 | db['foo'] = 'bar' 261 | db.close() 262 | with self.open_data_file(mode='rb+') as f: 263 | f.seek(0) 264 | f.write(b'Z') 265 | # Opening the db file should now fail. 266 | self.assertRaises(semidbm.DBMLoadError, self.open_db_file) 267 | 268 | def test_incompatible_version_number(self): 269 | db = self.open_db_file() 270 | db['foo'] = 'bar' 271 | db.close() 272 | with self.open_data_file(mode='rb+') as f: 273 | f.seek(4) 274 | f.write(struct.pack('!H', 2)) 275 | # Opening the db file should now fail. 276 | self.assertRaises(semidbm.DBMLoadError, self.open_db_file) 277 | 278 | def test_recover_from_last_failed_write(self): 279 | # Testing this scenario: 280 | # - we're writing a large object, we write the entry 281 | # header properly but we crash so we don't write out the 282 | # full value. The next time the db is loaded we should 283 | # be able to recover from this situation. 284 | db = self.open_db_file() 285 | # First write a few good keys. 286 | db['foobar'] = 'foobar' 287 | db['key'] = 'value' 288 | db['key2'] = 'value2' 289 | # Now simulate a failing write. 290 | db['largevalue'] = 'foobarbaz' * 1024 291 | db.close() 292 | # This is implementation specific, but we're going to read the raw data 293 | # file and truncate it. 294 | with self.open_data_file(mode='rb') as f: 295 | filename = f.name 296 | original_size = os.path.getsize(filename) 297 | self.truncate_data_file(bytes_from_end=100) 298 | db2 = self.open_db_file() 299 | self.assertEquals(db2['foobar'], b'foobar') 300 | self.assertEquals(db2['key'], b'value') 301 | self.assertEquals(db2['key2'], b'value2') 302 | # But largevalue is not there, we recovered and just removed it. 303 | self.assertNotIn('largevalue', db2) 304 | # And when we compact the data file, the junk data 305 | # is ignored and not written to the new file. 306 | db2.compact() 307 | db2.close() 308 | new_size = os.path.getsize(filename) 309 | self.assertTrue(new_size < original_size) 310 | 311 | def test_file_thats_truncated(self): 312 | # Let's say that the file header is fine, but part 313 | # of the header for an individual record has been 314 | # trunated. 315 | db = self.open_db_file() 316 | db['foo'] = 'bar' 317 | db.close() 318 | # Now let's truncate the file to only 10 bytes which 319 | # will include the file header and part of an entry 320 | # header. 321 | with self.open_data_file(mode='rb') as f: 322 | contents = f.read() 323 | with self.open_data_file(mode='wb') as f2: 324 | f2.write(contents[:10]) 325 | self.assertRaises(semidbm.DBMLoadError, self.open_db_file) 326 | 327 | def test_key_size_says_to_read_past_end_of_file(self): 328 | # We can create this situation by creating an entry 329 | # and truncating the key/value part. 330 | db = self.open_db_file() 331 | db['foo'] = 'bar' 332 | db.close() 333 | # From the end we have a 4 byte checksum + 3 bytes for 334 | # the key and 3 bytes for the value, or a total of 335 | # 10 bytes. We'll chop off 8 which means we're missing 336 | # the checksum, the value, and one byte of the key. 337 | self.truncate_data_file(bytes_from_end=8) 338 | self.assertRaises(semidbm.DBMLoadError, self.open_db_file) 339 | 340 | 341 | @unittest.skipIf(mmap is None, 'mmap required') 342 | class TestRemapping(SemiDBMTest): 343 | def setUp(self): 344 | import semidbm.loaders.mmapload 345 | super(TestRemapping, self).setUp() 346 | self.original = semidbm.loaders.mmapload._MAPPED_LOAD_PAGES 347 | # Change the number of mapped pages to 1 so that we don't have to write 348 | # as much data. The logic in the code uses this constant, so changing 349 | # the value of the constant won't affect the code logic, it'll just 350 | # make the test run faster. 351 | semidbm.loaders.mmapload._MAPPED_LOAD_PAGES = 1 352 | 353 | def tearDown(self): 354 | super(TestRemapping, self).tearDown() 355 | semidbm.loaders.mmapload._MAPPED_LOAD_PAGES = self.original 356 | 357 | def test_remap_required(self): 358 | # Verify the loading buffer logic works. This is 359 | # really slow. 360 | size = ( 361 | semidbm.loaders.mmapload._MAPPED_LOAD_PAGES * 362 | mmap.ALLOCATIONGRANULARITY * 4) 363 | db = self.open_db_file() 364 | # 100 byte values. 365 | values = b'abcd' * 25 366 | for i in range(int(size / 100)): 367 | db[str(i)] = values 368 | db.close() 369 | 370 | db2 = self.open_db_file() 371 | for k in db2: 372 | self.assertEqual(db2[k], values) 373 | db2.close() 374 | 375 | 376 | class TestReadOnlyMode(SemiDBMTest): 377 | def open_db_file(self, **kwargs): 378 | return semidbm.open(self.dbdir, 'r', **kwargs) 379 | 380 | def test_cant_setitem(self): 381 | db = self.open_db_file() 382 | self.assertRaises(semidbm.DBMError, db.__setitem__, 'foo', 'bar') 383 | db.close() 384 | 385 | def test_cant_sync(self): 386 | db = self.open_db_file() 387 | self.assertRaises(semidbm.DBMError, db.sync) 388 | db.close() 389 | 390 | def test_cant_compact(self): 391 | db = self.open_db_file() 392 | self.assertRaises(semidbm.DBMError, db.compact) 393 | db.close() 394 | 395 | def test_cant_delitem(self): 396 | db = self.open_db_file() 397 | self.assertRaises(semidbm.DBMError, db.__delitem__, 'foo') 398 | db.close() 399 | 400 | def test_close_never_compacts_index(self): 401 | db = self.open_db_file() 402 | db.calls = [] 403 | db.compact = lambda: db.calls.append('compact') 404 | db.sync = lambda: db.calls.append('sync') 405 | 406 | db.close(compact=True) 407 | 408 | self.assertEqual(db.calls, []) 409 | 410 | def test_open_read_multiple_times(self): 411 | db = semidbm.open(self.dbdir, 'c') 412 | db['foo'] = 'bar' 413 | db.close() 414 | # Open then close db immediately. 415 | db2 = self.open_db_file() 416 | db2.close() 417 | read_only = self.open_db_file() 418 | self.assertEqual(read_only['foo'], b'bar') 419 | read_only.close() 420 | 421 | def test_can_read_items(self): 422 | db = semidbm.open(self.dbdir, 'c') 423 | db['foo'] = 'bar' 424 | db['bar'] = 'baz' 425 | db['baz'] = 'foo' 426 | db.close() 427 | 428 | read_only = self.open_db_file() 429 | self.assertEqual(read_only[b'foo'], b'bar') 430 | self.assertEqual(read_only[b'bar'], b'baz') 431 | self.assertEqual(read_only[b'baz'], b'foo') 432 | read_only.close() 433 | 434 | def test_key_does_not_exist(self): 435 | db = semidbm.open(self.dbdir, 'c') 436 | db['foo'] = 'bar' 437 | db.close() 438 | 439 | read_only = self.open_db_file() 440 | self.assertRaises(KeyError, read_only.__getitem__, 'bar') 441 | read_only.close() 442 | 443 | def test_checksum_failure(self): 444 | db = semidbm.open(self.dbdir, 'c') 445 | db[b'key'] = b'value' 446 | db.close() 447 | data_file = self.open_data_file(mode='rb') 448 | contents = data_file.read() 449 | data_file.close() 450 | # Changing 'value' to 'Value' should cause a checksum failure. 451 | contents = contents.replace(b'value', b'Value') 452 | data_file = self.open_data_file(mode='wb') 453 | data_file.write(contents) 454 | data_file.close() 455 | db = self.open_db_file(verify_checksums=True) 456 | with self.assertRaises(semidbm.DBMChecksumError): 457 | db['key'] 458 | db.close() 459 | # If checksums are not enabled, an exception is not raised. 460 | db = self.open_db_file(verify_checksums=False) 461 | try: 462 | db['key'] 463 | except semidbm.DBMChecksumError: 464 | self.fail("Checksums were suppose to be disabled.") 465 | finally: 466 | db.close() 467 | 468 | def test_unicode_chars(self): 469 | db = semidbm.open(self.dbdir, 'c') 470 | # cafe with the e-accute. 471 | db[b'caf\xc3\xa9'] = b'caf\xc3\xa9' 472 | self.assertEqual(db[b'caf\xc3\xa9'], b'caf\xc3\xa9') 473 | db.close() 474 | 475 | 476 | class TestWriteMode(SemiDBMTest): 477 | def test_when_index_file_does_not_exist(self): 478 | self.assertRaises(semidbm.DBMError, semidbm.open, self.dbdir, 'w') 479 | 480 | def test_when_data_file_does_not_exist(self): 481 | self.assertRaises(semidbm.DBMError, semidbm.open, self.dbdir, 'w') 482 | 483 | def test_when_files_exist(self): 484 | db = self.open_db_file() 485 | db['foo'] = 'bar' 486 | db.close() 487 | 488 | db_write_mode = semidbm.open(self.dbdir, 'w') 489 | self.assertEqual(db_write_mode['foo'], b'bar') 490 | db_write_mode.close() 491 | 492 | 493 | class TestNewMode(SemiDBMTest): 494 | def test_when_file_does_not_exist(self): 495 | path = os.path.join(self.tempdir, 'foo.db') 496 | db = semidbm.open(path, 'n') 497 | db['foo'] = 'bar' 498 | self.assertEqual(db['foo'], b'bar') 499 | db.close() 500 | 501 | # Opening the file again should basically blank out 502 | # any existing database. 503 | db = semidbm.open(path, 'n') 504 | self.assertEqual(list(db.keys()), []) 505 | db.close() 506 | 507 | 508 | class TestInvalidModeArgument(unittest.TestCase): 509 | def test_invalid_open_arg_raises_exception(self): 510 | self.assertRaises(ValueError, semidbm.open, 'foo.db', 'z') 511 | 512 | 513 | class TestWithChecksumsOn(TestSemiDBM): 514 | def open_db_file(self, **kwargs): 515 | # If they do not explicitly set verify_checksums 516 | # to something, default to it being on. 517 | if 'verify_checksums' not in kwargs: 518 | kwargs['verify_checksums'] = True 519 | return semidbm.open(self.dbdir, 'c', **kwargs) 520 | 521 | 522 | class TestSimpleFileLoader(TestSemiDBM): 523 | def open_db_file(self, **kwargs): 524 | kwargs = semidbm.db._create_default_params() 525 | kwargs['data_loader'] = SimpleFileLoader() 526 | return semidbm.db._SemiDBM(self.dbdir, **kwargs) 527 | 528 | 529 | if __name__ == '__main__': 530 | unittest.main() 531 | --------------------------------------------------------------------------------