├── .gitignore
├── .travis.yml
├── COPYING
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── api_semidbm.rst
    ├── benchmarks.rst
    ├── changelog.rst
    ├── conf.py
    ├── details.rst
    ├── fileformat.rst
    ├── img
    │   ├── delete_sequential.png
    │   ├── fill_sequential.png
    │   ├── large_delete_sequential.png
    │   ├── large_fill_sequential.png
    │   ├── large_read_hot.png
    │   ├── large_read_random.png
    │   ├── large_read_sequential.png
    │   ├── read_hot.png
    │   ├── read_random.png
    │   └── read_sequential.png
    ├── index.rst
    └── overview.rst
├── requirements.txt
├── scripts
    ├── adapters
    │   ├── bdb_btopen.py
    │   ├── bdb_hashopen.py
    │   ├── bdb_minimal.py
    │   └── builtindict.py
    ├── benchmark
    ├── loadtime
    ├── makedb
    ├── makegraphs
    └── tps
├── semidbm
    ├── __init__.py
    ├── compat.py
    ├── db.py
    ├── exceptions.py
    ├── loaders
    │   ├── __init__.py
    │   ├── mmapload.py
    │   └── simpleload.py
    └── win32.py
├── setup.cfg
├── setup.py
└── test_semidbm.py


/.gitignore:
--------------------------------------------------------------------------------
1 | build/*
2 | dist/*
3 | semidbm.egg-info/*
4 | .coverage
5 | _templates/
6 | _static/
7 | _build/
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.6"
 4 |   - "2.7"
 5 |   - "3.3"
 6 |   - "3.4"
 7 |   - "pypy"
 8 |   - "pypy3"
 9 | install:
10 |   - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi
11 |   - pip install coverage coveralls
12 | script:
13 |   - coverage erase
14 |   - coverage run --source semidbm test_semidbm.py
15 | notifications:
16 |   email:
17 |     - js@jamesls.com
18 | after_success: coveralls
19 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011 James Saryerwinnie
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions
 6 | are met:
 7 | 1. Redistributions of source code must retain the above copyright
 8 |    notice, this list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright
10 |    notice, this list of conditions and the following disclaimer in the
11 |    documentation and/or other materials provided with the distribution.
12 | 3. The name of the author may not be used to endorse or promote products
13 |    derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 | IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include COPYING
2 | include README.rst
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ========
  2 | Overview
  3 | ========
  4 | 
  5 | .. image:: https://secure.travis-ci.org/jamesls/semidbm.png?branch=master
  6 |    :target: http://travis-ci.org/jamesls/semidbm
  7 | 
  8 | .. image:: https://coveralls.io/repos/jamesls/semidbm/badge.png?branch=master
  9 |    :target: https://coveralls.io/r/jamesls/semidbm?branch=master
 10 | 
 11 | .. image:: https://img.shields.io/pypi/v/semidbm.svg
 12 |     :target: https://pypi.python.org/pypi/semidbm/
 13 |     :alt: Latest Version
 14 | 
 15 | .. image:: https://img.shields.io/pypi/pyversions/semidbm.svg
 16 |     :target: https://pypi.python.org/pypi/semidbm/
 17 |     :alt: Supported Python versions
 18 | 
 19 | .. image:: https://img.shields.io/pypi/implementation/semidbm.svg
 20 |     :target: https://pypi.python.org/pypi/semidbm/
 21 |     :alt: Supported Python implementations
 22 | 
 23 | .. image:: https://img.shields.io/pypi/l/semidbm.svg
 24 |     :target: https://pypi.python.org/pypi/semidbm/
 25 |     :alt: License
 26 | 
 27 | .. image:: https://img.shields.io/pypi/wheel/semidbm.svg
 28 |     :target: https://pypi.python.org/pypi/semidbm/
 29 |     :alt: Wheel Status
 30 | 
 31 | 
 32 | Semidbm is a fast, pure python implementation of a dbm, which is a
 33 | persistent key value store. It allows you to get and set keys through
 34 | a dict interface::
 35 | 
 36 |     import semidbm
 37 |     db = semidbm.open('testdb', 'c')
 38 |     db['foo'] = 'bar'
 39 |     print db['foo']
 40 |     db.close()
 41 | 
 42 | These values are persisted to disk, and you can later retrieve
 43 | these key/value pairs::
 44 | 
 45 |     # Then at a later time:
 46 |     db = semidbm.open('testdb', 'r')
 47 |     # prints "bar"
 48 |     print db['foo']
 49 | 
 50 | 
 51 | It was written with these things in mind:
 52 | 
 53 | * Pure python, supporting python 2.6, 2.7, 3.3, and 3.4.
 54 | * Cross platform, works on Windows, Linux, Mac OS X.
 55 | * Supports CPython, pypy, and jython (versions 2.7-b3 and higher).
 56 | * Simple and Fast (See `Benchmarking Semidbm <http://semidbm.readthedocs.org/en/latest/benchmarks.html>`__).
 57 | 
 58 | 
 59 | Supported Python Versions
 60 | =========================
 61 | 
 62 | Semidbm supports python 2.6, 2.7, 3.3, and 3.4.
 63 | 
 64 | =============
 65 | Official Docs
 66 | =============
 67 | 
 68 | Read the `semidbm docs <http://semidbm.readthedocs.org>`_ for more information
 69 | and how to use semidbm.
 70 | 
 71 | 
 72 | ========
 73 | Features
 74 | ========
 75 | 
 76 | Semidbm originally started off as an improvement over the
 77 | `dumbdbm <https://docs.python.org/2/library/dumbdbm.html>`__
 78 | library in the python standard library.  Below are a list of some of the
 79 | improvements over dumbdbm.
 80 | 
 81 | 
 82 | Single Data File
 83 | ================
 84 | 
 85 | Instead of an index file and a data file, the index and data have been
 86 | consolidated into a single file.  This single data file is always appended to,
 87 | data written to the file is never modified.
 88 | 
 89 | 
 90 | Data File Compaction
 91 | ====================
 92 | 
 93 | Semidbm uses an append only file format.  This has the potential to grow to
 94 | large sizes as space is never reclaimed.  Semidbm addresses this by adding a
 95 | ``compact()`` method that will rewrite the data file to a minimal size.
 96 | 
 97 | 
 98 | Performance
 99 | ===========
100 | 
101 | Semidbm is significantly faster than dumbdbm (keep in mind both are pure python
102 | libraries) in just about every way.  The documentation shows the
103 | `results <http://semidbm.readthedocs.org/en/latest/benchmarks.html>`_
104 | of semidbm vs. other dbms, along with how to run the benchmarking
105 | script yourself.
106 | 
107 | 
108 | ===========
109 | Limitations
110 | ===========
111 | 
112 | * Not thread safe; can't be accessed by multiple processes.
113 | * The entire index must fit in memory.  This essentially means that all of the
114 |   keys must fit in memory.
115 | 
116 | 
117 | Post feedback and issues on `github issues`_, or check out the
118 | latest changes at the github `repo`_.
119 | 
120 | 
121 | .. _github issues: https://github.com/jamesls/semidbm/issues
122 | .. _repo: https://github.com/jamesls/semidbm
123 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/semidbm.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/semidbm.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/semidbm"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/semidbm"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/api_semidbm.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | API for semidbm
 3 | ===============
 4 | 
 5 | .. autofunction:: semidbm.db.open
 6 | 
 7 | 
 8 | .. autoclass:: semidbm.db._SemiDBM
 9 |     :members:
10 | 


--------------------------------------------------------------------------------
/docs/benchmarks.rst:
--------------------------------------------------------------------------------
  1 | ====================
  2 | Benchmarking Semidbm
  3 | ====================
  4 | 
  5 | Semidbm was not written to be the fastest dbm available, but its performance is
  6 | surprisingly well for a pure python dbm.  Before showing the benchmark results,
  7 | it's important to note that these benchmark results can vary across machines
  8 | and should in no way be considered definitive nor comprehensive.  And yes,
  9 | there are other things besides performance that are important when considering
 10 | a dbm.
 11 | 
 12 | 
 13 | Benchmarking Approach
 14 | =====================
 15 | 
 16 | The benchmarks used for semidbm are based off the benchmark scripts for
 17 | `leveldb <http://code.google.com/p/leveldb/>`_. You can run the benchmark
 18 | scripts yourself using the `scripts/benchmark` script in the repo.  By default,
 19 | the benchmark uses a db of one million keys with 16 byte keys and 100 byte
 20 | values (these are the values used for leveldb's benchmarks).  All of these
 21 | parameters can be changed via command line arugments ( `-n`, `-k`, `-s`
 22 | respectively).
 23 | 
 24 | The benchmark script is written in a way to be compatible with any module
 25 | supporting the dbm interface.  Given the dbm interface isn't entirely
 26 | standardized, this is what is required:
 27 | 
 28 | * An `open()` function in the module (that behaves like
 29 |   `dumbdbm.open <http://docs.python.org/library/dumbdbm.html#dumbdbm.open>`_,
 30 |   `gdbm.open <http://docs.python.org/library/gdbm.html#gdbm.open>`_, etc).
 31 | * The returned object from `open()` is a "dbm" like object.  All the object
 32 |   needs to support is `__getitem__`, `__setitem__`, `__delitem__`, and
 33 |   `close()`.
 34 | 
 35 | To specify what dbm module to use, use the `-d` argument.  The value of this
 36 | argument should the module name of the dbm, for example, to run the benchmarks
 37 | against semidbm::
 38 | 
 39 |     scripts/benchmark -d semidbm
 40 | 
 41 | The `-d` argument can be specified multiple times.
 42 | 
 43 | If a dbm does not support a dbm interface, an adapter module can be written for
 44 | the dbm.  The directory `scripts/adapters` is added to `sys.path` before the
 45 | benchmarks are run, so benchmarking a 3rd party dbm is straightforward.  For
 46 | example, in order to benchmark Berkeley DB using the bsddb3 module, a
 47 | `scripts/adapters/bdb_minimal.py` file was created::
 48 | 
 49 | 
 50 |     import bsddb3.db
 51 |     def open(filename, mode):
 52 |         db = bsddb3.db.DB(None)
 53 |         if mode == 'r':
 54 |             flags = bsddb3.db.DB_RDONLY
 55 |         elif mode == 'rw':
 56 |             flags = 0
 57 |         elif mode == 'w':
 58 |             flags =  bsddb3.db.DB_CREATE
 59 |         elif mode == 'c':
 60 |             flags =  bsddb3.db.DB_CREATE
 61 |         elif mode == 'n':
 62 |             flags = bsddb3.db.DB_TRUNCATE | bsddb3.db.DB_CREATE
 63 |         else:
 64 |             raise bsddb3.db.DBError(
 65 |                 "flags should be one of 'r', 'w', 'c' or 'n' or use the "
 66 |                 "bsddb.db.DB_* flags")
 67 |         db.open(filename, None, bsddb3.db.DB_HASH, flags)
 68 |         return db
 69 | 
 70 | The `bsddb3.db.DB <http://www.jcea.es/programacion/pybsddb_doc/db.html>`_
 71 | object can now be benchmarked using::
 72 | 
 73 |     scripts/benchmark -d bdb_minimal
 74 | 
 75 | 
 76 | Benchmark Results
 77 | =================
 78 | 
 79 | Below are the results of benchmarking various dbms.
 80 | Although `scripts/benchmark` shows the results in various forms of measurement,
 81 | the measurement chosen here is the average number of operations per second over
 82 | the total number of keys.  For this measurement, **higher is better**.
 83 | 
 84 | The dbms chosen for this benchmark are:
 85 | 
 86 | * semidbm
 87 | * gdbm (GDN dbm)
 88 | * bdb (minimal Berkeley DB interface, `scripts/adapaters/bdb_minimal.py`)
 89 | * dumbdbm
 90 | 
 91 | The `dbm` module was not included because it was not able to add 1000000 to its
 92 | db, it raises an exception around 420000 keys with an "Out of overflow pages"
 93 | error.
 94 | 
 95 | 
 96 | This first benchmark shows the ops/sec for adding one million keys to the db.
 97 | 
 98 | 
 99 | .. image:: img/fill_sequential.png
100 | 
101 | The second benchmark shows the ops/sec for repeatedly accessing 1% of the keys
102 | (randomly selected).
103 | 
104 | 
105 | .. image:: img/read_hot.png
106 | 
107 | 
108 | The next benchmark shows the ops/sec for reading all one million keys in the
109 | same order that they were added.
110 | 
111 | .. image:: img/read_sequential.png
112 | 
113 | The next benchmark shows the ops/sec for reading all one million keys in a
114 | randomly selected order.
115 | 
116 | .. image:: img/read_random.png
117 | 
118 | And the last benchmark shows the ops/sec for deleting all one million keys in
119 | the same order that they were added.
120 | 
121 | .. image:: img/delete_sequential.png
122 | 
123 | 
124 | Note that dumbdbm is not shown in the chart above.  This is because deletion of
125 | keys in dumbdbm is extremely slow.  It also appears to have O(n) behavior (it
126 | writes out its data file on every delete).  To give you an idea of the
127 | performance, running this benchmark against dumbdbm with 1000 keys gave an
128 | average ops/sec for the delete_sequential benchmark of **800**.  For 10000
129 | keys dumbdbm resulted in **104** ops/sec.
130 | 
131 | 
132 | The table below shows the actual numbers for the charts above.
133 | 
134 | +-------------------+-------------+------------+--------+---------+
135 | |                   | semidbm     |  gdbm      |  bdb   | dumbdbm |
136 | +===================+=============+============+========+=========+
137 | | fill_sequential   |  **73810**  |  63177     |  73614 |    5460 |
138 | +-------------------+-------------+------------+--------+---------+
139 | | read_hot          |  **218651** | 202432     | 200111 |   59569 |
140 | +-------------------+-------------+------------+--------+---------+
141 | | read_sequential   |  257668     | **417320** | 209696 |   62605 |
142 | +-------------------+-------------+------------+--------+---------+
143 | | read_random       |  219962     | **406594** | 197690 |   59258 |
144 | +-------------------+-------------+------------+--------+---------+
145 | | delete_sequential |  **144265** | 119167     | 135137 |       0 |
146 | +-------------------+-------------+------------+--------+---------+
147 | 
148 | 
149 | Benchmarking With Large Values
150 | ------------------------------
151 | 
152 | One area where semidbm benchmarks really well is when dealing with large
153 | values.  The same 5 benchmarks were repeated, but with only 1000 total keys,
154 | 16 byte keys, and 100000 byte values.
155 | 
156 | 
157 | The first benchmark shows the ops/sec for 1000 sequential writes.
158 | 
159 | 
160 | .. image:: img/large_fill_sequential.png
161 | 
162 | 
163 | The second benchmark shows the ops/sec for repeatedly accessing 1% of the keys
164 | (randomly selected).
165 | 
166 | 
167 | .. image:: img/large_read_hot.png
168 | 
169 | 
170 | The third benchmark shows the ops/sec for sequentially reading all 1000 keys.
171 | 
172 | .. image:: img/large_read_sequential.png
173 | 
174 | The fourth benchmark shows the ops/sec for reading all 1000 keys in a
175 | randomly selected order.
176 | 
177 | .. image:: img/large_read_random.png
178 | 
179 | And the last benchmark shows the ops/sec for deleting all 1000 keys in
180 | the same order that they were added.
181 | 
182 | .. image:: img/large_delete_sequential.png
183 | 
184 | Below is the raw data used to generate the above charts.
185 | 
186 | +----------------------+------------+-----------+-----------+-------------+-----------+
187 | | n=1000,k=16,v=100000 |  semidbm   |    dbm    |   gdbm    | bdb_minimal |  dumbdbm  |
188 | +======================+============+===========+===========+=============+===========+
189 | | fill_sequential      |   2653     |  2591     |  **5525** |    4677     |  1330     |
190 | +----------------------+------------+-----------+-----------+-------------+-----------+
191 | | read_hot             |  **61016** |  8363     | 23104     |   11782     | 31624     |
192 | +----------------------+------------+-----------+-----------+-------------+-----------+
193 | | read_sequential      |  **42421** |  8822     |  1508     |   11519     | 26757     |
194 | +----------------------+------------+-----------+-----------+-------------+-----------+
195 | | read_random          |  **42133** |  8720     | 16442     |   11162     | 23778     |
196 | +----------------------+------------+-----------+-----------+-------------+-----------+
197 | | delete_sequential    | **141379** | 21167     | 17695     |    7267     |   780     |
198 | +----------------------+------------+-----------+-----------+-------------+-----------+
199 | 
200 | You can see that with the exception of fill_sequential (in which the fastest
201 | module, gdbm, was roughly twice as fast as semidbm), semidbm completely
202 | outperforms all the other dbms.  In the case of read_sequential, semidbm's **28
203 | times faster than gdbm.**
204 | 
205 | 
206 | Overall, semidbm's performance is comparable to the performance of other dbms
207 | with small keys and values, but is surprisingly faster than other dbms when
208 | reading large values.  It's also clear that semidbm is faster than dumbdbm is all
209 | of the benchmarks shown here.
210 | 
211 | 
212 | Running the Benchmarks
213 | ----------------------
214 | 
215 | You are encouraged to run the benchmarks yourself, to recreate the benchmark
216 | above, you can run::
217 | 
218 |     scripts/benchmark -d semidbm -d gdbm -d bdb_minimal -d dumbdbm
219 | 
220 | Though keep in mind that you will probably want to stop the benchmark
221 | once dumbdbm reaches the delete_sequential benchmark.  Either that or you can
222 | leave off dumbdbm and run it with a smaller number of keys::
223 | 
224 |     scripts/benchmark -d dumbdbm -n 10000
225 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Changelog
 3 | =========
 4 | 
 5 | 0.5.1
 6 | =====
 7 | 
 8 | * Remove unused code.
 9 | * Add support for wheels.
10 | 
11 | 
12 | 0.5.0
13 | =====
14 | 
15 | * Remove mmap read only dbm subclass. This functionality
16 |   has not been available in a public interface since
17 |   b265e60c5f4c0b1e8e9e4343f5f2300b5e017bf0 (1.5 years ago)
18 |   so it's now removed.
19 | * Added non mmap based dbm loader for platforms that do not
20 |   support mmap (jython).
21 | * Atomic renames on windows during db compaction.
22 | 
23 | 
24 | 0.4.0
25 | =====
26 | 
27 | 0.4.0 is a backwards incompatible release with 0.3.1.
28 | Data files created with 0.3.1 will not work with 0.4.0.
29 | The reasons for switching to 0.4.0 include:
30 | 
31 | * Data format switched from ASCII to binary file format,
32 |   this resulted in a nice performance boost.
33 | * Index and data file consolidated to a single file, resulting
34 |   in improved write performance.
35 | * Checksums are written for all entries.  Checksums can
36 |   be verified for every __getitem__ call (off by default).
37 | * Python 3 support (officially python 3.3.x).
38 | 
39 | 
40 | 0.3.1
41 | =====
42 | 
43 | * Windows support.
44 | 
45 | 
46 | 0.3.0
47 | =====
48 | 
49 | * The data file and the index file are kept in a separate directory.  To load
50 |   the the db you specify the directory name instead of the data filename.
51 | * Non-mmapped read only version is used when the db is opened with ``r``.
52 | * Write performance improvements.
53 | 
54 | 
55 | 0.2.1
56 | =====
57 | 
58 | * DB can be opened with ``r``, ``c``, ``w``, and ``n``.
59 | * Add a memory mapped read only implementation for reading
60 |   from the DB (if your entire data file can be mmapped this
61 |   provides a huge performance boost for reads).
62 | * Benchmark scripts rewritten to provide more useful information.
63 | 
64 | 
65 | 0.2.0
66 | =====
67 | 
68 | * New ``sync()`` method to ensure data is written to disk.
69 | 
70 |   * ``sync()`` is called during compaction and on ``close()``.
71 | 
72 | * Add a ``DBMLoadError`` exception for catching semidbm loading errors.
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # semidbm documentation build configuration file, created by
  4 | # sphinx-quickstart on Sun Feb 12 21:08:54 2012.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | on_rtd = os.environ.get('READTHEDOCS') == 'True'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be extensions
 29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 31 | 
 32 | # Add any paths that contain templates here, relative to this directory.
 33 | templates_path = ['_templates']
 34 | 
 35 | # The suffix of source filenames.
 36 | source_suffix = '.rst'
 37 | 
 38 | # The encoding of source files.
 39 | #source_encoding = 'utf-8-sig'
 40 | 
 41 | # The master toctree document.
 42 | master_doc = 'index'
 43 | 
 44 | # General information about the project.
 45 | project = u'semidbm'
 46 | copyright = u'2012, James Saryerwinnie Jr'
 47 | 
 48 | # The version info for the project you're documenting, acts as replacement for
 49 | # |version| and |release|, also used in various other places throughout the
 50 | # built documents.
 51 | #
 52 | # The short X.Y version.
 53 | version = '0.5'
 54 | # The full version, including alpha/beta/rc tags.
 55 | release = '0.5.1'
 56 | 
 57 | # The language for content autogenerated by Sphinx. Refer to documentation
 58 | # for a list of supported languages.
 59 | #language = None
 60 | 
 61 | # There are two options for replacing |today|: either, you set today to some
 62 | # non-false value, then it is used:
 63 | #today = ''
 64 | # Else, today_fmt is used as the format for a strftime call.
 65 | #today_fmt = '%B %d, %Y'
 66 | 
 67 | # List of patterns, relative to source directory, that match files and
 68 | # directories to ignore when looking for source files.
 69 | exclude_patterns = ['_build']
 70 | 
 71 | # The reST default role (used for this markup: `text`) to use for all documents.
 72 | #default_role = None
 73 | 
 74 | # If true, '()' will be appended to :func: etc. cross-reference text.
 75 | #add_function_parentheses = True
 76 | 
 77 | # If true, the current module name will be prepended to all description
 78 | # unit titles (such as .. function::).
 79 | #add_module_names = True
 80 | 
 81 | # If true, sectionauthor and moduleauthor directives will be shown in the
 82 | # output. They are ignored by default.
 83 | #show_authors = False
 84 | 
 85 | 
 86 | 
 87 | # A list of ignored prefixes for module index sorting.
 88 | #modindex_common_prefix = []
 89 | 
 90 | 
 91 | # -- Options for HTML output ---------------------------------------------------
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | if not on_rtd:
 94 |     try:
 95 |         import sphinx_rtd_theme
 96 |     except ImportError:
 97 |         sys.stderr.write("Could not import sphinx_rtd_theme, you can "
 98 |                          "run 'pip install sphinx_rtd_theme' to install "
 99 |                          "this package.\n")
100 |         raise
101 |     html_theme = 'sphinx_rtd_theme'
102 |     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
103 | html_theme_options = {
104 | }
105 | 
106 | 
107 | # Add any paths that contain custom static files (such as style sheets) here,
108 | # relative to this directory. They are copied after the builtin static files,
109 | # so a file named "default.css" will overwrite the builtin "default.css".
110 | html_static_path = ['_static']
111 | 
112 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
113 | # using the given strftime format.
114 | #html_last_updated_fmt = '%b %d, %Y'
115 | 
116 | # If true, SmartyPants will be used to convert quotes and dashes to
117 | # typographically correct entities.
118 | #html_use_smartypants = True
119 | 
120 | # Custom sidebar templates, maps document names to template names.
121 | #html_sidebars = {}
122 | 
123 | # Additional templates that should be rendered to pages, maps page names to
124 | # template names.
125 | #html_additional_pages = {}
126 | 
127 | # If false, no module index is generated.
128 | #html_domain_indices = True
129 | 
130 | # If false, no index is generated.
131 | #html_use_index = True
132 | 
133 | # If true, the index is split into individual pages for each letter.
134 | #html_split_index = False
135 | 
136 | # If true, links to the reST sources are added to the pages.
137 | #html_show_sourcelink = True
138 | 
139 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
140 | #html_show_sphinx = True
141 | 
142 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
143 | #html_show_copyright = True
144 | 
145 | # If true, an OpenSearch description file will be output, and all pages will
146 | # contain a <link> tag referring to it.  The value of this option must be the
147 | # base URL from which the finished HTML is served.
148 | #html_use_opensearch = ''
149 | 
150 | # This is the file name suffix for HTML files (e.g. ".xhtml").
151 | #html_file_suffix = None
152 | 
153 | # Output file base name for HTML help builder.
154 | htmlhelp_basename = 'semidbmdoc'
155 | 
156 | 
157 | # -- Options for LaTeX output --------------------------------------------------
158 | 
159 | latex_elements = {
160 | # The paper size ('letterpaper' or 'a4paper').
161 | #'papersize': 'letterpaper',
162 | 
163 | # The font size ('10pt', '11pt' or '12pt').
164 | #'pointsize': '10pt',
165 | 
166 | # Additional stuff for the LaTeX preamble.
167 | #'preamble': '',
168 | }
169 | 
170 | # Grouping the document tree into LaTeX files. List of tuples
171 | # (source start file, target name, title, author, documentclass [howto/manual]).
172 | latex_documents = [
173 |   ('index', 'semidbm.tex', u'semidbm Documentation',
174 |    u'James Saryerwinnie Jr', 'manual'),
175 | ]
176 | 
177 | # The name of an image file (relative to this directory) to place at the top of
178 | # the title page.
179 | #latex_logo = None
180 | 
181 | # For "manual" documents, if this is true, then toplevel headings are parts,
182 | # not chapters.
183 | #latex_use_parts = False
184 | 
185 | # If true, show page references after internal links.
186 | #latex_show_pagerefs = False
187 | 
188 | # If true, show URL addresses after external links.
189 | #latex_show_urls = False
190 | 
191 | # Documents to append as an appendix to all manuals.
192 | #latex_appendices = []
193 | 
194 | # If false, no module index is generated.
195 | #latex_domain_indices = True
196 | 
197 | 
198 | # -- Options for manual page output --------------------------------------------
199 | 
200 | # One entry per manual page. List of tuples
201 | # (source start file, name, description, authors, manual section).
202 | man_pages = [
203 |     ('index', 'semidbm', u'semidbm Documentation',
204 |      [u'James Saryerwinnie Jr'], 1)
205 | ]
206 | 
207 | # If true, show URL addresses after external links.
208 | #man_show_urls = False
209 | 
210 | 
211 | # -- Options for Texinfo output ------------------------------------------------
212 | 
213 | # Grouping the document tree into Texinfo files. List of tuples
214 | # (source start file, target name, title, author,
215 | #  dir menu entry, description, category)
216 | texinfo_documents = [
217 |   ('index', 'semidbm', u'semidbm Documentation',
218 |    u'James Saryerwinnie Jr', 'semidbm', 'One line description of project.',
219 |    'Miscellaneous'),
220 | ]
221 | 
222 | # Documents to append as an appendix to all manuals.
223 | #texinfo_appendices = []
224 | 
225 | # If false, no module index is generated.
226 | #texinfo_domain_indices = True
227 | 
228 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
229 | #texinfo_show_urls = 'footnote'
230 | 


--------------------------------------------------------------------------------
/docs/details.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | SemiDBM Details
 3 | ===============
 4 | 
 5 | This guide goes into the details of how semidbm works.
 6 | 
 7 | Writing a Value
 8 | ===============
 9 | 
10 | One of the key characteristics of semidbm is that it only writes to the end of
11 | a file.  **Once data has been written to a file, it is never changed.**  This
12 | makes it easy to guarantee that once the data is written to disk, you can be
13 | certain that semidbm will not corrupt the data.  This also makes semidbm
14 | simpler because we don't have to worry about how to modify data in a way that
15 | prevents corruption in the event of a crash.
16 | 
17 | Even updates to existing values are written as new values at the end of
18 | a file.  When the data file is loaded, these transactions are "replayed"
19 | so that the last change will "win".  For example, given these operations::
20 | 
21 |     add key "foo" with value "bar"
22 |     add key "foo2" with value "bar2"
23 |     delete key "foo2"
24 |     add key "foo" with value "new value"
25 | 
26 | this would represent a dictionary that looked like this::
27 | 
28 |     {"foo": "new value"}
29 | 
30 | .. note::
31 | 
32 |   This is just the conceptual view of the transactions.  The actual
33 |   format is a binary format specified in :doc:`fileformat`.
34 | 
35 | You can imagine that a db with a large number of updates can cause
36 | the file to grow to a much larger size than is needed.  To reclaim
37 | fixed space, you can use the ``compact()`` method.  This will
38 | rewrite the data file is the shortest amount of transactions
39 | needed.  The above example can be compacted to::
40 | 
41 |     add key "foo" with value "new value"
42 | 
43 | When a compaction occurs, a new data file is written out (the original
44 | data file is left untouched).  Once all the compacted data has been
45 | written out to the new data file (and fsync'd!), the new data file
46 | is renamed over the original data file, completing the compaction.
47 | This way, if a crash occurs during compaction, the original data file
48 | is not corrupted.
49 | 
50 | 
51 | Reading Values
52 | ==============
53 | 
54 | The index that is stored in memory does not contain the actual
55 | data associated with the key.  Instead, it contains the location
56 | within the file where the value is located, conceptually::
57 | 
58 |     db = {'foo': DiskLocation(offset=40, size=10)}
59 | 
60 | When the value for a key is requested, the offset and size are looked
61 | up.  A disk seek is performed and a read is performed for the
62 | specified size associated with the value.  This translates to
63 | 2 syscalls::
64 | 
65 |     lseek(fd, offset, os.SEEKSET)
66 |     data = read(fs, value_size)
67 | 
68 | Data Verification
69 | =================
70 | 
71 | Every write to a semidbm db file also includes a crc32 checksum.
72 | When a value is read from disk, semidbm can verify this crc32 checksum.
73 | By default, this verification step is turned off, but can be enabled using the
74 | ``verify_checksums`` argument::
75 | 
76 |     >>> db = semidbm.open('dbname', 'c', verify_checksums=True)
77 | 
78 | If a checksum error is detected a ``DBMChecksumError`` is raised::
79 | 
80 |     >>> db[b'foo']
81 |     Traceback (most recent call last):
82 |       File "<stdin>", line 1, in <module>
83 |       File "./semidbm/db.py", line 192, in __getitem__
84 |         return self._verify_checksum_data(key, data)
85 |       File "./semidbm/db.py", line 203, in _verify_checksum_data
86 |         "Corrupt data detected: invalid checksum for key %s" % key)
87 |     semidbm.db.DBMChecksumError: Corrupt data detected: invalid checksum for key b'foo'
88 | 
89 | 
90 | Read Only Mode
91 | ==============
92 | 
93 | SemiDBM includes an optimized read only mode.  If you know you only
94 | want to read values from the database without writing new values you
95 | can take advantage of this optimized read only mode.  To open a db
96 | file as read only, use the ``'r'`` option::
97 | 
98 |     db = semidbm.open('dbname', 'r')
99 | 


--------------------------------------------------------------------------------
/docs/fileformat.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | File Format of DB file
 3 | ======================
 4 | 
 5 | :author: James Saryerwinnie
 6 | :status: Draft
 7 | :target-version: 0.4.0
 8 | :date: April 15, 2013
 9 | 
10 | Abstract
11 | ========
12 | 
13 | This document proposes a new file format for semidbm.  This is a backwards
14 | incompatible change.
15 | 
16 | Motivation
17 | ==========
18 | 
19 | When python3 support was added, ``semidbm`` received a significant performance
20 | degredation.  This was mainly due to the str vs. bytes differentiation, and
21 | the fact that semidbm was a text based format.  All of the integer sizes and
22 | checksum information was written as ASCII strings, and as a result, encoding
23 | the string to a byte sequence added additional overhead.
24 | 
25 | In order to improve performance, ``semidbm`` should adopt a binary format,
26 | specifically the sizes of the keys and values as well as the checksums should
27 | be written as binary values.  This will avoid the need to use string formatting
28 | when writing values.  It will also improve the load time of a db file.
29 | 
30 | 
31 | Specification
32 | =============
33 | 
34 | A semidbm file will consist of a header and a sequence of entries.
35 | All multibyte sequences are writteni network byte order.
36 | 
37 | 
38 | Header
39 | ======
40 | 
41 | The semidbm header format consists of:
42 | 
43 | * 4 byte magic number (``53 45 4d 49``)
44 | * 4 byte version number consisting of 2 byte major version and 2 byte
45 |   minor version (currently (1, 1)).
46 | 
47 | 
48 | Entries
49 | =======
50 | 
51 | After the header, the file contains a sequence of
52 | entries.  Each entry has this format:
53 | 
54 | * 4 byte key size
55 | * 4 byte value size
56 | * Key contents
57 | * Value content
58 | * 4 byte CRC32 checksum of Key + Value
59 | 
60 | If a key is deleted it will have a value size of -1 and no value content.
61 | 


--------------------------------------------------------------------------------
/docs/img/delete_sequential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/delete_sequential.png


--------------------------------------------------------------------------------
/docs/img/fill_sequential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/fill_sequential.png


--------------------------------------------------------------------------------
/docs/img/large_delete_sequential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_delete_sequential.png


--------------------------------------------------------------------------------
/docs/img/large_fill_sequential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_fill_sequential.png


--------------------------------------------------------------------------------
/docs/img/large_read_hot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_read_hot.png


--------------------------------------------------------------------------------
/docs/img/large_read_random.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_read_random.png


--------------------------------------------------------------------------------
/docs/img/large_read_sequential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/large_read_sequential.png


--------------------------------------------------------------------------------
/docs/img/read_hot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/read_hot.png


--------------------------------------------------------------------------------
/docs/img/read_random.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/read_random.png


--------------------------------------------------------------------------------
/docs/img/read_sequential.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesls/semidbm/adcd3b2ad4aa24093402f4d9d8c41b71334cda5f/docs/img/read_sequential.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Semidbm
 2 | =======
 3 | 
 4 | Semidbm is a fast, pure python implementation of a dbm, which is a
 5 | persistent key value store. It allows you to get and set keys through
 6 | a dict interface::
 7 | 
 8 |     import semidbm
 9 |     db = semidbm.open('testdb', 'c')
10 |     db['foo'] = 'bar'
11 |     print db['foo']
12 |     db.close()
13 | 
14 | These values are persisted to disk, and you can later retrieve
15 | these key/value pairs::
16 | 
17 |     # Then at a later time:
18 |     db = semidbm.open('testdb', 'r')
19 |     # prints "bar"
20 |     print db['foo']
21 | 
22 | 
23 | It was written with these things in mind:
24 | 
25 | * Pure python, supporting python 2.6, 2.7, 3.3, and 3.4.
26 | * Cross platform, works on Windows, Linux, Mac OS X.
27 | * Supports CPython, pypy, and jython (versions 2.7-b3 and higher).
28 | * Simple and Fast (See :doc:`benchmarks`).
29 | 
30 | 
31 | Post feedback and issues on
32 | `github issues <https://github.com/jamesls/semidbm/issues>`_, or check out the
33 | latest changes at the `github repo <https://github.com/jamesls/semidbm>`_.
34 | 
35 | 
36 | Topics
37 | ------
38 | 
39 | .. toctree::
40 |     :maxdepth: 2
41 | 
42 |     overview
43 |     details
44 |     benchmarks
45 |     changelog
46 | 
47 | 
48 | Developer Documentation
49 | -----------------------
50 | 
51 | .. toctree::
52 |     :maxdepth: 2
53 | 
54 |     api_semidbm
55 |     fileformat
56 | 
57 | 
58 | Indices and tables
59 | ==================
60 | 
61 | * :ref:`genindex`
62 | * :ref:`modindex`
63 | * :ref:`search`
64 | 
65 | 


--------------------------------------------------------------------------------
/docs/overview.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | An Overview of Semidbm
 3 | ======================
 4 | 
 5 | The easiest way to think of semidbm is as an improvement over python's
 6 | `dumbdbm <http://docs.python.org/library/dumbdbm.html>`_ module.
 7 | 
 8 | While the standard library has faster dbms based on well established C
 9 | libraries (GNU dbm, Berkeley DB, ndbm), dumbdbm is the only pure python
10 | portable dbm in the standard library.
11 | 
12 | Semidbm offers a few improvements over dumbdbm including:
13 | 
14 | * Better overall performance (more on this later).
15 | * Only a single file is used (no separate index and data file).
16 | * Data file compaction.  Free space can be reclaimed (though this
17 |   only happens whenever explicitly asked to do so
18 |   using the `compact()` method).
19 | * Get/set/delete are require O(1) IO.
20 | 
21 | Like dumbdbm, semidbm is cross platform.  It has been tested on:
22 | 
23 | * Linux (Ubuntu 11.10, debian)
24 | * Mac OS X (Lion/Mountain Lion)
25 | * Windows 7/8.
26 | 
27 | There are also a few caveats to consider when using semidbm:
28 | 
29 | * The entire index must fit in memory, this means all keys must
30 |   fit in memory.
31 | * Not thread safe; can only be accessed by a single process.
32 | * While the performance is reasonable, it still will not beat one of the
33 |   standard dbms (GNU dbm, Berkeley DB, etc).
34 | 
35 | 
36 | Using Semidbm
37 | =============
38 | 
39 | To create a new db, specify the name of the directory::
40 | 
41 |     import semidbm
42 |     db = semidbm.open('mydb', 'c')
43 | 
44 | This will create a *mydb* directory.  This directory is where semidbm will
45 | place all the files it needs to keep track of the keys and values stored in the
46 | db.  If the directory does not exist, it will be created.
47 | 
48 | 
49 | Once the db has been created, you can get and set values::
50 | 
51 |     db['key1'] = 'value1'
52 |     print db['key1']
53 | 
54 | **Keys and values can be either str or bytes.**
55 | 
56 | ``str`` types will be encoded to utf-8 before writing to disk.
57 | You can avoid this encoding step by providing a byte string
58 | directly::
59 | 
60 |     db[b'key1'] = b'value1'
61 | 
62 | Otherwise, semidbm will do the equivalent of::
63 | 
64 |     db['key1'.encode('utf-8')] = 'value1'.encode('utf-8')
65 | 
66 | It is recommended that you handle the encoding of your strings
67 | in your application, and only use ``bytes`` when working with
68 | semidbm.  The reason for this is that when a value
69 | is retrieved, it is returned as a bytestring (semidbm can't
70 | know the encoding of the bytes it retrieved).  For example (this
71 | is with python 3.3)::
72 | 
73 |     >>> db['foo'] = 'value'
74 |     >>> db['foo']
75 |     b'value'
76 |     >>> db['kēy'] = 'valueē'
77 |     >>> db['kēy']
78 |     b'value\xc4\x93'
79 | 
80 | To avoid this confusion, encode your strings before storing with
81 | with semidbm.
82 | 
83 | The reason this automatic conversion is supported is that this is
84 | what is done with the DBMs in the python standard library (including
85 | ``dumbdbm`` which this module was intended to be a drop in replacement
86 | for).  In order to be able to be a drop in replacement, this
87 | automatic encoding process needs to be supported (but not recommended).
88 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx==1.2b1
2 | 


--------------------------------------------------------------------------------
/scripts/adapters/bdb_btopen.py:
--------------------------------------------------------------------------------
1 | import bsddb3
2 | open = bsddb3.btopen
3 | 


--------------------------------------------------------------------------------
/scripts/adapters/bdb_hashopen.py:
--------------------------------------------------------------------------------
1 | import bsddb3
2 | open = bsddb3.hashopen
3 | 


--------------------------------------------------------------------------------
/scripts/adapters/bdb_minimal.py:
--------------------------------------------------------------------------------
 1 | """A minimal version of bsddb3.DB."""
 2 | import bsddb3.db
 3 | 
 4 | # This might be somewhere in the bsddb3 module, but I wanted to compare the
 5 | # performance of bsddb3 with semidbm and I could not have a bare bones dict
 6 | # interface exposed with a shelve like interface.
 7 | 
 8 | def open(filename, mode):
 9 |     db = bsddb3.db.DB(None)
10 |     if mode == 'r':
11 |         flags = bsddb3.db.DB_RDONLY
12 |     elif mode == 'rw':
13 |         flags = 0
14 |     elif mode == 'w':
15 |         flags =  bsddb3.db.DB_CREATE
16 |     elif mode == 'c':
17 |         flags =  bsddb3.db.DB_CREATE
18 |     elif mode == 'n':
19 |         flags = bsddb3.db.DB_TRUNCATE | bsddb3.db.DB_CREATE
20 |     else:
21 |         raise bsddb3.db.DBError(
22 |             "flags should be one of 'r', 'w', 'c' or 'n' or use the "
23 |             "bsddb.db.DB_* flags")
24 |     db.open(filename, None, bsddb3.db.DB_HASH, flags)
25 |     return db
26 | 


--------------------------------------------------------------------------------
/scripts/adapters/builtindict.py:
--------------------------------------------------------------------------------
1 | _DB = {}
2 | 
3 | def open(*args, **kwargs):
4 |     return _DB
5 | 


--------------------------------------------------------------------------------
/scripts/benchmark:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Very simple script for profiling various dbms.
  3 | 
  4 | The point of this script is to give a rough
  5 | estimate for how semidbm does compared to other
  6 | dbms.  You can run this script with no args or
  7 | specify the dbms you want to benchmark using
  8 | the --dbm arg.
  9 | 
 10 | """
 11 | import os
 12 | import sys
 13 | import stat
 14 | import json
 15 | import shutil
 16 | import optparse
 17 | import time
 18 | import string
 19 | import tempfile
 20 | import random
 21 | import traceback
 22 | 
 23 | try:
 24 |     _range = xrange
 25 | except NameError:
 26 |     _range = range
 27 | 
 28 | random.seed(100)
 29 | 
 30 | 
 31 | _potential_dbms = ['dbhash', 'dbm', 'gdbm', 'dumbdbm', 'semidbm']
 32 | 
 33 | ADAPTER_DIR = os.path.join(os.path.dirname(__file__), 'adapters')
 34 | sys.path.append(ADAPTER_DIR)
 35 | out = sys.stdout.write
 36 | 
 37 | 
 38 | def _rand_key(key_length, chars=string.printable):
 39 |     return bytes(bytearray(''.join(random.choice(chars) for i in
 40 |                                    _range(key_length))))
 41 | 
 42 | 
 43 | def set_dbms(dbms):
 44 |     dbms_found = []
 45 |     for potential in dbms:
 46 |         try:
 47 |             d = __import__(potential, fromlist=[potential])
 48 |             dbms_found.append(d)
 49 |         except ImportError as e:
 50 |             sys.stderr.write("Could not import %s: %s\n" % (potential, e))
 51 |             continue
 52 |     return dbms_found
 53 | 
 54 | 
 55 | class Options(object):
 56 |     num_keys = 1000000
 57 |     key_size_bytes = 16
 58 |     value_size_bytes = 100
 59 | 
 60 |     def __init__(self, **kwargs):
 61 |         self.__dict__.update(kwargs)
 62 | 
 63 |     def print_options(self):
 64 |         stats = ("    num_keys  : %(num_keys)s\n"
 65 |                  "    key_size  : %(key_size_bytes)s\n"
 66 |                  "    value_size: %(value_size_bytes)s" % self.__dict__)
 67 |         return stats
 68 | 
 69 |     @property
 70 |     def key_format(self):
 71 |         return '%0' + str(self.key_size_bytes) + 'd'
 72 | 
 73 | 
 74 | class StatsReporter(object):
 75 |     def __init__(self, name, total_time, total_bytes, total_ops):
 76 |         self._name = name
 77 |         self._total_time = total_time
 78 |         self._total_bytes = total_bytes
 79 |         self._total_ops = total_ops
 80 | 
 81 |     def micros_per_op(self):
 82 |         # Leveldb uses this, so it's useful to compare.
 83 |         total_micros = self._total_time * 1e6
 84 |         return total_micros / self._total_ops
 85 | 
 86 |     def ops_per_second(self):
 87 |         return self._total_ops / float(self._total_time)
 88 | 
 89 |     def megabytes_per_second(self):
 90 |         return self._total_bytes / (1024.0 * 1024) / self._total_time
 91 | 
 92 |     def print_report(self):
 93 |         out("%-20s:" % self._name)
 94 |         out(" time: %9.3f,   micros/ops: %9.3f,   ops/s: %10.3f,  "
 95 |               "MB/s: %10.3f\n" % (self._total_time, self.micros_per_op(),
 96 |                                 self.ops_per_second(),
 97 |                                 self.megabytes_per_second()))
 98 | 
 99 |     @property
100 |     def name(self):
101 |         return self._name
102 | 
103 | 
104 | class Benchmarks(object):
105 |     def __init__(self, options, tmpdir):
106 |         self.options = options
107 |         self.tmpdir = tmpdir
108 |         self.random_values = self._generate_random_string(1024 * 1024)
109 | 
110 |     def _generate_random_string(self, string_size):
111 |         out("Generating random data.\n")
112 |         c = chr
113 |         rand = random.randint
114 |         r = bytes(bytearray([rand(0, 255) for i in _range(string_size)]))
115 |         return r
116 | 
117 |     def run(self, dbm):
118 |         print("Benchmarking:", dbm)
119 |         print(self.options.print_options())
120 |         all_reports = []
121 |         try:
122 |             for name in ['fill_random', 'fill_sequential', 'read_cold',
123 |                          'read_sequential', 'read_hot', 'read_random',
124 |                          'delete_sequential']:
125 |                 method = getattr(self, name)
126 |                 report = method(dbm)
127 |                 report.print_report()
128 |                 all_reports.append(report)
129 |         finally:
130 |             self.delete_dbm()
131 |             print
132 |         return all_reports
133 | 
134 |     def fill_random(self, dbm):
135 |         db = self._load_dbm(dbm)
136 |         random_values = self.random_values
137 |         maxlen = len(random_values)
138 |         position = 0
139 |         value_size = self.options.value_size_bytes
140 |         key_size = self.options.key_size_bytes
141 |         num_keys = self.options.num_keys
142 |         indices = [_rand_key(key_size) for i in _range(num_keys)]
143 | 
144 |         t = time.time
145 |         out = sys.stdout.write
146 |         flush = sys.stdout.flush
147 |         start = t()
148 |         for i in _range(num_keys):
149 |             db[indices[i]] = random_values[position:position+value_size]
150 |             position += value_size
151 |             if position + value_size > maxlen:
152 |                 position = 0
153 |                 out("(%s/%s)\r" % (i, num_keys))
154 |                 flush()
155 |         total = t() - start
156 |         self._close_db(db)
157 |         self.delete_dbm()
158 |         return StatsReporter(
159 |             'fill_random', total,
160 |             (value_size * num_keys) + (self.options.key_size_bytes * num_keys),
161 |             num_keys)
162 | 
163 |     def fill_sequential(self, dbm):
164 |         db = self._load_dbm(dbm)
165 |         key_format = self.options.key_format
166 |         random_values = self.random_values
167 |         maxlen = len(random_values)
168 |         position = 0
169 |         value_size = self.options.value_size_bytes
170 |         num_keys = self.options.num_keys
171 |         indices = [(key_format % i).encode('utf-8') for i in _range(num_keys)]
172 | 
173 |         t = time.time
174 |         out = sys.stdout.write
175 |         flush = sys.stdout.flush
176 |         start = t()
177 |         for i in _range(num_keys):
178 |             db[indices[i]] = random_values[position:position+value_size]
179 |             position += value_size
180 |             if position + value_size > maxlen:
181 |                 position = 0
182 |                 out("(%s/%s)\r" % (i, num_keys))
183 |                 flush()
184 |         total = t() - start
185 |         self._close_db(db)
186 |         return StatsReporter(
187 |             'fill_sequential', total,
188 |             (value_size * num_keys) + (self.options.key_size_bytes * num_keys),
189 |             num_keys)
190 | 
191 |     def read_sequential(self, dbm, name='read_sequential'):
192 |         # Assumes fill_sequential has been called.
193 |         db = self._load_dbm(dbm, 'r')
194 |         key_format = self.options.key_format
195 |         num_keys = self.options.num_keys
196 | 
197 |         indices = [(key_format % i).encode('utf-8') for i in _range(num_keys)]
198 |         t = time.time
199 |         start = t()
200 |         for i in _range(num_keys):
201 |             db[indices[i]]
202 |         total = t() - start
203 |         self._close_db(db)
204 |         total_bytes = (self.options.key_size_bytes * num_keys +
205 |                        self.options.value_size_bytes * num_keys)
206 |         return StatsReporter(name, total, total_bytes, num_keys)
207 | 
208 |     def read_cold(self, dbm):
209 |         # read_cold is intended to be called before read_sequential or any
210 |         # other reads to test the performance of a "cold" read.
211 |         return self.read_sequential(dbm, name='read_cold')
212 | 
213 |     def read_hot(self, dbm):
214 |         # Assumes fill_sequential has been called.
215 |         # Read from 1% of the database self.options.num_keys times.
216 |         # This should test the effectiveness of any caching being used.
217 |         num_keys = self.options.num_keys
218 |         unique_keys = int(num_keys * 0.01)
219 |         indices = [(self.options.key_format % i).encode('utf-8')
220 |                    for i in random.sample(_range(num_keys), unique_keys)]
221 |         indices = indices * (int(num_keys / unique_keys))
222 |         db = self._load_dbm(dbm, 'r')
223 |         t = time.time
224 |         start = t()
225 |         for i in _range(num_keys):
226 |             db[indices[i]]
227 |         total = t() - start
228 |         self._close_db(db)
229 |         total_bytes = (self.options.key_size_bytes * num_keys +
230 |                        self.options.value_size_bytes * num_keys)
231 |         return StatsReporter('read_hot', total, total_bytes,
232 |                              num_keys)
233 | 
234 |     def read_random(self, dbm):
235 |         # This doesn't matter to semidbm because the keys
236 |         # aren't ordered, but other dbms might be impacted.
237 |         num_keys = self.options.num_keys
238 |         key_format = self.options.key_format
239 |         indices = [(key_format % i).encode('utf-8') for i in range(num_keys)]
240 |         random.shuffle(indices)
241 |         db = self._load_dbm(dbm, 'r')
242 |         t = time.time
243 |         start = t()
244 |         for i in _range(num_keys):
245 |             db[indices[i]]
246 |         total = t() - start
247 |         self._close_db(db)
248 |         total_bytes = (self.options.key_size_bytes * num_keys +
249 |                        self.options.value_size_bytes * num_keys)
250 |         return StatsReporter('read_random', total, total_bytes,
251 |                              num_keys)
252 | 
253 |     def delete_sequential(self, dbm):
254 |         # Assumes fill_sequential has been called.
255 |         db = self._load_dbm(dbm, 'c')
256 |         key_format = self.options.key_format
257 |         num_keys = self.options.num_keys
258 | 
259 |         indices = [(key_format % i).encode('utf-8') for i in _range(num_keys)]
260 |         t = time.time
261 |         start = t()
262 |         for i in _range(num_keys):
263 |             del db[indices[i]]
264 |         total = t() - start
265 |         self._close_db(db)
266 |         total_bytes = (self.options.key_size_bytes * num_keys +
267 |                        self.options.value_size_bytes * num_keys)
268 |         return StatsReporter('delete_sequential', total, total_bytes, num_keys)
269 | 
270 |     def delete_dbm(self):
271 |         # Just wipe out everything under tmpdir.
272 |         self._rmtree(self.tmpdir)
273 | 
274 |     def _rmtree(self, tmpdir):
275 |         # Delete everything under tmpdir but don't actually
276 |         # delete tmpdir itself.
277 |         for path in os.listdir(tmpdir):
278 |             full_path = os.path.join(tmpdir, path)
279 |             mode = os.lstat(full_path).st_mode
280 |             if stat.S_ISDIR(mode):
281 |                 shutil.rmtree(full_path)
282 |             else:
283 |                 os.remove(full_path)
284 | 
285 |     def _load_dbm(self, dbm, flags='c'):
286 |         db = dbm.open(os.path.join(self.tmpdir, 'db'), flags)
287 |         return db
288 | 
289 |     def _close_db(self, db):
290 |         # If the db has a close() method call it.  Basically a hack
291 |         # so we can benchmark a normal python dict.
292 |         if hasattr(db, 'close'):
293 |             db.close()
294 | 
295 | 
296 | def generate_report(filename, options, reports):
297 |     """Create a json report grouped by benchmarks rather than by dbm.
298 | 
299 |     Since this is going to be used to autogenerate the
300 |     charts/tables, a comparison across dbms for a given benchmark
301 |     is more useful.  The output should look like::
302 | 
303 |         {num_keys: 100, key_size_bytes: 16, value_size_bytes: 1000,
304 |          dbms: ['semidbm', 'gdbm'],
305 |          benchmarks:
306 |              [['fill_sequential', [
307 |                 {total_time: 100, micros_per_op: 1,
308 |                  ops_per_second: 123, mb_per_second: 100}]],
309 |               ...
310 |              ]
311 |         }
312 | 
313 | 
314 |     """
315 |     # Generating a report requires python >= 2.7.
316 |     from collections import OrderedDict
317 |     output = {
318 |         'num_keys': options.num_keys,
319 |         'key_size_bytes': options.key_size_bytes,
320 |         'value_size_bytes': options.value_size_bytes
321 |     }
322 |     by_benchmarks = OrderedDict()
323 |     dbms = []
324 |     for dbm, benchmarks in reports:
325 |         dbms.append(dbm)
326 |         for benchmark in benchmarks:
327 |             by_benchmarks.setdefault(benchmark.name, []).append({
328 |                 'total_time': benchmark.total_time(),
329 |                 'micros_per_op': benchmark.micros_per_op(),
330 |                 'ops_per_second': benchmark.ops_per_second(),
331 |                 'megabytes_per_second': benchmark.megabytes_per_second(),
332 |             })
333 |     output['dbms'] = dbms
334 |     output['benchmarks'] = by_benchmarks
335 |     json.dump(output, open(filename, 'w'), indent=4)
336 | 
337 | 
338 | def main():
339 |     parser = optparse.OptionParser()
340 |     parser.add_option('-d', '--dbm', dest='dbms', action='append')
341 |     # These are the same defaults as the leveldb benchmark,
342 |     # which this scripts is based off of.
343 |     parser.add_option('-n', '--num-keys', default=1000000, type=int)
344 |     parser.add_option('-k', '--key-size-bytes', default=16, type=int)
345 |     parser.add_option('-s', '--value-size-bytes', default=100, type=int)
346 |     parser.add_option('-r', '--report', help="Generate a summary report "
347 |                       "in json to specified location.")
348 |     opts, args = parser.parse_args()
349 | 
350 | 
351 |     dbm_names = opts.__dict__.pop('dbms') or _potential_dbms
352 |     dbms = set_dbms(dbm_names)
353 |     if not dbms:
354 |         sys.stderr.write("List of dbms is empty.\n")
355 |         sys.exit(1)
356 |     options = Options(**opts.__dict__)
357 |     tmpdir = tempfile.mkdtemp(prefix='dbmprofile')
358 |     benchmarks = Benchmarks(options, tmpdir)
359 |     all_reports = []
360 |     try:
361 |         for dbm in dbms:
362 |             try:
363 |                 all_reports.append((dbm.__name__, benchmarks.run(dbm)))
364 |             except Exception as e:
365 |                 traceback.print_exc()
366 |                 sys.stderr.write(
367 |                     "ERROR: exception caught when benchmarking %s: %s\n" %
368 |                     (dbm, e))
369 |     finally:
370 |         shutil.rmtree(tmpdir)
371 |     if opts.report:
372 |         generate_report(opts.report, options, all_reports)
373 | 
374 | 
375 | if __name__ == '__main__':
376 |     main()
377 | 


--------------------------------------------------------------------------------
/scripts/loadtime:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import time
 3 | import argparse
 4 | 
 5 | import semidbm
 6 | 
 7 | 
 8 | def measure_laod_time(db_path):
 9 |     num_loads = 10
10 |     times = []
11 |     o = semidbm.open
12 |     for  i in range(num_loads):
13 |         start = time.time()
14 |         db = o(db_path, 'c')
15 |         times.append(time.time() - start)
16 |         db.close()
17 |     print "%.5f milliseconds average load time" % (
18 |         (sum(times) / float(num_loads)) * 1000)
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('db_path')
24 |     args = parser.parse_args()
25 |     measure_laod_time(args.db_path)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/scripts/makedb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Time the time to load a semidbm db.
 4 | import sys
 5 | import random
 6 | import string
 7 | from argparse import ArgumentParser
 8 | import tempfile
 9 | import os
10 | 
11 | import semidbm
12 | 
13 | 
14 | try:
15 |     _range = xrange
16 | except NameError:
17 |     _range = range
18 | 
19 | 
20 | def _rand_bytes(key_length, chars=string.printable):
21 |     return bytes(bytearray(''.join(random.choice(chars) for i in
22 |                                    _range(key_length))))
23 | 
24 | 
25 | def populate_db(args):
26 |     path = args.output_dir
27 |     db = semidbm.open(path, 'c')
28 |     sys.stdout.write("Populating the DB...\n")
29 |     sys.stdout.write("  - num_keys: %s\n" % args.num_keys)
30 |     sys.stdout.write("  - key_size_bytes: %s\n" % args.key_size_bytes)
31 |     sys.stdout.write("  - val_size_bytes: %s\n" % args.value_size_bytes)
32 |     sys.stdout.flush()
33 |     key_size_bytes = args.key_size_bytes
34 |     value_size_bytes = args.value_size_bytes
35 |     for i in range(args.num_keys):
36 |         db[_rand_bytes(key_size_bytes)] = _rand_bytes(value_size_bytes)
37 |     sys.stdout.write("\nDone")
38 |     db.close()
39 | 
40 | 
41 | def main():
42 |     parser = ArgumentParser()
43 |     parser.add_argument('-n', '--num-keys', default=1000000, type=int)
44 |     parser.add_argument('-k', '--key-size-bytes', default=16, type=int)
45 |     parser.add_argument('-s', '--value-size-bytes', default=100, type=int)
46 |     parser.add_argument('output_dir', help="Location of db to create.")
47 | 
48 |     args = parser.parse_args()
49 |     populate_db(args)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/scripts/makegraphs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # This script is used to generate fancy charts and text tables
 3 | # from a .json file produced via the '-r' argument of scripts/benchmark.
 4 | import os
 5 | import sys
 6 | import json
 7 | import optparse
 8 | # I don't expect anyone else to run this script, but OrderedDict
 9 | # requires python2.7.
10 | from collections import OrderedDict
11 | 
12 | import numpy.numarray as na
13 | from matplotlib import pyplot as p
14 | import matplotlib
15 | import texttable
16 | 
17 | 
18 | def generate_charts(results, benchmark_to_use, results_filename):
19 |     labels = results['dbms']
20 |     font = {'size'   : 7}
21 |     matplotlib.rc('font', **font)
22 | 
23 |     for name, benchmark in results['benchmarks'].iteritems():
24 |         data = [b[benchmark_to_use] for b in benchmark]
25 |         title = name
26 |         title += ('(numkeys=%(num_keys)s,keysize=%(key_size_bytes)s,'
27 |                   'valsize=%(value_size_bytes)s)' % results)
28 |         xlocations = na.array(range(len(data)))+0.5
29 |         width = 0.5
30 |         p.gcf().set_size_inches(5, 5)
31 |         p.bar(xlocations, data, width=width)
32 |         ymax = int(max(data)) + 1
33 |         p.yticks(range(0, ymax, ymax / 10))
34 |         p.ylim(0, ymax)
35 |         p.xticks(xlocations + width / 2, labels)
36 |         p.xlim(0, xlocations[-1] + width * 2)
37 |         p.ylabel(benchmark_to_use)
38 |         p.title(title)
39 |         p.gca().get_xaxis().tick_bottom()
40 |         p.gca().get_yaxis().tick_left()
41 |         p.savefig('docs/img/' + '%s_%s' %
42 |                   (os.path.splitext(results_filename)[0], name))
43 |         p.close('all')
44 | 
45 | 
46 | def generate_table(results, metric_to_use):
47 |     dbms = results['dbms']
48 |     title = ('n=%(num_keys)s,k=%(key_size_bytes)s,'
49 |              'v=%(value_size_bytes)s' % results)
50 |     t = texttable.Texttable(max_width=120)
51 |     t.set_cols_align(['l'] + ['r' for i in xrange(len(dbms))])
52 |     t.add_rows([
53 |         [title] + dbms,
54 |     ])
55 |     for name, benchmark in results['benchmarks'].iteritems():
56 |         t.add_row([name] + [b[metric_to_use] for b in benchmark])
57 |     print t.draw()
58 | 
59 | 
60 | def main():
61 |     parser = optparse.OptionParser()
62 |     parser.add_option('-t', '--table', action="store_true", default=False)
63 |     parser.add_option('-c', '--chart', action="store_true", default=False)
64 |     parser.add_option('-b', '--benchmark', default="ops_per_second")
65 |     opts, args = parser.parse_args()
66 |     if len(args) != 1:
67 |         sys.stderr.write("Supply the name of the .json report.")
68 |         sys.exit(1)
69 |     results = json.load(open(args[0]), object_pairs_hook=OrderedDict)
70 |     if opts.chart:
71 |         generate_charts(results, opts.benchmark, args[0])
72 |     if opts.table:
73 |         generate_table(results, opts.benchmark)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/scripts/tps:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Ok so I know this is basically a bs metric, but it's still fun to see what the numbers
 3 | # are.  But be warned, don't take these numbers too seriously.
 4 | import sys
 5 | import shutil
 6 | import time
 7 | import os
 8 | import tempfile
 9 | import optparse
10 | 
11 | 
12 | def main():
13 |     parser = optparse.OptionParser()
14 |     parser.add_option('-n', '--num-transactions', default=1000000, type=int)
15 |     parser.add_option('-c', '--chunk-size', default=10000, type=int,
16 |                       help="For the read chunked tests, this will "
17 |                       "set how many elements to iterate over at a time")
18 |     parser.add_option('-r', '--repeat', default=10, type=int,
19 |                       help="For the read chunked tests, this will "
20 |                       "specify how many times to iterate over the chunks "
21 |                       "before moving on to the next chunk.")
22 |     parser.add_option('-t', '--skip-read-test', action="store_true",
23 |                       default=False, help="Skip the sequential read test "
24 |                       "(useful if you just want to benchmark writes)")
25 |     parser.add_option('-s', '--skip-read-chunk', action="store_true",
26 |                       default=False, help="Skip the read chunk test "
27 |                       "(it can take a while to run).")
28 |     parser.add_option('-d', '--dbm', default='semidbm')
29 |     opts, args = parser.parse_args()
30 | 
31 |     tempdir = tempfile.mkdtemp(prefix='tps')
32 |     dbname = os.path.join(tempdir, 'tps.db')
33 |     try:
34 |         dbm_module = __import__(opts.dbm, fromlist=[opts.dbm])
35 |     except ImportError:
36 |         sys.stderr.write("Can't import dbm: %s\n" % opts.dbm)
37 |         sys.exit(1)
38 |     db = dbm_module.open(dbname, 'c')
39 | 
40 |     num_transactions = opts.num_transactions
41 |     groups_of = opts.chunk_size
42 |     repeat = opts.repeat
43 | 
44 |     start = time.time()
45 |     for i in xrange(num_transactions):
46 |         db[str(i)] = str(i)
47 |     end = time.time()
48 |     print "Write ",
49 |     print "Total: %.5f, tps: %.2f" % (end - start,
50 |                                       float(num_transactions) / (end - start))
51 |     if not opts.skip_read_test:
52 |         db.close()
53 |         db = dbm_module.open(dbname, 'r')
54 |         start = time.time()
55 |         for i in xrange(num_transactions):
56 |             db[str(i)]
57 |         end = time.time()
58 | 
59 |         print "Read ",
60 |         print "Total: %.5f, tps: %.2f" % (end - start,
61 |                                           float(num_transactions) / (end - start))
62 | 
63 | 
64 |     if not opts.skip_read_chunk:
65 |         count = 0
66 |         start = time.time()
67 |         for i in xrange(0, num_transactions, groups_of):
68 |             for j in xrange(groups_of):
69 |                 for k in xrange(repeat):
70 |                     count += 1
71 |                     db[str(i + j)]
72 |         end = time.time()
73 |         print "Read (grouped)",
74 |         print "count:", count
75 |         print "Total: %.5f, tps: %.2f" % (end - start,
76 |                                           float(count) / (end - start))
77 |     db.close()
78 |     shutil.rmtree(tempdir)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/semidbm/__init__.py:
--------------------------------------------------------------------------------
 1 | import semidbm.db
 2 | open = semidbm.db.open
 3 | 
 4 | from semidbm.db import DBMError
 5 | from semidbm.db import DBMLoadError
 6 | from semidbm.db import DBMChecksumError
 7 | 
 8 | 
 9 | __version__ = '0.5.1'
10 | 


--------------------------------------------------------------------------------
/semidbm/compat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | try:
 4 |     import __builtin__
 5 | except ImportError:
 6 |     import builtins as __builtin__
 7 | 
 8 | file_open = __builtin__.open
 9 | 
10 | 
11 | try:
12 |     str_type = unicode
13 | except NameError:
14 |     # Python 3.x.
15 |     str_type = str
16 | 
17 | 
18 | DATA_OPEN_FLAGS = os.O_RDWR | os.O_CREAT | os.O_APPEND
19 | if sys.platform.startswith('win'):
20 |     # On windows we need to specify that we should be
21 |     # reading the file as a binary file so it doesn't
22 |     # change any line ending characters.
23 |     DATA_OPEN_FLAGS = DATA_OPEN_FLAGS | os.O_BINARY
24 | 


--------------------------------------------------------------------------------
/semidbm/db.py:
--------------------------------------------------------------------------------
  1 | """An only semi-dumb DBM.
  2 | 
  3 | This module is an attempt to do slightly better than the
  4 | standard library's dumbdbm.  It keeps a similar design
  5 | to dumbdbm while improving and fixing some of dumbdbm's
  6 | problems.
  7 | 
  8 | """
  9 | import os
 10 | import sys
 11 | from binascii import crc32
 12 | import struct
 13 | 
 14 | from semidbm.exceptions import DBMLoadError, DBMChecksumError, DBMError
 15 | from semidbm.loaders import _DELETED, FILE_FORMAT_VERSION, FILE_IDENTIFIER
 16 | from semidbm import compat
 17 | 
 18 | 
 19 | _open = compat.file_open
 20 | 
 21 | 
 22 | class _SemiDBM(object):
 23 |     """
 24 | 
 25 |     :param dbdir: The directory containing the dbm files.  If the directory
 26 |         does not exist it will be created.
 27 | 
 28 |     """
 29 |     def __init__(self, dbdir, renamer, data_loader=None,
 30 |                  verify_checksums=False):
 31 |         self._renamer = renamer
 32 |         self._data_loader = data_loader
 33 |         self._dbdir = dbdir
 34 |         self._data_filename = os.path.join(dbdir, 'data')
 35 |         # The in memory index, mapping of key to (offset, size).
 36 |         self._index = None
 37 |         self._data_fd = None
 38 |         self._verify_checksums = verify_checksums
 39 |         self._current_offset = 0
 40 |         self._load_db()
 41 | 
 42 |     def _create_db_dir(self):
 43 |         if not os.path.exists(self._dbdir):
 44 |             os.makedirs(self._dbdir)
 45 | 
 46 |     def _load_db(self):
 47 |         self._create_db_dir()
 48 |         self._index = self._load_index(self._data_filename)
 49 |         self._data_fd = os.open(self._data_filename, compat.DATA_OPEN_FLAGS)
 50 |         self._current_offset = os.lseek(self._data_fd, 0, os.SEEK_END)
 51 | 
 52 |     def _load_index(self, filename):
 53 |         # This method is only used upon instantiation to populate
 54 |         # the in memory index.
 55 |         if not os.path.exists(filename):
 56 |             self._write_headers(filename)
 57 |             return {}
 58 |         try:
 59 |             return self._load_index_from_fileobj(filename)
 60 |         except ValueError as e:
 61 |             raise DBMLoadError("Bad index file %s: %s" % (filename, e))
 62 | 
 63 |     def _write_headers(self, filename):
 64 |         with _open(filename, 'wb') as f:
 65 |             # Magic number identifier.
 66 |             f.write(FILE_IDENTIFIER)
 67 |             # File version format.
 68 |             f.write(struct.pack('!HH', *FILE_FORMAT_VERSION))
 69 | 
 70 |     def _load_index_from_fileobj(self, filename):
 71 |         index = {}
 72 |         for key_name, offset, size in self._data_loader.iter_keys(filename):
 73 |             size = int(size)
 74 |             offset = int(offset)
 75 |             if size == _DELETED:
 76 |                 # This is a deleted item so we need to make sure that this
 77 |                 # value is not in the index.  We know that the key is already
 78 |                 # in the index, because a delete is only written to the index
 79 |                 # if the key already exists in the db.
 80 |                 del index[key_name]
 81 |             else:
 82 |                 if key_name in index:
 83 |                     index[key_name] = (offset, size)
 84 |                 else:
 85 |                     index[key_name] = (offset, size)
 86 |         return index
 87 | 
 88 |     def __getitem__(self, key, read=os.read, lseek=os.lseek,
 89 |                     seek_set=os.SEEK_SET, str_type=compat.str_type,
 90 |                     isinstance=isinstance):
 91 |         if isinstance(key, str_type):
 92 |             key = key.encode('utf-8')
 93 |         offset, size = self._index[key]
 94 |         lseek(self._data_fd, offset, seek_set)
 95 |         if not self._verify_checksums:
 96 |             return read(self._data_fd, size)
 97 |         else:
 98 |             # Checksum is at the end of the value.
 99 |             data = read(self._data_fd, size + 4)
100 |             return self._verify_checksum_data(key, data)
101 | 
102 |     def _verify_checksum_data(self, key, data):
103 |         # key is the bytes of the key,
104 |         # data is the bytes of the value + 4 byte checksum at the end.
105 |         value = data[:-4]
106 |         expected = struct.unpack('!I', data[-4:])[0]
107 |         actual = crc32(key)
108 |         actual = crc32(value, actual)
109 |         if actual & 0xffffffff != expected:
110 |             raise DBMChecksumError(
111 |                 "Corrupt data detected: invalid checksum for key %s" % key)
112 |         return value
113 | 
114 |     def __setitem__(self, key, value, len=len, crc32=crc32, write=os.write,
115 |                     str_type=compat.str_type, pack=struct.pack,
116 |                     isinstance=isinstance):
117 |         if isinstance(key, str_type):
118 |             key = key.encode('utf-8')
119 |         if isinstance(value, str_type):
120 |             value = value.encode('utf-8')
121 |         # Write the new data out at the end of the file.
122 |         # Format is
123 |         # 4 bytes   4bytes              4bytes
124 |         # <keysize><valsize><key><val><keyvalcksum>
125 |         # Everything except for the actual checksum + value
126 |         key_size = len(key)
127 |         val_size = len(value)
128 |         keyval_size = pack('!ii', key_size, val_size)
129 |         keyval = key + value
130 |         checksum = pack('!I', crc32(keyval) & 0xffffffff)
131 |         blob = keyval_size + keyval + checksum
132 | 
133 |         write(self._data_fd, blob)
134 |         # Update the in memory index.
135 |         self._index[key] = (self._current_offset + 8 + key_size,
136 |                             val_size)
137 |         self._current_offset += len(blob)
138 | 
139 |     def __contains__(self, key):
140 |         return key in self._index
141 | 
142 |     def __delitem__(self, key, len=len, write=os.write, deleted=_DELETED,
143 |                     str_type=compat.str_type, isinstance=isinstance,
144 |                     crc32=crc32, pack=struct.pack):
145 |         if isinstance(key, str_type):
146 |             key = key.encode('utf-8')
147 |         key_size = pack('!ii', len(key), _DELETED)
148 |         crc = pack('!I', crc32(key) & 0xffffffff)
149 |         blob = key_size + key + crc
150 | 
151 |         write(self._data_fd, blob)
152 |         del self._index[key]
153 |         self._current_offset += len(blob)
154 | 
155 |     def __iter__(self):
156 |         for key in self._index:
157 |             yield key
158 | 
159 |     def keys(self):
160 |         """Return all they keys in the db.
161 | 
162 |         The keys are returned in an arbitrary order.
163 | 
164 |         """
165 |         return self._index.keys()
166 | 
167 |     def values(self):
168 |         return [self[key] for key in self._index]
169 | 
170 |     def close(self, compact=False):
171 |         """Close the db.
172 | 
173 |         The data is synced to disk and the db is closed.
174 |         Once the db has been closed, no further reads or writes
175 |         are allowed.
176 | 
177 |         :param compact: Indicate whether or not to compact the db
178 |             before closing the db.
179 | 
180 |         """
181 |         if compact:
182 |             self.compact()
183 |         self.sync()
184 |         os.close(self._data_fd)
185 | 
186 |     def sync(self):
187 |         """Sync the db to disk.
188 | 
189 |         This will flush any of the existing buffers and
190 |         fsync the data to disk.
191 | 
192 |         You should call this method to guarantee that the data
193 |         is written to disk.  This method is also called whenever
194 |         the dbm is `close()`'d.
195 | 
196 |         """
197 |         # The files are opened unbuffered so we don't technically
198 |         # need to flush the file objects.
199 |         os.fsync(self._data_fd)
200 | 
201 |     def compact(self):
202 |         """Compact the db to reduce space.
203 | 
204 |         This method will compact the data file and the index file.
205 |         This is needed because of the append only nature of the index
206 |         and data files.  This method will read the index and data file
207 |         and write out smaller but equivalent versions of these files.
208 | 
209 |         As a general rule of thumb, the more non read updates you do,
210 |         the more space you'll save when you compact.
211 | 
212 |         """
213 |         # Basically, compaction works by opening a new db, writing
214 |         # all the keys from this db to the new db, renaming the
215 |         # new db to the filenames associated with this db, and
216 |         # reopening the files associated with this db.  This
217 |         # implementation can certainly be more efficient, but compaction
218 |         # is really slow anyways.
219 |         new_db = self.__class__(os.path.join(self._dbdir, 'compact'),
220 |                                 data_loader=self._data_loader,
221 |                                 renamer=self._renamer)
222 |         for key in self._index:
223 |             new_db[key] = self[key]
224 |         new_db.sync()
225 |         new_db.close()
226 |         os.close(self._data_fd)
227 |         self._renamer(new_db._data_filename, self._data_filename)
228 |         os.rmdir(new_db._dbdir)
229 |         # The index is already compacted so we don't need to compact it.
230 |         self._load_db()
231 | 
232 | 
233 | class _SemiDBMReadOnly(_SemiDBM):
234 |     def __delitem__(self, key):
235 |         self._method_not_allowed('delitem')
236 | 
237 |     def __setitem__(self, key, value):
238 |         self._method_not_allowed('setitem')
239 | 
240 |     def sync(self):
241 |         self._method_not_allowed('sync')
242 | 
243 |     def compact(self):
244 |         self._method_not_allowed('compact')
245 | 
246 |     def _method_not_allowed(self, method_name):
247 |         raise DBMError("Can't %s: db opened in read only mode." % method_name)
248 | 
249 |     def close(self, compact=False):
250 |         os.close(self._data_fd)
251 | 
252 | 
253 | class _SemiDBMReadWrite(_SemiDBM):
254 |     def _load_db(self):
255 |         if not os.path.isfile(self._data_filename):
256 |             raise DBMError("Not a file: %s" % self._data_filename)
257 | 
258 |         super(_SemiDBMReadWrite, self)._load_db()
259 | 
260 | 
261 | class _SemiDBMNew(_SemiDBM):
262 |     def _load_db(self):
263 |         self._create_db_dir()
264 |         self._remove_files_in_dbdir()
265 |         super(_SemiDBMNew, self)._load_db()
266 | 
267 |     def _remove_files_in_dbdir(self):
268 |         # We want to create a new DB so we need to remove
269 |         # any of the existing files in the dbdir.
270 |         if os.path.exists(self._data_filename):
271 |             os.remove(self._data_filename)
272 | 
273 | 
274 | # These renamer classes are needed because windows
275 | # doesn't support atomic renames, and I won't want
276 | # non-window clients to suffer for this.  If you're on
277 | # windows, you don't get atomic renames.
278 | class _Renamer(object):
279 |     """An object that can rename files."""
280 |     def __call__(self, from_file, to_file):
281 |         os.rename(from_file, to_file)
282 | 
283 | 
284 | # Note that this also works on posix platforms as well.
285 | class _WindowsRenamer(object):
286 |     def __call__(self, from_file, to_file):
287 |         # os.rename() does not work if the dst file exists
288 |         # on windows so we have to use our own version that
289 |         # supports atomic renames.
290 |         import semidbm.win32
291 |         semidbm.win32.rename(from_file, to_file)
292 | 
293 | 
294 | def _create_default_params(**starting_kwargs):
295 |     kwargs = starting_kwargs.copy()
296 |     # Internal method that creates the parameters based
297 |     # on the choices like platform/available features.
298 |     if sys.platform.startswith('win'):
299 |         renamer = _WindowsRenamer()
300 |     else:
301 |         renamer = _Renamer()
302 |     try:
303 |         from semidbm.loaders.mmapload import MMapLoader
304 |         data_loader = MMapLoader()
305 |     except ImportError:
306 |         # If mmap is not available then fall back to the
307 |         # simple non mmap based file loader.
308 |         from semidbm.loaders.simpleload import SimpleFileLoader
309 |         data_loader = SimpleFileLoader()
310 |     kwargs.update({'renamer': renamer, 'data_loader': data_loader})
311 |     return kwargs
312 | 
313 | 
314 | # The "dbm" interface is:
315 | #
316 | #     open(filename, flag='r', mode=0o666)
317 | #
318 | # All the other args after this should have default values
319 | # so that this function remains compatible with the dbm interface.
320 | def open(filename, flag='r', mode=0o666, verify_checksums=False):
321 |     """Open a semidbm database.
322 | 
323 |     :param filename: The name of the db.  Note that for semidbm,
324 |         this is actually a directory name.  The argument is named
325 |         `filename` to be compatible with the dbm interface.
326 | 
327 |     :param flag: Specifies how the db should be opened.
328 |         `flag` can be any of these values:
329 | 
330 |         +---------+-------------------------------------------+
331 |         | Value   | Meaning                                   |
332 |         +=========+===========================================+
333 |         | ``'r'`` | Open existing database for reading only   |
334 |         |         | (default)                                 |
335 |         +---------+-------------------------------------------+
336 |         | ``'w'`` | Open existing database for reading and    |
337 |         |         | writing                                   |
338 |         +---------+-------------------------------------------+
339 |         | ``'c'`` | Open database for reading and writing,    |
340 |         |         | creating it if it doesn't exist           |
341 |         +---------+-------------------------------------------+
342 |         | ``'n'`` | Always create a new, empty database, open |
343 |         |         | for reading and writing                   |
344 |         +---------+-------------------------------------------+
345 | 
346 |     :param mode: Not currently used (provided to be compatible with
347 |         the dbm interface).
348 | 
349 |     :param verify_checksums: Verify the checksums for each value
350 |         are correct on every __getitem__ call (defaults to False).
351 | 
352 |     """
353 |     kwargs = _create_default_params(verify_checksums=verify_checksums)
354 |     if flag == 'r':
355 |         return _SemiDBMReadOnly(filename, **kwargs)
356 |     elif flag == 'c':
357 |         return _SemiDBM(filename, **kwargs)
358 |     elif flag == 'w':
359 |         return _SemiDBMReadWrite(filename, **kwargs)
360 |     elif flag == 'n':
361 |         return _SemiDBMNew(filename, **kwargs)
362 |     else:
363 |         raise ValueError("flag argument must be 'r', 'c', 'w', or 'n'")
364 | 


--------------------------------------------------------------------------------
/semidbm/exceptions.py:
--------------------------------------------------------------------------------
 1 | class DBMError(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class DBMLoadError(DBMError):
 6 |     pass
 7 | 
 8 | 
 9 | class DBMChecksumError(DBMError):
10 |     pass
11 | 


--------------------------------------------------------------------------------
/semidbm/loaders/__init__.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | 
 3 | 
 4 | from semidbm.exceptions import DBMLoadError
 5 | 
 6 | 
 7 | # Major, Minor version.
 8 | FILE_FORMAT_VERSION = (1, 1)
 9 | FILE_IDENTIFIER = b'\x53\x45\x4d\x49'
10 | _DELETED = -1
11 | 
12 | 
13 | class DBMLoader(object):
14 |     def __init__(self):
15 |         pass
16 | 
17 |     def iter_keys(self, filename):
18 |         """Load the keys given a filename.
19 | 
20 |         Subclasses need to implement this method that accepts a filename and
21 |         iterates over the keys associated with the data file.  Each yielded
22 |         item should contain a tuple of::
23 | 
24 |             (key_name, offset, size)
25 | 
26 |         Where key_name is the name of the key (bytes), offset is the integer
27 |         offset within the file of the value associated with the key, and size
28 |         is the size of the value in bytes.
29 |         """
30 |         raise NotImplementedError("iter_keys")
31 | 
32 |     def _verify_header(self, header):
33 |         sig = header[:4]
34 |         if sig != FILE_IDENTIFIER:
35 |             raise DBMLoadError("File is not a semidbm db file.")
36 |         major, minor = struct.unpack('!HH', header[4:])
37 |         if major != FILE_FORMAT_VERSION[0]:
38 |             raise DBMLoadError(
39 |                 'Incompatible file version (got: v%s, can handle: v%s)' % (
40 |                     (major, FILE_FORMAT_VERSION[0])))
41 | 


--------------------------------------------------------------------------------
/semidbm/loaders/mmapload.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import mmap
 3 | import struct
 4 | 
 5 | 
 6 | from semidbm.loaders import DBMLoader, _DELETED
 7 | from semidbm.exceptions import DBMLoadError
 8 | from semidbm import compat
 9 | 
10 | 
11 | _MAPPED_LOAD_PAGES = 300
12 | 
13 | 
14 | class MMapLoader(DBMLoader):
15 |     def __init__(self):
16 |         pass
17 | 
18 |     def iter_keys(self, filename):
19 |         # yields keyname, offset, size
20 |         f = compat.file_open(filename, 'rb')
21 |         header = f.read(8)
22 |         self._verify_header(header)
23 |         contents = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
24 |         remap_size = mmap.ALLOCATIONGRANULARITY * _MAPPED_LOAD_PAGES
25 |         # We need to track the max_index to use as the upper bound
26 |         # in the .find() calls to be compatible with python 2.6.
27 |         # There's a bug in python 2.6 where if an offset is specified
28 |         # along with a size of 0, then the size for mmap() is the size
29 |         # of the file instead of the size of the file - offset.  To
30 |         # fix this, we track this ourself and make sure we never go passed
31 |         # max_index.  If we don't do this, python2.6 will crash with
32 |         # a bus error (python2.7 works fine without this workaround).
33 |         # See http://bugs.python.org/issue10916 for more info.
34 |         max_index = os.path.getsize(filename)
35 |         file_size_bytes = max_index
36 |         num_resizes = 0
37 |         current = 8
38 |         try:
39 |             while current != max_index:
40 |                 try:
41 |                     key_size, val_size = struct.unpack(
42 |                         '!ii', contents[current:current+8])
43 |                 except struct.error:
44 |                     raise DBMLoadError()
45 |                 key = contents[current+8:current+8+key_size]
46 |                 if len(key) != key_size:
47 |                     raise DBMLoadError()
48 |                 offset = (remap_size * num_resizes) + current + 8 + key_size
49 |                 if offset + val_size > file_size_bytes:
50 |                     # If this happens then the index is telling us
51 |                     # to read past the end of the file.  What we need
52 |                     # to do is stop reading from the index.
53 |                     return
54 |                 yield (key, offset, val_size)
55 |                 if val_size == _DELETED:
56 |                     val_size = 0
57 |                 # Also need to skip past the 4 byte checksum, hence
58 |                 # the '+ 4' at the end
59 |                 current = current + 8 + key_size + val_size + 4
60 |                 if current >= remap_size:
61 |                     contents.close()
62 |                     num_resizes += 1
63 |                     offset = num_resizes * remap_size
64 |                     # Windows python2.6 bug.  You can't specify a length of
65 |                     # 0 with an offset, otherwise you get a WindowsError, not
66 |                     # enough storage is available to process this command.
67 |                     # Couldn't find an issue for this, but the workaround
68 |                     # is to specify the actual length of the mmap'd region
69 |                     # which is the total size minus the offset we want.
70 |                     contents = mmap.mmap(f.fileno(), file_size_bytes - offset,
71 |                                          access=mmap.ACCESS_READ,
72 |                                          offset=offset)
73 |                     current -= remap_size
74 |                     max_index -= remap_size
75 |         finally:
76 |             contents.close()
77 |             f.close()
78 | 


--------------------------------------------------------------------------------
/semidbm/loaders/simpleload.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import struct
 3 | 
 4 | from semidbm.loaders import DBMLoader, _DELETED
 5 | from semidbm.exceptions import DBMLoadError
 6 | 
 7 | 
 8 | class SimpleFileLoader(DBMLoader):
 9 |     def __init__(self):
10 |         pass
11 | 
12 |     def iter_keys(self, filename):
13 |         # yields keyname, offset, size
14 |         with open(filename, 'rb') as f:
15 |             header = f.read(8)
16 |             self._verify_header(header)
17 |             current_offset = 8
18 |             file_size_bytes = os.path.getsize(filename)
19 |             while True:
20 |                 current_contents = f.read(8)
21 |                 current_offset += 8
22 |                 if len(current_contents) < 8:
23 |                     if len(current_contents) > 0:
24 |                         # This means we read a partial header
25 |                         # entry which should never happen.
26 |                         raise DBMLoadError(
27 |                             'Error loading db: partial header read')
28 |                     else:
29 |                         return
30 |                 key_size, val_size = struct.unpack(
31 |                     '!ii', current_contents)
32 |                 key = f.read(key_size)
33 |                 if len(key) != key_size:
34 |                     raise DBMLoadError(
35 |                         "Error loading db: key size does not match "
36 |                         "(expected %s bytes, got %s instead."
37 |                         % (key_size, len(key)))
38 |                 value_offset = current_offset + key_size
39 |                 if value_offset + val_size > file_size_bytes:
40 |                     return
41 |                 yield (key, value_offset, val_size)
42 |                 if val_size == _DELETED:
43 |                     val_size = 0
44 |                 # 4 bytes is for the checksum.
45 |                 skip_ahead = key_size + val_size + 4
46 |                 current_offset += skip_ahead
47 |                 if current_offset > file_size_bytes:
48 |                     raise DBMLoadError(
49 |                         "Error loading db: reading past the "
50 |                         "end of the file (file possibly truncated)")
51 |                 f.seek(current_offset)
52 | 


--------------------------------------------------------------------------------
/semidbm/win32.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | from ctypes.wintypes import LPVOID, DWORD
 3 | 
 4 | 
 5 | LPCTSTR = ctypes.c_wchar_p
 6 | LPTSTR = LPCTSTR
 7 | kernel32 = ctypes.windll.kernel32
 8 | kernel32.ReplaceFile.argtypes = [
 9 |     LPCTSTR, LPCTSTR, LPCTSTR, DWORD, LPVOID, LPVOID]
10 | 
11 | 
12 | def rename(src, dst):
13 |     # Atomic renames in windows!
14 |     # Equivalent to os.rename() in POSIX.
15 |     # Yes the args here seem backwards but this is in fact
16 |     # the awesomeness of windows just being different.
17 |     rc = kernel32.ReplaceFile(LPCTSTR(dst), LPCTSTR(src), None, 0, None, None)
18 |     if rc == 0:
19 |         # While some sort of error is better than nothing,
20 |         # I think there's a way to get a better error message
21 |         # from another win32 call.
22 |         raise OSError("can't rename file, error: %s" % kernel32.GetLastError())
23 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | 
 5 | setup(
 6 |     name='semidbm',
 7 |     version='0.5.1',
 8 |     description="Cross platform (fast) DBM interface in python",
 9 |     long_description=open(os.path.join(os.path.dirname(__file__),
10 |                                        'README.rst')).read(),
11 |     license='BSD',
12 |     author='James Saryerwinnie',
13 |     author_email='js@jamesls.com',
14 |     packages = find_packages(),
15 |     zip_safe=False,
16 |     keywords="semidbm",
17 |     url="https://github.com/jamesls/semidbm",
18 |     classifiers=[
19 |         'Development Status :: 4 - Beta',
20 |         'Programming Language :: Python :: 2.6',
21 |         'Programming Language :: Python :: 2.7',
22 |         'Programming Language :: Python :: 3.3',
23 |         'Programming Language :: Python :: 3.4',
24 |         'Programming Language :: Python :: Implementation :: CPython',
25 |         'Programming Language :: Python :: Implementation :: PyPy',
26 |         'Programming Language :: Python :: Implementation :: Jython',
27 |         'License :: OSI Approved :: BSD License',
28 |     ],
29 | )
30 | 
31 | 


--------------------------------------------------------------------------------
/test_semidbm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import sys
  5 | import shutil
  6 | import struct
  7 | import tempfile
  8 | try:
  9 |     import mmap
 10 | except ImportError:
 11 |     mmap = None
 12 | try:
 13 |     import unittest2 as unittest
 14 | except ImportError:
 15 |     import unittest
 16 | 
 17 | import semidbm
 18 | import semidbm.db
 19 | from semidbm.loaders.simpleload import SimpleFileLoader
 20 | 
 21 | 
 22 | class SemiDBMTest(unittest.TestCase):
 23 |     def setUp(self):
 24 |         self.tempdir = tempfile.mkdtemp(prefix='semidbm_ut')
 25 |         self.dbdir = os.path.join(self.tempdir, 'myfile.db')
 26 | 
 27 |     def tearDown(self):
 28 |         shutil.rmtree(self.tempdir)
 29 | 
 30 |     def open_db_file(self, **kwargs):
 31 |         return semidbm.open(self.dbdir, 'c', **kwargs)
 32 | 
 33 |     def open_data_file(self, dbdir=None, mode='r'):
 34 |         if dbdir is None:
 35 |             dbdir = self.dbdir
 36 |         if not os.path.exists(dbdir):
 37 |             os.makedirs(dbdir)
 38 |         data_filename = os.path.join(dbdir, 'data')
 39 |         return open(data_filename, mode=mode)
 40 | 
 41 |     def truncate_data_file(self, bytes_from_end):
 42 |         with self.open_data_file(mode='rb') as f:
 43 |             contents = f.read()
 44 |         with self.open_data_file(mode='wb') as f2:
 45 |             # Simulate the last bytes_from_end bytes missing.
 46 |             f2.write(contents[:-bytes_from_end])
 47 | 
 48 | 
 49 | class TestSemiDBM(SemiDBMTest):
 50 |     def test_insert_then_retrieve(self):
 51 |         db = self.open_db_file()
 52 |         db['foo'] = 'bar'
 53 |         self.assertEqual(db['foo'], b'bar')
 54 |         db.close()
 55 | 
 56 |     def test_insert_close_retrieve(self):
 57 |         # This will verify loading the index.
 58 |         db = self.open_db_file()
 59 |         db['foo'] = 'bar'
 60 |         db.close()
 61 | 
 62 |         db2 = self.open_db_file()
 63 |         self.assertEqual(db2['foo'], b'bar')
 64 |         db2.close()
 65 | 
 66 |     def test_insert_multiple(self):
 67 |         db = self.open_db_file()
 68 |         db['one'] = '1'
 69 |         db['two'] = '2'
 70 |         db['three'] = '3'
 71 |         self.assertEqual(db['one'], b'1')
 72 |         self.assertEqual(db['two'], b'2')
 73 |         self.assertEqual(db['three'], b'3')
 74 |         db.close()
 75 | 
 76 |     def test_intermixed_inserts_and_retrievals(self):
 77 |         db = self.open_db_file()
 78 |         db['one'] = '1'
 79 |         db['two'] = '2'
 80 |         self.assertEqual(db['one'], b'1')
 81 |         db['three'] = '3'
 82 |         self.assertEqual(db['two'], b'2')
 83 |         self.assertEqual(db['three'], b'3')
 84 |         db.close()
 85 | 
 86 |     def test_keyerror_raised_when_key_does_not_exist(self):
 87 |         db = self.open_db_file()
 88 |         self.assertRaises(KeyError, db.__getitem__, 'one')
 89 |         db.close()
 90 | 
 91 |     def test_updates(self):
 92 |         db = self.open_db_file()
 93 |         db['one'] = 'foo'
 94 |         db['one'] = 'bar'
 95 |         self.assertEqual(db['one'], b'bar')
 96 |         db['one'] = 'baz'
 97 |         self.assertEqual(db['one'], b'baz')
 98 |         db.close()
 99 | 
100 |     def test_updates_persist(self):
101 |         db = self.open_db_file()
102 |         db['one'] = 'foo'
103 |         db['one'] = 'bar'
104 |         db['one'] = 'baz'
105 |         db.close()
106 | 
107 |         db2 = self.open_db_file()
108 |         self.assertEqual(db2['one'], b'baz')
109 |         db2.close()
110 | 
111 |     def test_contains(self):
112 |         db = self.open_db_file()
113 |         db[b'one'] = 'foo'
114 |         self.assertTrue(b'one' in db)
115 |         db.close()
116 | 
117 |     def test_deletes(self):
118 |         db = self.open_db_file()
119 |         db['foo'] = 'bar'
120 |         del db['foo']
121 |         self.assertTrue('foo' not in db)
122 |         db.close()
123 | 
124 |     def test_delete_key_not_there_when_reopened(self):
125 |         db = self.open_db_file()
126 |         db['foo'] = 'foo'
127 |         db['bar'] = 'bar'
128 |         del db['foo']
129 |         db.close()
130 | 
131 |         db2 = self.open_db_file()
132 |         self.assertTrue('foo' not in db2)
133 |         self.assertEqual(db2['bar'], b'bar')
134 |         db2.close()
135 | 
136 |     def test_multiple_deletes(self):
137 |         db = self.open_db_file()
138 |         db['foo'] = 'foo'
139 |         del db['foo']
140 |         db['foo'] = 'foo'
141 |         del db['foo']
142 |         db['foo'] = 'foo'
143 |         del db['foo']
144 |         db['bar'] = 'bar'
145 |         db.close()
146 |         db2 = self.open_db_file()
147 |         self.assertTrue('foo' not in db2)
148 |         self.assertEqual(db2['bar'], b'bar')
149 |         db2.close()
150 | 
151 |     def test_keys_method(self):
152 |         db = self.open_db_file()
153 |         db['one'] = 'bar'
154 |         db['two'] = 'bar'
155 |         db['three'] = 'bar'
156 |         self.assertEqual(set(db.keys()), set([b'one', b'two', b'three']))
157 |         db.close()
158 | 
159 |     def test_values_method(self):
160 |         db = self.open_db_file()
161 |         db['one'] = 'one_value'
162 |         db['two'] = 'two_value'
163 |         db['three'] = 'three_value'
164 |         self.assertEqual(set(db.values()), set([b'one_value', b'two_value',
165 |                                                 b'three_value']))
166 |         db.close()
167 | 
168 |     def test_iterate(self):
169 |         db = self.open_db_file()
170 |         db['one'] = 'foo'
171 |         db['two'] = 'bar'
172 |         db['three'] = 'baz'
173 |         self.assertEqual(set(db), set([b'one', b'two', b'three']))
174 |         db.close()
175 | 
176 |     def test_sync_contents(self):
177 |         # So there's not really a good way to test this, so
178 |         # I'm just making sure you can call it, and you can see the data.
179 |         db = self.open_db_file()
180 |         db['foo'] = 'bar'
181 |         db.sync()
182 |         db.close()
183 |         db2 = self.open_db_file()
184 |         self.assertEqual(db2['foo'], b'bar')
185 |         db2.close()
186 | 
187 |     def test_compaction_does_not_leave_behind_files(self):
188 |         db = self.open_db_file()
189 |         before = len(os.listdir(self.dbdir))
190 |         for i in range(10):
191 |             db[str(i)] = str(i)
192 |         for i in range(10):
193 |             del db[str(i)]
194 |         db.close()
195 |         db2 = self.open_db_file()
196 |         db2.compact()
197 |         db2.close()
198 |         after = len(os.listdir(self.dbdir))
199 |         self.assertEqual(before, after, os.listdir(self.dbdir))
200 | 
201 |     def test_inserts_after_deletes(self):
202 |         db = self.open_db_file()
203 |         db['one'] = b'one'
204 |         del db['one']
205 |         db['two'] = b'two'
206 | 
207 |         self.assertEqual(db['two'], b'two')
208 |         db.close()
209 | 
210 |     def test_mixed_updates_and_deletes(self):
211 |         db = self.open_db_file()
212 |         db['one'] = 'one'
213 |         db['CHECK'] = 'original'
214 |         db['two'] = 'two'
215 |         db['CHECK'] = 'updated'
216 |         del db['CHECK']
217 |         db['three'] = 'three'
218 | 
219 |         self.assertEqual(db['one'], b'one')
220 |         self.assertEqual(db['two'], b'two')
221 |         self.assertEqual(db['three'], b'three')
222 |         db.close()
223 | 
224 |     def test_compact_and_retrieve_data(self):
225 |         db = self.open_db_file()
226 |         db['one'] = 'foo'
227 |         db['key'] = 'original'
228 |         db['two'] = 'bar'
229 |         db['key'] = 'updated'
230 |         del db['key']
231 |         db['three'] = 'baz'
232 |         db.compact()
233 |         self.assertEqual(db['one'], b'foo')
234 |         self.assertEqual(db['two'], b'bar')
235 |         self.assertEqual(db['three'], b'baz')
236 |         db.close()
237 | 
238 |     def test_compact_on_close(self):
239 |         db = self.open_db_file()
240 |         db['key'] = 'original'
241 |         del db['key']
242 |         db.close(compact=True)
243 |         # Header is 8 bytes.
244 |         self.assertEqual(len(open(db._data_filename).read()), 8)
245 | 
246 |     def test_compact_then_write_data(self):
247 |         db = self.open_db_file()
248 |         db['before'] = 'before'
249 |         del db['before']
250 |         db.compact()
251 |         db['after'] = 'after'
252 |         db.close()
253 | 
254 |         db2 = self.open_db_file()
255 |         self.assertEqual(db2['after'], b'after')
256 |         db2.close()
257 | 
258 |     def test_bad_magic_number(self):
259 |         db = self.open_db_file()
260 |         db['foo'] = 'bar'
261 |         db.close()
262 |         with self.open_data_file(mode='rb+') as f:
263 |             f.seek(0)
264 |             f.write(b'Z')
265 |         # Opening the db file should now fail.
266 |         self.assertRaises(semidbm.DBMLoadError, self.open_db_file)
267 | 
268 |     def test_incompatible_version_number(self):
269 |         db = self.open_db_file()
270 |         db['foo'] = 'bar'
271 |         db.close()
272 |         with self.open_data_file(mode='rb+') as f:
273 |             f.seek(4)
274 |             f.write(struct.pack('!H', 2))
275 |         # Opening the db file should now fail.
276 |         self.assertRaises(semidbm.DBMLoadError, self.open_db_file)
277 | 
278 |     def test_recover_from_last_failed_write(self):
279 |         # Testing this scenario:
280 |         # - we're writing a large object, we write the entry
281 |         # header properly but we crash so we don't write out the
282 |         # full value.  The next time the db is loaded we should
283 |         # be able to recover from this situation.
284 |         db = self.open_db_file()
285 |         # First write a few good keys.
286 |         db['foobar'] = 'foobar'
287 |         db['key'] = 'value'
288 |         db['key2'] = 'value2'
289 |         # Now simulate a failing write.
290 |         db['largevalue'] = 'foobarbaz' * 1024
291 |         db.close()
292 |         # This is implementation specific, but we're going to read the raw data
293 |         # file and truncate it.
294 |         with self.open_data_file(mode='rb') as f:
295 |             filename = f.name
296 |             original_size = os.path.getsize(filename)
297 |         self.truncate_data_file(bytes_from_end=100)
298 |         db2 = self.open_db_file()
299 |         self.assertEquals(db2['foobar'], b'foobar')
300 |         self.assertEquals(db2['key'], b'value')
301 |         self.assertEquals(db2['key2'], b'value2')
302 |         # But largevalue is not there, we recovered and just removed it.
303 |         self.assertNotIn('largevalue', db2)
304 |         # And when we compact the data file, the junk data
305 |         # is ignored and not written to the new file.
306 |         db2.compact()
307 |         db2.close()
308 |         new_size = os.path.getsize(filename)
309 |         self.assertTrue(new_size < original_size)
310 | 
311 |     def test_file_thats_truncated(self):
312 |         # Let's say that the file header is fine, but part
313 |         # of the header for an individual record has been
314 |         # trunated.
315 |         db = self.open_db_file()
316 |         db['foo'] = 'bar'
317 |         db.close()
318 |         # Now let's truncate the file to only 10 bytes which
319 |         # will include the file header and part of an entry
320 |         # header.
321 |         with self.open_data_file(mode='rb') as f:
322 |             contents = f.read()
323 |         with self.open_data_file(mode='wb') as f2:
324 |             f2.write(contents[:10])
325 |         self.assertRaises(semidbm.DBMLoadError, self.open_db_file)
326 | 
327 |     def test_key_size_says_to_read_past_end_of_file(self):
328 |         # We can create this situation by creating an entry
329 |         # and truncating the key/value part.
330 |         db = self.open_db_file()
331 |         db['foo'] = 'bar'
332 |         db.close()
333 |         # From the end we have a 4 byte checksum + 3 bytes for
334 |         # the key and 3 bytes for the value, or a total of
335 |         # 10 bytes.  We'll chop off 8 which means we're missing
336 |         # the checksum, the value, and one byte of the key.
337 |         self.truncate_data_file(bytes_from_end=8)
338 |         self.assertRaises(semidbm.DBMLoadError, self.open_db_file)
339 | 
340 | 
341 | @unittest.skipIf(mmap is None, 'mmap required')
342 | class TestRemapping(SemiDBMTest):
343 |     def setUp(self):
344 |         import semidbm.loaders.mmapload
345 |         super(TestRemapping, self).setUp()
346 |         self.original = semidbm.loaders.mmapload._MAPPED_LOAD_PAGES
347 |         # Change the number of mapped pages to 1 so that we don't have to write
348 |         # as much data.  The logic in the code uses this constant, so changing
349 |         # the value of the constant won't affect the code logic, it'll just
350 |         # make the test run faster.
351 |         semidbm.loaders.mmapload._MAPPED_LOAD_PAGES = 1
352 | 
353 |     def tearDown(self):
354 |         super(TestRemapping, self).tearDown()
355 |         semidbm.loaders.mmapload._MAPPED_LOAD_PAGES = self.original
356 | 
357 |     def test_remap_required(self):
358 |         # Verify the loading buffer logic works.  This is
359 |         # really slow.
360 |         size = (
361 |             semidbm.loaders.mmapload._MAPPED_LOAD_PAGES *
362 |             mmap.ALLOCATIONGRANULARITY * 4)
363 |         db = self.open_db_file()
364 |         # 100 byte values.
365 |         values = b'abcd' * 25
366 |         for i in range(int(size / 100)):
367 |             db[str(i)] = values
368 |         db.close()
369 | 
370 |         db2 = self.open_db_file()
371 |         for k in db2:
372 |             self.assertEqual(db2[k], values)
373 |         db2.close()
374 | 
375 | 
376 | class TestReadOnlyMode(SemiDBMTest):
377 |     def open_db_file(self, **kwargs):
378 |         return semidbm.open(self.dbdir, 'r', **kwargs)
379 | 
380 |     def test_cant_setitem(self):
381 |         db = self.open_db_file()
382 |         self.assertRaises(semidbm.DBMError, db.__setitem__, 'foo', 'bar')
383 |         db.close()
384 | 
385 |     def test_cant_sync(self):
386 |         db = self.open_db_file()
387 |         self.assertRaises(semidbm.DBMError, db.sync)
388 |         db.close()
389 | 
390 |     def test_cant_compact(self):
391 |         db = self.open_db_file()
392 |         self.assertRaises(semidbm.DBMError, db.compact)
393 |         db.close()
394 | 
395 |     def test_cant_delitem(self):
396 |         db = self.open_db_file()
397 |         self.assertRaises(semidbm.DBMError, db.__delitem__, 'foo')
398 |         db.close()
399 | 
400 |     def test_close_never_compacts_index(self):
401 |         db = self.open_db_file()
402 |         db.calls = []
403 |         db.compact = lambda: db.calls.append('compact')
404 |         db.sync = lambda: db.calls.append('sync')
405 | 
406 |         db.close(compact=True)
407 | 
408 |         self.assertEqual(db.calls, [])
409 | 
410 |     def test_open_read_multiple_times(self):
411 |         db = semidbm.open(self.dbdir, 'c')
412 |         db['foo'] = 'bar'
413 |         db.close()
414 |         # Open then close db immediately.
415 |         db2 = self.open_db_file()
416 |         db2.close()
417 |         read_only = self.open_db_file()
418 |         self.assertEqual(read_only['foo'], b'bar')
419 |         read_only.close()
420 | 
421 |     def test_can_read_items(self):
422 |         db = semidbm.open(self.dbdir, 'c')
423 |         db['foo'] = 'bar'
424 |         db['bar'] = 'baz'
425 |         db['baz'] = 'foo'
426 |         db.close()
427 | 
428 |         read_only = self.open_db_file()
429 |         self.assertEqual(read_only[b'foo'], b'bar')
430 |         self.assertEqual(read_only[b'bar'], b'baz')
431 |         self.assertEqual(read_only[b'baz'], b'foo')
432 |         read_only.close()
433 | 
434 |     def test_key_does_not_exist(self):
435 |         db = semidbm.open(self.dbdir, 'c')
436 |         db['foo'] = 'bar'
437 |         db.close()
438 | 
439 |         read_only = self.open_db_file()
440 |         self.assertRaises(KeyError, read_only.__getitem__, 'bar')
441 |         read_only.close()
442 | 
443 |     def test_checksum_failure(self):
444 |         db = semidbm.open(self.dbdir, 'c')
445 |         db[b'key'] = b'value'
446 |         db.close()
447 |         data_file = self.open_data_file(mode='rb')
448 |         contents = data_file.read()
449 |         data_file.close()
450 |         # Changing 'value' to 'Value' should cause a checksum failure.
451 |         contents = contents.replace(b'value', b'Value')
452 |         data_file = self.open_data_file(mode='wb')
453 |         data_file.write(contents)
454 |         data_file.close()
455 |         db = self.open_db_file(verify_checksums=True)
456 |         with self.assertRaises(semidbm.DBMChecksumError):
457 |             db['key']
458 |         db.close()
459 |         # If checksums are not enabled, an exception is not raised.
460 |         db = self.open_db_file(verify_checksums=False)
461 |         try:
462 |             db['key']
463 |         except semidbm.DBMChecksumError:
464 |             self.fail("Checksums were suppose to be disabled.")
465 |         finally:
466 |             db.close()
467 | 
468 |     def test_unicode_chars(self):
469 |         db = semidbm.open(self.dbdir, 'c')
470 |         # cafe with the e-accute.
471 |         db[b'caf\xc3\xa9'] = b'caf\xc3\xa9'
472 |         self.assertEqual(db[b'caf\xc3\xa9'], b'caf\xc3\xa9')
473 |         db.close()
474 | 
475 | 
476 | class TestWriteMode(SemiDBMTest):
477 |     def test_when_index_file_does_not_exist(self):
478 |         self.assertRaises(semidbm.DBMError, semidbm.open, self.dbdir, 'w')
479 | 
480 |     def test_when_data_file_does_not_exist(self):
481 |         self.assertRaises(semidbm.DBMError, semidbm.open, self.dbdir, 'w')
482 | 
483 |     def test_when_files_exist(self):
484 |         db = self.open_db_file()
485 |         db['foo'] = 'bar'
486 |         db.close()
487 | 
488 |         db_write_mode = semidbm.open(self.dbdir, 'w')
489 |         self.assertEqual(db_write_mode['foo'], b'bar')
490 |         db_write_mode.close()
491 | 
492 | 
493 | class TestNewMode(SemiDBMTest):
494 |     def test_when_file_does_not_exist(self):
495 |         path = os.path.join(self.tempdir, 'foo.db')
496 |         db = semidbm.open(path, 'n')
497 |         db['foo'] = 'bar'
498 |         self.assertEqual(db['foo'], b'bar')
499 |         db.close()
500 | 
501 |         # Opening the file again should basically blank out
502 |         # any existing database.
503 |         db = semidbm.open(path, 'n')
504 |         self.assertEqual(list(db.keys()), [])
505 |         db.close()
506 | 
507 | 
508 | class TestInvalidModeArgument(unittest.TestCase):
509 |     def test_invalid_open_arg_raises_exception(self):
510 |         self.assertRaises(ValueError, semidbm.open, 'foo.db', 'z')
511 | 
512 | 
513 | class TestWithChecksumsOn(TestSemiDBM):
514 |     def open_db_file(self, **kwargs):
515 |         # If they do not explicitly set verify_checksums
516 |         # to something, default to it being on.
517 |         if 'verify_checksums' not in kwargs:
518 |             kwargs['verify_checksums'] = True
519 |         return semidbm.open(self.dbdir, 'c', **kwargs)
520 | 
521 | 
522 | class TestSimpleFileLoader(TestSemiDBM):
523 |     def open_db_file(self, **kwargs):
524 |         kwargs = semidbm.db._create_default_params()
525 |         kwargs['data_loader'] = SimpleFileLoader()
526 |         return semidbm.db._SemiDBM(self.dbdir, **kwargs)
527 | 
528 | 
529 | if __name__ == '__main__':
530 |     unittest.main()
531 | 


--------------------------------------------------------------------------------