├── python ├── diskhash │ ├── tests │ │ ├── __init__.py │ │ ├── test_smoke.py │ │ └── test_larger.py │ ├── diskhash_version.py │ ├── __init__.py │ └── _diskhash.c └── .gitignore ├── Setup.hs ├── src ├── .gitignore ├── rtable.py ├── Makefile ├── primes.py ├── primes.h ├── disktest.c ├── diskhashtools.cpp ├── diskhash.hpp ├── diskhash.h ├── rtable.h └── diskhash.c ├── .gitignore ├── stack.yaml ├── MANIFEST.in ├── haskell └── Data │ ├── diskhash2.c │ ├── DiskHash │ └── Tests.hs │ └── DiskHash.hs ├── Makefile ├── stack.yaml.lock ├── .github └── workflows │ ├── build_haskell_w_nix.yml │ └── build_python_w_nix.yml ├── COPYING ├── ChangeLog ├── diskhash.cabal ├── default.nix ├── setup.py └── README.md /python/diskhash/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /python/diskhash/diskhash_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4.2" 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | disktest 3 | libdht.so 4 | diskhashtools 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.stack-work/ 2 | *.so 3 | /build/ 4 | /dist/ 5 | /diskhash.egg-info/ 6 | /.cache/ 7 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /dist/ 3 | /diskhash.egg-info/ 4 | __pycache__/ 5 | /.cache/ 6 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-22.6 2 | 3 | packages: 4 | - . 5 | extra-deps: [] 6 | 7 | flags: {} 8 | 9 | extra-package-dbs: [] 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include ChangeLog 3 | include COPYING 4 | include src/primes.h 5 | include src/rtable.h 6 | include src/diskhash.h 7 | recursive-include python/diskhash *.py 8 | -------------------------------------------------------------------------------- /haskell/Data/diskhash2.c: -------------------------------------------------------------------------------- 1 | #include "diskhash.h" 2 | HashTable* dht_open2(const char* f, unsigned int key_maxlen, unsigned int object_datalen, int flags, char** err) { 3 | HashTableOpts opts; 4 | opts.key_maxlen = key_maxlen; 5 | opts.object_datalen = object_datalen; 6 | return dht_open(f, opts, flags, err); 7 | } 8 | -------------------------------------------------------------------------------- /src/rtable.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | print("#include ") 3 | print("uint64_t rtable [] = {") 4 | for _ in range(256): 5 | val = 0 6 | for _ in range(64): 7 | val *= 2 8 | val += (1 if random() < .5 else 0) 9 | print("{}LLU,".format(val)) 10 | print("0 /* sentinel */") 11 | print("};") 12 | 13 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: c python haskell 2 | 3 | c: 4 | cd src && $(MAKE) 5 | 6 | python: build_python 7 | 8 | haskell: 9 | stack build 10 | 11 | check: check_python check_haskell 12 | 13 | check_haskell: 14 | stack test 15 | 16 | build_python: 17 | python setup.py build_ext --inplace 18 | 19 | check_python: build_python 20 | pytest 21 | 22 | install_python: 23 | python setup.py install 24 | 25 | .PHONY: build_python check_python install_python 26 | .PHONY: all c python haskell check check_haskell 27 | -------------------------------------------------------------------------------- /stack.yaml.lock: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by Stack. 2 | # You should not edit this file by hand. 3 | # For more information, please see the documentation at: 4 | # https://docs.haskellstack.org/en/stable/lock_files 5 | 6 | packages: [] 7 | snapshots: 8 | - completed: 9 | sha256: 1b4c2669e26fa828451830ed4725e4d406acc25a1fa24fcc039465dd13d7a575 10 | size: 714100 11 | url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/22/6.yaml 12 | original: lts-22.6 13 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | all: disktest libdht.so 2 | 3 | diskhashtools: diskhashtools.cpp diskhash.hpp diskhash.o 4 | g++ $(CFLAGS) -o diskhashtools diskhashtools.cpp diskhash.o 5 | 6 | disktest: disktest.o diskhash.o 7 | gcc $(CFLAGS) -o $@ disktest.o diskhash.o 8 | 9 | libdht.so: diskhash.o 10 | gcc -shared $(CFLAGS) -o $@ $< 11 | 12 | diskhash.o: diskhash.c diskhash.h rtable.h 13 | gcc $(CFLAGS) -fPIC -o $@ -Wall -c diskhash.c 14 | 15 | disktest.o: disktest.c diskhash.o diskhash.h 16 | gcc $(CFLAGS) -o $@ -Wall -c $< 17 | 18 | .PHONY: all 19 | -------------------------------------------------------------------------------- /python/diskhash/tests/test_smoke.py: -------------------------------------------------------------------------------- 1 | from diskhash import Str2int 2 | from os import unlink, path 3 | 4 | filename = 'testing.dht' 5 | 6 | def test_insert_check(): 7 | if path.exists(filename): 8 | unlink(filename) 9 | ht = Str2int(filename, 17, 'rw') 10 | 11 | assert ht.size() == 0 12 | assert ht.lookup('key') is None 13 | ht.insert('key', 23) 14 | assert ht.lookup('key') == 23 15 | assert ht.size() == 1 16 | 17 | del ht 18 | 19 | ht = Str2int(filename, 17, 'r') 20 | assert ht.size() == 1 21 | assert ht.lookup('key') == 23 22 | del ht 23 | 24 | unlink(filename) 25 | -------------------------------------------------------------------------------- /src/primes.py: -------------------------------------------------------------------------------- 1 | # This code is terribly inefficient, but only runs once. 2 | def prime(n): 3 | '''Check if a number is prime''' 4 | if n % 2 == 0 or n % 3 == 0: return False 5 | if n == 1: return False 6 | if n <= 3: return True 7 | 8 | i = 5 9 | while i*i <= n: 10 | if n % i == 0 or n % (i+2) == 0: 11 | return False 12 | i += 6 13 | return True 14 | 15 | def next_prime(p): 16 | if p % 2 == 0: 17 | p += 1 18 | while not prime(p): 19 | p += 2 20 | return p 21 | p = 7 22 | print("#include ") 23 | print("uint64_t primes [] = {") 24 | while p < 2**46: # 64 Teraelements should be enough for everybody 25 | print("{}, ".format(p)) 26 | p *= 1.7 27 | p = next_prime(int(p)) 28 | print("0 /* sentinel */") 29 | print("};") 30 | 31 | -------------------------------------------------------------------------------- /src/primes.h: -------------------------------------------------------------------------------- 1 | #ifndef DISKHASH_PRIMES_H__INCLUDE_GUARD__ 2 | #define DISKHASH_PRIMES_H__INCLUDE_GUARD__ 3 | 4 | #include 5 | uint64_t primes [] = { 6 | 7, 7 | 11, 8 | 19, 9 | 37, 10 | 67, 11 | 113, 12 | 193, 13 | 331, 14 | 563, 15 | 967, 16 | 1657, 17 | 2819, 18 | 4793, 19 | 8161, 20 | 13873, 21 | 23593, 22 | 40111, 23 | 68207, 24 | 115963, 25 | 197137, 26 | 335149, 27 | 569759, 28 | 968593, 29 | 1646609, 30 | 2799239, 31 | 4758707, 32 | 8089817, 33 | 13752701, 34 | 23379599, 35 | 39745319, 36 | 67567091, 37 | 114864059, 38 | 195268963, 39 | 331957243, 40 | 564327347, 41 | 959356547, 42 | 1630906177, 43 | 2772540503, 44 | 4713318893, 45 | 8012642153, 46 | 13621491703, 47 | 23156535937, 48 | 39366111097, 49 | 66922388873, 50 | 113768061101, 51 | 193405703917, 52 | 328789696673, 53 | 558942484349, 54 | 950202223409, 55 | 1615343779807, 56 | 2746084425691, 57 | 4668343523687, 58 | 7936183990283, 59 | 13491512783501, 60 | 22935571732013, 61 | 38990471944439, 62 | 66283802305549, 63 | 0 /* sentinel */ 64 | }; 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /.github/workflows/build_haskell_w_nix.yml: -------------------------------------------------------------------------------- 1 | name: "Build & test (haskell with nix)" 2 | on: 3 | pull_request: 4 | push: 5 | jobs: 6 | tests: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: cachix/install-nix-action@v25 11 | with: 12 | nix_path: nixpkgs=channel:nixos-unstable 13 | extra_nix_config: | 14 | trusted-public-keys = cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs= hydra.iohk.io:f/Ea+s+dFdN+3Y/G+FDgSq+a5NEWhJGzdjvKNGv0/EQ= 15 | substituters = https://cache.nixos.org https://nix-community.cachix.org https://hydra.iohk.io 16 | - uses: cachix/cachix-action@v14 17 | with: 18 | name: luispedro 19 | # If you chose API tokens for write access OR if you have a private cache 20 | authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' 21 | 22 | - run: nix build -f . haskell.diskhash.components.tests.diskhashtest --out-link haskell-test 23 | - run: haskell-test/bin/diskhashtest 24 | 25 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 2 | Luis Pedro Coelho 3 | 4 | Permission is hereby granted, free of charge, to any person 5 | obtaining a copy of this software and associated documentation 6 | files (the "Software"), to deal in the Software without 7 | restriction, including without limitation the rights to use, 8 | copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the 10 | Software is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 18 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 20 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 21 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | OTHER DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /.github/workflows/build_python_w_nix.yml: -------------------------------------------------------------------------------- 1 | name: "Build & test (python on nix)" 2 | on: 3 | pull_request: 4 | push: 5 | jobs: 6 | tests: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: 11 | - python38 12 | - python39 13 | - python310 14 | - python311 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: cachix/install-nix-action@v25 19 | with: 20 | nix_path: nixpkgs=channel:nixos-unstable 21 | extra_nix_config: | 22 | trusted-public-keys = cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs= hydra.iohk.io:f/Ea+s+dFdN+3Y/G+FDgSq+a5NEWhJGzdjvKNGv0/EQ= 23 | substituters = https://cache.nixos.org https://nix-community.cachix.org https://hydra.iohk.io 24 | - uses: cachix/cachix-action@v14 25 | with: 26 | name: luispedro 27 | # If you chose API tokens for write access OR if you have a private cache 28 | authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' 29 | - run: nix build -f . python --argstr pythonVersion ${{ matrix.python-version }} 30 | 31 | 32 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | Version 0.0.4.2 2019-11-11 by luispedro 2 | * Fix non-ASCII keys 3 | 4 | Version 0.0.4.1 2019-11-05 by luispedro 5 | * Fix Python extension compilation 6 | 7 | Version 0.0.4.0 2017-11-27 by luispedro 8 | * Load hash table into memory 9 | 10 | Version 0.0.3.2 2017-11-09 by luispedro 11 | * Fix Haskell distribution 12 | 13 | Version 0.0.3.1 2017-11-09 by luispedro 14 | * Add rtable.c to distributions 15 | 16 | Version 0.0.3.0 2017-11-09 by luispedro 17 | * Better hash function 18 | 19 | Version 0.0.2.3 2017-10-24 by luispedro 20 | * Fix crashes on very large hashes 21 | 22 | Version 0.0.2.2 2017-10-23 by luispedro 23 | * Better error message for ftruncate failures 24 | 25 | Version 0.0.2.1 2017-10-12 by luispedro 26 | * Export reserve() in Python interface 27 | 28 | Version 0.0.2.0 2017-10-05 by luispedro 29 | * Python improvement: support generic types through struct 30 | * Do not crash when attempting to write to RO tables 31 | * More flexible option checking 32 | * Better error messages 33 | 34 | Version 0.0.1.2 2017-06-27 by luispedro 35 | * Fix cabal release: include C header files in extra-source-files 36 | 37 | Version 0.0.1.1 2017-06-27 by luispedro 38 | * Fix cabal release: include C header files 39 | 40 | Version 0.0.1 2017-06-27 by luispedro 41 | * First release. Basic funcionality 42 | -------------------------------------------------------------------------------- /diskhash.cabal: -------------------------------------------------------------------------------- 1 | name: diskhash 2 | version: 0.0.4.2 3 | synopsis: Disk-based hash table 4 | description: Disk-based hash table 5 | category: Data 6 | author: Luis Pedro Coelho 7 | maintainer: Luis Pedro Coelho 8 | license: MIT 9 | license-file: COPYING 10 | cabal-version: >= 1.10 11 | build-type: Simple 12 | bug-reports: https://github.com/luispedro/diskhash/issues 13 | extra-source-files: README.md ChangeLog src/diskhash.h src/primes.h src/rtable.h 14 | 15 | library 16 | default-language: Haskell2010 17 | exposed-modules: Data.DiskHash 18 | hs-source-dirs: haskell/ 19 | C-sources: haskell/Data/diskhash2.c src/diskhash.c 20 | Include-dirs: src/ 21 | ghc-options: -Wall 22 | build-depends: 23 | base > 4.8 && < 5, 24 | bytestring 25 | 26 | Test-Suite diskhashtest 27 | default-language: Haskell2010 28 | type: exitcode-stdio-1.0 29 | main-is: Data/DiskHash/Tests.hs 30 | other-modules: Data.DiskHash 31 | ghc-options: -Wall 32 | hs-source-dirs: haskell/ 33 | include-dirs: src/ 34 | build-depends: 35 | base > 4.8 && < 5, 36 | bytestring, 37 | directory, 38 | diskhash, 39 | tasty, 40 | tasty-quickcheck, 41 | tasty-th, 42 | tasty-hunit 43 | 44 | source-repository head 45 | type: git 46 | location: https://github.com/luispedro/diskhash 47 | -------------------------------------------------------------------------------- /src/disktest.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "diskhash.h" 9 | 10 | 11 | typedef int64_t data_t; 12 | int dht_ismember(HashTable* ht, const char* k) { 13 | return dht_lookup(ht, k) != NULL; 14 | } 15 | 16 | void chomp(char* p) { 17 | for ( ; *p ; ++p) { 18 | if (*p == '\n') { 19 | *p = '\0'; 20 | return; 21 | } 22 | } 23 | } 24 | 25 | data_t dht_lookup_data_or(HashTable* ht, const char* k, data_t def) { 26 | void* data = dht_lookup(ht, k); 27 | if (!data) return def; 28 | memcpy(&def, data, sizeof(def)); 29 | return def; 30 | } 31 | int main() { 32 | HashTableOpts opts; 33 | opts.key_maxlen = 15; 34 | opts.object_datalen = sizeof(data_t); 35 | char* err; 36 | HashTable* ht = dht_open("testing.dht", opts, O_RDWR|O_CREAT, &err); 37 | if (!ht) { 38 | fprintf(stderr, "Failed opening hash table: %s.\n", err); 39 | free(err); 40 | return 1; 41 | } 42 | 43 | char buffer[256]; 44 | data_t i = 9; 45 | while (fgets(buffer, 255, stdin)) { 46 | chomp(buffer); 47 | printf("Looking for %s: %ld\n", buffer, dht_lookup_data_or(ht, buffer, -1)); 48 | int v = dht_insert(ht, buffer, &i, &err); 49 | if (v < 1) { 50 | printf("dht_insert returned %d; %s.\n", v, err); 51 | free(err); 52 | } 53 | ++i; 54 | } 55 | show_ht(ht); 56 | dht_free(ht); 57 | } 58 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { pythonVersion ? "python39" }: 2 | let 3 | sources = { 4 | haskellNix = builtins.fetchTarball { 5 | name = "haskell-nix-snap"; 6 | url = "https://github.com/input-output-hk/haskell.nix/archive/c689f01730e5b6c6c16d3947a15689569844c38c.tar.gz"; 7 | sha256 = "09lw2419a5dd9g0ja31hjfqf6d4bzcgr5mrqx0vrvlksmp7a1kzk"; 8 | }; 9 | }; 10 | 11 | haskellNix = import sources.haskellNix { }; 12 | 13 | # Import nixpkgs and pass the haskell.nix provided nixpkgsArgs 14 | pkgs = import 15 | # haskell.nix provides access to the nixpkgs pins which are used by our CI, 16 | # hence you will be more likely to get cache hits when using these. 17 | # But you can also just use your own, e.g. ''. 18 | haskellNix.sources.nixpkgs-unstable 19 | # These arguments passed to nixpkgs, include some patches and also 20 | # the haskell.nix functionality itself as an overlay. 21 | haskellNix.nixpkgsArgs; 22 | 23 | ignoredPaths = [".github"]; 24 | src = pkgs.lib.cleanSourceWith { 25 | # 'cleanGit' cleans a source directory based on the files known by git 26 | src = pkgs.haskell-nix.haskellLib.cleanGit { 27 | name = "diskhash"; 28 | src = ./.; 29 | }; 30 | # ignore paths that change frequently, but do not contribute to the result 31 | filter = path: type: let baseName = baseNameOf (toString path); in !(pkgs.lib.elem baseName ignoredPaths); 32 | }; 33 | pythonPackages = pkgs.${pythonVersion + "Packages"}; 34 | in { 35 | haskell = pkgs.haskell-nix.stackProject { inherit src; }; 36 | python = pythonPackages.buildPythonPackage { 37 | name = "diskshash"; 38 | inherit src; 39 | 40 | checkInputs = [ 41 | pythonPackages.pytest 42 | pythonPackages.hypothesis 43 | ]; 44 | checkPhase = '' 45 | cp -pir python/diskhash/tests $TMPDIR/tests_tmp 46 | ${pythonPackages.python.interpreter} -m pytest $TMPDIR/tests_tmp 47 | ''; 48 | propagatedBuildInputs = [ 49 | pythonPackages.python 50 | pkgs.zlib 51 | ]; 52 | }; 53 | 54 | } 55 | 56 | -------------------------------------------------------------------------------- /python/diskhash/__init__.py: -------------------------------------------------------------------------------- 1 | from ._diskhash import Diskhash as _Diskhash 2 | from .diskhash_version import __version__ 3 | from struct import Struct 4 | 5 | class StructHash(object): 6 | def __init__(self, fname, keysize, structformat, mode, load=False): 7 | ''' 8 | fname : file name 9 | keysize : max key size 10 | structformat : same argument as in the standard Python struct.Struct constructor 11 | mode: 'r' or 'rw' 12 | load: whether to load the hash table into memory 13 | ''' 14 | self.s = Struct(structformat) 15 | self.dh = _Diskhash(fname, keysize, self.s.size, mode, int(load)) 16 | 17 | def insert(self, key, *value): 18 | '''Insert a value into the hash 19 | 20 | Parameters 21 | ---------- 22 | key: a string 23 | value: the value to insert 24 | 25 | The value will be passed to the `struct.pack` function with the format 26 | used to build this object 27 | 28 | Returns 29 | ------- 30 | 31 | Whether the object was inserted (if an object already existed, it is 32 | *not* inserted). 33 | ''' 34 | return self.dh.insert(key, memoryview(self.s.pack(*value))) 35 | 36 | def lookup(self, key): 37 | '''Lookup 38 | 39 | Returns None if the key is not found 40 | ''' 41 | r = self.dh.lookup(key) 42 | if r is not None: 43 | return self.s.unpack(r) 44 | 45 | def reserve(self, n): 46 | '''Reserve space for future expansion 47 | 48 | Pre-reserving space can make building large hashes significantly 49 | faster. 50 | ''' 51 | return self.dh.reserve(n) 52 | 53 | def size(self): 54 | 'Return the size()' 55 | return self.dh.size() 56 | 57 | 58 | class Str2int(StructHash): 59 | def __init__(self, fname, keysize, mode): 60 | StructHash.__init__(self, fname, keysize, "l", mode) 61 | 62 | def lookup(self, key): 63 | '''Returns the integer value''' 64 | val = StructHash.lookup(self, key) 65 | if val is not None: 66 | return val[0] 67 | -------------------------------------------------------------------------------- /src/diskhashtools.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "diskhash.hpp" 9 | 10 | int main(int argc, char** argv) { 11 | if (argc < 2) { 12 | std::cerr << "Usage:\n" 13 | << argv[0] << " [sub-command] [ARGS...]\n"; 14 | return 1; 15 | } 16 | std::string mode = argv[1]; 17 | if (mode == "create") { 18 | if (argc < 5 || std::atol(argv[3]) <= 0) { 19 | std::cerr << "Usage:\n" 20 | << argv[0] << " create FILE.dht key-size input-file\n"; 21 | return 1; 22 | } 23 | const size_t key_maxlen = std::atol(argv[3]); 24 | dht::DiskHash ht(argv[2], key_maxlen, dht::DHOpenRW); 25 | std::string line; 26 | std::ifstream finput(argv[4]); 27 | uint64_t ix = 0; 28 | while (std::getline(finput, line)) { 29 | if (line.length() > key_maxlen) { 30 | std::cerr << "Key too long: '" << line << "'. Aborting.\n"; 31 | return 2; 32 | } 33 | const bool inserted = ht.insert(line.c_str(), ix); 34 | if (!inserted) { 35 | std::cerr << "Found repeated key '" << line << "' (ignored).\n"; 36 | } 37 | ++ix; 38 | } 39 | } else if (mode == "lookup") { 40 | if (argc < 5 || std::atol(argv[3]) < 0) { 41 | std::cerr << "Usage:\n" 42 | << argv[0] << " lookup FILE.dht key-size input-file\n"; 43 | return 1; 44 | } 45 | const size_t key_maxlen = std::atol(argv[3]); 46 | dht::DiskHash ht(argv[2], key_maxlen, dht::DHOpenRO); 47 | std::string line; 48 | std::ifstream finput(argv[4]); 49 | while (std::getline(finput, line)) { 50 | if (line.length() > key_maxlen) { 51 | std::cout << "-1\n"; 52 | } else { 53 | const uint64_t* val = ht.lookup(line.c_str()); 54 | std::cout << (val ? *val : -1) << '\n'; 55 | } 56 | } 57 | 58 | } else { 59 | std::cerr << "Unknown subcommand: '" << mode << "'\n"; 60 | return 1; 61 | } 62 | return 0; 63 | } 64 | 65 | -------------------------------------------------------------------------------- /src/diskhash.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DISKHASH_HPP_INCLUDE_GUARD__ 2 | #define DISKHASH_HPP_INCLUDE_GUARD__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "diskhash.h" 14 | 15 | namespace dht { 16 | enum OpenMode { DHOpenRO, DHOpenRW, DHOpenRWNoCreate }; 17 | 18 | template 19 | struct DiskHash { 20 | static_assert(std::is_trivially_copyable::value, 21 | "DiskHash only works for POD (plain old data) types that can be mempcy()ed around"); 22 | public: 23 | /*** 24 | * Open a diskhash from disk 25 | */ 26 | DiskHash(const char* fname, const int keysize, OpenMode m):ht_(0) { 27 | char* err = nullptr; 28 | int flags; 29 | if (m == DHOpenRO) { 30 | flags = O_RDONLY; 31 | } else if (m == DHOpenRW) { 32 | flags = O_RDWR|O_CREAT; 33 | } else { 34 | flags = O_RDWR; 35 | } 36 | HashTableOpts opts; 37 | opts.key_maxlen = keysize; 38 | opts.object_datalen = sizeof(T); 39 | ht_ = dht_open(fname, opts, flags, &err); 40 | if (!ht_) { 41 | if (!err) throw std::bad_alloc(); 42 | std::string error = "Error opening file '" + std::string(fname) + "': " + std::string(err); 43 | std::free(err); 44 | throw std::runtime_error(error); 45 | } 46 | } 47 | DiskHash(DiskHash&& other):ht_(other.ht_) { other.ht_ = 0; } 48 | 49 | ~DiskHash() { 50 | if (ht_) dht_free(ht_); 51 | } 52 | 53 | /** 54 | * Check if key is a member 55 | */ 56 | bool is_member(const char* key) const { return const_cast*>(this)->lookup(key); } 57 | 58 | /** 59 | * Return a pointer to the element (if present, otherwise nullptr). 60 | * 61 | * Note that if the diskhash was not opened in read-write mode, then 62 | * the memory will not be writeable. 63 | */ 64 | T* lookup(const char* key) { 65 | if (!ht_) return nullptr; 66 | return static_cast(dht_lookup(ht_, key)); 67 | } 68 | 69 | /** 70 | * Insert an element 71 | * 72 | * Returns true if element was inserted (else false and nothing is 73 | * modified). 74 | */ 75 | bool insert(const char* key, const T& val) { 76 | char* err = nullptr; 77 | const int icode = dht_insert(ht_, key, &val, &err); 78 | if (icode == 0) return false; 79 | if (icode == 1) return true; 80 | if (!err) { throw std::bad_alloc(); } 81 | std::string error = "Error inserting key '" + std::string(key) + "': " + std::string(err); 82 | std::free(err); 83 | throw std::runtime_error(error); 84 | } 85 | 86 | DiskHash(const DiskHash&) = delete; 87 | DiskHash& operator=(const DiskHash&) = delete; 88 | private: 89 | HashTable* ht_; 90 | }; 91 | 92 | } 93 | 94 | #endif /* DISKHASH_HPP_INCLUDE_GUARD__ */ 95 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2017-2022, Luis Pedro Coelho 3 | # vim: set ts=4 sts=4 sw=4 expandtab smartindent: 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | from __future__ import division 24 | try: 25 | import setuptools 26 | except: 27 | print(''' 28 | setuptools not found. 29 | 30 | On linux, the package is often called python-setuptools''') 31 | from sys import exit 32 | exit(1) 33 | import os 34 | 35 | exec(compile(open('python/diskhash/diskhash_version.py').read(), 36 | 'python/diskhash/diskhash_version.py', 'exec')) 37 | 38 | try: 39 | long_description = open('README.md', encoding='utf-8').read() 40 | except: 41 | long_description = open('README.md').read() 42 | 43 | undef_macros = [] 44 | define_macros = [] 45 | if os.environ.get('DEBUG'): 46 | undef_macros = ['NDEBUG'] 47 | if os.environ.get('DEBUG') == '2': 48 | define_macros = [('_GLIBCXX_DEBUG','1')] 49 | 50 | 51 | packages = setuptools.find_packages('python') 52 | 53 | 54 | classifiers = [ 55 | 'Intended Audience :: Developers', 56 | 'Topic :: Software Development :: Libraries', 57 | 'Programming Language :: Python', 58 | 'Programming Language :: Python :: 2', 59 | 'Programming Language :: Python :: 2.7', 60 | 'Programming Language :: Python :: 3', 61 | 'Programming Language :: Python :: 3.7', 62 | 'Programming Language :: Python :: 3.8', 63 | 'Programming Language :: Python :: 3.9', 64 | 'Programming Language :: Python :: 3.10', 65 | 'Operating System :: OS Independent', 66 | 'License :: OSI Approved :: MIT License', 67 | ] 68 | 69 | 70 | setuptools.setup(name = 'diskhash', 71 | version = __version__, 72 | description = 'Disk-based hashtable', 73 | long_description = long_description, 74 | long_description_content_type = 'text/markdown', 75 | author = 'Luis Pedro Coelho', 76 | author_email = 'luis@luispedro.org', 77 | license = 'MIT', 78 | platforms = ['Any'], 79 | classifiers = classifiers, 80 | url = 'https://github.com/luispedro/diskhash', 81 | packages = packages, 82 | package_dir = {'':'python'}, 83 | ext_modules = [setuptools.Extension('diskhash._diskhash', sources=['python/diskhash/_diskhash.c', 'src/diskhash.c'], depends=['src/diskhash.h'])], 84 | ) 85 | 86 | -------------------------------------------------------------------------------- /haskell/Data/DiskHash/Tests.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings, TemplateHaskell, QuasiQuotes #-} 2 | 3 | module Main where 4 | 5 | import Test.Tasty.HUnit 6 | import Test.Tasty.TH 7 | import Test.Tasty.QuickCheck 8 | 9 | 10 | import qualified Data.ByteString as B 11 | import qualified Data.ByteString.Char8 as B8 12 | import Control.Arrow (first) 13 | import Control.Monad (forM, forM_) 14 | import Control.Exception (throwIO) 15 | import System.IO.Error (isDoesNotExistError, catchIOError) 16 | import System.Directory (removeFile) 17 | import Data.Int 18 | 19 | import Data.DiskHash 20 | 21 | main :: IO () 22 | main = do 23 | removeFileIfExists outname 24 | $(defaultMainGenerator) 25 | 26 | removeFileIfExists :: FilePath -> IO () 27 | removeFileIfExists fp = removeFile fp `catchIOError` ignoreDoesNotExistError 28 | where 29 | ignoreDoesNotExistError e 30 | | isDoesNotExistError e = return () 31 | | otherwise = throwIO e 32 | 33 | outname :: FilePath 34 | outname = "testing.dht" 35 | 36 | case_smoke = do 37 | ht <- htOpenRW outname 15 38 | s <- htSizeRW ht 39 | assertEqual "new table has size 0" s 0 40 | inserted <- htInsert "key" (9 :: Int64) ht 41 | assertBool "inserted should have return True" inserted 42 | reInserted <- htInsert "key" (9 :: Int64) ht 43 | assertBool "inserted should have return False (2nd time around)" (not reInserted) 44 | s' <- htSizeRW ht 45 | assertEqual "after insert table has size 1" s' 1 46 | val <- htLookupRW "key" ht 47 | assertEqual "Lookup" (Just 9) val 48 | removeFileIfExists outname 49 | 50 | case_open_close = do 51 | withDiskHashRW outname 15 $ \ht -> do 52 | s <- htSizeRW ht 53 | assertEqual "new table has size 0" s 0 54 | inserted <- htInsert "key" (9 :: Int64) ht 55 | assertBool "inserted should have return True" inserted 56 | ht <- htOpenRO outname 15 57 | assertEqual "read-only table after reopen" (htSizeRO ht) 1 58 | assertEqual "Lookup" (Just (9 :: Int64)) (htLookupRO "key" ht) 59 | removeFileIfExists outname 60 | 61 | case_open_close_load = do 62 | withDiskHashRW outname 15 $ \ht -> do 63 | s <- htSizeRW ht 64 | assertEqual "new table has size 0" s 0 65 | inserted <- htInsert "key" (9 :: Int64) ht 66 | assertBool "inserted should have return True" inserted 67 | ht <- htLoadRO outname 15 68 | assertEqual "read-only table after reopen (load)" (htSizeRO ht) 1 69 | assertEqual "Lookup" (Just (9 :: Int64)) (htLookupRO "key" ht) 70 | removeFileIfExists outname 71 | 72 | prop_insert_find :: [(ASCIIString, Int64)] -> Property 73 | prop_insert_find args = ioProperty $ do 74 | let args' = normArgs args 75 | found <- withDiskHashRW outname 15 $ \ht -> do 76 | forM_ args' $ \(k,val) -> htInsert k val ht 77 | forM args' $ \(k, val) -> do 78 | v <- htLookupRW k ht 79 | return $! v == Just val 80 | removeFileIfExists outname 81 | return $! and found 82 | 83 | 84 | normArgs :: [(ASCIIString, Int64)] -> [(B.ByteString, Int64)] 85 | normArgs = normArgs' [] . map (first normKey) 86 | where 87 | normKey = B8.pack . (filter (/= '\0')) . getASCIIString 88 | normArgs' r [] = r 89 | normArgs' r (x@(k,_):xs) 90 | | k `elem` (map fst r) = normArgs' r xs 91 | | B.length k >= 15 = normArgs' r xs 92 | | otherwise = normArgs' (x:r) xs 93 | -------------------------------------------------------------------------------- /python/diskhash/tests/test_larger.py: -------------------------------------------------------------------------------- 1 | from diskhash import Str2int, StructHash 2 | from os import unlink, path 3 | 4 | filename = 'testing.dht' 5 | 6 | def test_insert_check(): 7 | if path.exists(filename): 8 | unlink(filename) 9 | ht = Str2int(filename, 17, 'rw') 10 | 11 | items = [ 12 | ('one', 1), 13 | ('two', 2), 14 | ('three', 3), 15 | ('four', 4), 16 | ('five', 5), 17 | ('six', 6), 18 | ('seven', 7), 19 | ('eight', 8), 20 | ('nine', 9), 21 | ('ten', 10), 22 | ('eleven', 11), 23 | ] 24 | 25 | for k,v in items: 26 | assert ht.lookup(k) is None 27 | 28 | for k,v in items: 29 | ht.insert(k, v) 30 | 31 | for k,v in items: 32 | assert ht.lookup(k) == v 33 | del ht 34 | 35 | ht = Str2int(filename, 17, 'r') 36 | for k,v in items: 37 | assert ht.lookup(k) == v 38 | del ht 39 | 40 | unlink(filename) 41 | 42 | def test_insert_check_two_ints(): 43 | if path.exists(filename): 44 | unlink(filename) 45 | ht = StructHash(filename, 17, 'll', 'rw') 46 | 47 | items = [ 48 | ('one', (1,2)), 49 | ('two', (2,3)), 50 | ('three', (3,4)), 51 | ('four', (4,0)), 52 | ('five', (5,0)), 53 | ('six', (6,0)), 54 | ('seven', (7,0)), 55 | ('eight', (8,0)), 56 | ('nine', (9,0)), 57 | ('ten', (10,0)), 58 | ('eleven', (11,0)), 59 | ] 60 | 61 | for k,v in items: 62 | assert ht.lookup(k) is None 63 | 64 | for k,v in items: 65 | ht.insert(k, *v) 66 | 67 | for k,v in items: 68 | assert ht.lookup(k) == v 69 | del ht 70 | 71 | ht = StructHash(filename, 17, 'll', 'r') 72 | for k,v in items: 73 | assert ht.lookup(k) == v 74 | del ht 75 | 76 | unlink(filename) 77 | 78 | def test_reserve(): 79 | if path.exists(filename): 80 | unlink(filename) 81 | # Basically, just a smoke test: reserve() should have no observable 82 | # behaviour (except better performance) 83 | ht = StructHash(filename, 17, 'll', 'rw') 84 | 85 | items = [ 86 | ('one', (1,2)), 87 | ('two', (2,3)), 88 | ('three', (3,4)), 89 | ('four', (4,0)), 90 | ('five', (5,0)), 91 | ('six', (6,0)), 92 | ('seven', (7,0)), 93 | ('eight', (8,0)), 94 | ('nine', (9,0)), 95 | ('ten', (10,0)), 96 | ('eleven', (11,0)), 97 | ] 98 | 99 | ht.reserve(len(items)) 100 | 101 | for k,v in items: 102 | ht.insert(k, *v) 103 | 104 | for k,v in items: 105 | assert ht.lookup(k) == v 106 | del ht 107 | 108 | ht = StructHash(filename, 17, 'll', 'r') 109 | for k,v in items: 110 | assert ht.lookup(k) == v 111 | del ht 112 | 113 | unlink(filename) 114 | 115 | def test_load(): 116 | if path.exists(filename): 117 | unlink(filename) 118 | ht = StructHash(filename, 17, 'll', 'rw') 119 | 120 | items = [ 121 | ('one', (1,2)), 122 | ('two', (2,3)), 123 | ('three', (3,4)), 124 | ('four', (4,0)), 125 | ('five', (5,0)), 126 | ('six', (6,0)), 127 | ('seven', (7,0)), 128 | ('eight', (8,0)), 129 | ('nine', (9,0)), 130 | ('ten', (10,0)), 131 | ('eleven', (11,0)), 132 | ] 133 | 134 | for k,v in items: 135 | ht.insert(k, *v) 136 | 137 | del ht 138 | 139 | ht = StructHash(filename, 17, 'll', 'r', load=True) 140 | for k,v in items: 141 | assert ht.lookup(k) == v 142 | del ht 143 | 144 | unlink(filename) 145 | -------------------------------------------------------------------------------- /src/diskhash.h: -------------------------------------------------------------------------------- 1 | #ifndef DISKHASH_H_INCLUDE_GUARD__ 2 | #define DISKHASH_H_INCLUDE_GUARD__ 3 | #include 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | 10 | /** 11 | * key_maxlen is the maximum key length not including the terminator NUL, i.e., 12 | * diskhash will check that for every key you insert `strlen(key) < 13 | * opts.key_maxlen`. 14 | * 15 | * Internally, space is allocated on 8-Byte aligned boundaries, so numbers such 16 | * as 7, 15, 23, 31, ... (i.e., multiples of 8 minus 1 for NUL) are good 17 | * choices for key_maxlen. 18 | * 19 | * object_datalen is the number of Bytes that your data elements occupy. 20 | */ 21 | typedef struct HashTableOpts { 22 | size_t key_maxlen; 23 | size_t object_datalen; 24 | } HashTableOpts; 25 | 26 | typedef struct HashTable { 27 | int fd_; 28 | const char* fname_; 29 | void* data_; 30 | size_t datasize_; 31 | int flags_; 32 | } HashTable; 33 | 34 | 35 | /** Zero-valued options 36 | */ 37 | HashTableOpts dht_zero_opts(void); 38 | 39 | /** Open a hash table file 40 | * 41 | * fpath is the file path 42 | * flags are passed to call to open() and the user should read the documentation therein 43 | * 44 | * Values returned from dht_open must be freed with dht_free. 45 | * 46 | * Examples: 47 | * 48 | * Read-write: 49 | * 50 | * HashTableOpts opts; 51 | * opts.key_maxlen = 15; 52 | * opts.object_datalen = 8; 53 | * char* err; 54 | * HashTable* ht = dht_open("hashtable.dht", opts, O_RDWR|O_CREAT, &err); 55 | * 56 | * Read-only: 57 | * 58 | * char* err; 59 | * HashTable* ht = dht_open("hashtable.dht", opts, O_RDONLY, &err); 60 | * 61 | * When opening an existing disk table, you can pass `{ 0, 0 }` (the return 62 | * value of `dht_zero_opts()`) as the options, in which case the values will be 63 | * taken from the table on disk. If you do pass values > 0, they are checked 64 | * against the values on disk and it is an error if there is a mismatch 65 | * (passing zero to one of the option fields and not the other is supported: 66 | * only the non-zero field is checked). 67 | * 68 | * The last argument is an error output argument. If it is set to a non-NULL 69 | * value, then the memory must be released with free(). Passing NULL is valid 70 | * (and no error message will be produced). An error return with *err == NULL 71 | * will mean an out-of-memory error (when dht fails to allocate memory, it does 72 | * not try to allocate memory for an error message). 73 | */ 74 | HashTable* dht_open(const char* fpath, HashTableOpts opts, int flags, char**); 75 | 76 | /** Load table into memory 77 | * 78 | * Return: 79 | * 0 : success 80 | * 81 | * 1 : impossible operation: nothing has been done. Attempting to load a 82 | * previously loaded table or a read/write table is impossible. 83 | * 84 | * 2 : error: the HashTable has been freed and must not be used. 85 | */ 86 | int dht_load_to_memory(HashTable*, char**); 87 | 88 | /** Lookup a value by key 89 | * 90 | * If the hash table was opened in read-write mode, then the memory returned 91 | * can be written to (the hash table itself does not inspect the values in any 92 | * way). Writing to a read-only hashtable will probably trigger a segmentation 93 | * fault. 94 | * 95 | * If the object is not found, returns NULL. 96 | * 97 | * Thread safety: multiple concurrent reads are perfectly safe. No guarantees 98 | * are given whenever writing is performed. Similarly, if you write to the 99 | * output of this function (the ht_data field), no guarantees are given. 100 | */ 101 | void* dht_lookup(const HashTable*, const char* key); 102 | 103 | /** Insert a value. 104 | * 105 | * The hashtable must be opened in read write mode. 106 | * 107 | * If a value with the given key is already present in the table, then no 108 | * action is performed and 0 is returned. If you want to overwrite that value, 109 | * you can use `dht_lookup` and write to its output. 110 | * 111 | * This operation is typically O(1) amortized. However, if table is at capacity 112 | * when dht_insert is called, then it must be grown which can be a 113 | * time-consuming operation as all the values are copied to the newly allocated 114 | * memory block (see dht_reserve). 115 | * 116 | * Errors can occur if table expansion is needed and memory cannot be 117 | * allocated. 118 | * 119 | * Returns 1 if the value was inserted. 120 | * 0 if the key was already present in the table. The hash table was 121 | * not modified. 122 | * -EINVAL : key is too long 123 | * -EACCES : attempted to insert into a read-only table. 124 | * -ENOMEM : dht_reserve failed. 125 | * 126 | * The last argument is an error output argument. If it is set to a non-NULL 127 | * value, then the memory must be released with free(). Passing NULL is valid 128 | * (and no error message will be produced). An error return with *err == NULL 129 | * will mean an out-of-memory error (when dht fails to allocate memory, it does 130 | * not try to allocate memory for an error message). 131 | */ 132 | int dht_insert(HashTable*, const char* key, const void* data, char** err); 133 | 134 | /** Preallocate memory for the table. 135 | * 136 | * Calling this function if the number of elements is known apriori can improve 137 | * performance. Additionally, if capacity exists, then dht_insert never fails. 138 | * 139 | * This function returns the actual capacity allocated (which may be more than 140 | * requested, but never less). Calling dht_reserve asking for _less_ capacity 141 | * than is currently used is a no-op. 142 | * 143 | * If capacity cannot be allocated, this function returns 0 (but no changes to 144 | * the hash table are made). 145 | * 146 | * This function can be used to query the current capacity by passing the value 147 | * 1 as the desired capacity. 148 | * 149 | * The last argument is an error output argument. If it is set to a non-NULL 150 | * value, then the memory must be released with free(). Passing NULL is valid 151 | * (and no error message will be produced). 152 | * 153 | * Attempting to call this function on a read-only table will fail (return 154 | * value: -EACCES). 155 | */ 156 | size_t dht_reserve(HashTable*, size_t capacity, char** err); 157 | 158 | /** 159 | * Return the number of elements 160 | */ 161 | size_t dht_size(const HashTable*); 162 | 163 | /** Free the hashtable and sync to disk. 164 | */ 165 | void dht_free(HashTable*); 166 | 167 | /** For debug use only */ 168 | void show_ht(const HashTable*); 169 | 170 | 171 | #ifdef __cplusplus 172 | } /* extern "C" */ 173 | #endif 174 | 175 | #endif /* DISKHASH_H_INCLUDE_GUARD__*/ 176 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Disk-based hashtable 2 | 3 | [![Build & test (Haskell)](https://github.com/luispedro/diskhash/actions/workflows/build_haskell_w_nix.yml/badge.svg)](https://github.com/luispedro/diskhash/actions/workflows/build_haskell_w_nix.yml) 4 | [![Build & test (Python)](https://github.com/luispedro/diskhash/actions/workflows/build_python_w_nix.yml/badge.svg)](https://github.com/luispedro/diskhash/actions/workflows/build_python_w_nix.yml) 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) 6 | 7 | 8 | A simple disk-based hash table (i.e., persistent hash table). 9 | 10 | It is a hashtable implemented on memory-mapped disk, so that it can be loaded 11 | with a single `mmap()` system call and used in memory directly (being as fast 12 | as an in-memory hashtable once it is loaded from disk). 13 | 14 | The code is in C, wrappers are provided for Python, Haskell, and C++. The 15 | wrappers follow similar APIs with variations to accommodate the language 16 | specificity. They all use the same underlying code, so you can open a hashtable 17 | created in C from Haskell, modify it within your Haskell code, and later open 18 | the result in Python. 19 | 20 | Cross-language functionality will only work for simple types where you can 21 | control their binary representation (64-bit integers, for example). 22 | 23 | Reading does not touch the disk representation at all and, thus, can be done on 24 | top of read-only files or using multiple threads (and different processes will 25 | share the memory: the operating system does that for you). Writing or modifying 26 | values is, however, not thread-safe. 27 | 28 | ## Examples 29 | 30 | The following examples all create a hashtable to store longs (`int64_t`), then 31 | set the value associated with the key `"key"` to 9. In the current API, the 32 | maximum size of the keys needs to be pre-specified, which is the value `15` 33 | below. 34 | 35 | ### Raw C 36 | 37 | ```c 38 | #include 39 | #include 40 | #include "diskhash.h" 41 | 42 | int main(void) { 43 | HashTableOpts opts; 44 | opts.key_maxlen = 15; 45 | opts.object_datalen = sizeof(int64_t); 46 | char* err = NULL; 47 | HashTable* ht = dht_open("testing.dht", opts, O_RDWR|O_CREAT, &err); 48 | if (!ht) { 49 | if (!err) err = "Unknown error"; 50 | fprintf(stderr, "Failed opening hash table: %s.\n", err); 51 | return 1; 52 | } 53 | long i = 9; 54 | dht_insert(ht, "key", &i); 55 | 56 | long* val = (long*) dht_lookup(ht, "key"); 57 | printf("Looked up value: %l\n", *val); 58 | 59 | dht_free(ht); 60 | return 0; 61 | } 62 | ``` 63 | 64 | The C API relies on error codes and error strings (the `&err` argument above). 65 | The header file has [decent 66 | documentation](https://github.com/luispedro/diskhash/blob/master/src/diskhash.h). 67 | 68 | ### Haskell 69 | 70 | In Haskell, you have different types/functions for read-write and read-only 71 | hashtables. Read-write operations are `IO` operations, read-only hashtables are 72 | pure. 73 | 74 | Read write example: 75 | 76 | ```haskell 77 | import Data.DiskHash 78 | import Data.Int 79 | main = do 80 | ht <- htOpenRW "testing.dht" 15 81 | htInsertRW ht "key" (9 :: Int64) 82 | val <- htLookupRW "key" ht 83 | print val 84 | ``` 85 | 86 | Read only example (`htLookupRO` is pure in this case): 87 | 88 | ```haskell 89 | import Data.DiskHash 90 | import Data.Int 91 | main = do 92 | ht <- htOpenRO "testing.dht" 15 93 | let val :: Int64 94 | val = htLookupRO "key" ht 95 | print val 96 | ``` 97 | 98 | 99 | ### Python 100 | 101 | Python's interface is based on the [struct 102 | module](https://docs.python.org/3/library/struct.html). For example, `'ll'` 103 | refers to a pair of 64-bit ints (_longs_): 104 | 105 | ```python 106 | import diskhash 107 | 108 | tb = diskhash.StructHash( 109 | fname="testing.dht", 110 | keysize=15, 111 | structformat='ll', # store pairs of longs 112 | mode='rw', 113 | ) 114 | value = [1, 2] # pair of longs 115 | tb.insert("key", *value) 116 | print(tb.lookup("key")) 117 | ``` 118 | 119 | The Python interface is currently Python 3 only. Patches to extend it to 2.7 120 | are welcome, but it's not a priority. 121 | 122 | 123 | ### C++ 124 | 125 | In C++, a simple wrapper is defined, which provides a modicum of type-safety. 126 | You use the `DiskHash` template. Additionally, errors are reported through 127 | exceptions (both `std::bad_alloc` and `std::runtime_error` can be thrown) and 128 | not return codes. 129 | 130 | ```c++ 131 | #include 132 | #include 133 | 134 | #include 135 | 136 | int main() { 137 | const int key_maxlen = 15; 138 | dht::DiskHash ht("testing.dht", key_maxlen, dht::DHOpenRW); 139 | std::string line; 140 | uint64_t ix = 0; 141 | while (std::getline(std::cine, line)) { 142 | if (line.length() > key_maxlen) { 143 | std::cerr << "Key too long: '" << line << "'. Aborting.\n"; 144 | return 2; 145 | } 146 | const bool inserted = ht.insert(line.c_str(), ix); 147 | if (!inserted) { 148 | std::cerr << "Found repeated key '" << line << "' (ignored).\n"; 149 | } 150 | ++ix; 151 | } 152 | return 0; 153 | } 154 | ``` 155 | 156 | ## Stability 157 | 158 | This is _beta_ software. It is good enough that I am using it, but the API can 159 | change in the future with little warning. The binary format is versioned (the 160 | magic string encodes its version, so changes can be detected and you will get 161 | an error message in the future rather than some silent misbehaviour. 162 | 163 | [Automated unit testing](https://travis-ci.com/luispedro/diskhash) ensures that 164 | basic mistakes will not go uncaught. 165 | 166 | ## Limitations 167 | 168 | - You must specify the maximum key size. This can be worked around either by 169 | pre-hashing the keys (with a strong hash) or using multiple hash tables for 170 | different key sizes. Neither is currently implemented in diskhash. 171 | 172 | - You cannot delete objects. This was not a necessity for my uses, so it was 173 | not implemented. A simple implementation could be done by marking objects as 174 | "deleted" in place and recompacting when the hash table size changes or with 175 | an explicit `dht_gc()` call. It may also be important to add functionality to 176 | shrink hashtables so as to not waste disk space. 177 | 178 | - The algorithm is a rather naïve implementation of linear addression. It would 179 | not be hard to switch to [robin hood 180 | hashing](https://www.sebastiansylvan.com/post/robin-hood-hashing-should-be-your-default-hash-table-implementation/) 181 | and this may indeed happen in the near future. 182 | 183 | License: MIT 184 | 185 | -------------------------------------------------------------------------------- /src/rtable.h: -------------------------------------------------------------------------------- 1 | #include 2 | uint64_t rtable [] = { 3 | 7161894671141868069LLU, 4 | 2026477282954470369LLU, 5 | 2429508451144552337LLU, 6 | 17635849675291803701LLU, 7 | 11131238007097670752LLU, 8 | 14575476040221120054LLU, 9 | 9448375414748162182LLU, 10 | 2454672299959216175LLU, 11 | 9038789348187905929LLU, 12 | 10933259439552310726LLU, 13 | 2604799974320421089LLU, 14 | 5824527580030745861LLU, 15 | 5125964867552333118LLU, 16 | 1032915946974892656LLU, 17 | 7131376882090382684LLU, 18 | 14307526071761074839LLU, 19 | 3292238237244544521LLU, 20 | 17717635889128793053LLU, 21 | 7823470298035437952LLU, 22 | 1975366557820174427LLU, 23 | 10130351589661861436LLU, 24 | 9571000392841473940LLU, 25 | 6202913113537891909LLU, 26 | 16922673442642735572LLU, 27 | 2156829192340373188LLU, 28 | 17495964840236106615LLU, 29 | 13101909516477009795LLU, 30 | 4817099012745200872LLU, 31 | 14440198504627065274LLU, 32 | 14651729576330546182LLU, 33 | 16980202802087429883LLU, 34 | 12900263937584820877LLU, 35 | 2306027069114081229LLU, 36 | 1780864555642068862LLU, 37 | 9043601808960553234LLU, 38 | 15810640748094581704LLU, 39 | 17810874749899385925LLU, 40 | 4370190832352558107LLU, 41 | 17458810291480308649LLU, 42 | 17137251597030639508LLU, 43 | 10210885020493337176LLU, 44 | 6152269424340619539LLU, 45 | 10958960557696769058LLU, 46 | 8165460651316620468LLU, 47 | 12486315176958405201LLU, 48 | 17592774145591647916LLU, 49 | 8342048574340997121LLU, 50 | 3154714627493747480LLU, 51 | 15876451744546896617LLU, 52 | 343989707369051175LLU, 53 | 18365325630196641291LLU, 54 | 2477841560223049522LLU, 55 | 3898012761994223425LLU, 56 | 13369082662290243962LLU, 57 | 15815494733174249928LLU, 58 | 6304647780032933422LLU, 59 | 10269137979996697992LLU, 60 | 6277295857270470928LLU, 61 | 6000011608783271869LLU, 62 | 13901041329075672746LLU, 63 | 5669978874985545944LLU, 64 | 5455469485091664306LLU, 65 | 65335843055329217LLU, 66 | 7934898480926769168LLU, 67 | 1273073282087458907LLU, 68 | 16888695346150306798LLU, 69 | 8377368602892110731LLU, 70 | 1830469377103303016LLU, 71 | 17077332288245307211LLU, 72 | 5023390539089620699LLU, 73 | 14022792370712725528LLU, 74 | 13636537031872854434LLU, 75 | 7870715805107021553LLU, 76 | 9484630489072256949LLU, 77 | 1785134210141330253LLU, 78 | 8281677267421262728LLU, 79 | 15726046893482748791LLU, 80 | 1797508209107071112LLU, 81 | 12663465705322475788LLU, 82 | 8481880801368525635LLU, 83 | 4369078805123407981LLU, 84 | 903851902053625478LLU, 85 | 7837494259996271178LLU, 86 | 2168514567018919298LLU, 87 | 6219073417129656739LLU, 88 | 1609380567131206125LLU, 89 | 5153501301389604643LLU, 90 | 3896838726756759252LLU, 91 | 8715831947034564606LLU, 92 | 2079854376561616185LLU, 93 | 3912231126093865119LLU, 94 | 10987109511229875092LLU, 95 | 5769717256290699511LLU, 96 | 13634152406859624118LLU, 97 | 16659329366647962799LLU, 98 | 6409336122833332638LLU, 99 | 2789121606213559319LLU, 100 | 11317161666220426030LLU, 101 | 5185355556384359463LLU, 102 | 8442664884098488832LLU, 103 | 7076397050218785674LLU, 104 | 10088557173876904389LLU, 105 | 7360073205936248365LLU, 106 | 16451313490277755438LLU, 107 | 9084450421159417912LLU, 108 | 7348057751024665700LLU, 109 | 10777225389799347258LLU, 110 | 17763690326796380653LLU, 111 | 2891266716373283937LLU, 112 | 3655130714223786187LLU, 113 | 7755701151370365925LLU, 114 | 2212513937006372504LLU, 115 | 13523931701650912532LLU, 116 | 12113220208962200812LLU, 117 | 9881595722024476612LLU, 118 | 23002422026741603LLU, 119 | 17107233953671023105LLU, 120 | 12923193250737842924LLU, 121 | 17273319855552890761LLU, 122 | 9133832377885676104LLU, 123 | 9902895775500395847LLU, 124 | 7159708893808326203LLU, 125 | 9615663095357430075LLU, 126 | 1173681393331973473LLU, 127 | 10048050401369270562LLU, 128 | 3613131142636259446LLU, 129 | 8860752244643660461LLU, 130 | 8399862844187873110LLU, 131 | 11183016300248525601LLU, 132 | 1652501197644627033LLU, 133 | 15778225670129936072LLU, 134 | 16574295115244112702LLU, 135 | 4071749887734109238LLU, 136 | 17813113169857863668LLU, 137 | 5866933485061915213LLU, 138 | 2617207803875766346LLU, 139 | 4927492731270466097LLU, 140 | 7688280036888791740LLU, 141 | 10563928654676671801LLU, 142 | 5936083376618012629LLU, 143 | 2075718064826528220LLU, 144 | 9759600056023457767LLU, 145 | 3395252869081930520LLU, 146 | 9580020404503428349LLU, 147 | 15488920383895635663LLU, 148 | 13904943215446818750LLU, 149 | 6352674340782198596LLU, 150 | 14778579765560499187LLU, 151 | 1164565579755687793LLU, 152 | 9437615470246010305LLU, 153 | 12898414888770350367LLU, 154 | 11725805952345397029LLU, 155 | 2006395968798208907LLU, 156 | 6768413660762254955LLU, 157 | 14184745999655108007LLU, 158 | 7813559219758021899LLU, 159 | 18324113513585059371LLU, 160 | 7085574629681949288LLU, 161 | 431441848559906543LLU, 162 | 464267230845345911LLU, 163 | 1723783827948552176LLU, 164 | 13062030383763489855LLU, 165 | 17331570881699461752LLU, 166 | 4623703904617640360LLU, 167 | 12154158654629278230LLU, 168 | 11380594816064729611LLU, 169 | 11043398968000652485LLU, 170 | 16839635851508053491LLU, 171 | 8035951356794912228LLU, 172 | 13812221149347752777LLU, 173 | 16687823876032983453LLU, 174 | 12787887064839641354LLU, 175 | 5607226621089666635LLU, 176 | 14689718028345080763LLU, 177 | 6624890426811243934LLU, 178 | 3028949181253016309LLU, 179 | 6065105387082590706LLU, 180 | 7160481282251925368LLU, 181 | 3358207170046758608LLU, 182 | 17630673089168148155LLU, 183 | 64927047284298113LLU, 184 | 904966882733558578LLU, 185 | 1287129322554954815LLU, 186 | 1075294316230729903LLU, 187 | 1878565720252146877LLU, 188 | 5685951107830131172LLU, 189 | 3952290001879195657LLU, 190 | 735398701777187064LLU, 191 | 13776332351172973742LLU, 192 | 5037200310196218445LLU, 193 | 9527386686783206461LLU, 194 | 7649393134260975078LLU, 195 | 12061883647515108400LLU, 196 | 485526623820713506LLU, 197 | 10291959463983135641LLU, 198 | 12243304422703205549LLU, 199 | 8572879391592600851LLU, 200 | 12173591873149535986LLU, 201 | 5819027826014763350LLU, 202 | 7427024978563219025LLU, 203 | 14870798251933491692LLU, 204 | 3706320956350599471LLU, 205 | 13766394013514711397LLU, 206 | 13924654713711829920LLU, 207 | 171503584564545799LLU, 208 | 16183518851605836502LLU, 209 | 14630157969039086483LLU, 210 | 15510815886551585460LLU, 211 | 95398970168327875LLU, 212 | 14978281303351638957LLU, 213 | 16546380181433032989LLU, 214 | 12520318463248954619LLU, 215 | 6573340268706252253LLU, 216 | 3265740609522436520LLU, 217 | 5573960834668842001LLU, 218 | 8691344533374800877LLU, 219 | 754361571041746766LLU, 220 | 15161242727912943940LLU, 221 | 4862135978370040353LLU, 222 | 18077714108297608008LLU, 223 | 7011244222376967230LLU, 224 | 11314094752631247342LLU, 225 | 7838448989574755374LLU, 226 | 1085364298322250198LLU, 227 | 917401104898565019LLU, 228 | 18427172238462792360LLU, 229 | 2166707018289349654LLU, 230 | 1790273621506854230LLU, 231 | 11519979079560744544LLU, 232 | 13573964173325008866LLU, 233 | 17754380141665606138LLU, 234 | 6000314738465890293LLU, 235 | 6299478196153293038LLU, 236 | 1360744789613898708LLU, 237 | 2715087463067748498LLU, 238 | 8615301479886349946LLU, 239 | 6325907209709099700LLU, 240 | 5508742753589735059LLU, 241 | 15045665041620723592LLU, 242 | 7641618366800420587LLU, 243 | 8017941082372145823LLU, 244 | 12827110062020410768LLU, 245 | 16212098331480246028LLU, 246 | 10114390025762652463LLU, 247 | 7792159807616260057LLU, 248 | 16490297273177275624LLU, 249 | 2894344997890487060LLU, 250 | 14703898115432444100LLU, 251 | 3209020746801030043LLU, 252 | 7938173398119607796LLU, 253 | 3756451239655888427LLU, 254 | 11339389497709461975LLU, 255 | 13313028615130553489LLU, 256 | 14181252301152349881LLU, 257 | 4872007803117399842LLU, 258 | 17412432177095593973LLU, 259 | 0 /* sentinel */ 260 | }; 261 | -------------------------------------------------------------------------------- /python/diskhash/_diskhash.c: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2017 Luis Pedro Coelho 2 | // 3 | // License: MIT (see COPYING file) 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "../../src/diskhash.h" 11 | 12 | typedef struct { 13 | PyObject_HEAD 14 | HashTable* ht; 15 | unsigned object_size; 16 | } htObject; 17 | 18 | 19 | PyObject* htLookup(htObject* self, PyObject* args) { 20 | const char* k; 21 | if (!PyArg_ParseTuple(args, "s", &k)) { 22 | return NULL; 23 | } 24 | void* data = dht_lookup(self->ht, k); 25 | if (!data) { 26 | Py_RETURN_NONE; 27 | } 28 | return PyMemoryView_FromMemory(data, self->object_size, PyBUF_READ); 29 | } 30 | 31 | PyObject* htReserve(htObject* self, PyObject* args) { 32 | int cap; 33 | if (!PyArg_ParseTuple(args, "i", &cap)) { 34 | return NULL; 35 | } 36 | char* err; 37 | long r = dht_reserve(self->ht, cap, &err); 38 | if (r == 0) { 39 | if (!err) { 40 | return PyErr_NoMemory(); 41 | } 42 | PyErr_SetString(PyExc_RuntimeError, err); 43 | free(err); 44 | return NULL; 45 | } 46 | return PyLong_FromLong(r); 47 | } 48 | 49 | PyObject* htInsert(htObject* self, PyObject* args) { 50 | const char* k; 51 | PyObject* v; 52 | if (!PyArg_ParseTuple(args, "sO", &k, &v)) { 53 | return NULL; 54 | } 55 | if (!PyMemoryView_Check(v)) { 56 | PyErr_SetString(PyExc_TypeError, "Diskhash.insert expected a memory view"); 57 | return NULL; 58 | } 59 | Py_buffer* buf = PyMemoryView_GET_BUFFER(v); 60 | char* err; 61 | int r = dht_insert(self->ht, k, buf->buf, &err); 62 | if (r < 0) { 63 | if (!err) { 64 | return PyErr_NoMemory(); 65 | } 66 | PyErr_SetString(PyExc_RuntimeError, err); 67 | free(err); 68 | return NULL; 69 | } 70 | return PyLong_FromLong(r); 71 | } 72 | 73 | PyObject* htLen(htObject* self, PyObject* args) { 74 | long n = dht_size(self->ht); 75 | return PyLong_FromLong(n); 76 | } 77 | 78 | static PyMethodDef htMethods[] = { 79 | { "lookup", (PyCFunction)htLookup, METH_VARARGS, 80 | "Lookup a value.\n" 81 | "\n" 82 | "Returns value if found, otherwise None.\n" }, 83 | 84 | { "reserve", (PyCFunction)htReserve, METH_VARARGS, 85 | "Reserve space\n" 86 | "\n" 87 | "Parameters\n" 88 | "----------\n" 89 | "\n" 90 | "c: int\n" 91 | " Desired capacity\n" 92 | "\n" 93 | "Returns\n" 94 | "-------\n" 95 | "c : int\n" 96 | " New capacity.\n" }, 97 | 98 | { "insert", (PyCFunction)htInsert, METH_VARARGS, 99 | "Insert an element into the hash.\n" 100 | "\n" 101 | "This function can fail (raising an exception) if there is not enough \n" 102 | "capacity and hash cannot be resized.\n" 103 | "\n" 104 | "Parameters\n" 105 | "----------\n" 106 | "\n" 107 | "key : str\n" 108 | " Key to insert\n" 109 | "value : memoryview\n" 110 | " Value to insert\n" 111 | "\n" 112 | "Returns\n" 113 | "-------\n" 114 | "r : int\n" 115 | " 1 if object was inserted, 0 if not.\n" }, 116 | 117 | { "size", (PyCFunction)htLen, METH_VARARGS, 118 | "Return number of elements." }, 119 | 120 | {NULL} /* Sentinel */ 121 | }; 122 | 123 | 124 | 125 | static PyObject * 126 | htNew(PyTypeObject *type, PyObject * args, PyObject * kwargs) { 127 | htObject *self; 128 | 129 | self = (htObject *)type->tp_alloc(type, 0); 130 | if (self != NULL) { 131 | self->ht = 0; 132 | } 133 | 134 | return (PyObject *)self; 135 | } 136 | 137 | static int 138 | htInit(htObject *self, PyObject *args, PyObject *kwds) { 139 | const char* fpath; 140 | const char* mode; 141 | int maxi; 142 | int object_size; 143 | int load; 144 | if (!PyArg_ParseTuple(args, "siisi", &fpath, &maxi, &object_size, &mode, &load)) { 145 | return -1; 146 | } 147 | int mode_flags = 0; 148 | if (!strcmp(mode, "r")) { 149 | mode_flags = O_RDONLY; 150 | } else if (!strcmp(mode, "w") 151 | || !strcmp(mode, "rw") 152 | || !strcmp(mode, "wr") 153 | || !strcmp(mode, "+")) { 154 | mode_flags = O_RDWR|O_CREAT; 155 | } else if (!strcmp(mode, "a")) { 156 | mode_flags = O_RDWR; 157 | } else if (!strcmp(mode, "x")) { 158 | mode_flags = O_RDWR|O_CREAT|O_EXCL; 159 | } 160 | 161 | HashTableOpts opts; 162 | opts.key_maxlen = maxi; 163 | opts.object_datalen = object_size; 164 | 165 | char* err; 166 | self->ht = dht_open(fpath, opts, mode_flags, &err); 167 | self->object_size = object_size; 168 | 169 | if (!self->ht) { 170 | if (!err) { 171 | PyErr_SetNone(PyExc_MemoryError); 172 | } else { 173 | PyErr_SetString(PyExc_RuntimeError, err); 174 | free(err); 175 | } 176 | return -1; 177 | } 178 | if (load) { 179 | int e = dht_load_to_memory(self->ht, &err); 180 | if (e == 2) { 181 | PyErr_SetString(PyExc_RuntimeError, err); 182 | return -1; 183 | } 184 | } 185 | return 0; 186 | } 187 | 188 | 189 | static void 190 | htDealloc(PyObject* obj) { 191 | htObject* ht = (htObject*)obj; 192 | if (ht->ht) dht_free(ht->ht); 193 | } 194 | 195 | 196 | 197 | static PyTypeObject htWrapperType = { 198 | PyVarObject_HEAD_INIT(NULL, 0) 199 | "diskhash.Diskhash", /* tp_name */ 200 | sizeof(htObject), /* tp_basicsize */ 201 | 0, /* tp_itemsize */ 202 | htDealloc, /* tp_dealloc */ 203 | 0, /* tp_print */ 204 | 0, /* tp_getattr */ 205 | 0, /* tp_setattr */ 206 | 0, /* tp_reserved */ 207 | 0, /* tp_repr */ 208 | 0, /* tp_as_number */ 209 | 0, /* tp_as_sequence */ 210 | 0, /* tp_as_mapping */ 211 | 0, /* tp_hash */ 212 | 0, /* tp_call */ 213 | 0, /* tp_str */ 214 | 0, /* tp_getattro */ 215 | 0, /* tp_setattro */ 216 | 0, /* tp_as_buffer */ 217 | Py_TPFLAGS_DEFAULT | 218 | Py_TPFLAGS_BASETYPE, /* tp_flags */ 219 | 220 | "Disk based hash table.\n" 221 | "\n" 222 | "See https://github.com/luispedro/diskhash\n", /* tp_doc */ 223 | 224 | 0, /* tp_traverse */ 225 | 0, /* tp_clear */ 226 | 0, /* tp_richcompare */ 227 | 0, /* tp_weaklistoffset */ 228 | 0, /* tp_iter */ 229 | 0, /* tp_iternext */ 230 | htMethods, /* tp_methods */ 231 | 0, /* tp_members */ 232 | 0, /* tp_getset */ 233 | 0, /* tp_base */ 234 | 0, /* tp_dict */ 235 | 0, /* tp_descr_get */ 236 | 0, /* tp_descr_set */ 237 | 0, /* tp_dictoffset */ 238 | (initproc)htInit, /* tp_init */ 239 | 0, /* tp_alloc */ 240 | htNew, /* tp_new */ 241 | }; 242 | 243 | static PyModuleDef pydiskhash = { 244 | PyModuleDef_HEAD_INIT, 245 | "diskhash", 246 | "", 247 | -1, 248 | NULL, NULL, NULL, NULL, NULL 249 | }; 250 | 251 | PyMODINIT_FUNC 252 | PyInit__diskhash(void) 253 | { 254 | PyObject* m; 255 | 256 | htWrapperType.tp_new = PyType_GenericNew; 257 | if (PyType_Ready(&htWrapperType) < 0) 258 | return NULL; 259 | 260 | m = PyModule_Create(&pydiskhash); 261 | if (m == NULL) 262 | return NULL; 263 | 264 | Py_INCREF(&htWrapperType); 265 | PyModule_AddObject(m, "Diskhash", (PyObject *)&htWrapperType); 266 | return m; 267 | } 268 | 269 | 270 | 271 | -------------------------------------------------------------------------------- /haskell/Data/DiskHash.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts, ScopedTypeVariables #-} 2 | {-| 3 | 4 | Disk based hash table 5 | 6 | The Haskell interface has two types, distinguishing between read-only and 7 | read-write hash tables. Operations on the RW variant are in the IO monad, while 8 | operations on RO tables are all pure (after the 'htOpenRO' call, naturally). 9 | Using read-write hashtables with more than one thread is undefined behaviour, 10 | but the read-only variant is perfectly thread safe. 11 | 12 | All data structures are strict (naturally: they write to disk). 13 | 14 | The Haskell API can be used to access diskhashes created from other languages 15 | as long as the types are compatible. 16 | -} 17 | 18 | module Data.DiskHash 19 | ( DiskHashRO 20 | , DiskHashRW 21 | , htOpenRO 22 | , htLoadRO 23 | , htOpenRW 24 | , withDiskHashRW 25 | , htLookupRO 26 | , htLookupRW 27 | , htSizeRW 28 | , htSizeRO 29 | , htInsert 30 | , htModify 31 | , htReserve 32 | ) where 33 | 34 | import qualified Data.ByteString as B 35 | import qualified Data.ByteString.Char8 as B8 36 | import Control.Exception (throwIO) 37 | import Control.Monad (when) 38 | import System.IO.Unsafe (unsafeDupablePerformIO) 39 | import Foreign.Ptr (Ptr, FunPtr, castPtr, nullPtr) 40 | import Foreign.ForeignPtr (ForeignPtr, newForeignPtr, withForeignPtr, finalizeForeignPtr) 41 | import Foreign.Storable (Storable(..)) 42 | import Foreign.Marshal.Alloc (alloca, free) 43 | import Foreign.C.Types (CInt(..), CSize(..)) 44 | import Foreign.C.String (CString, peekCString) 45 | 46 | type HashTable_t = ForeignPtr () 47 | 48 | -- | Represents a read-only diskhash storing type 'a' 49 | newtype DiskHashRO a = DiskHashRO HashTable_t 50 | 51 | -- | Represents a read-write diskhash storing type 'a' 52 | newtype DiskHashRW a = DiskHashRW HashTable_t 53 | 54 | foreign import ccall "dht_open2" c_dht_open2:: CString -> CInt -> CInt -> CInt -> Ptr CString -> IO (Ptr ()) 55 | foreign import ccall "dht_lookup" c_dht_lookup :: Ptr () -> CString -> IO (Ptr ()) 56 | foreign import ccall "dht_reserve" c_dht_reserve :: Ptr () -> CInt -> Ptr CString -> IO () 57 | foreign import ccall "dht_insert" c_dht_insert :: Ptr () -> CString -> Ptr () -> Ptr CString -> IO CInt 58 | foreign import ccall "dht_size" c_dht_size :: Ptr () -> IO CSize 59 | foreign import ccall "dht_load_to_memory" c_dht_load_to_memory :: Ptr () -> Ptr CString -> IO CInt 60 | foreign import ccall "&dht_free" c_dht_free_p :: FunPtr (Ptr () -> IO ()) 61 | 62 | -- | Internal function to handle error message interface 63 | -- 64 | -- If argument points to NULL, then return "No message" 65 | -- Otherwise, return its contents and release memory 66 | getError :: Ptr CString -> IO String 67 | getError err = do 68 | err' <- peek err 69 | if err' == nullPtr 70 | then return "No message" 71 | else do 72 | m <- peekCString err' 73 | free err' 74 | return m 75 | 76 | -- | open a hash table in read-write mode 77 | htOpenRW :: forall a. (Storable a) => FilePath 78 | -- ^ file path 79 | -> Int 80 | -- ^ maximum key size 81 | -> IO (DiskHashRW a) 82 | htOpenRW fpath maxk = DiskHashRW <$> open' (undefined :: a) fpath maxk 66 False 83 | 84 | -- | open a hash table in read-only mode 85 | -- 86 | -- The 'maxk' argument can be 0, in which case the value of the maximum key 87 | -- will be taken from the disk file. If not zero, then it is checked against 88 | -- the value on disk and an exception is raised if there is a mismatch. 89 | htOpenRO :: forall a. (Storable a) => FilePath 90 | -- ^ file path 91 | -> Int 92 | -- ^ maximum key size 93 | -> IO (DiskHashRO a) 94 | htOpenRO fpath maxk = DiskHashRO <$> open' (undefined :: a) fpath maxk 0 False 95 | 96 | -- | open a hash table in read-only mode and load it into memory 97 | -- 98 | -- The 'maxk' argument can be 0, in which case the value of the maximum key 99 | -- will be taken from the disk file. If not zero, then it is checked against 100 | -- the value on disk and an exception is raised if there is a mismatch. 101 | -- 102 | -- @since 0.0.4.0 103 | htLoadRO :: forall a. (Storable a) => FilePath 104 | -- ^ file path 105 | -> Int 106 | -- ^ maximum key size 107 | -> IO (DiskHashRO a) 108 | htLoadRO fpath maxk = DiskHashRO <$> open' (undefined :: a) fpath maxk 0 True 109 | 110 | open' :: forall a. (Storable a) => a -> FilePath -> Int -> CInt -> Bool -> IO HashTable_t 111 | open' unused fpath maxk flags load = B.useAsCString (B8.pack fpath) $ \fpath' -> 112 | alloca $ \err -> do 113 | poke err nullPtr 114 | ht <- c_dht_open2 fpath' (fromIntegral maxk) (fromIntegral $ sizeOf unused) flags err 115 | if ht == nullPtr 116 | then do 117 | errmsg <- getError err 118 | throwIO $ userError ("Could not open hash table: " ++ show errmsg) 119 | else do 120 | when load $ do 121 | e <- c_dht_load_to_memory ht err 122 | when (e == 2) $ do 123 | errmsg <- getError err 124 | throwIO $ userError ("Could not load hash table into memory: " ++ show errmsg) 125 | newForeignPtr c_dht_free_p ht 126 | 127 | -- | Open a hash table in read-write mode and pass it to an action 128 | -- 129 | -- Once the action is is complete, the hashtable is closed (and sync'ed to disk). 130 | withDiskHashRW :: (Storable a) => FilePath 131 | -- ^ file path 132 | -> Int 133 | -- ^ maximum key size 134 | -> (DiskHashRW a -> IO b) -> IO b 135 | withDiskHashRW fp s act = do 136 | ht@(DiskHashRW ht') <- htOpenRW fp s 137 | r <- act ht 138 | finalizeForeignPtr ht' 139 | return r 140 | 141 | 142 | -- | Retrieve the size of the hash table 143 | htSizeRW :: DiskHashRW a -> IO Int 144 | htSizeRW (DiskHashRW ht) = withForeignPtr ht $ \ht' -> fromIntegral <$> (c_dht_size ht') 145 | 146 | -- | Retrieve the size of the hash table 147 | htSizeRO :: DiskHashRO a -> Int 148 | htSizeRO (DiskHashRO ht) = unsafeDupablePerformIO (htSizeRW (DiskHashRW ht)) 149 | 150 | 151 | -- | insert an element into the hash table 152 | -- 153 | -- Returns whether an insertion took place (if an object with that key already 154 | -- exists, no insertion is made). 155 | -- 156 | -- This operation can fail (throwing an exception) if space could not be 157 | -- allocated. You can pre-allocate space using 'htReserve'. 158 | -- 159 | htInsert :: (Storable a) => B.ByteString 160 | -- ^ key 161 | -> a 162 | -- ^ value 163 | -> DiskHashRW a 164 | -- ^ hash table 165 | -> IO Bool 166 | -- ^ True if inserted, False if not 167 | htInsert key val (DiskHashRW ht) = 168 | withForeignPtr ht $ \ht' -> 169 | B.useAsCString key $ \key' -> 170 | alloca $ \val' -> 171 | alloca $ \err -> do 172 | poke err nullPtr 173 | poke val' val 174 | r <- c_dht_insert ht' key' (castPtr val') err 175 | case r of 176 | 1 -> return True 177 | 0 -> return False 178 | -1 -> do 179 | errmsg <- getError err 180 | throwIO $ userError ("insertion failed ("++errmsg++")") 181 | _ -> do 182 | errmsg <- getError err 183 | throwIO $ userError ("Unexpected return from dht_insert: " ++ errmsg) 184 | -- | Lookup by key 185 | -- 186 | -- This is in the IO Monad to ensure ordering of operations. 187 | htLookupRW :: (Storable a) => B.ByteString 188 | -- ^ key 189 | -> DiskHashRW a 190 | -> IO (Maybe a) 191 | htLookupRW key (DiskHashRW ht) = 192 | withForeignPtr ht $ \ht' -> 193 | B.useAsCString key $ \key' -> do 194 | r <- c_dht_lookup ht' key' 195 | if r == nullPtr 196 | then return Nothing 197 | else Just <$> peek (castPtr r) 198 | 199 | -- | Lookup by key 200 | -- 201 | -- This is a pure operation 202 | htLookupRO :: (Storable a) => B.ByteString -> DiskHashRO a -> Maybe a 203 | htLookupRO key (DiskHashRO ht) = unsafeDupablePerformIO (htLookupRW key (DiskHashRW ht)) 204 | 205 | -- | Modify a value 206 | htModify :: (Storable a) => B.ByteString -> (a -> a) -> DiskHashRW a -> IO Bool 207 | htModify key f (DiskHashRW ht) = 208 | withForeignPtr ht $ \ht' -> 209 | B.useAsCString key $ \key' -> do 210 | r <- castPtr <$> c_dht_lookup ht' key' 211 | if r == nullPtr 212 | then return False 213 | else do 214 | val <- peek r 215 | poke r (f val) 216 | return True 217 | 218 | -- | Reserve space in the hash table 219 | -- 220 | -- Reserving space can ensure that any subsequent 'htInsert' calls will not fail. 221 | -- 222 | -- If the operation fails, an exception is raised 223 | htReserve :: (Storable a) => Int -> DiskHashRW a -> IO Int 224 | htReserve cap (DiskHashRW ht) = 225 | withForeignPtr ht $ \ht' -> 226 | alloca $ \err -> do 227 | poke err nullPtr 228 | cap' <- fromEnum <$> c_dht_reserve ht' (fromIntegral cap) err 229 | if cap' == 0 230 | then do 231 | errmsg <- getError err 232 | throwIO . userError $ "Could not change capacity: " ++ errmsg 233 | else return cap' 234 | 235 | -------------------------------------------------------------------------------- /src/diskhash.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include "diskhash.h" 16 | #include "primes.h" 17 | #include "rtable.h" 18 | 19 | enum { 20 | HT_FLAG_CAN_WRITE = 1, 21 | HT_FLAG_HASH_2 = 2, 22 | HT_FLAG_IS_LOADED = 4, 23 | }; 24 | 25 | typedef struct HashTableHeader { 26 | char magic[16]; 27 | HashTableOpts opts_; 28 | size_t cursize_; 29 | size_t slots_used_; 30 | } HashTableHeader; 31 | 32 | typedef struct HashTableEntry { 33 | const char* ht_key; 34 | void* ht_data; 35 | } HashTableEntry; 36 | 37 | static 38 | uint64_t hash_key(const char* k, int use_hash_2) { 39 | /* Taken from http://www.cse.yorku.ca/~oz/hash.html */ 40 | const unsigned char* ku = (const unsigned char*)k; 41 | uint64_t hash = 5381; 42 | uint64_t next; 43 | for ( ; *ku; ++ku) { 44 | hash *= 33; 45 | next = *ku; 46 | if (use_hash_2) { 47 | next = rtable[next]; 48 | } 49 | hash ^= next; 50 | } 51 | return hash; 52 | } 53 | 54 | inline static 55 | size_t aligned_size(size_t s) { 56 | size_t s_8bytes = s & ~0x7; 57 | return s_8bytes == s ? s : (s_8bytes + 8); 58 | } 59 | 60 | inline static 61 | HashTableHeader* header_of(HashTable* ht) { 62 | return (HashTableHeader*)ht->data_; 63 | } 64 | 65 | inline static 66 | const HashTableHeader* cheader_of(const HashTable* ht) { 67 | return (const HashTableHeader*)ht->data_; 68 | } 69 | 70 | inline static 71 | int is_64bit(const HashTable* ht) { 72 | return cheader_of(ht)->cursize_ > (1L << 32); 73 | } 74 | 75 | inline static 76 | size_t node_size_opts(HashTableOpts opts) { 77 | return aligned_size(opts.key_maxlen + 1) + aligned_size(opts.object_datalen); 78 | } 79 | 80 | inline static 81 | size_t node_size(const HashTable* ht) { 82 | return node_size_opts(cheader_of(ht)->opts_); 83 | } 84 | 85 | inline static 86 | int entry_empty(const HashTableEntry et) { 87 | return !et.ht_key; 88 | } 89 | 90 | void* hashtable_of(HashTable* ht) { 91 | return (unsigned char*)ht->data_ + sizeof(HashTableHeader); 92 | } 93 | 94 | 95 | static 96 | uint64_t get_table_at(const HashTable* ht, uint64_t ix) { 97 | assert(ix < cheader_of(ht)->cursize_); 98 | if (is_64bit(ht)) { 99 | uint64_t* table = (uint64_t*)hashtable_of((HashTable*)ht); 100 | return table[ix]; 101 | } else { 102 | uint32_t* table = (uint32_t*)hashtable_of((HashTable*)ht); 103 | return table[ix]; 104 | } 105 | } 106 | 107 | static 108 | void set_table_at(HashTable* ht, uint64_t ix, const uint64_t val) { 109 | if (is_64bit(ht)) { 110 | uint64_t* table = (uint64_t*)hashtable_of(ht); 111 | table[ix] = val; 112 | } else { 113 | uint32_t* table = (uint32_t*)hashtable_of(ht); 114 | table[ix] = val; 115 | } 116 | } 117 | 118 | void show_ht(const HashTable* ht) { 119 | fprintf(stderr, "HT {\n" 120 | "\tmagic = \"%s\",\n" 121 | "\tcursize = %d,\n" 122 | "\tslots used = %ld\n" 123 | "\n", cheader_of(ht)->magic, (int)cheader_of(ht)->cursize_, cheader_of(ht)->slots_used_); 124 | 125 | uint64_t i; 126 | for (i = 0; i < cheader_of(ht)->cursize_; ++i) { 127 | fprintf(stderr, "\tTable [ %d ] = %d\n",(int)i, (int)get_table_at(ht, i)); 128 | } 129 | fprintf(stderr, "}\n"); 130 | } 131 | 132 | static 133 | HashTableEntry entry_at(const HashTable* ht, size_t ix) { 134 | ix = get_table_at(ht, ix); 135 | HashTableEntry r; 136 | if (ix == 0) { 137 | r.ht_key = 0; 138 | r.ht_data = 0; 139 | return r; 140 | } 141 | --ix; 142 | const size_t sizeof_table_elem = is_64bit(ht) ? sizeof(uint64_t) : sizeof(uint32_t); 143 | const char* node_data = (const char*)ht->data_ 144 | + sizeof(HashTableHeader) 145 | + cheader_of(ht)->cursize_ * sizeof_table_elem; 146 | r.ht_key = node_data + ix * node_size(ht); 147 | r.ht_data = (void*)( node_data + ix * node_size(ht) + aligned_size(cheader_of(ht)->opts_.key_maxlen + 1) ); 148 | return r; 149 | } 150 | 151 | HashTableOpts dht_zero_opts() { 152 | HashTableOpts r; 153 | r.key_maxlen = 0; 154 | r.object_datalen = 0; 155 | return r; 156 | } 157 | 158 | HashTable* dht_open(const char* fpath, HashTableOpts opts, int flags, char** err) { 159 | if (!fpath || !*fpath) return NULL; 160 | const int fd = open(fpath, flags, 0644); 161 | int needs_init = 0; 162 | if (fd < 0) { 163 | if (err) { *err = strdup("open call failed."); } 164 | return NULL; 165 | } 166 | HashTable* rp = (HashTable*)malloc(sizeof(HashTable)); 167 | if (!rp) { 168 | if (err) { *err = NULL; } 169 | return NULL; 170 | } 171 | rp->fd_ = fd; 172 | rp->fname_ = strdup(fpath); 173 | if (!rp->fname_) { 174 | if (err) { *err = NULL; } 175 | close(rp->fd_); 176 | free(rp); 177 | return NULL; 178 | } 179 | struct stat st; 180 | fstat(rp->fd_, &st); 181 | rp->datasize_ = st.st_size; 182 | if (rp->datasize_ == 0) { 183 | needs_init = 1; 184 | rp->datasize_ = sizeof(HashTableHeader) + 7 * sizeof(uint32_t) + 3 * node_size_opts(opts); 185 | if (ftruncate(fd, rp->datasize_) < 0) { 186 | if (err) { 187 | *err = malloc(256); 188 | if (*err) { 189 | snprintf(*err, 256, "Could not allocate disk space. Error: %s.", strerror(errno)); 190 | } 191 | } 192 | close(rp->fd_); 193 | free((char*)rp->fname_); 194 | free(rp); 195 | return NULL; 196 | } 197 | } 198 | rp->flags_ = HT_FLAG_HASH_2; 199 | const int prot = (flags == O_RDONLY) ? 200 | PROT_READ 201 | : PROT_READ|PROT_WRITE; 202 | if (prot & PROT_WRITE) rp->flags_ |= HT_FLAG_CAN_WRITE; 203 | rp->data_ = mmap(NULL, 204 | rp->datasize_, 205 | prot, 206 | MAP_SHARED, 207 | rp->fd_, 208 | 0); 209 | if (rp->data_ == MAP_FAILED) { 210 | if (err) { *err = strdup("mmap() call failed."); } 211 | close(rp->fd_); 212 | free((char*)rp->fname_); 213 | free(rp); 214 | return NULL; 215 | } 216 | if (needs_init) { 217 | strcpy(header_of(rp)->magic, "DiskBasedHash11"); 218 | header_of(rp)->opts_ = opts; 219 | header_of(rp)->cursize_ = 7; 220 | header_of(rp)->slots_used_ = 0; 221 | } else if (strcmp(header_of(rp)->magic, "DiskBasedHash11")) { 222 | if (!strcmp(header_of(rp)->magic, "DiskBasedHash10")) { 223 | rp->flags_ &= ~HT_FLAG_HASH_2; 224 | } else { 225 | char start[16]; 226 | strncpy(start, header_of(rp)->magic, 14); 227 | start[13] = '\0'; 228 | if (!strcmp(start, "DiskBasedHash")) { 229 | if (err) { *err = strdup("Version mismatch. This code can only load version 1.0 or 1.1."); } 230 | } else { 231 | if (err) { *err = strdup("No magic number found."); } 232 | } 233 | dht_free(rp); 234 | return 0; 235 | } 236 | } else if ((header_of(rp)->opts_.key_maxlen != opts.key_maxlen && opts.key_maxlen != 0) 237 | || (header_of(rp)->opts_.object_datalen != opts.object_datalen && opts.object_datalen != 0)) { 238 | if (err) { *err = strdup("Options mismatch (diskhash table on disk was not created with the same options used to open it)."); } 239 | dht_free(rp); 240 | return 0; 241 | } 242 | return rp; 243 | } 244 | 245 | int dht_load_to_memory(HashTable* ht, char** err) { 246 | if (ht->flags_ & HT_FLAG_CAN_WRITE) { 247 | if (err) *err = "Cannot call dht_load_to_memory on a read/write Diskhash"; 248 | return 1; 249 | } 250 | if (ht->flags_ & HT_FLAG_IS_LOADED) { 251 | if (err) *err = "dht_load_to_memory had already been called."; 252 | return 1; 253 | } 254 | munmap(ht->data_, ht->datasize_); 255 | ht->data_ = malloc(ht->datasize_); 256 | if (ht->data_) { 257 | size_t n = read(ht->fd_, ht->data_, ht->datasize_); 258 | if (n == ht->datasize_) return 0; 259 | else if (err) *err = "dht_load_to_memory: could not read data from file"; 260 | } else { 261 | if (err) *err = "dht_load_to_memory: could not allocate memory."; 262 | } 263 | free(ht->data_); 264 | fsync(ht->fd_); 265 | close(ht->fd_); 266 | free((char*)ht->fname_); 267 | free(ht); 268 | return 2; 269 | 270 | } 271 | 272 | void dht_free(HashTable* ht) { 273 | if (ht->flags_ & HT_FLAG_IS_LOADED) { 274 | free(ht->data_); 275 | } else { 276 | munmap(ht->data_, ht->datasize_); 277 | } 278 | fsync(ht->fd_); 279 | close(ht->fd_); 280 | free((char*)ht->fname_); 281 | free(ht); 282 | } 283 | 284 | char random_char(void) { 285 | const char* available = 286 | "0123456789" 287 | "abcdefghijklmnopqrstuvwxyz" 288 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; 289 | return available[rand() % (26*2 + 10)]; 290 | } 291 | 292 | 293 | char* generate_tempname_from(const char* base) { 294 | char* res = (char*)malloc(strlen(base) + 21); 295 | if (!res) return NULL; 296 | strcpy(res, base); 297 | char* p = res; 298 | while (*p) ++p; 299 | *p++ = '.'; 300 | int i; 301 | for (i = 0; i < 19; ++i) { 302 | *p++ = random_char(); 303 | } 304 | *p = 0; 305 | return res; 306 | } 307 | 308 | size_t dht_reserve(HashTable* ht, size_t cap, char** err) { 309 | if (!(ht->flags_ & HT_FLAG_CAN_WRITE)) { 310 | if (err) { *err = strdup("Hash table is read-only. Cannot call dht_reserve."); } 311 | return -EACCES; 312 | } 313 | if (header_of(ht)->cursize_ / 2 > cap) { 314 | return header_of(ht)->cursize_ / 2; 315 | } 316 | const uint64_t starting_slots = cheader_of(ht)->slots_used_; 317 | const uint64_t min_slots = cap * 2 + 1; 318 | uint64_t i = 0; 319 | while (primes[i] && primes[i] < min_slots) ++i; 320 | const uint64_t n = primes[i]; 321 | cap = n / 2; 322 | const size_t sizeof_table_elem = is_64bit(ht) ? sizeof(uint64_t) : sizeof(uint32_t); 323 | const size_t total_size = sizeof(HashTableHeader) + n * sizeof_table_elem + cap * node_size(ht); 324 | 325 | HashTable* temp_ht = (HashTable*)malloc(sizeof(HashTable)); 326 | while (1) { 327 | temp_ht->fname_ = generate_tempname_from(ht->fname_); 328 | if (!temp_ht->fname_) { 329 | if (err) { *err = NULL; } 330 | free(temp_ht); 331 | return 0; 332 | } 333 | temp_ht->fd_ = open(temp_ht->fname_, O_EXCL | O_CREAT | O_RDWR, 0600 ); 334 | if (temp_ht->fd_) break; 335 | free((char*)temp_ht->fname_); 336 | } 337 | if (ftruncate(temp_ht->fd_, total_size) < 0) { 338 | if (err) { 339 | *err = malloc(256); 340 | if (*err) { 341 | snprintf(*err, 256, "Could not allocate disk space. Error: %s.", strerror(errno)); 342 | } 343 | } 344 | free((char*)temp_ht->fname_); 345 | free(temp_ht); 346 | return 0; 347 | } 348 | temp_ht->datasize_ = total_size; 349 | temp_ht->data_ = mmap(NULL, 350 | temp_ht->datasize_, 351 | PROT_READ|PROT_WRITE, 352 | MAP_SHARED, 353 | temp_ht->fd_, 354 | 0); 355 | temp_ht->flags_ = ht->flags_; 356 | if (temp_ht->data_ == MAP_FAILED) { 357 | if (err) { 358 | const int errorbufsize = 512; 359 | *err = (char*)malloc(errorbufsize); 360 | if (*err) { 361 | snprintf(*err, errorbufsize, "Could not mmap() new hashtable: %s.\n", strerror(errno)); 362 | } 363 | } 364 | close(temp_ht->fd_); 365 | unlink(temp_ht->fname_); 366 | free((char*)temp_ht->fname_); 367 | free(temp_ht); 368 | return 0; 369 | } 370 | memcpy(header_of(temp_ht), header_of(ht), sizeof(HashTableHeader)); 371 | header_of(temp_ht)->cursize_ = n; 372 | header_of(temp_ht)->slots_used_ = 0; 373 | 374 | if (!strcmp(header_of(temp_ht)->magic, "DiskBasedHash10")) { 375 | strcpy(header_of(temp_ht)->magic, "DiskBasedHash11"); 376 | temp_ht->flags_ |= HT_FLAG_HASH_2; 377 | } 378 | 379 | HashTableEntry et; 380 | for (i = 0; i < header_of(ht)->slots_used_; ++i) { 381 | set_table_at(ht, 0, i + 1); 382 | et = entry_at(ht, 0); 383 | dht_insert(temp_ht, et.ht_key, et.ht_data, NULL); 384 | } 385 | 386 | char* temp_fname = strdup(temp_ht->fname_); 387 | if (!temp_fname) { 388 | if (err) { *err = NULL; } 389 | unlink(temp_ht->fname_); 390 | dht_free(temp_ht); 391 | return 0; 392 | } 393 | 394 | dht_free(temp_ht); 395 | const HashTableOpts opts = header_of(ht)->opts_; 396 | 397 | munmap(ht->data_, ht->datasize_); 398 | close(ht->fd_); 399 | 400 | rename(temp_fname, ht->fname_); 401 | free(temp_fname); 402 | 403 | temp_ht = dht_open(ht->fname_, opts, O_RDWR, err); 404 | if (!temp_ht) { 405 | /* err is set by dht_open */ 406 | return 0; 407 | } 408 | free((char*)ht->fname_); 409 | memcpy(ht, temp_ht, sizeof(HashTable)); 410 | free(temp_ht); 411 | assert(starting_slots == cheader_of(ht)->slots_used_); 412 | return cap; 413 | } 414 | 415 | size_t dht_size(const HashTable* ht) { 416 | return cheader_of(ht)->slots_used_; 417 | } 418 | 419 | void* dht_lookup(const HashTable* ht, const char* key) { 420 | uint64_t h = hash_key(key, ht->flags_ & HT_FLAG_HASH_2) % cheader_of(ht)->cursize_; 421 | uint64_t i; 422 | for (i = 0; i < cheader_of(ht)->cursize_; ++i) { 423 | HashTableEntry et = entry_at(ht, h); 424 | if (!et.ht_key) return NULL; 425 | if (!strcmp(et.ht_key, key)) return et.ht_data; 426 | ++h; 427 | if (h == cheader_of(ht)->cursize_) h = 0; 428 | } 429 | fprintf(stderr, "dht_lookup: the code should never have reached this line.\n"); 430 | return NULL; 431 | } 432 | 433 | int dht_insert(HashTable* ht, const char* key, const void* data, char** err) { 434 | if (!(ht->flags_ & HT_FLAG_CAN_WRITE)) { 435 | if (err) { *err = strdup("Hash table is read-only. Cannot insert."); } 436 | return -EACCES; 437 | } 438 | if (strlen(key) >= header_of(ht)->opts_.key_maxlen) { 439 | if (err) { *err = strdup("Key is too long"); } 440 | return -EINVAL; 441 | } 442 | /* Max load is 50% */ 443 | if (cheader_of(ht)->cursize_ / 2 <= cheader_of(ht)->slots_used_) { 444 | if (!dht_reserve(ht, cheader_of(ht)->slots_used_ + 1, err)) return -ENOMEM; 445 | } 446 | uint64_t h = hash_key(key, ht->flags_ & HT_FLAG_HASH_2) % cheader_of(ht)->cursize_; 447 | while (1) { 448 | HashTableEntry et = entry_at(ht, h); 449 | if (entry_empty(et)) break; 450 | if (!strcmp(et.ht_key, key)) { 451 | return 0; 452 | } 453 | ++h; 454 | if (h == cheader_of(ht)->cursize_) { 455 | h = 0; 456 | } 457 | } 458 | set_table_at(ht, h, header_of(ht)->slots_used_ + 1); 459 | ++header_of(ht)->slots_used_; 460 | HashTableEntry et = entry_at(ht, h); 461 | 462 | strcpy((char*)et.ht_key, key); 463 | memcpy(et.ht_data, data, cheader_of(ht)->opts_.object_datalen); 464 | 465 | return 1; 466 | } 467 | 468 | --------------------------------------------------------------------------------